Merge branch 'bugfixes20260103' into 'master'

corilo · corilo · commit a4e1b8a044c3 · 2026-03-16T21:25:07.000Z
Various bug fixes for docker, percentage_assigned, db_jobs, SNR recalibration

See merge request mass-spectrometry/corems!196
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 3.9.3
+current_version = 3.10.0
 commit = False
 tag = False
 
diff --git a/README.md b/README.md
@@ -48,7 +48,7 @@ CoreMS aims to provide
 
 ## Current Version
 
- `3.9.3`
+ `3.10.0`
 
 ***
 
@@ -335,7 +335,7 @@ UML (unified modeling language) diagrams for Direct Infusion FT-MS and GC-MS cla
 
 If you use CoreMS in your work, please use the following citation:
 
-Version [3.9.3 Release on GitHub](https://github.com/EMSL-Computing/CoreMS/releases/tag/v3.9.3), archived on Zenodo:  
+Version [3.10.0 Release on GitHub](https://github.com/EMSL-Computing/CoreMS/releases/tag/v3.10.0), archived on Zenodo:  
 
 [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.14009575.svg)](https://doi.org/10.5281/zenodo.14009575)
 
diff --git a/corems/__init__.py b/corems/__init__.py
@@ -1,5 +1,5 @@
 __author__ = "Yuri E. Corilo"
-__version__ = "3.9.3"
+__version__ = "3.10.0"
 import time
 import os
 import sys
diff --git a/corems/encapsulation/factory/processingSetting.py b/corems/encapsulation/factory/processingSetting.py
@@ -893,7 +893,7 @@ class MolecularFormulaSearchSettings:
     url_database : str, optional
         URL for the database. Default is 'postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/coremsapp'.
     db_jobs : int, optional
-        Number of jobs to use for database queries. Default is 3.
+        Number of jobs to use for database queries. Default is 1. Can increase to 3 when python environment supports it. 
     db_chunk_size : int, optional
         Chunk size to use for database queries. Default is 300.
     ion_charge : int, optional
@@ -981,7 +981,7 @@ class MolecularFormulaSearchSettings:
         "postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/coremsapp"
     )
 
-    db_jobs: int = 3
+    db_jobs: int = 1
 
     db_chunk_size: int = 300
 
diff --git a/corems/mass_spectrum/calc/Calibration.py b/corems/mass_spectrum/calc/Calibration.py
@@ -210,6 +210,8 @@ def find_calibration_points(
                 Some software does this the other way around and value signs must be inverted for that to work.
         calib_snr_threshold : float, optional
             snr threshold for finding calibration masses in the spectrum. The default is 5.
+            If SNR data is unavailable, peaks are filtered by intensity percentile using the formula:
+            percentile = max(5, 100 - calib_snr_threshold)
 
         Returns
         -------
@@ -220,15 +222,54 @@ def find_calibration_points(
 
         """
 
+        # Check if SNR data is available by testing the first peak
+        use_snr = False
+        if len(self.mass_spectrum.mspeaks) > 0:
+            first_peak = self.mass_spectrum.mspeaks[0]
+            if (hasattr(first_peak, 'signal_to_noise') and 
+                first_peak.signal_to_noise is not None and 
+                not np.isnan(first_peak.signal_to_noise) and
+                first_peak.signal_to_noise > 0):
+                use_snr = True
+
         # This approach is much more efficient and expedient than the original implementation.
         peaks_mz = []
-        for x in self.mass_spectrum.mspeaks:
-            if x.signal_to_noise > calib_snr_threshold:
+        peaks_intensity = []
+        
+        if use_snr:
+            # Use SNR filtering
+            for x in self.mass_spectrum.mspeaks:
+                if x.signal_to_noise > calib_snr_threshold:
+                    if self.mzsegment:
+                        if min(self.mzsegment) <= x.mz_exp <= max(self.mzsegment):
+                            peaks_mz.append(x.mz_exp)
+                    else:
+                        peaks_mz.append(x.mz_exp)
+        else:
+            # Fallback to intensity percentile filtering
+            intensity_percentile = max(5, 100 - calib_snr_threshold)
+            warnings.warn(
+                f"SNR data unavailable for calibration. Using intensity-based filtering instead. "
+                f"SNR threshold of {calib_snr_threshold} corresponds to intensity percentile >= {intensity_percentile}%."
+            )
+            
+            # Collect all peaks and their intensities
+            all_peaks_data = []
+            for x in self.mass_spectrum.mspeaks:
                 if self.mzsegment:
                     if min(self.mzsegment) <= x.mz_exp <= max(self.mzsegment):
-                        peaks_mz.append(x.mz_exp)
+                        all_peaks_data.append((x.mz_exp, x.abundance))
                 else:
-                    peaks_mz.append(x.mz_exp)
+                    all_peaks_data.append((x.mz_exp, x.abundance))
+            
+            if all_peaks_data:
+                peaks_mz_list, intensities = zip(*all_peaks_data)
+                intensity_threshold = np.percentile(intensities, intensity_percentile)
+                
+                for mz, intensity in all_peaks_data:
+                    if intensity >= intensity_threshold:
+                        peaks_mz.append(mz)
+        
         peaks_mz = np.asarray(peaks_mz)
 
         if calibration_ref_match_method == "legacy":
@@ -549,7 +590,7 @@ def run(self):
         This function runs the calibration routine.
 
         """
-        calib_ppm_error_threshold = self.mass_spectrum.settings.calib_sn_threshold
+        calib_snr_threshold = self.mass_spectrum.settings.calib_sn_threshold
         max_calib_ppm_error = self.mass_spectrum.settings.max_calib_ppm_error
         min_calib_ppm_error = self.mass_spectrum.settings.min_calib_ppm_error
         calib_pol_order = self.mass_spectrum.settings.calib_pol_order
@@ -570,7 +611,7 @@ def run(self):
         cal_peaks_mz, cal_refs_mz = self.find_calibration_points(
             df_ref,
             calib_ppm_error_threshold=(min_calib_ppm_error, max_calib_ppm_error),
-            calib_snr_threshold=calib_ppm_error_threshold,
+            calib_snr_threshold=calib_snr_threshold,
             calibration_ref_match_method=calibration_ref_match_method,
             calibration_ref_match_tolerance=calibration_ref_match_tolerance,
             calibration_ref_match_std_raw_error_limit=calibration_ref_match_std_raw_error_limit,
diff --git a/corems/mass_spectrum/calc/MassSpectrumCalc.py b/corems/mass_spectrum/calc/MassSpectrumCalc.py
@@ -38,16 +38,35 @@ class MassSpecCalc(PeakPicking, NoiseThresholdCalc):
         Calculate the weight average molecular weight
     """
 
-    def percentile_assigned(self, report_error: bool = False, mute_output: bool = False):
+    def percentage_assigned(self, report_error: bool = False, mute_output: bool = False):
         """Percentage of peaks which are assigned
 
+        Calculates the percentage and relative abundance of assigned peaks in the spectrum.
+        Includes protection against division by zero with explicit handling of edge cases.
+
         Parameters
         -----------
         report_error: bool, optional
-            Report the error of the assigned peaks. Default is False.
+            Report the RMS error of the assigned peaks. Default is False.
         mute_output: bool, optional
             Override the verbose setting. Default is False.
             If True, the function will silence results
+
+        Returns
+        -------
+        tuple
+            If report_error is False:
+                (assigned_count, unassigned_count, total_percent, total_relative_abundance)
+            If report_error is True:
+                (assigned_count, unassigned_count, total_percent, total_relative_abundance, rms_error)
+                where rms_error is None if no assigned peaks exist
+        
+        Notes
+        -----
+        Edge cases are handled with explicit reporting:
+        - If no peaks detected: returns (0, 0, 0.0, 0.0[, None]) with message
+        - If no abundance data: returns (i, j, 0.0, 0.0[, None]) with message
+        - If no assigned peaks but peaks exist: returns with rms_error=None and explanatory message
         """
         verbose = self.parameters.mass_spectrum.verbose_processing
         assign_abun = 0
@@ -67,15 +86,45 @@ def percentile_assigned(self, report_error: bool = False, mute_output: bool = Fa
                 j += 1
                 not_assign_abun += mspeak.abundance
 
-        total_percent = (i / (i + j)) * 100
-        total_relative_abundance = (assign_abun / (not_assign_abun + assign_abun)) * 100
+        # Protect against division by zero
+        total_peaks = i + j
+        total_abundance = assign_abun + not_assign_abun
+        
+        # Handle edge cases
+        if total_peaks == 0:
+            if verbose and not mute_output:
+                print("No peaks detected in spectrum")
+            if report_error:
+                return i, j, 0.0, 0.0, None
+            else:
+                return i, j, 0.0, 0.0
+        
+        if total_abundance == 0:
+            if verbose and not mute_output:
+                print("No abundance data detected in spectrum")
+            if report_error:
+                return i, j, 0.0, 0.0, None
+            else:
+                return i, j, 0.0, 0.0
+        
+        total_percent = (i / total_peaks * 100) if total_peaks > 0 else 0.0
+        total_relative_abundance = (assign_abun / total_abundance * 100) if total_abundance > 0 else 0.0
+        
         if report_error:
-            rms_error = sqrt(mean(array(error) ** 2))
+            rms_error = None
+            if i > 0:
+                rms_error = sqrt(mean(array(error) ** 2))
             if verbose and not mute_output:
-                print(
-                    "%i assigned peaks and %i unassigned peaks, total  = %.2f %%, relative abundance = %.2f %%, RMS error (best candidate) (ppm) = %.3f"
-                    % (i, j, total_percent, total_relative_abundance, rms_error)
-                )
+                if i == 0:
+                    print(
+                        "No assigned peaks detected - cannot calculate RMS error. %i unassigned peaks, total = %.2f %%, relative abundance = %.2f %%"
+                        % (j, total_percent, total_relative_abundance)
+                    )
+                else:
+                    print(
+                        "%i assigned peaks and %i unassigned peaks, total  = %.2f %%, relative abundance = %.2f %%, RMS error (best candidate) (ppm) = %.3f"
+                        % (i, j, total_percent, total_relative_abundance, rms_error)
+                    )
             return i, j, total_percent, total_relative_abundance, rms_error
 
         else:
@@ -91,6 +140,33 @@ def percentile_assigned(self, report_error: bool = False, mute_output: bool = Fa
                 )
             return i, j, total_percent, total_relative_abundance
 
+    def percentile_assigned(self, report_error: bool = False, mute_output: bool = False):
+        """Deprecated: Use percentage_assigned() instead.
+
+        This method is deprecated and will be removed in a future version.
+        The function returns a percentage, not a percentile, so the name has been corrected.
+
+        Parameters
+        -----------
+        report_error: bool, optional
+            Report the error of the assigned peaks. Default is False.
+        mute_output: bool, optional
+            Override the verbose setting. Default is False.
+
+        Returns
+        -------
+        tuple
+            Refer to percentage_assigned() for return value details.
+        """
+        import warnings
+        warnings.warn(
+            "percentile_assigned() is deprecated and will be removed in a future version. "
+            "Use percentage_assigned() instead, as the function returns a percentage, not a percentile.",
+            DeprecationWarning,
+            stacklevel=2
+        )
+        return self.percentage_assigned(report_error=report_error, mute_output=mute_output)
+
     def resolving_power_calc(self, B: float, T: float):
         """Calculate the theoretical resolving power
 
diff --git a/corems/molecular_id/factory/MolecularLookupTable.py b/corems/molecular_id/factory/MolecularLookupTable.py
@@ -432,11 +432,12 @@ def runworker(self, molecular_search_settings, **kwargs):
             # each chunk takes ~600Mb of memory, so if using 8 processes the total free memory needs to be 5GB
             if settings.db_jobs > 1:
                 list_insert_chunks = list(chunks(all_results, self.sql_db.chunks_count))
-                print(
-                    "Started database insert using {} iterations for a total of {} rows".format(
-                        len(list_insert_chunks), len(all_results)
+                if verbose:
+                    print(
+                        "Started database insert using {} iterations for a total of {} rows".format(
+                            len(list_insert_chunks), len(all_results)
+                        )
                     )
-                )
                 worker_args = [
                     (chunk, settings.url_database) for chunk in list_insert_chunks
                 ]
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,15 +1,22 @@
-version: '3.1'
-
 services:
   molformdb:
-    image: postgres
+    image: postgres:18
     restart: always
-    volumes:
-      - db-volume:/var/lib/postgresql/data
-    ports:
-      - 5432:5432
     env_file:
       - ./.env
+    ports:
+      - "5432:5432"
+    volumes:
+      # mount the PARENT; v18+ will store under /var/lib/postgresql/18/data
+      - db-volume:/var/lib/postgresql
+    environment:
+      # optional but makes layout explicit
+      PGDATA: /var/lib/postgresql/18/data
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U $$POSTGRES_USER -d $$POSTGRES_DB"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
 
 volumes:
   db-volume:
diff --git a/setup.py b/setup.py
@@ -14,7 +14,7 @@
 # This call to setup() does all the work
 setup(
     name="CoreMS",
-    version="3.9.3",
+    version="3.10.0",
     description="Mass Spectrometry Framework for Small Molecules Analysis",
     long_description=long_description,
     long_description_content_type="text/markdown",
diff --git a/tests/test_classification.py b/tests/test_classification.py
@@ -17,12 +17,12 @@ def test_heteroatoms_classification(mass_spectrum_ftms, postgres_database):
     mass_spectrum_ftms.molecular_search_settings.usedAtoms = usedAtoms
 
     # Check that there are not assigned peaks
-    assert mass_spectrum_ftms.percentile_assigned()[2] == 0
+    assert mass_spectrum_ftms.percentage_assigned()[2] == 0
     
     SearchMolecularFormulas(mass_spectrum_ftms).run_worker_mass_spectrum()
     
     # Check if search was successful
-    assert mass_spectrum_ftms.percentile_assigned()[2] > 0
+    assert mass_spectrum_ftms.percentage_assigned()[2] > 0
 
     mass_spectrum_by_classes = HeteroatomsClassification(mass_spectrum_ftms)
 
diff --git a/tests/test_molecular_formula_search.py b/tests/test_molecular_formula_search.py
@@ -164,7 +164,7 @@ def test_priorityAssignment(mass_spectrum_ftms, postgres_database):
     # Run the molecular formula search on the mass spectrum object and check the percentage of assigned peaks
     assignOx = OxygenPriorityAssignment(mass_spectrum_ftms)
     assignOx.run()
-    assert mass_spectrum_ftms.percentile_assigned()[0] > 15
+    assert mass_spectrum_ftms.percentage_assigned()[0] > 15
 
     # Test the HeteroatomsClassification class
     mass_spectrum_by_classes = HeteroatomsClassification(mass_spectrum_ftms)