Skip to content

Commit a4e1b8a

Browse files
committed
Merge branch 'bugfixes20260103' into 'master'
Various bug fixes for docker, percentage_assigned, db_jobs, SNR recalibration See merge request mass-spectrometry/corems!196
2 parents a5227ed + ad32d47 commit a4e1b8a

11 files changed

Lines changed: 161 additions & 36 deletions

File tree

.bumpversion.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[bumpversion]
2-
current_version = 3.9.3
2+
current_version = 3.10.0
33
commit = False
44
tag = False
55

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ CoreMS aims to provide
4848

4949
## Current Version
5050

51-
`3.9.3`
51+
`3.10.0`
5252

5353
***
5454

@@ -335,7 +335,7 @@ UML (unified modeling language) diagrams for Direct Infusion FT-MS and GC-MS cla
335335
336336
If you use CoreMS in your work, please use the following citation:
337337
338-
Version [3.9.3 Release on GitHub](https://github.com/EMSL-Computing/CoreMS/releases/tag/v3.9.3), archived on Zenodo:
338+
Version [3.10.0 Release on GitHub](https://github.com/EMSL-Computing/CoreMS/releases/tag/v3.10.0), archived on Zenodo:
339339
340340
[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.14009575.svg)](https://doi.org/10.5281/zenodo.14009575)
341341

corems/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
__author__ = "Yuri E. Corilo"
2-
__version__ = "3.9.3"
2+
__version__ = "3.10.0"
33
import time
44
import os
55
import sys

corems/encapsulation/factory/processingSetting.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -893,7 +893,7 @@ class MolecularFormulaSearchSettings:
893893
url_database : str, optional
894894
URL for the database. Default is 'postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/coremsapp'.
895895
db_jobs : int, optional
896-
Number of jobs to use for database queries. Default is 3.
896+
Number of jobs to use for database queries. Default is 1. Can increase to 3 when python environment supports it.
897897
db_chunk_size : int, optional
898898
Chunk size to use for database queries. Default is 300.
899899
ion_charge : int, optional
@@ -981,7 +981,7 @@ class MolecularFormulaSearchSettings:
981981
"postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/coremsapp"
982982
)
983983

984-
db_jobs: int = 3
984+
db_jobs: int = 1
985985

986986
db_chunk_size: int = 300
987987

corems/mass_spectrum/calc/Calibration.py

Lines changed: 47 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,8 @@ def find_calibration_points(
210210
Some software does this the other way around and value signs must be inverted for that to work.
211211
calib_snr_threshold : float, optional
212212
snr threshold for finding calibration masses in the spectrum. The default is 5.
213+
If SNR data is unavailable, peaks are filtered by intensity percentile using the formula:
214+
percentile = max(5, 100 - calib_snr_threshold)
213215
214216
Returns
215217
-------
@@ -220,15 +222,54 @@ def find_calibration_points(
220222
221223
"""
222224

225+
# Check if SNR data is available by testing the first peak
226+
use_snr = False
227+
if len(self.mass_spectrum.mspeaks) > 0:
228+
first_peak = self.mass_spectrum.mspeaks[0]
229+
if (hasattr(first_peak, 'signal_to_noise') and
230+
first_peak.signal_to_noise is not None and
231+
not np.isnan(first_peak.signal_to_noise) and
232+
first_peak.signal_to_noise > 0):
233+
use_snr = True
234+
223235
# This approach is much more efficient and expedient than the original implementation.
224236
peaks_mz = []
225-
for x in self.mass_spectrum.mspeaks:
226-
if x.signal_to_noise > calib_snr_threshold:
237+
peaks_intensity = []
238+
239+
if use_snr:
240+
# Use SNR filtering
241+
for x in self.mass_spectrum.mspeaks:
242+
if x.signal_to_noise > calib_snr_threshold:
243+
if self.mzsegment:
244+
if min(self.mzsegment) <= x.mz_exp <= max(self.mzsegment):
245+
peaks_mz.append(x.mz_exp)
246+
else:
247+
peaks_mz.append(x.mz_exp)
248+
else:
249+
# Fallback to intensity percentile filtering
250+
intensity_percentile = max(5, 100 - calib_snr_threshold)
251+
warnings.warn(
252+
f"SNR data unavailable for calibration. Using intensity-based filtering instead. "
253+
f"SNR threshold of {calib_snr_threshold} corresponds to intensity percentile >= {intensity_percentile}%."
254+
)
255+
256+
# Collect all peaks and their intensities
257+
all_peaks_data = []
258+
for x in self.mass_spectrum.mspeaks:
227259
if self.mzsegment:
228260
if min(self.mzsegment) <= x.mz_exp <= max(self.mzsegment):
229-
peaks_mz.append(x.mz_exp)
261+
all_peaks_data.append((x.mz_exp, x.abundance))
230262
else:
231-
peaks_mz.append(x.mz_exp)
263+
all_peaks_data.append((x.mz_exp, x.abundance))
264+
265+
if all_peaks_data:
266+
peaks_mz_list, intensities = zip(*all_peaks_data)
267+
intensity_threshold = np.percentile(intensities, intensity_percentile)
268+
269+
for mz, intensity in all_peaks_data:
270+
if intensity >= intensity_threshold:
271+
peaks_mz.append(mz)
272+
232273
peaks_mz = np.asarray(peaks_mz)
233274

234275
if calibration_ref_match_method == "legacy":
@@ -549,7 +590,7 @@ def run(self):
549590
This function runs the calibration routine.
550591
551592
"""
552-
calib_ppm_error_threshold = self.mass_spectrum.settings.calib_sn_threshold
593+
calib_snr_threshold = self.mass_spectrum.settings.calib_sn_threshold
553594
max_calib_ppm_error = self.mass_spectrum.settings.max_calib_ppm_error
554595
min_calib_ppm_error = self.mass_spectrum.settings.min_calib_ppm_error
555596
calib_pol_order = self.mass_spectrum.settings.calib_pol_order
@@ -570,7 +611,7 @@ def run(self):
570611
cal_peaks_mz, cal_refs_mz = self.find_calibration_points(
571612
df_ref,
572613
calib_ppm_error_threshold=(min_calib_ppm_error, max_calib_ppm_error),
573-
calib_snr_threshold=calib_ppm_error_threshold,
614+
calib_snr_threshold=calib_snr_threshold,
574615
calibration_ref_match_method=calibration_ref_match_method,
575616
calibration_ref_match_tolerance=calibration_ref_match_tolerance,
576617
calibration_ref_match_std_raw_error_limit=calibration_ref_match_std_raw_error_limit,

corems/mass_spectrum/calc/MassSpectrumCalc.py

Lines changed: 85 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -38,16 +38,35 @@ class MassSpecCalc(PeakPicking, NoiseThresholdCalc):
3838
Calculate the weight average molecular weight
3939
"""
4040

41-
def percentile_assigned(self, report_error: bool = False, mute_output: bool = False):
41+
def percentage_assigned(self, report_error: bool = False, mute_output: bool = False):
4242
"""Percentage of peaks which are assigned
4343
44+
Calculates the percentage and relative abundance of assigned peaks in the spectrum.
45+
Includes protection against division by zero with explicit handling of edge cases.
46+
4447
Parameters
4548
-----------
4649
report_error: bool, optional
47-
Report the error of the assigned peaks. Default is False.
50+
Report the RMS error of the assigned peaks. Default is False.
4851
mute_output: bool, optional
4952
Override the verbose setting. Default is False.
5053
If True, the function will silence results
54+
55+
Returns
56+
-------
57+
tuple
58+
If report_error is False:
59+
(assigned_count, unassigned_count, total_percent, total_relative_abundance)
60+
If report_error is True:
61+
(assigned_count, unassigned_count, total_percent, total_relative_abundance, rms_error)
62+
where rms_error is None if no assigned peaks exist
63+
64+
Notes
65+
-----
66+
Edge cases are handled with explicit reporting:
67+
- If no peaks detected: returns (0, 0, 0.0, 0.0[, None]) with message
68+
- If no abundance data: returns (i, j, 0.0, 0.0[, None]) with message
69+
- If no assigned peaks but peaks exist: returns with rms_error=None and explanatory message
5170
"""
5271
verbose = self.parameters.mass_spectrum.verbose_processing
5372
assign_abun = 0
@@ -67,15 +86,45 @@ def percentile_assigned(self, report_error: bool = False, mute_output: bool = Fa
6786
j += 1
6887
not_assign_abun += mspeak.abundance
6988

70-
total_percent = (i / (i + j)) * 100
71-
total_relative_abundance = (assign_abun / (not_assign_abun + assign_abun)) * 100
89+
# Protect against division by zero
90+
total_peaks = i + j
91+
total_abundance = assign_abun + not_assign_abun
92+
93+
# Handle edge cases
94+
if total_peaks == 0:
95+
if verbose and not mute_output:
96+
print("No peaks detected in spectrum")
97+
if report_error:
98+
return i, j, 0.0, 0.0, None
99+
else:
100+
return i, j, 0.0, 0.0
101+
102+
if total_abundance == 0:
103+
if verbose and not mute_output:
104+
print("No abundance data detected in spectrum")
105+
if report_error:
106+
return i, j, 0.0, 0.0, None
107+
else:
108+
return i, j, 0.0, 0.0
109+
110+
total_percent = (i / total_peaks * 100) if total_peaks > 0 else 0.0
111+
total_relative_abundance = (assign_abun / total_abundance * 100) if total_abundance > 0 else 0.0
112+
72113
if report_error:
73-
rms_error = sqrt(mean(array(error) ** 2))
114+
rms_error = None
115+
if i > 0:
116+
rms_error = sqrt(mean(array(error) ** 2))
74117
if verbose and not mute_output:
75-
print(
76-
"%i assigned peaks and %i unassigned peaks, total = %.2f %%, relative abundance = %.2f %%, RMS error (best candidate) (ppm) = %.3f"
77-
% (i, j, total_percent, total_relative_abundance, rms_error)
78-
)
118+
if i == 0:
119+
print(
120+
"No assigned peaks detected - cannot calculate RMS error. %i unassigned peaks, total = %.2f %%, relative abundance = %.2f %%"
121+
% (j, total_percent, total_relative_abundance)
122+
)
123+
else:
124+
print(
125+
"%i assigned peaks and %i unassigned peaks, total = %.2f %%, relative abundance = %.2f %%, RMS error (best candidate) (ppm) = %.3f"
126+
% (i, j, total_percent, total_relative_abundance, rms_error)
127+
)
79128
return i, j, total_percent, total_relative_abundance, rms_error
80129

81130
else:
@@ -91,6 +140,33 @@ def percentile_assigned(self, report_error: bool = False, mute_output: bool = Fa
91140
)
92141
return i, j, total_percent, total_relative_abundance
93142

143+
def percentile_assigned(self, report_error: bool = False, mute_output: bool = False):
144+
"""Deprecated: Use percentage_assigned() instead.
145+
146+
This method is deprecated and will be removed in a future version.
147+
The function returns a percentage, not a percentile, so the name has been corrected.
148+
149+
Parameters
150+
-----------
151+
report_error: bool, optional
152+
Report the error of the assigned peaks. Default is False.
153+
mute_output: bool, optional
154+
Override the verbose setting. Default is False.
155+
156+
Returns
157+
-------
158+
tuple
159+
Refer to percentage_assigned() for return value details.
160+
"""
161+
import warnings
162+
warnings.warn(
163+
"percentile_assigned() is deprecated and will be removed in a future version. "
164+
"Use percentage_assigned() instead, as the function returns a percentage, not a percentile.",
165+
DeprecationWarning,
166+
stacklevel=2
167+
)
168+
return self.percentage_assigned(report_error=report_error, mute_output=mute_output)
169+
94170
def resolving_power_calc(self, B: float, T: float):
95171
"""Calculate the theoretical resolving power
96172

corems/molecular_id/factory/MolecularLookupTable.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -432,11 +432,12 @@ def runworker(self, molecular_search_settings, **kwargs):
432432
# each chunk takes ~600Mb of memory, so if using 8 processes the total free memory needs to be 5GB
433433
if settings.db_jobs > 1:
434434
list_insert_chunks = list(chunks(all_results, self.sql_db.chunks_count))
435-
print(
436-
"Started database insert using {} iterations for a total of {} rows".format(
437-
len(list_insert_chunks), len(all_results)
435+
if verbose:
436+
print(
437+
"Started database insert using {} iterations for a total of {} rows".format(
438+
len(list_insert_chunks), len(all_results)
439+
)
438440
)
439-
)
440441
worker_args = [
441442
(chunk, settings.url_database) for chunk in list_insert_chunks
442443
]

docker-compose.yml

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,22 @@
1-
version: '3.1'
2-
31
services:
42
molformdb:
5-
image: postgres
3+
image: postgres:18
64
restart: always
7-
volumes:
8-
- db-volume:/var/lib/postgresql/data
9-
ports:
10-
- 5432:5432
115
env_file:
126
- ./.env
7+
ports:
8+
- "5432:5432"
9+
volumes:
10+
# mount the PARENT; v18+ will store under /var/lib/postgresql/18/data
11+
- db-volume:/var/lib/postgresql
12+
environment:
13+
# optional but makes layout explicit
14+
PGDATA: /var/lib/postgresql/18/data
15+
healthcheck:
16+
test: ["CMD-SHELL", "pg_isready -U $$POSTGRES_USER -d $$POSTGRES_DB"]
17+
interval: 10s
18+
timeout: 5s
19+
retries: 5
1320

1421
volumes:
1522
db-volume:

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
# This call to setup() does all the work
1515
setup(
1616
name="CoreMS",
17-
version="3.9.3",
17+
version="3.10.0",
1818
description="Mass Spectrometry Framework for Small Molecules Analysis",
1919
long_description=long_description,
2020
long_description_content_type="text/markdown",

tests/test_classification.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,12 @@ def test_heteroatoms_classification(mass_spectrum_ftms, postgres_database):
1717
mass_spectrum_ftms.molecular_search_settings.usedAtoms = usedAtoms
1818

1919
# Check that there are not assigned peaks
20-
assert mass_spectrum_ftms.percentile_assigned()[2] == 0
20+
assert mass_spectrum_ftms.percentage_assigned()[2] == 0
2121

2222
SearchMolecularFormulas(mass_spectrum_ftms).run_worker_mass_spectrum()
2323

2424
# Check if search was successful
25-
assert mass_spectrum_ftms.percentile_assigned()[2] > 0
25+
assert mass_spectrum_ftms.percentage_assigned()[2] > 0
2626

2727
mass_spectrum_by_classes = HeteroatomsClassification(mass_spectrum_ftms)
2828

0 commit comments

Comments
 (0)