From fdceebac3d306a0630ba992343b3fb2c9f9a0286 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Sat, 24 Feb 2024 15:48:00 +0100 Subject: [PATCH 01/54] initial commit --- ms2rescore/feature_generators/ms2.py | 161 +++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 ms2rescore/feature_generators/ms2.py diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py new file mode 100644 index 00000000..8e144574 --- /dev/null +++ b/ms2rescore/feature_generators/ms2.py @@ -0,0 +1,161 @@ +""" +MS2-based feature generator. + +""" + +import logging +import re +from typing import List, Optional, Union +from itertools import chain +from collections import defaultdict + + +import numpy as np +from psm_utils import PSMList +from pyteomics.mass import calculate_mass + +from ms2rescore.feature_generators.base import FeatureGeneratorBase +from ms2rescore.utils import infer_spectrum_path + +logger = logging.getLogger(__name__) + + +class MS2FeatureGenerator(FeatureGeneratorBase): + """DeepLC retention time-based feature generator.""" + + def __init__( + self, + *args, + spectrum_path: Optional[str] = None, + spectrum_id_pattern: str = "(.*)", + **kwargs, + ) -> None: + """ + Generate MS2-based features for rescoring. + + DeepLC retraining is on by default. Add ``deeplc_retrain: False`` as a keyword argument to + disable retraining. + + Parameters + ---------- + + Attributes + ---------- + feature_names: list[str] + Names of the features that will be added to the PSMs. + + """ + super().__init__(*args, **kwargs) + self.spectrum_path = spectrum_path + self.spectrum_id_pattern = spectrum_id_pattern + + @property + def feature_names(self) -> List[str]: + return [ + "ln_explained_intensity", + "ln_total_intensity", + "ln_explained_intensity_ratio", + "ln_explained_b_ion_ratio", + "ln_explained_y_ion_ratio", + "longest_b_ion_sequence", + "longest_y_ion_sequence", + "matched_b_ions", + "matched_b_ions_pct", + "matched_y_ions", + "matched_y_ions_pct", + "matched_ions_pct", + ] + + def add_features(self, psm_list: PSMList) -> None: + """Add DeepLC-derived features to PSMs.""" + + logger.info("Adding MS2-derived features to PSMs.") + psm_dict = psm_list.get_psm_dict() + current_run = 1 + total_runs = sum(len(runs) for runs in psm_dict.values()) + + for runs in psm_dict.values(): + for run, psms in runs.items(): + logger.info( + f"Running MS2 for PSMs from run ({current_run}/{total_runs}) `{run}`..." + ) + psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) + spectrum_filename = infer_spectrum_path(self.spectrum_path, run) + logger.debug(f"Using spectrum file `{spectrum_filename}`") + self._add_features_run(psm_list_run, spectrum_filename) + current_run += 1 + + def _add_features_run(self, psm_list: PSMList, spectrum_filename: str) -> None: + pass + + +def longest_sequence_of_trues(lst): + max_sequence = 0 + current_sequence = 0 + + for value in lst: + current_sequence = current_sequence + 1 if value else 0 + max_sequence = max(max_sequence, current_sequence) + + return max_sequence + + +def extract_spectrum_features(annotated_spectrum, peptidoform): + features = defaultdict(list) + b_ions_matched = [False] * (len(peptidoform.sequence)) + y_ions_matched = [False] * (len(peptidoform.sequence)) + + pseudo_count = 0.00001 + + for annotated_peak in annotated_spectrum.spectrum: + features["total_intensity"].append(annotated_peak.intensity) + + if annotated_peak.annotation: + features["matched_intensity"].append(annotated_peak.intensity) + for matched_ion in annotated_peak.annotation: + if "y" in matched_ion.ion: + features["y_ion_matched"].append(annotated_peak.intensity) + y_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True + elif "b" in matched_ion.ion: + features["b_ion_matched"].append(annotated_peak.intensity) + b_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True + + return { + "ln_explained_intensity": np.log(np.sum(features["matched_intensity"]) + pseudo_count), + "ln_total_intensity": np.log(np.sum(features["total_intensity"]) + pseudo_count), + "ln_explained_intensity_ratio": np.log( + np.sum(features["matched_intensity"]) / np.sum(features["total_intensity"]) + + pseudo_count + ), + "ln_explained_b_ion_ratio": np.log( + np.sum(features["b_ion_matched"]) / np.sum(features["matched_intensity"]) + + pseudo_count + ), + "ln_explained_y_ion_ratio": np.log( + np.sum(features["y_ion_matched"]) / np.sum(features["matched_intensity"]) + + pseudo_count + ), + "longest_b_ion_sequence": longest_sequence_of_trues(b_ions_matched), + "longest_y_ion_sequence": longest_sequence_of_trues(y_ions_matched), + "matched_b_ions": sum(b_ions_matched), + "matched_b_ions_pct": sum(b_ions_matched) / len(b_ions_matched), + "matched_y_ions": sum(y_ions_matched), + "matched_y_ions_pct": sum(y_ions_matched) / len(y_ions_matched), + "matched_ions_pct": (sum(b_ions_matched) + sum(y_ions_matched)) + / (len(b_ions_matched) + len(y_ions_matched)), + } + + +# TODO: keep this here? +def modification_evidence(): + return + + +# TODO: move to basic feature generator +def extract_psm_features(psm): + return { + "peptide_len": len(psm.peptidoform.sequence), + "expmass": (psm.precursor_mz * psm.get_precursor_charge()) + - (psm.get_precursor_charge() * 1.007276), # TODO: replace by constant + "calcmass": calculate_mass(psm.peptidoform.composition), + } From 5374ed83aa56c075bcd2cf2da1004e049d13ee9d Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Sun, 25 Feb 2024 16:44:54 +0100 Subject: [PATCH 02/54] finalize ms2 feature generation --- ms2rescore/core.py | 25 +-- ms2rescore/exceptions.py | 6 + ms2rescore/feature_generators/__init__.py | 2 + ms2rescore/feature_generators/ms2.py | 218 +++++++++++++++------- 4 files changed, 171 insertions(+), 80 deletions(-) diff --git a/ms2rescore/core.py b/ms2rescore/core.py index e5ae26f1..f8ff38d7 100644 --- a/ms2rescore/core.py +++ b/ms2rescore/core.py @@ -54,18 +54,18 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: f"PSMs already contain the following rescoring features: {psm_list_feature_names}" ) - # TODO: avoid hard coding feature generators in some way - rt_required = ("deeplc" in config["feature_generators"]) and ( - None in psm_list["retention_time"] - ) - im_required = ("ionmob" or "im2deep" in config["feature_generators"]) and ( - None in psm_list["ion_mobility"] - ) - logger.debug(f"RT required: {rt_required}, IM required: {im_required}") - - if rt_required or im_required: - logger.info("Parsing missing retention time and/or ion mobility values from spectra...") - fill_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required) + # # TODO: avoid hard coding feature generators in some way + # rt_required = ("deeplc" in config["feature_generators"]) and ( + # None in psm_list["retention_time"] + # ) + # im_required = ("ionmob" or "im2deep" in config["feature_generators"]) and ( + # None in psm_list["ion_mobility"] + # ) + # logger.debug(f"RT required: {rt_required}, IM required: {im_required}") + + # if rt_required or im_required: + # logger.info("Parsing missing retention time and/or ion mobility values from spectra...") + # fill_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required) # Add rescoring features for fgen_name, fgen_config in config["feature_generators"].items(): @@ -82,6 +82,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: fgen.add_features(psm_list) logger.debug(f"Adding features from {fgen_name}: {set(fgen.feature_names)}") feature_names[fgen_name] = set(fgen.feature_names) + exit() # Filter out psms that do not have all added features all_feature_names = {f for fgen in feature_names.values() for f in fgen} diff --git a/ms2rescore/exceptions.py b/ms2rescore/exceptions.py index f8d1fa01..9b429a30 100644 --- a/ms2rescore/exceptions.py +++ b/ms2rescore/exceptions.py @@ -23,3 +23,9 @@ class ModificationParsingError(IDFileParsingError): """Identification file parsing error.""" pass + + +class ParseSpectrumError(MS2RescoreError): + """Error parsing spectrum file.""" + + pass diff --git a/ms2rescore/feature_generators/__init__.py b/ms2rescore/feature_generators/__init__.py index 52440a5f..7a10fd4c 100644 --- a/ms2rescore/feature_generators/__init__.py +++ b/ms2rescore/feature_generators/__init__.py @@ -8,6 +8,7 @@ from ms2rescore.feature_generators.maxquant import MaxQuantFeatureGenerator from ms2rescore.feature_generators.ms2pip import MS2PIPFeatureGenerator from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator +from ms2rescore.feature_generators.ms2 import MS2FeatureGenerator FEATURE_GENERATORS = { "basic": BasicFeatureGenerator, @@ -16,4 +17,5 @@ "maxquant": MaxQuantFeatureGenerator, "ionmob": IonMobFeatureGenerator, "im2deep": IM2DeepFeatureGenerator, + "ms2": MS2FeatureGenerator, } diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py index 8e144574..013bcb44 100644 --- a/ms2rescore/feature_generators/ms2.py +++ b/ms2rescore/feature_generators/ms2.py @@ -12,13 +12,27 @@ import numpy as np from psm_utils import PSMList -from pyteomics.mass import calculate_mass +from pyteomics import mass, mzml, mgf +from rustyms import RawSpectrum, LinearPeptide, FragmentationModel, MassMode from ms2rescore.feature_generators.base import FeatureGeneratorBase from ms2rescore.utils import infer_spectrum_path +from ms2rescore.exceptions import ParseSpectrumError logger = logging.getLogger(__name__) +FRAGMENTATION_MODELS = { + "cidhcd": FragmentationModel.CidHcd, + "etcid": FragmentationModel.Etcid, + "etd": FragmentationModel.Etd, + "ethcd": FragmentationModel.Ethcd, + "all": FragmentationModel.All, +} +MASS_MODES = { + "average": MassMode.Average, + "monoisotopic": MassMode.Monoisotopic, +} + class MS2FeatureGenerator(FeatureGeneratorBase): """DeepLC retention time-based feature generator.""" @@ -28,14 +42,13 @@ def __init__( *args, spectrum_path: Optional[str] = None, spectrum_id_pattern: str = "(.*)", + fragmentationmodel: str = "All", + massmode: str = "Monoisotopic", **kwargs, ) -> None: """ Generate MS2-based features for rescoring. - DeepLC retraining is on by default. Add ``deeplc_retrain: False`` as a keyword argument to - disable retraining. - Parameters ---------- @@ -48,6 +61,8 @@ def __init__( super().__init__(*args, **kwargs) self.spectrum_path = spectrum_path self.spectrum_id_pattern = spectrum_id_pattern + self.fragmentation_model = FRAGMENTATION_MODELS[fragmentationmodel.lower()] + self.mass_mode = MASS_MODES[massmode.lower()] @property def feature_names(self) -> List[str]: @@ -67,7 +82,7 @@ def feature_names(self) -> List[str]: ] def add_features(self, psm_list: PSMList) -> None: - """Add DeepLC-derived features to PSMs.""" + """Add MS2-derived features to PSMs.""" logger.info("Adding MS2-derived features to PSMs.") psm_dict = psm_list.get_psm_dict() @@ -82,68 +97,128 @@ def add_features(self, psm_list: PSMList) -> None: psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) spectrum_filename = infer_spectrum_path(self.spectrum_path, run) logger.debug(f"Using spectrum file `{spectrum_filename}`") - self._add_features_run(psm_list_run, spectrum_filename) - current_run += 1 - - def _add_features_run(self, psm_list: PSMList, spectrum_filename: str) -> None: - pass - - -def longest_sequence_of_trues(lst): - max_sequence = 0 - current_sequence = 0 - - for value in lst: - current_sequence = current_sequence + 1 if value else 0 - max_sequence = max(max_sequence, current_sequence) - - return max_sequence - -def extract_spectrum_features(annotated_spectrum, peptidoform): - features = defaultdict(list) - b_ions_matched = [False] * (len(peptidoform.sequence)) - y_ions_matched = [False] * (len(peptidoform.sequence)) - - pseudo_count = 0.00001 - - for annotated_peak in annotated_spectrum.spectrum: - features["total_intensity"].append(annotated_peak.intensity) - - if annotated_peak.annotation: - features["matched_intensity"].append(annotated_peak.intensity) - for matched_ion in annotated_peak.annotation: - if "y" in matched_ion.ion: - features["y_ion_matched"].append(annotated_peak.intensity) - y_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True - elif "b" in matched_ion.ion: - features["b_ion_matched"].append(annotated_peak.intensity) - b_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True + annotated_spectra = self._annotate_spectra(psm_list_run, spectrum_filename) + self._calculate_features(psm_list_run, annotated_spectra) + current_run += 1 - return { - "ln_explained_intensity": np.log(np.sum(features["matched_intensity"]) + pseudo_count), - "ln_total_intensity": np.log(np.sum(features["total_intensity"]) + pseudo_count), - "ln_explained_intensity_ratio": np.log( - np.sum(features["matched_intensity"]) / np.sum(features["total_intensity"]) - + pseudo_count - ), - "ln_explained_b_ion_ratio": np.log( - np.sum(features["b_ion_matched"]) / np.sum(features["matched_intensity"]) - + pseudo_count - ), - "ln_explained_y_ion_ratio": np.log( - np.sum(features["y_ion_matched"]) / np.sum(features["matched_intensity"]) - + pseudo_count - ), - "longest_b_ion_sequence": longest_sequence_of_trues(b_ions_matched), - "longest_y_ion_sequence": longest_sequence_of_trues(y_ions_matched), - "matched_b_ions": sum(b_ions_matched), - "matched_b_ions_pct": sum(b_ions_matched) / len(b_ions_matched), - "matched_y_ions": sum(y_ions_matched), - "matched_y_ions_pct": sum(y_ions_matched) / len(y_ions_matched), - "matched_ions_pct": (sum(b_ions_matched) + sum(y_ions_matched)) - / (len(b_ions_matched) + len(y_ions_matched)), - } + def _calculate_features(self, psm_list: PSMList, annotated_spectra: List) -> None: + """Calculate features from annotated spectra and add them to the PSMs.""" + for psm, spectrum in zip(psm_list, annotated_spectra): + psm.features.update(self.extract_spectrum_features(spectrum, psm.peptidoform)) + + def _annotate_spectra(self, psm_list: PSMList, spectrum_file: str) -> List: + """Retrieve annotated spectra for all psms.""" + + annotated_spectra = [] + spectrum_reader = self._read_spectrum_file(spectrum_file) + + spectrum_id_pattern = re.compile( + self.spectrum_id_pattern if self.spectrum_id_pattern else r"(.*)" + ) + logger.info(spectrum_id_pattern) + logger.info(spectrum_reader._offset_index.mapping["spectrum"].keys()) + + try: + mapper = { + spectrum_id_pattern.search(spectrum_id).group(1): spectrum_id + for spectrum_id in spectrum_reader._offset_index.mapping["spectrum"].keys() + } + except AttributeError: + raise ParseSpectrumError( + "Could not parse spectrum IDs using ´spectrum_id_pattern´. Please make sure that there is a capturing in the pattern." + ) + + for psm in psm_list: + pyteomics_spectrum = spectrum_reader.get_by_id(mapper[psm.spectrum_id]) + annotated_spectrum = self._annotate_spectrum(psm, pyteomics_spectrum) + annotated_spectra.append(annotated_spectrum) + + return annotated_spectra + + @staticmethod + def _read_spectrum_file(spectrum_filepath: str) -> Union[mzml.PreIndexedMzML, mgf.IndexedMGF]: + + if spectrum_filepath.suffix == ".mzML": + return mzml.PreIndexedMzML(str(spectrum_filepath)) + elif spectrum_filepath.suffix == ".mgf": + return mgf.IndexedMGF(str(spectrum_filepath)) + + @staticmethod + def _longest_ion_sequence(lst): + max_sequence = 0 + current_sequence = 0 + + for value in lst: + current_sequence = current_sequence + 1 if value else 0 + max_sequence = max(max_sequence, current_sequence) + + return max_sequence + + def extract_spectrum_features(self, annotated_spectrum, peptidoform): + features = defaultdict(list) + b_ions_matched = [False] * (len(peptidoform.sequence)) + y_ions_matched = [False] * (len(peptidoform.sequence)) + + pseudo_count = 0.00001 + + for annotated_peak in annotated_spectrum.spectrum: + features["total_intensity"].append(annotated_peak.intensity) + + if annotated_peak.annotation: + features["matched_intensity"].append(annotated_peak.intensity) + for matched_ion in annotated_peak.annotation: + if "y" in matched_ion.ion: + features["y_ion_matched"].append(annotated_peak.intensity) + y_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True + elif "b" in matched_ion.ion: + features["b_ion_matched"].append(annotated_peak.intensity) + b_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True + + return { + "ln_explained_intensity": np.log(np.sum(features["matched_intensity"]) + pseudo_count), + "ln_total_intensity": np.log(np.sum(features["total_intensity"]) + pseudo_count), + "ln_explained_intensity_ratio": np.log( + np.sum(features["matched_intensity"]) / np.sum(features["total_intensity"]) + + pseudo_count + ), + "ln_explained_b_ion_ratio": np.log( + np.sum(features["b_ion_matched"]) / np.sum(features["matched_intensity"]) + + pseudo_count + ), + "ln_explained_y_ion_ratio": np.log( + np.sum(features["y_ion_matched"]) / np.sum(features["matched_intensity"]) + + pseudo_count + ), + "longest_b_ion_sequence": self._longest_ion_sequence(b_ions_matched), + "longest_y_ion_sequence": self._longest_ion_sequence(y_ions_matched), + "matched_b_ions": sum(b_ions_matched), + "matched_b_ions_pct": sum(b_ions_matched) / len(b_ions_matched), + "matched_y_ions": sum(y_ions_matched), + "matched_y_ions_pct": sum(y_ions_matched) / len(y_ions_matched), + "matched_ions_pct": (sum(b_ions_matched) + sum(y_ions_matched)) + / (len(b_ions_matched) + len(y_ions_matched)), + } + + def _annotate_spectrum(self, psm, spectrum): + + spectrum = RawSpectrum( + title=psm.spectrum_id, + num_scans=1, + rt=psm.retention_time, + precursor_charge=psm.get_precursor_charge(), + precursor_mass=mass.calculate_mass(psm.peptidoform.compsoition), + mz_array=spectrum["m/z array"], + intensity_array=spectrum["intensity array"], + ) + + annotated_spectrum = spectrum.annotate( + peptide=LinearPeptide(psm.peptidoform.proforma), + model=self.fragmentationmodel, + mode=self.massmode, + ) + + return annotated_spectrum # TODO: keep this here? @@ -153,9 +228,16 @@ def modification_evidence(): # TODO: move to basic feature generator def extract_psm_features(psm): + + expmass = (psm.precursor_mz * psm.get_precursor_charge()) - ( + psm.get_precursor_charge() * 1.007276 + ) # TODO: replace by constant + calcmass = mass.calculate_mass(psm.peptidoform.composition) + delta_mass = expmass - calcmass + return { "peptide_len": len(psm.peptidoform.sequence), - "expmass": (psm.precursor_mz * psm.get_precursor_charge()) - - (psm.get_precursor_charge() * 1.007276), # TODO: replace by constant - "calcmass": calculate_mass(psm.peptidoform.composition), + "expmass": expmass, + "calcmass": calcmass, + "delta_mass": delta_mass, } From 60207a390f08cba34730889543a356fd1cb9b06b Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Sun, 25 Feb 2024 16:45:09 +0100 Subject: [PATCH 03/54] add rustyms --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index be5295ba..beb6b440 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,7 @@ dependencies = [ [project.optional-dependencies] ionmob = ["ionmob>=0.2", "tensorflow"] +ms2 = ["rustyms"] dev = ["ruff", "black", "pytest", "pytest-cov", "pre-commit"] docs = [ "sphinx", From ae39844a7b46b9e4622c5f66458aae5146e3776b Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Mon, 26 Feb 2024 11:04:37 +0100 Subject: [PATCH 04/54] remove exit statement fixed IM required value --- ms2rescore/core.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/ms2rescore/core.py b/ms2rescore/core.py index f8ff38d7..6e0e547c 100644 --- a/ms2rescore/core.py +++ b/ms2rescore/core.py @@ -54,18 +54,19 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: f"PSMs already contain the following rescoring features: {psm_list_feature_names}" ) - # # TODO: avoid hard coding feature generators in some way - # rt_required = ("deeplc" in config["feature_generators"]) and ( - # None in psm_list["retention_time"] - # ) - # im_required = ("ionmob" or "im2deep" in config["feature_generators"]) and ( - # None in psm_list["ion_mobility"] - # ) - # logger.debug(f"RT required: {rt_required}, IM required: {im_required}") - - # if rt_required or im_required: - # logger.info("Parsing missing retention time and/or ion mobility values from spectra...") - # fill_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required) + # TODO: avoid hard coding feature generators in some way + rt_required = ("deeplc" in config["feature_generators"]) and ( + None in psm_list["retention_time"] + ) + im_required = ( + "ionmob" in config["feature_generators"] or "im2deep" in config["feature_generators"] + ) and (None in psm_list["ion_mobility"]) + + logger.info(f"RT required: {rt_required}, IM required: {im_required}") + + if rt_required or im_required: + logger.info("Parsing missing retention time and/or ion mobility values from spectra...") + fill_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required) # Add rescoring features for fgen_name, fgen_config in config["feature_generators"].items(): @@ -82,7 +83,6 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: fgen.add_features(psm_list) logger.debug(f"Adding features from {fgen_name}: {set(fgen.feature_names)}") feature_names[fgen_name] = set(fgen.feature_names) - exit() # Filter out psms that do not have all added features all_feature_names = {f for fgen in feature_names.values() for f in fgen} From 9b98c4df3b13d0ab935172a5cfeca8bdbb6159f4 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Mon, 26 Feb 2024 15:05:23 +0100 Subject: [PATCH 05/54] change logger.info to debug --- ms2rescore/core.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ms2rescore/core.py b/ms2rescore/core.py index 6e0e547c..032b1463 100644 --- a/ms2rescore/core.py +++ b/ms2rescore/core.py @@ -11,10 +11,12 @@ from ms2rescore.parse_spectra import fill_missing_values from ms2rescore.report import generate from ms2rescore.rescoring_engines import mokapot, percolator +from ms2rescore.utils import profile logger = logging.getLogger(__name__) +@profile def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: """ Run full MS²Rescore workflow with passed configuration. @@ -62,7 +64,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: "ionmob" in config["feature_generators"] or "im2deep" in config["feature_generators"] ) and (None in psm_list["ion_mobility"]) - logger.info(f"RT required: {rt_required}, IM required: {im_required}") + logger.debug(f"RT required: {rt_required}, IM required: {im_required}") if rt_required or im_required: logger.info("Parsing missing retention time and/or ion mobility values from spectra...") From 5e45756726582fa54e63f2d572c400332bcaa683 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Mon, 26 Feb 2024 15:06:34 +0100 Subject: [PATCH 06/54] added profile decorator to get timings for functions --- ms2rescore/utils.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py index 70417b56..982664b5 100644 --- a/ms2rescore/utils.py +++ b/ms2rescore/utils.py @@ -4,6 +4,9 @@ from glob import glob from pathlib import Path from typing import Optional, Union +import cProfile +import pstats +import io from ms2rescore.exceptions import MS2RescoreConfigurationError @@ -76,3 +79,22 @@ def infer_spectrum_path( ) return Path(resolved_path) + + +def profile(fnc): + """A decorator that uses cProfile to profile a function""" + + def inner(*args, **kwargs): + + pr = cProfile.Profile() + pr.enable() + retval = fnc(*args, **kwargs) + pr.disable() + s = io.StringIO() + sortby = "time" + ps = pstats.Stats(pr, stream=s).sort_stats(sortby) + ps.print_stats() + logger.info(s.getvalue()) + return retval + + return inner From 304777cfa171cf5b7abf5cd14640e971f6510167 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Mon, 26 Feb 2024 15:07:13 +0100 Subject: [PATCH 07/54] removed profile as standard rescore debug statement --- ms2rescore/core.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/ms2rescore/core.py b/ms2rescore/core.py index 032b1463..d0a52a4e 100644 --- a/ms2rescore/core.py +++ b/ms2rescore/core.py @@ -11,12 +11,10 @@ from ms2rescore.parse_spectra import fill_missing_values from ms2rescore.report import generate from ms2rescore.rescoring_engines import mokapot, percolator -from ms2rescore.utils import profile logger = logging.getLogger(__name__) -@profile def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: """ Run full MS²Rescore workflow with passed configuration. From 95ee4755325cd85990473cf15313c92e8c2f567a Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Mon, 26 Feb 2024 15:07:21 +0100 Subject: [PATCH 08/54] added new basic features --- ms2rescore/feature_generators/basic.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/ms2rescore/feature_generators/basic.py b/ms2rescore/feature_generators/basic.py index bc2222ee..df005167 100644 --- a/ms2rescore/feature_generators/basic.py +++ b/ms2rescore/feature_generators/basic.py @@ -56,6 +56,7 @@ def add_features(self, psm_list: PSMList) -> None: charge_states = np.array([psm.peptidoform.precursor_charge for psm in psm_list]) precursor_mzs = psm_list["precursor_mz"] scores = psm_list["score"] + peptide_lengths = np.array([len(psm.peptidoform.sequence) for psm in psm_list]) has_charge = None not in charge_states has_mz = None not in precursor_mzs and has_charge @@ -74,6 +75,14 @@ def add_features(self, psm_list: PSMList) -> None: if has_score: self._feature_names.append("search_engine_score") + if has_mz and has_charge: + experimental_mass = (precursor_mzs * charge_n) - (charge_n * 1.007276466812) + theoretical_mass = (theo_mz * charge_n) - (charge_n * 1.007276466812) + mass_error = experimental_mass - theoretical_mass + self._feature_names.extend(["theoretical_mass", "experimental_mass", "mass_error"]) + + self._feature_names.append("pep_len") + for i, psm in enumerate(psm_list): psm.rescoring_features.update( dict( @@ -81,6 +90,10 @@ def add_features(self, psm_list: PSMList) -> None: **charge_one_hot[i] if has_charge else {}, **{"abs_ms1_error_ppm": abs_ms1_error_ppm[i]} if has_mz else {}, **{"search_engine_score": scores[i]} if has_score else {}, + **{"theoretical_mass": theoretical_mass[i]} if has_mz and has_charge else {}, + **{"experimental_mass": experimental_mass[i]} if has_mz and has_charge else {}, + **{"mass_error": mass_error[i]} if has_mz and has_charge else {}, + **{"pep_len": peptide_lengths[i]}, ) ) From 73f4573331a82ea3984457333d3c701736f4d86d Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Mon, 26 Feb 2024 15:07:47 +0100 Subject: [PATCH 09/54] fixes for ms2 feature generator, removed multiprocessing --- ms2rescore/feature_generators/ms2.py | 149 +++++++++++++++------------ 1 file changed, 81 insertions(+), 68 deletions(-) diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py index 013bcb44..997accb2 100644 --- a/ms2rescore/feature_generators/ms2.py +++ b/ms2rescore/feature_generators/ms2.py @@ -4,6 +4,7 @@ """ import logging +import multiprocessing import re from typing import List, Optional, Union from itertools import chain @@ -14,6 +15,7 @@ from psm_utils import PSMList from pyteomics import mass, mzml, mgf from rustyms import RawSpectrum, LinearPeptide, FragmentationModel, MassMode +from rich.progress import track from ms2rescore.feature_generators.base import FeatureGeneratorBase from ms2rescore.utils import infer_spectrum_path @@ -42,8 +44,9 @@ def __init__( *args, spectrum_path: Optional[str] = None, spectrum_id_pattern: str = "(.*)", - fragmentationmodel: str = "All", - massmode: str = "Monoisotopic", + fragmentation_model: str = "All", + mass_mode: str = "Monoisotopic", + processes: int = 1, **kwargs, ) -> None: """ @@ -61,8 +64,9 @@ def __init__( super().__init__(*args, **kwargs) self.spectrum_path = spectrum_path self.spectrum_id_pattern = spectrum_id_pattern - self.fragmentation_model = FRAGMENTATION_MODELS[fragmentationmodel.lower()] - self.mass_mode = MASS_MODES[massmode.lower()] + self.fragmentation_model = FRAGMENTATION_MODELS[fragmentation_model.lower()] + self.mass_mode = MASS_MODES[mass_mode.lower()] + self.processes = processes @property def feature_names(self) -> List[str]: @@ -98,26 +102,17 @@ def add_features(self, psm_list: PSMList) -> None: spectrum_filename = infer_spectrum_path(self.spectrum_path, run) logger.debug(f"Using spectrum file `{spectrum_filename}`") - annotated_spectra = self._annotate_spectra(psm_list_run, spectrum_filename) - self._calculate_features(psm_list_run, annotated_spectra) + self._calculate_features(psm_list_run, spectrum_filename) current_run += 1 - def _calculate_features(self, psm_list: PSMList, annotated_spectra: List) -> None: - """Calculate features from annotated spectra and add them to the PSMs.""" - for psm, spectrum in zip(psm_list, annotated_spectra): - psm.features.update(self.extract_spectrum_features(spectrum, psm.peptidoform)) - - def _annotate_spectra(self, psm_list: PSMList, spectrum_file: str) -> List: + def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List: """Retrieve annotated spectra for all psms.""" - annotated_spectra = [] spectrum_reader = self._read_spectrum_file(spectrum_file) spectrum_id_pattern = re.compile( self.spectrum_id_pattern if self.spectrum_id_pattern else r"(.*)" ) - logger.info(spectrum_id_pattern) - logger.info(spectrum_reader._offset_index.mapping["spectrum"].keys()) try: mapper = { @@ -128,20 +123,43 @@ def _annotate_spectra(self, psm_list: PSMList, spectrum_file: str) -> List: raise ParseSpectrumError( "Could not parse spectrum IDs using ´spectrum_id_pattern´. Please make sure that there is a capturing in the pattern." ) - - for psm in psm_list: - pyteomics_spectrum = spectrum_reader.get_by_id(mapper[psm.spectrum_id]) - annotated_spectrum = self._annotate_spectrum(psm, pyteomics_spectrum) - annotated_spectra.append(annotated_spectrum) - - return annotated_spectra + annotated_spectra = [ + self._annotate_spectrum(psm, spectrum_reader.get_by_id(mapper[psm.spectrum_id])) + for psm in psm_list + ] + for psm, annotated_spectrum in zip(psm_list, annotated_spectra): + psm.rescoring_features.update( + self._calculate_spectrum_features(psm, annotated_spectrum) + ) + # with multiprocessing.Pool(self.processes) as pool: + # counts_failed = 0 + # for psm, features in zip( + # psm_list, + # track( + # pool.imap( + # self._calculate_spectrum_features_wrapper, + # zip(psm_list, annotated_spectra), + # chunksize=1000, + # ), + # total=len(psm_list), + # description="Calculating MS2 features...", + # transient=True, + # ), + # ): + # if features: + # psm.rescoring_features.update(features) + + # else: + # counts_failed += 1 + # if counts_failed > 0: + # logger.warning(f"Failed to calculate features for {counts_failed} PSMs") @staticmethod def _read_spectrum_file(spectrum_filepath: str) -> Union[mzml.PreIndexedMzML, mgf.IndexedMGF]: - if spectrum_filepath.suffix == ".mzML": + if spectrum_filepath.suffix.lower() == ".mzml": return mzml.PreIndexedMzML(str(spectrum_filepath)) - elif spectrum_filepath.suffix == ".mgf": + elif spectrum_filepath.suffix.lower() == ".mgf": return mgf.IndexedMGF(str(spectrum_filepath)) @staticmethod @@ -155,40 +173,48 @@ def _longest_ion_sequence(lst): return max_sequence - def extract_spectrum_features(self, annotated_spectrum, peptidoform): + def _calculate_spectrum_features(self, psm, annotated_spectrum): + features = defaultdict(list) - b_ions_matched = [False] * (len(peptidoform.sequence)) - y_ions_matched = [False] * (len(peptidoform.sequence)) + b_ions_matched = [False] * (len(psm.peptidoform.sequence)) + y_ions_matched = [False] * (len(psm.peptidoform.sequence)) pseudo_count = 0.00001 + ion_fragment_regex = re.compile(r"\d+") - for annotated_peak in annotated_spectrum.spectrum: - features["total_intensity"].append(annotated_peak.intensity) + for peak in annotated_spectrum: + features["total_intensity"].append(peak.intensity) - if annotated_peak.annotation: - features["matched_intensity"].append(annotated_peak.intensity) - for matched_ion in annotated_peak.annotation: + if peak.annotation: + features["matched_intensity"].append(peak.intensity) + for matched_ion in peak.annotation: if "y" in matched_ion.ion: - features["y_ion_matched"].append(annotated_peak.intensity) - y_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True + features["y_ion_matched"].append(peak.intensity) + y_ions_matched[int(ion_fragment_regex.search(matched_ion.ion).group())] = ( + True + ) elif "b" in matched_ion.ion: - features["b_ion_matched"].append(annotated_peak.intensity) - b_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True + features["b_ion_matched"].append(peak.intensity) + b_ions_matched[int(ion_fragment_regex.search(matched_ion.ion).group())] = ( + True + ) + + total_intensity_sum = np.sum(features["total_intensity"]) + matched_intensity_sum = np.sum(features["matched_intensity"]) + b_ion_matched_sum = np.sum(features["b_ion_matched"]) + y_ion_matched_sum = np.sum(features["y_ion_matched"]) return { - "ln_explained_intensity": np.log(np.sum(features["matched_intensity"]) + pseudo_count), - "ln_total_intensity": np.log(np.sum(features["total_intensity"]) + pseudo_count), + "ln_explained_intensity": np.log(matched_intensity_sum + pseudo_count), + "ln_total_intensity": np.log(total_intensity_sum + pseudo_count), "ln_explained_intensity_ratio": np.log( - np.sum(features["matched_intensity"]) / np.sum(features["total_intensity"]) - + pseudo_count + matched_intensity_sum / total_intensity_sum + pseudo_count ), "ln_explained_b_ion_ratio": np.log( - np.sum(features["b_ion_matched"]) / np.sum(features["matched_intensity"]) - + pseudo_count + b_ion_matched_sum / matched_intensity_sum + pseudo_count ), "ln_explained_y_ion_ratio": np.log( - np.sum(features["y_ion_matched"]) / np.sum(features["matched_intensity"]) - + pseudo_count + y_ion_matched_sum / matched_intensity_sum + pseudo_count ), "longest_b_ion_sequence": self._longest_ion_sequence(b_ions_matched), "longest_y_ion_sequence": self._longest_ion_sequence(y_ions_matched), @@ -200,44 +226,31 @@ def extract_spectrum_features(self, annotated_spectrum, peptidoform): / (len(b_ions_matched) + len(y_ions_matched)), } - def _annotate_spectrum(self, psm, spectrum): + def _calculate_spectrum_features_wrapper(self, psm_spectrum_tuple): + psm, spectrum = psm_spectrum_tuple + return self._calculate_spectrum_features(psm, spectrum) + + def _annotate_spectrum(self, psm, pyteomics_spectrum): spectrum = RawSpectrum( title=psm.spectrum_id, num_scans=1, rt=psm.retention_time, precursor_charge=psm.get_precursor_charge(), - precursor_mass=mass.calculate_mass(psm.peptidoform.compsoition), - mz_array=spectrum["m/z array"], - intensity_array=spectrum["intensity array"], + precursor_mass=mass.calculate_mass(psm.peptidoform.composition), + mz_array=pyteomics_spectrum["m/z array"], + intensity_array=pyteomics_spectrum["intensity array"], ) annotated_spectrum = spectrum.annotate( peptide=LinearPeptide(psm.peptidoform.proforma), - model=self.fragmentationmodel, - mode=self.massmode, + model=self.fragmentation_model, + mode=self.mass_mode, ) - return annotated_spectrum + return annotated_spectrum.spectrum # TODO: keep this here? def modification_evidence(): return - - -# TODO: move to basic feature generator -def extract_psm_features(psm): - - expmass = (psm.precursor_mz * psm.get_precursor_charge()) - ( - psm.get_precursor_charge() * 1.007276 - ) # TODO: replace by constant - calcmass = mass.calculate_mass(psm.peptidoform.composition) - delta_mass = expmass - calcmass - - return { - "peptide_len": len(psm.peptidoform.sequence), - "expmass": expmass, - "calcmass": calcmass, - "delta_mass": delta_mass, - } From 947233ee0039757676ea5b62daaacf2e1059089f Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Wed, 28 Feb 2024 11:12:32 +0100 Subject: [PATCH 10/54] return empty list on parsing error with rustyms, removed multiprocessing --- ms2rescore/feature_generators/ms2.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py index 997accb2..b292d51a 100644 --- a/ms2rescore/feature_generators/ms2.py +++ b/ms2rescore/feature_generators/ms2.py @@ -4,7 +4,6 @@ """ import logging -import multiprocessing import re from typing import List, Optional, Union from itertools import chain @@ -15,7 +14,6 @@ from psm_utils import PSMList from pyteomics import mass, mzml, mgf from rustyms import RawSpectrum, LinearPeptide, FragmentationModel, MassMode -from rich.progress import track from ms2rescore.feature_generators.base import FeatureGeneratorBase from ms2rescore.utils import infer_spectrum_path @@ -62,6 +60,7 @@ def __init__( """ super().__init__(*args, **kwargs) + self.spectrum_path = spectrum_path self.spectrum_id_pattern = spectrum_id_pattern self.fragmentation_model = FRAGMENTATION_MODELS[fragmentation_model.lower()] @@ -100,7 +99,6 @@ def add_features(self, psm_list: PSMList) -> None: ) psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) spectrum_filename = infer_spectrum_path(self.spectrum_path, run) - logger.debug(f"Using spectrum file `{spectrum_filename}`") self._calculate_features(psm_list_run, spectrum_filename) current_run += 1 @@ -175,6 +173,9 @@ def _longest_ion_sequence(lst): def _calculate_spectrum_features(self, psm, annotated_spectrum): + if not annotated_spectrum: + return {} + features = defaultdict(list) b_ions_matched = [False] * (len(psm.peptidoform.sequence)) y_ions_matched = [False] * (len(psm.peptidoform.sequence)) @@ -241,12 +242,14 @@ def _annotate_spectrum(self, psm, pyteomics_spectrum): mz_array=pyteomics_spectrum["m/z array"], intensity_array=pyteomics_spectrum["intensity array"], ) - - annotated_spectrum = spectrum.annotate( - peptide=LinearPeptide(psm.peptidoform.proforma), - model=self.fragmentation_model, - mode=self.mass_mode, - ) + try: + annotated_spectrum = spectrum.annotate( + peptide=LinearPeptide(psm.peptidoform.proforma.split("/")[0]), + model=self.fragmentation_model, + mode=self.mass_mode, + ) + except: # noqa E722 + return [] return annotated_spectrum.spectrum From 24ce565542fb42cea534b81b53093435dba2a36c Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Fri, 15 Mar 2024 09:47:02 +0100 Subject: [PATCH 11/54] add deeplc_calibration psm set --- ms2rescore/feature_generators/deeplc.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py index 30bea716..207f8ba1 100644 --- a/ms2rescore/feature_generators/deeplc.py +++ b/ms2rescore/feature_generators/deeplc.py @@ -25,6 +25,7 @@ import numpy as np from psm_utils import PSMList +from psm_utils.io import read_file from ms2rescore.feature_generators.base import FeatureGeneratorBase @@ -40,6 +41,7 @@ def __init__( *args, lower_score_is_better: bool = False, calibration_set_size: Union[int, float, None] = None, + calibration_set: Union[str, None] = None, processes: int = 1, **kwargs, ) -> None: @@ -71,6 +73,7 @@ def __init__( self.lower_psm_score_better = lower_score_is_better self.calibration_set_size = calibration_set_size + self.calibration_set = calibration_set self.processes = processes self.deeplc_kwargs = kwargs or {} @@ -120,7 +123,9 @@ def add_features(self, psm_list: PSMList) -> None: # Run DeepLC for each spectrum file current_run = 1 total_runs = sum(len(runs) for runs in psm_dict.values()) - + if self.calibration_set: + logger.info("Loading calibration set...") + self.calibration_set = read_file(self.calibration_set, filetype="tsv") for runs in psm_dict.values(): # Reset DeepLC predictor for each collection of runs self.deeplc_predictor = None @@ -138,13 +143,19 @@ def add_features(self, psm_list: PSMList) -> None: ) # Disable wild logging to stdout by Tensorflow, unless in debug mode - with contextlib.redirect_stdout( - open(os.devnull, "w") - ) if not self._verbose else contextlib.nullcontext(): + with ( + contextlib.redirect_stdout(open(os.devnull, "w")) + if not self._verbose + else contextlib.nullcontext() + ): # Make new PSM list for this run (chain PSMs per spectrum to flat list) psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) - - psm_list_calibration = self._get_calibration_psms(psm_list_run) + if self.calibration_set: + psm_list_calibration = self.calibration_set[ + self.calibration_set["run"] == run + ] + else: + psm_list_calibration = self._get_calibration_psms(psm_list_run) logger.debug(f"Calibrating DeepLC with {len(psm_list_calibration)} PSMs...") self.deeplc_predictor = self.DeepLC( n_jobs=self.processes, From 33c38b04e304e6b865714adb26b813c647911628 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Wed, 17 Apr 2024 17:10:38 +0200 Subject: [PATCH 12/54] remove unused import --- ms2rescore/utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py index 28249c51..3515893a 100644 --- a/ms2rescore/utils.py +++ b/ms2rescore/utils.py @@ -4,9 +4,6 @@ from glob import glob from pathlib import Path from typing import Optional, Union -import cProfile -import pstats -import io from ms2rescore.exceptions import MS2RescoreConfigurationError From 11fdc51ad70c225393553f5964f7485a60b7ae19 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Wed, 21 Aug 2024 13:25:10 +0200 Subject: [PATCH 13/54] integrate mumble into ms2branch --- ms2rescore/exceptions.py | 6 +++ ms2rescore/package_data/config_default.json | 1 + ms2rescore/package_data/config_schema.json | 42 +++++++++++++++++++++ ms2rescore/parse_psms.py | 14 +++++++ pyproject.toml | 1 + 5 files changed, 64 insertions(+) diff --git a/ms2rescore/exceptions.py b/ms2rescore/exceptions.py index ba5492a2..68b1b43d 100644 --- a/ms2rescore/exceptions.py +++ b/ms2rescore/exceptions.py @@ -41,3 +41,9 @@ class RescoringError(MS2RescoreError): """Error while rescoring PSMs.""" pass + + +class ParseSpectrumError(MS2RescoreError): + """Error while rescoring PSMs.""" + + pass diff --git a/ms2rescore/package_data/config_default.json b/ms2rescore/package_data/config_default.json index 733ea707..126fde8b 100644 --- a/ms2rescore/package_data/config_default.json +++ b/ms2rescore/package_data/config_default.json @@ -12,6 +12,7 @@ }, "maxquant": {} }, + "psm_generator": {}, "rescoring_engine": { "mokapot": { "train_fdr": 0.01, diff --git a/ms2rescore/package_data/config_schema.json b/ms2rescore/package_data/config_schema.json index ab77f3b7..40471714 100644 --- a/ms2rescore/package_data/config_schema.json +++ b/ms2rescore/package_data/config_schema.json @@ -62,6 +62,18 @@ "mokapot": {} } }, + "psm_generator": { + "description": "PSM generator and their configuration.", + "type": "object", + "minProperties": 0, + "maxProperties": 1, + "patternProperties": { + ".*": { "$ref": "#/definitions/psm_generator" }, + "mumble": { + "$ref": "#/definitions/mumble" + } + } + }, "config_file": { "description": "Path to configuration file", "oneOf": [{ "type": "string" }, { "type": "null" }] @@ -179,6 +191,7 @@ "type": "boolean", "default": false } + } } }, @@ -193,6 +206,11 @@ "type": "object", "additionalProperties": true }, + "psm_generator": { + "description": "Additional PSM feature generator configuration", + "type": "object", + "additionalProperties": true + }, "basic": { "$ref": "#/definitions/feature_generator", "description": "Basic feature generator configuration", @@ -273,6 +291,30 @@ } } }, + "mumble": { + "$ref": "#/definitions/psm_generator", + "description": "Mumble PSM generator configuration using Mubmle", + "type": "object", + "additionalProperties": true, + "properties": { + "aa_combinations": { + "description": "Additional amino acid combinations to consider as mass shift", + "type": "integer", + "default": 0 + }, + "fasta_file": { + "description": "Maximum number of modifications per peptide", + "oneOf": [{ "type": "string" }, { "type": "null" }], + "default": false + }, + "mass_error": { + "description": "mass error in Da", + "type": "number", + "default": 0.2 + } + } + }, + "mokapot": { "$ref": "#/definitions/rescoring_engine", "description": "Mokapot rescoring engine configuration. Additional properties are passed to the Mokapot brew function.", diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py index 6a3c6065..ae28b519 100644 --- a/ms2rescore/parse_psms.py +++ b/ms2rescore/parse_psms.py @@ -5,6 +5,7 @@ import numpy as np import psm_utils.io from psm_utils import PSMList +from mumble import PSMHandler from ms2rescore.exceptions import MS2RescoreConfigurationError @@ -90,6 +91,19 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]] psm_list["spectrum_id"] = new_ids + # Addition of Modifications for mass shifts in the PSMs with Mumble + if "mumble" in config["psm_generator"]: + logger.debug("Applying modifications for mass shifts using Mumble...") + mumble_config = config["psm_generator"]["mumble"] + psm_handler = PSMHandler( + **mumble_config, # TODO how do we store config for mumble? + ) + psm_list = psm_handler.add_modified_psms( + psm_list, generate_modified_decoys=True, keep_original=True + ) + if mumble_config["output_file"]: + psm_handler.write_modified_psm_list(psm_list, mumble_config["output_file"]) + return psm_list diff --git a/pyproject.toml b/pyproject.toml index f57ad800..7c1b0a9c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,6 +50,7 @@ dependencies = [ "pyteomics>=4.7.2", "rich>=12", "tomli>=2; python_version < '3.11'", + "mumble" ] [project.optional-dependencies] From 883169a90cf7c57970dac1085ba54defbf1ffdb9 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Fri, 27 Sep 2024 09:58:27 +0200 Subject: [PATCH 14/54] temp removal of sage features before rescoring --- ms2rescore/parse_psms.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py index ae28b519..91c94168 100644 --- a/ms2rescore/parse_psms.py +++ b/ms2rescore/parse_psms.py @@ -95,6 +95,15 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: if "mumble" in config["psm_generator"]: logger.debug("Applying modifications for mass shifts using Mumble...") mumble_config = config["psm_generator"]["mumble"] + for psm in psm_list: + psm["rescoring_features"].pop("expmass", None) + psm["rescoring_features"].pop("calcmass", None) + psm["rescoring_features"].pop("peptide_len", None) + psm["rescoring_features"].pop("matched_peaks", None) + psm["rescoring_features"].pop("longest_b", None) + psm["rescoring_features"].pop("longest_y", None) + psm["rescoring_features"].pop("longest_y_pct", None) + psm["rescoring_features"].pop("matched_intensity_pct", None) psm_handler = PSMHandler( **mumble_config, # TODO how do we store config for mumble? ) From da39ae81d052b4391ada17fd24bb5a856078e8a8 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Fri, 8 Nov 2024 15:41:26 +0100 Subject: [PATCH 15/54] remove psm_file features when rescoring with mumble --- ms2rescore/parse_psms.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py index 91c94168..309c1c96 100644 --- a/ms2rescore/parse_psms.py +++ b/ms2rescore/parse_psms.py @@ -95,15 +95,14 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: if "mumble" in config["psm_generator"]: logger.debug("Applying modifications for mass shifts using Mumble...") mumble_config = config["psm_generator"]["mumble"] - for psm in psm_list: - psm["rescoring_features"].pop("expmass", None) - psm["rescoring_features"].pop("calcmass", None) - psm["rescoring_features"].pop("peptide_len", None) - psm["rescoring_features"].pop("matched_peaks", None) - psm["rescoring_features"].pop("longest_b", None) - psm["rescoring_features"].pop("longest_y", None) - psm["rescoring_features"].pop("longest_y_pct", None) - psm["rescoring_features"].pop("matched_intensity_pct", None) + + # Check if psm_list[0].rescoring_features is empty or not + if psm_list[0].rescoring_features: + logger.debug("Removing psm_file rescoring features from PSMs...") + # psm_list.remove_rescoring_features() # TODO add this to psm_utils + for psm in psm_list: + psm.rescoring_features = {} + psm_handler = PSMHandler( **mumble_config, # TODO how do we store config for mumble? ) From 37fff28ebc03fcf66f98be5a5c51ca7e3b68e97b Mon Sep 17 00:00:00 2001 From: SamvPy Date: Tue, 19 Nov 2024 16:10:43 +0100 Subject: [PATCH 16/54] linting --- ms2rescore/core.py | 5 ++++- ms2rescore/feature_generators/__init__.py | 4 ++-- ms2rescore/feature_generators/deeplc.py | 8 +++++--- ms2rescore/feature_generators/ionmob.py | 11 +++++++++-- ms2rescore/feature_generators/ms2pip.py | 5 ++++- ms2rescore/gui/__main__.py | 2 +- ms2rescore/parse_psms.py | 2 +- ms2rescore/rescoring_engines/percolator.py | 2 +- 8 files changed, 27 insertions(+), 12 deletions(-) diff --git a/ms2rescore/core.py b/ms2rescore/core.py index a02ab4f6..48e125bc 100644 --- a/ms2rescore/core.py +++ b/ms2rescore/core.py @@ -14,7 +14,10 @@ from ms2rescore.parse_spectra import get_missing_values from ms2rescore.report import generate from ms2rescore.rescoring_engines import mokapot, percolator -from ms2rescore.rescoring_engines.mokapot import add_peptide_confidence, add_psm_confidence +from ms2rescore.rescoring_engines.mokapot import ( + add_peptide_confidence, + add_psm_confidence, +) logger = logging.getLogger(__name__) diff --git a/ms2rescore/feature_generators/__init__.py b/ms2rescore/feature_generators/__init__.py index 7a10fd4c..9eff1b5c 100644 --- a/ms2rescore/feature_generators/__init__.py +++ b/ms2rescore/feature_generators/__init__.py @@ -4,11 +4,11 @@ from ms2rescore.feature_generators.basic import BasicFeatureGenerator from ms2rescore.feature_generators.deeplc import DeepLCFeatureGenerator +from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator from ms2rescore.feature_generators.ionmob import IonMobFeatureGenerator from ms2rescore.feature_generators.maxquant import MaxQuantFeatureGenerator -from ms2rescore.feature_generators.ms2pip import MS2PIPFeatureGenerator -from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator from ms2rescore.feature_generators.ms2 import MS2FeatureGenerator +from ms2rescore.feature_generators.ms2pip import MS2PIPFeatureGenerator FEATURE_GENERATORS = { "basic": BasicFeatureGenerator, diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py index 206d6a21..e3113544 100644 --- a/ms2rescore/feature_generators/deeplc.py +++ b/ms2rescore/feature_generators/deeplc.py @@ -143,9 +143,11 @@ def add_features(self, psm_list: PSMList) -> None: ) # Disable wild logging to stdout by Tensorflow, unless in debug mode - with contextlib.redirect_stdout( - open(os.devnull, "w", encoding="utf-8") - ) if not self._verbose else contextlib.nullcontext(): + with ( + contextlib.redirect_stdout(open(os.devnull, "w", encoding="utf-8")) + if not self._verbose + else contextlib.nullcontext() + ): # Make new PSM list for this run (chain PSMs per spectrum to flat list) psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) if self.calibration_set: diff --git a/ms2rescore/feature_generators/ionmob.py b/ms2rescore/feature_generators/ionmob.py index 7fa0c0a1..e9b51026 100644 --- a/ms2rescore/feature_generators/ionmob.py +++ b/ms2rescore/feature_generators/ionmob.py @@ -23,12 +23,19 @@ import tensorflow as tf from psm_utils import Peptidoform, PSMList -from ms2rescore.feature_generators.base import FeatureGeneratorBase, FeatureGeneratorException +from ms2rescore.feature_generators.base import ( + FeatureGeneratorBase, + FeatureGeneratorException, +) try: from ionmob import __file__ as ionmob_file from ionmob.preprocess.data import to_tf_dataset_inference - from ionmob.utilities.chemistry import VARIANT_DICT, calculate_mz, reduced_mobility_to_ccs + from ionmob.utilities.chemistry import ( + VARIANT_DICT, + calculate_mz, + reduced_mobility_to_ccs, + ) from ionmob.utilities.tokenization import tokenizer_from_json from ionmob.utilities.utility import get_ccs_shift except ImportError: diff --git a/ms2rescore/feature_generators/ms2pip.py b/ms2rescore/feature_generators/ms2pip.py index 3882b3fc..34d427e2 100644 --- a/ms2rescore/feature_generators/ms2pip.py +++ b/ms2rescore/feature_generators/ms2pip.py @@ -37,7 +37,10 @@ from psm_utils import PSMList from rich.progress import track -from ms2rescore.feature_generators.base import FeatureGeneratorBase, FeatureGeneratorException +from ms2rescore.feature_generators.base import ( + FeatureGeneratorBase, + FeatureGeneratorException, +) from ms2rescore.utils import infer_spectrum_path logger = logging.getLogger(__name__) diff --git a/ms2rescore/gui/__main__.py b/ms2rescore/gui/__main__.py index 429e117f..57cee3c6 100644 --- a/ms2rescore/gui/__main__.py +++ b/ms2rescore/gui/__main__.py @@ -1,8 +1,8 @@ """Entrypoint for MS²Rescore GUI.""" +import contextlib import multiprocessing import os -import contextlib from ms2rescore.gui.app import app diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py index 309c1c96..cc61926f 100644 --- a/ms2rescore/parse_psms.py +++ b/ms2rescore/parse_psms.py @@ -4,8 +4,8 @@ import numpy as np import psm_utils.io -from psm_utils import PSMList from mumble import PSMHandler +from psm_utils import PSMList from ms2rescore.exceptions import MS2RescoreConfigurationError diff --git a/ms2rescore/rescoring_engines/percolator.py b/ms2rescore/rescoring_engines/percolator.py index 192950bd..32d856c4 100644 --- a/ms2rescore/rescoring_engines/percolator.py +++ b/ms2rescore/rescoring_engines/percolator.py @@ -19,8 +19,8 @@ import logging import subprocess -from typing import Any, Dict, Optional from copy import deepcopy +from typing import Any, Dict, Optional import psm_utils From e8b59f3ca2c6f1fbaae4ec5cc6b531f25284fad7 Mon Sep 17 00:00:00 2001 From: SamvPy Date: Tue, 19 Nov 2024 16:10:56 +0100 Subject: [PATCH 17/54] add hyperscore calculation --- ms2rescore/feature_generators/ms2.py | 183 ++++++++++++++++++++++++++- 1 file changed, 176 insertions(+), 7 deletions(-) diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py index b292d51a..490f4f47 100644 --- a/ms2rescore/feature_generators/ms2.py +++ b/ms2rescore/feature_generators/ms2.py @@ -5,19 +5,19 @@ import logging import re -from typing import List, Optional, Union -from itertools import chain from collections import defaultdict - +from itertools import chain +from typing import List, Optional, Union import numpy as np -from psm_utils import PSMList -from pyteomics import mass, mzml, mgf -from rustyms import RawSpectrum, LinearPeptide, FragmentationModel, MassMode +import pyopenms as oms +from psm_utils import PSM, Peptidoform, PSMList +from pyteomics import mass, mgf, mzml +from rustyms import FragmentationModel, LinearPeptide, MassMode, RawSpectrum +from ms2rescore.exceptions import ParseSpectrumError from ms2rescore.feature_generators.base import FeatureGeneratorBase from ms2rescore.utils import infer_spectrum_path -from ms2rescore.exceptions import ParseSpectrumError logger = logging.getLogger(__name__) @@ -45,6 +45,7 @@ def __init__( fragmentation_model: str = "All", mass_mode: str = "Monoisotopic", processes: int = 1, + calculate_hyperscore: bool = True, # Allow optional ? **kwargs, ) -> None: """ @@ -66,6 +67,7 @@ def __init__( self.fragmentation_model = FRAGMENTATION_MODELS[fragmentation_model.lower()] self.mass_mode = MASS_MODES[mass_mode.lower()] self.processes = processes + self.calculate_hyperscore = calculate_hyperscore @property def feature_names(self) -> List[str]: @@ -82,6 +84,7 @@ def feature_names(self) -> List[str]: "matched_y_ions", "matched_y_ions_pct", "matched_ions_pct", + "hyperscore", ] def add_features(self, psm_list: PSMList) -> None: @@ -129,6 +132,22 @@ def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List: psm.rescoring_features.update( self._calculate_spectrum_features(psm, annotated_spectrum) ) + + if self.calculate_hyperscore: + # Filters out peaks which are unnannotated (can be specified, but keep at b-y ions of any charge ?) + mz_list, intensity_list = _annotated_spectrum_to_mzint( + annotated_spectrum=annotated_spectrum + ) + psm.rescoring_features.update( + { + "hyperscore": calculate_hyperscore( + psm=psm, observed_mz=mz_list, observed_intensity=intensity_list + ) + } + ) + + else: + psm.rescoring_features.update({"hyperscore": np.nan}) # with multiprocessing.Pool(self.processes) as pool: # counts_failed = 0 # for psm, features in zip( @@ -253,7 +272,157 @@ def _annotate_spectrum(self, psm, pyteomics_spectrum): return annotated_spectrum.spectrum + def _calculate_hyperscore(self, psm, spectrum): + pass + + def _calculate_fragmentation_features(self, psm, annotated_spectrum): + pass + # TODO: keep this here? def modification_evidence(): return + + +def _annotated_spectrum_to_mzint(annotated_spectrum, ion_types=["b", "y"]): + + mz_list = [] + intensity_list = [] + + for peak in annotated_spectrum: + + annotations = peak.annotation + for fragment in annotations: + ion = fragment.ion + if ion[0] not in ion_types: + continue + + mz_list.append(peak.experimental_mz) + intensity_list.append(peak.intensity) + break + + return mz_list, intensity_list + + +def _peptidoform_to_oms(peptidoform: Peptidoform) -> tuple[oms.AASequence, Optional[int]]: + """ + Parse a peptidoform object to pyOpenMS compatible format. + + Parameter + --------- + Peptidoform: Peptidoform + Peptide string in Peptidoform format + + Returns + ------- + AASequence (pyOpenMS): + A peptide sequence in pyOpenMS format + int: + charge of the peptide + """ + + n_term = peptidoform.properties["n_term"] + peptide_oms_str = f"[{sum([mod.mass for mod in n_term])}]" if n_term else "" + + for aa, mods in peptidoform.parsed_sequence: + peptide_oms_str += aa + if isinstance(mods, list): + peptide_oms_str += f"[{sum([mod.mass for mod in mods])}]" + + c_term = peptidoform.properties["c_term"] + peptide_oms_str += f"[{sum([mod.mass for mod in c_term])}]" if c_term else "" + + peptide_oms = oms.AASequence.fromString(peptide_oms_str) + + return peptide_oms + + +def _peptidoform_to_theoretical_spectrum(peptidoform: str) -> oms.MSSpectrum: + """ + Create a theoretical spectrum from a peptide sequence. + + Parameter + --------- + peptide: str + Peptide sequence in proforma format + engine: str + The engine to use to create theoretical spectrum. + Can only be 'pyopenms' or 'spectrum-utils' (default) + + Return + ------ + MSSpectrum + Spectrum object in pyOpenMS format + """ + # Reformat peptide sequence in pyOpenMS format + peptide_oms = _peptidoform_to_oms(peptidoform=peptidoform) + + # Initialize the required objects to create the spectrum + spectrum = oms.MSSpectrum() + tsg = oms.TheoreticalSpectrumGenerator() + p = oms.Param() + + p.setValue("add_b_ions", "true") + p.setValue("add_metainfo", "true") + tsg.setParameters(param=p) + + # Create the theoretical spectrum + tsg.getSpectrum(spec=spectrum, peptide=peptide_oms, min_charge=1, max_charge=2) + return spectrum + + +def calculate_hyperscore( + psm: PSM, + observed_mz: List[float], + observed_intensity: List[float], + fragment_tol_mass=20, + fragment_tol_mode="ppm", +): + """ + Calculate the hyperscore as defined in the X!Tandem search engine. + + It is a metric of how good two spectra match with each other (matching peaks). + + Parameters + ---------- + psm: psm_utils.PSM + The PSM used to extract 'spectrum_id' (for MGF spectrum extraction) + and 'Peptidoform' (the peptide sequence) + observed_mz: List[float] + List of observed mz values with matching order as observed intensity + observed_intensity: List[float] + List of observed intensity values + fragment_tol_mass: int + The allowed tolerance to match peaks + fragment_tol_mode: str + 'ppm' for parts-per-million mode. 'Da' for fragment_tol_mass in Dalton. + Return + ------ + int + The hyperscore + """ + if fragment_tol_mode == "ppm": + fragment_mass_tolerance_unit_ppm = True + elif fragment_tol_mode == "Da": + fragment_mass_tolerance_unit_ppm = False + else: + raise Exception( + "fragment_tol_mode can only take 'Da' or 'ppm'. {} was provided.".format( + fragment_tol_mode + ) + ) + if len(observed_intensity) == 0: + logging.warning(f"PSM ({psm.spectrum_id}) has no annotated peaks.") + return 0.0 + + theoretical_spectrum = _peptidoform_to_theoretical_spectrum(peptidoform=psm.peptidoform) + observed_spectrum_oms = oms.MSSpectrum() + observed_spectrum_oms.set_peaks([observed_mz, observed_intensity]) + hyperscore = oms.HyperScore() + result = hyperscore.compute( + fragment_mass_tolerance=fragment_tol_mass, + fragment_mass_tolerance_unit_ppm=fragment_mass_tolerance_unit_ppm, + exp_spectrum=observed_spectrum_oms, + theo_spectrum=theoretical_spectrum, + ) + return result From c51cd346f438b2afc97c2b171ea73bcdc60a64eb Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Thu, 21 Nov 2024 17:21:04 +0100 Subject: [PATCH 18/54] calibration fixes --- ms2rescore/feature_generators/deeplc.py | 15 +++++---------- ms2rescore/feature_generators/im2deep.py | 3 ++- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py index e3113544..29a249eb 100644 --- a/ms2rescore/feature_generators/deeplc.py +++ b/ms2rescore/feature_generators/deeplc.py @@ -123,9 +123,6 @@ def add_features(self, psm_list: PSMList) -> None: # Run DeepLC for each spectrum file current_run = 1 total_runs = sum(len(runs) for runs in psm_dict.values()) - if self.calibration_set: - logger.info("Loading calibration set...") - self.calibration_set = read_file(self.calibration_set, filetype="tsv") for runs in psm_dict.values(): # Reset DeepLC predictor for each collection of runs self.deeplc_predictor = None @@ -150,12 +147,7 @@ def add_features(self, psm_list: PSMList) -> None: ): # Make new PSM list for this run (chain PSMs per spectrum to flat list) psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) - if self.calibration_set: - psm_list_calibration = self.calibration_set[ - self.calibration_set["run"] == run - ] - else: - psm_list_calibration = self._get_calibration_psms(psm_list_run) + psm_list_calibration = self._get_calibration_psms(psm_list_run) logger.debug(f"Calibrating DeepLC with {len(psm_list_calibration)} PSMs...") self.deeplc_predictor = self.DeepLC( n_jobs=self.processes, @@ -204,7 +196,10 @@ def add_features(self, psm_list: PSMList) -> None: def _get_calibration_psms(self, psm_list: PSMList): """Get N best scoring target PSMs for calibration.""" - psm_list_targets = psm_list[~psm_list["is_decoy"]] + psm_list_targets = psm_list[ + ~psm_list["is_decoy"] + & [metadata.get("original_psm", True) for metadata in psm_list["metadata"]] + ] if self.calibration_set_size: n_psms = self._get_number_of_calibration_psms(psm_list_targets) indices = np.argsort(psm_list_targets["score"]) diff --git a/ms2rescore/feature_generators/im2deep.py b/ms2rescore/feature_generators/im2deep.py index 772f6855..2afa4fcd 100644 --- a/ms2rescore/feature_generators/im2deep.py +++ b/ms2rescore/feature_generators/im2deep.py @@ -158,7 +158,8 @@ def make_calibration_df(psm_list_df: pd.DataFrame, threshold: float = 0.25) -> p identified_psms = psm_list_df[ (psm_list_df["qvalue"] < 0.01) & (~psm_list_df["is_decoy"]) - & (psm_list_df["charge"] < 5) # predictions do not go higher for IM2Deep + & (psm_list_df["charge"] < 7) # predictions do not go higher for IM2Deep + & ([metadata.get("original_psm", True) for metadata in psm_list_df["metadata"]]) ] calibration_psms = identified_psms[ identified_psms["qvalue"] < identified_psms["qvalue"].quantile(1 - threshold) From 295e37f20d2e344f46bf5fc9d911f6955210f184 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Thu, 21 Nov 2024 17:21:16 +0100 Subject: [PATCH 19/54] changes for mumble implementation --- ms2rescore/feature_generators/ms2.py | 38 ---------------------------- ms2rescore/parse_psms.py | 8 ++---- pyproject.toml | 1 - 3 files changed, 2 insertions(+), 45 deletions(-) diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py index 490f4f47..a3bd6d77 100644 --- a/ms2rescore/feature_generators/ms2.py +++ b/ms2rescore/feature_generators/ms2.py @@ -23,7 +23,6 @@ FRAGMENTATION_MODELS = { "cidhcd": FragmentationModel.CidHcd, - "etcid": FragmentationModel.Etcid, "etd": FragmentationModel.Etd, "ethcd": FragmentationModel.Ethcd, "all": FragmentationModel.All, @@ -148,28 +147,6 @@ def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List: else: psm.rescoring_features.update({"hyperscore": np.nan}) - # with multiprocessing.Pool(self.processes) as pool: - # counts_failed = 0 - # for psm, features in zip( - # psm_list, - # track( - # pool.imap( - # self._calculate_spectrum_features_wrapper, - # zip(psm_list, annotated_spectra), - # chunksize=1000, - # ), - # total=len(psm_list), - # description="Calculating MS2 features...", - # transient=True, - # ), - # ): - # if features: - # psm.rescoring_features.update(features) - - # else: - # counts_failed += 1 - # if counts_failed > 0: - # logger.warning(f"Failed to calculate features for {counts_failed} PSMs") @staticmethod def _read_spectrum_file(spectrum_filepath: str) -> Union[mzml.PreIndexedMzML, mgf.IndexedMGF]: @@ -246,10 +223,6 @@ def _calculate_spectrum_features(self, psm, annotated_spectrum): / (len(b_ions_matched) + len(y_ions_matched)), } - def _calculate_spectrum_features_wrapper(self, psm_spectrum_tuple): - psm, spectrum = psm_spectrum_tuple - return self._calculate_spectrum_features(psm, spectrum) - def _annotate_spectrum(self, psm, pyteomics_spectrum): spectrum = RawSpectrum( @@ -272,17 +245,6 @@ def _annotate_spectrum(self, psm, pyteomics_spectrum): return annotated_spectrum.spectrum - def _calculate_hyperscore(self, psm, spectrum): - pass - - def _calculate_fragmentation_features(self, psm, annotated_spectrum): - pass - - -# TODO: keep this here? -def modification_evidence(): - return - def _annotated_spectrum_to_mzint(annotated_spectrum, ion_types=["b", "y"]): diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py index cc61926f..2f0ec3b0 100644 --- a/ms2rescore/parse_psms.py +++ b/ms2rescore/parse_psms.py @@ -104,13 +104,9 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: psm.rescoring_features = {} psm_handler = PSMHandler( - **mumble_config, # TODO how do we store config for mumble? + **mumble_config, ) - psm_list = psm_handler.add_modified_psms( - psm_list, generate_modified_decoys=True, keep_original=True - ) - if mumble_config["output_file"]: - psm_handler.write_modified_psm_list(psm_list, mumble_config["output_file"]) + psm_list = psm_handler.add_modified_psms(psm_list) return psm_list diff --git a/pyproject.toml b/pyproject.toml index 4285f1fa..985cc27e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,7 +50,6 @@ dependencies = [ "pyteomics>=4.7.2", "rich>=12", "tomli>=2; python_version < '3.11'", - "mumble" ] [project.optional-dependencies] From 909860d9c6f81490f9af3f6de364b606ddd7fe9c Mon Sep 17 00:00:00 2001 From: SamvPy Date: Fri, 22 Nov 2024 11:51:23 +0100 Subject: [PATCH 20/54] change openms peptide formatting --- ms2rescore/feature_generators/ms2.py | 121 ++++++++++++++++++++++----- 1 file changed, 98 insertions(+), 23 deletions(-) diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py index a3bd6d77..eb049ed8 100644 --- a/ms2rescore/feature_generators/ms2.py +++ b/ms2rescore/feature_generators/ms2.py @@ -12,8 +12,11 @@ import numpy as np import pyopenms as oms from psm_utils import PSM, Peptidoform, PSMList +from psm_utils.peptidoform import PeptidoformException +from pyteomics.proforma import ProFormaError from pyteomics import mass, mgf, mzml from rustyms import FragmentationModel, LinearPeptide, MassMode, RawSpectrum +from pathlib import Path from ms2rescore.exceptions import ParseSpectrumError from ms2rescore.feature_generators.base import FeatureGeneratorBase @@ -68,6 +71,7 @@ def __init__( self.processes = processes self.calculate_hyperscore = calculate_hyperscore + @property def feature_names(self) -> List[str]: return [ @@ -137,19 +141,20 @@ def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List: mz_list, intensity_list = _annotated_spectrum_to_mzint( annotated_spectrum=annotated_spectrum ) + hyperscore = calculate_hyperscore( + psm=psm, observed_mz=mz_list, observed_intensity=intensity_list + ) + if hyperscore is None: + continue psm.rescoring_features.update( { - "hyperscore": calculate_hyperscore( - psm=psm, observed_mz=mz_list, observed_intensity=intensity_list - ) + "hyperscore": hyperscore } ) - else: - psm.rescoring_features.update({"hyperscore": np.nan}) @staticmethod - def _read_spectrum_file(spectrum_filepath: str) -> Union[mzml.PreIndexedMzML, mgf.IndexedMGF]: + def _read_spectrum_file(spectrum_filepath: Path) -> Union[mzml.PreIndexedMzML, mgf.IndexedMGF]: if spectrum_filepath.suffix.lower() == ".mzml": return mzml.PreIndexedMzML(str(spectrum_filepath)) @@ -235,8 +240,9 @@ def _annotate_spectrum(self, psm, pyteomics_spectrum): intensity_array=pyteomics_spectrum["intensity array"], ) try: + linear_peptide = LinearPeptide(psm.peptidoform.proforma.split("/")[0]) annotated_spectrum = spectrum.annotate( - peptide=LinearPeptide(psm.peptidoform.proforma.split("/")[0]), + peptide=linear_peptide, model=self.fragmentation_model, mode=self.mass_mode, ) @@ -246,6 +252,9 @@ def _annotate_spectrum(self, psm, pyteomics_spectrum): return annotated_spectrum.spectrum +############################ +### HYPERSCORE FUNCTIONS ### +############################ def _annotated_spectrum_to_mzint(annotated_spectrum, ion_types=["b", "y"]): mz_list = [] @@ -268,35 +277,87 @@ def _annotated_spectrum_to_mzint(annotated_spectrum, ion_types=["b", "y"]): def _peptidoform_to_oms(peptidoform: Peptidoform) -> tuple[oms.AASequence, Optional[int]]: """ - Parse a peptidoform object to pyOpenMS compatible format. + Parse a peptide sequence in proforma format to pyOpenMS compatible format. + + Only supports UNIMOD format. Parameter --------- - Peptidoform: Peptidoform - Peptide string in Peptidoform format + peptide: str + Peptide string in proforma format Returns ------- AASequence (pyOpenMS): A peptide sequence in pyOpenMS format - int: - charge of the peptide """ + peptide = peptidoform.proforma + + # Reformat unimod modifications + pattern_unimod = r"\[UNIMOD:(\d+)\]" + + def replace_unimod(match): + return f"(UniMod:{match.group(1)})" + + peptide_oms_str = re.sub( + pattern=pattern_unimod, repl=replace_unimod, string=peptide + ) + + # Parse N-terminal modifications + if ")-" in peptide_oms_str: + peptide_oms_list = peptide_oms_str.split(")-") + nterm_modification, peptide_oms_str = peptide_oms_list[-2], peptide_oms_list[-1] + nterm_modification += ")" + peptide_oms_str = "." + nterm_modification + peptide_oms_str + "." + elif "]-" in peptide_oms_str: + peptide_oms_list = peptide_oms_str.split("]-") + nterm_modification, peptide_oms_str = peptide_oms_list[-2], peptide_oms_list[-1] + nterm_modification += "]" + peptide_oms_str = "." + nterm_modification + peptide_oms_str + "." - n_term = peptidoform.properties["n_term"] - peptide_oms_str = f"[{sum([mod.mass for mod in n_term])}]" if n_term else "" + # Split the charge from the peptide string + if "/" in peptide_oms_str: + peptide_oms_str, _ = peptide_oms_str.split("/") - for aa, mods in peptidoform.parsed_sequence: - peptide_oms_str += aa - if isinstance(mods, list): - peptide_oms_str += f"[{sum([mod.mass for mod in mods])}]" + try: + peptide_oms = oms.AASequence.fromString(peptide_oms_str) + return peptide_oms - c_term = peptidoform.properties["c_term"] - peptide_oms_str += f"[{sum([mod.mass for mod in c_term])}]" if c_term else "" + except: + return - peptide_oms = oms.AASequence.fromString(peptide_oms_str) - return peptide_oms +# def _peptidoform_to_oms(peptidoform: Peptidoform) -> tuple[oms.AASequence, Optional[int]]: +# """ +# Parse a peptidoform object to pyOpenMS compatible format. + +# Parameter +# --------- +# Peptidoform: Peptidoform +# Peptide string in Peptidoform format + +# Returns +# ------- +# AASequence (pyOpenMS): +# A peptide sequence in pyOpenMS format +# int: +# charge of the peptide +# """ + +# n_term = peptidoform.properties["n_term"] +# peptide_oms_str = f"[{sum([mod.mass for mod in n_term])}]" if n_term else "" + +# for aa, mods in peptidoform.parsed_sequence: +# peptide_oms_str += aa +# if isinstance(mods, list): +# peptide_oms_str += f"[{sum([mod.mass for mod in mods])}]" + +# c_term = peptidoform.properties["c_term"] +# peptide_oms_str += f"[{sum([mod.mass for mod in c_term])}]" if c_term else "" + +# peptide_oms = oms.AASequence.fromString(peptide_oms_str) + +# return peptide_oms def _peptidoform_to_theoretical_spectrum(peptidoform: str) -> oms.MSSpectrum: @@ -318,6 +379,8 @@ def _peptidoform_to_theoretical_spectrum(peptidoform: str) -> oms.MSSpectrum: """ # Reformat peptide sequence in pyOpenMS format peptide_oms = _peptidoform_to_oms(peptidoform=peptidoform) + if peptide_oms is None: + return # Initialize the required objects to create the spectrum spectrum = oms.MSSpectrum() @@ -375,9 +438,16 @@ def calculate_hyperscore( ) if len(observed_intensity) == 0: logging.warning(f"PSM ({psm.spectrum_id}) has no annotated peaks.") - return 0.0 + return theoretical_spectrum = _peptidoform_to_theoretical_spectrum(peptidoform=psm.peptidoform) + # This is mainly the cause of the modification not being allowed according to pyopenms + # pyOpenMS sets stringent criteria on modification being placed on annotated amino acids + # according to the unimod database + if theoretical_spectrum is None: + logging.warning(f'Peptidoform has either unsupported modifications or is being placed on non-allowed residue: {psm.peptidoform.proforma}') + return + observed_spectrum_oms = oms.MSSpectrum() observed_spectrum_oms.set_peaks([observed_mz, observed_intensity]) hyperscore = oms.HyperScore() @@ -388,3 +458,8 @@ def calculate_hyperscore( theo_spectrum=theoretical_spectrum, ) return result + + +########################### +### FRAG SITE FUNCTIONS ### +########################### From c5902c21c0c5c72e71bb64cafc17c15c28b50191 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Fri, 22 Nov 2024 13:36:41 +0100 Subject: [PATCH 21/54] add mumble psm filtering functionality --- ms2rescore/core.py | 10 +++++++++- ms2rescore/utils.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/ms2rescore/core.py b/ms2rescore/core.py index 48e125bc..f314bef6 100644 --- a/ms2rescore/core.py +++ b/ms2rescore/core.py @@ -18,6 +18,7 @@ add_peptide_confidence, add_psm_confidence, ) +from ms2rescore.utils import filter_mumble_psms logger = logging.getLogger(__name__) @@ -102,6 +103,10 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: ) psm_list = psm_list[psms_with_features] + if "mumble" in config["psm_generator"]: + # Remove PSMS that have a less matched ions than the original hit + psm_list = filter_mumble_psms(psm_list) + # Write feature names to file _write_feature_names(feature_names, output_file_root) @@ -251,7 +256,10 @@ def _write_feature_names(feature_names, output_file_root): def _log_id_psms_before(psm_list: PSMList, fdr: float = 0.01, max_rank: int = 1) -> int: """Log #PSMs identified before rescoring.""" id_psms_before = ( - (psm_list["qvalue"] <= 0.01) & (psm_list["rank"] <= max_rank) & (~psm_list["is_decoy"]) + (psm_list["qvalue"] <= 0.01) + & (psm_list["rank"] <= max_rank) + & (~psm_list["is_decoy"]) + & ([metadata.get("original_psm", True) for metadata in psm_list["metadata"]]) ).sum() logger.info( f"Found {id_psms_before} identified PSMs with rank <= {max_rank} at {fdr} FDR before " diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py index 3515893a..33b1f0a1 100644 --- a/ms2rescore/utils.py +++ b/ms2rescore/utils.py @@ -4,8 +4,10 @@ from glob import glob from pathlib import Path from typing import Optional, Union +import numpy as np from ms2rescore.exceptions import MS2RescoreConfigurationError +from psm_utils import PSMList logger = logging.getLogger(__name__) @@ -93,3 +95,32 @@ def _is_minitdf(spectrum_file: str) -> bool: files = set(Path(spectrum_file).glob("*ms2spectrum.bin")) files.update(Path(spectrum_file).glob("*ms2spectrum.parquet")) return len(files) >= 2 + + +def filter_mumble_psms(psm_list: PSMList) -> PSMList: + """ + Filter out PSMs with mumble in the peptide sequence. + """ + keep = [None] * len(psm_list) + original_hit = np.array([metadata.get("original_hit") for metadata in psm_list["metadata"]]) + spectrum_indices = np.array([psm.spectrum for psm in psm_list]) + runs = np.array([psm.run for psm in psm_list]) + if "matched_ions_pct" in psm_list[0].rescoring_features: + matched_ions = [psm.rescoring_features["matched_ions_pct"] for psm in psm_list] + else: + return psm_list + for i, psm in enumerate(psm_list): + if isinstance(keep[i], bool): + continue + elif original_hit[i]: + keep[i] = True + else: + original_matched_ions_pct = np.logical_and.reduce( + [original_hit, spectrum_indices == psm.spectrum_index, runs == psm.run] + ) + if original_matched_ions_pct > matched_ions[i]: + keep[i] = False + else: + keep[i] = True + logger.debug(f"Filtered out {len(psm_list) - sum(keep)} mumble PSMs.") + return psm_list[keep] From 5ce55f501d4a52fe6714a904d6402b90d8450613 Mon Sep 17 00:00:00 2001 From: SamvPy Date: Fri, 22 Nov 2024 15:05:03 +0100 Subject: [PATCH 22/54] remove pyopenms dependency for hyperscore calculation --- ms2rescore/feature_generators/ms2.py | 296 +++++++-------------------- 1 file changed, 75 insertions(+), 221 deletions(-) diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py index eb049ed8..df12ca95 100644 --- a/ms2rescore/feature_generators/ms2.py +++ b/ms2rescore/feature_generators/ms2.py @@ -138,20 +138,14 @@ def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List: if self.calculate_hyperscore: # Filters out peaks which are unnannotated (can be specified, but keep at b-y ions of any charge ?) - mz_list, intensity_list = _annotated_spectrum_to_mzint( - annotated_spectrum=annotated_spectrum + b, y = get_by_fragments(annotated_spectrum) + hs = calculate_hyperscore( + n_y=len(y), + n_b=len(b), + y_ion_intensities=y, + b_ion_intensities=b ) - hyperscore = calculate_hyperscore( - psm=psm, observed_mz=mz_list, observed_intensity=intensity_list - ) - if hyperscore is None: - continue - psm.rescoring_features.update( - { - "hyperscore": hyperscore - } - ) - + psm.rescoring_features.update({'hyperscore': hs}) @staticmethod def _read_spectrum_file(spectrum_filepath: Path) -> Union[mzml.PreIndexedMzML, mgf.IndexedMGF]: @@ -251,215 +245,75 @@ def _annotate_spectrum(self, psm, pyteomics_spectrum): return annotated_spectrum.spectrum - -############################ -### HYPERSCORE FUNCTIONS ### -############################ -def _annotated_spectrum_to_mzint(annotated_spectrum, ion_types=["b", "y"]): - - mz_list = [] - intensity_list = [] - - for peak in annotated_spectrum: - - annotations = peak.annotation - for fragment in annotations: - ion = fragment.ion - if ion[0] not in ion_types: - continue - - mz_list.append(peak.experimental_mz) - intensity_list.append(peak.intensity) - break - - return mz_list, intensity_list - - -def _peptidoform_to_oms(peptidoform: Peptidoform) -> tuple[oms.AASequence, Optional[int]]: - """ - Parse a peptide sequence in proforma format to pyOpenMS compatible format. - - Only supports UNIMOD format. - - Parameter - --------- - peptide: str - Peptide string in proforma format - - Returns - ------- - AASequence (pyOpenMS): - A peptide sequence in pyOpenMS format - """ - peptide = peptidoform.proforma - - # Reformat unimod modifications - pattern_unimod = r"\[UNIMOD:(\d+)\]" - - def replace_unimod(match): - return f"(UniMod:{match.group(1)})" - - peptide_oms_str = re.sub( - pattern=pattern_unimod, repl=replace_unimod, string=peptide - ) - - # Parse N-terminal modifications - if ")-" in peptide_oms_str: - peptide_oms_list = peptide_oms_str.split(")-") - nterm_modification, peptide_oms_str = peptide_oms_list[-2], peptide_oms_list[-1] - nterm_modification += ")" - peptide_oms_str = "." + nterm_modification + peptide_oms_str + "." - elif "]-" in peptide_oms_str: - peptide_oms_list = peptide_oms_str.split("]-") - nterm_modification, peptide_oms_str = peptide_oms_list[-2], peptide_oms_list[-1] - nterm_modification += "]" - peptide_oms_str = "." + nterm_modification + peptide_oms_str + "." - - # Split the charge from the peptide string - if "/" in peptide_oms_str: - peptide_oms_str, _ = peptide_oms_str.split("/") - - try: - peptide_oms = oms.AASequence.fromString(peptide_oms_str) - return peptide_oms - - except: - return - - -# def _peptidoform_to_oms(peptidoform: Peptidoform) -> tuple[oms.AASequence, Optional[int]]: -# """ -# Parse a peptidoform object to pyOpenMS compatible format. - -# Parameter -# --------- -# Peptidoform: Peptidoform -# Peptide string in Peptidoform format - -# Returns -# ------- -# AASequence (pyOpenMS): -# A peptide sequence in pyOpenMS format -# int: -# charge of the peptide -# """ - -# n_term = peptidoform.properties["n_term"] -# peptide_oms_str = f"[{sum([mod.mass for mod in n_term])}]" if n_term else "" - -# for aa, mods in peptidoform.parsed_sequence: -# peptide_oms_str += aa -# if isinstance(mods, list): -# peptide_oms_str += f"[{sum([mod.mass for mod in mods])}]" - -# c_term = peptidoform.properties["c_term"] -# peptide_oms_str += f"[{sum([mod.mass for mod in c_term])}]" if c_term else "" - -# peptide_oms = oms.AASequence.fromString(peptide_oms_str) - -# return peptide_oms - - -def _peptidoform_to_theoretical_spectrum(peptidoform: str) -> oms.MSSpectrum: + +def factorial(n): """ - Create a theoretical spectrum from a peptide sequence. - - Parameter - --------- - peptide: str - Peptide sequence in proforma format - engine: str - The engine to use to create theoretical spectrum. - Can only be 'pyopenms' or 'spectrum-utils' (default) - - Return - ------ - MSSpectrum - Spectrum object in pyOpenMS format + Compute factorial of n using a loop. + Parameters: + n (int): Non-negative integer. + Returns: + int: Factorial of n. """ - # Reformat peptide sequence in pyOpenMS format - peptide_oms = _peptidoform_to_oms(peptidoform=peptidoform) - if peptide_oms is None: - return - - # Initialize the required objects to create the spectrum - spectrum = oms.MSSpectrum() - tsg = oms.TheoreticalSpectrumGenerator() - p = oms.Param() - - p.setValue("add_b_ions", "true") - p.setValue("add_metainfo", "true") - tsg.setParameters(param=p) - - # Create the theoretical spectrum - tsg.getSpectrum(spec=spectrum, peptide=peptide_oms, min_charge=1, max_charge=2) - return spectrum - - -def calculate_hyperscore( - psm: PSM, - observed_mz: List[float], - observed_intensity: List[float], - fragment_tol_mass=20, - fragment_tol_mode="ppm", -): + result = 1 + for i in range(1, n + 1): + result *= i + return result + +def calculate_hyperscore(n_y, n_b, y_ion_intensities, b_ion_intensities): """ - Calculate the hyperscore as defined in the X!Tandem search engine. - - It is a metric of how good two spectra match with each other (matching peaks). - - Parameters - ---------- - psm: psm_utils.PSM - The PSM used to extract 'spectrum_id' (for MGF spectrum extraction) - and 'Peptidoform' (the peptide sequence) - observed_mz: List[float] - List of observed mz values with matching order as observed intensity - observed_intensity: List[float] - List of observed intensity values - fragment_tol_mass: int - The allowed tolerance to match peaks - fragment_tol_mode: str - 'ppm' for parts-per-million mode. 'Da' for fragment_tol_mass in Dalton. - Return - ------ - int - The hyperscore + Calculate the hyperscore for a peptide-spectrum match. + Parameters: + n_y (int): Number of matched y-ions. + n_b (int): Number of matched b-ions. + y_ion_intensities (list): Intensities of matched y-ions. + b_ion_intensities (list): Intensities of matched b-ions. + Returns: + float: Calculated hyperscore. """ - if fragment_tol_mode == "ppm": - fragment_mass_tolerance_unit_ppm = True - elif fragment_tol_mode == "Da": - fragment_mass_tolerance_unit_ppm = False - else: - raise Exception( - "fragment_tol_mode can only take 'Da' or 'ppm'. {} was provided.".format( - fragment_tol_mode - ) - ) - if len(observed_intensity) == 0: - logging.warning(f"PSM ({psm.spectrum_id}) has no annotated peaks.") - return - - theoretical_spectrum = _peptidoform_to_theoretical_spectrum(peptidoform=psm.peptidoform) - # This is mainly the cause of the modification not being allowed according to pyopenms - # pyOpenMS sets stringent criteria on modification being placed on annotated amino acids - # according to the unimod database - if theoretical_spectrum is None: - logging.warning(f'Peptidoform has either unsupported modifications or is being placed on non-allowed residue: {psm.peptidoform.proforma}') - return - - observed_spectrum_oms = oms.MSSpectrum() - observed_spectrum_oms.set_peaks([observed_mz, observed_intensity]) - hyperscore = oms.HyperScore() - result = hyperscore.compute( - fragment_mass_tolerance=fragment_tol_mass, - fragment_mass_tolerance_unit_ppm=fragment_mass_tolerance_unit_ppm, - exp_spectrum=observed_spectrum_oms, - theo_spectrum=theoretical_spectrum, - ) - return result - - -########################### -### FRAG SITE FUNCTIONS ### -########################### + # Calculate the product of y-ion and b-ion intensities + product_y = np.sum(y_ion_intensities) if y_ion_intensities else 1 + product_b = np.sum(b_ion_intensities) if b_ion_intensities else 1 + + # Calculate factorial using custom function + factorial_y = factorial(n_y) + factorial_b = factorial(n_b) + + # Compute hyperscore + hyperscore = np.log(factorial_y * factorial_b * (product_y+product_b)) + return hyperscore + +def infer_fragment_identity(frag, allow_ion_types=['b', 'y']): + ion = frag.ion + + is_allowed = False + for allowed_ion_type in allow_ion_types: + if allowed_ion_type in ion: + is_allowed=True + break + + if not is_allowed: + return False + # Radicals + if "·" in ion: + return False + if frag.neutral_loss is not None: + return False + if frag.charge > 2: + return False + + return ion[0] + +def get_by_fragments(annotated_spectrum): + b_intensities = [] + y_intensities = [] + for peak in annotated_spectrum: + + for fragment in peak.annotation: + + ion_type = infer_fragment_identity(fragment) + + if ion_type == 'b': + b_intensities.append(peak.intensity) + if ion_type == 'y': + y_intensities.append(peak.intensity) + return b_intensities, y_intensities \ No newline at end of file From 986c5f612bbd46d4f5f8f25e5d95425cde470e31 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Fri, 22 Nov 2024 16:00:42 +0100 Subject: [PATCH 23/54] fix spectrum_id accession --- ms2rescore/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py index 33b1f0a1..7897b924 100644 --- a/ms2rescore/utils.py +++ b/ms2rescore/utils.py @@ -103,7 +103,7 @@ def filter_mumble_psms(psm_list: PSMList) -> PSMList: """ keep = [None] * len(psm_list) original_hit = np.array([metadata.get("original_hit") for metadata in psm_list["metadata"]]) - spectrum_indices = np.array([psm.spectrum for psm in psm_list]) + spectrum_indices = np.array([psm.spectrum_id for psm in psm_list]) runs = np.array([psm.run for psm in psm_list]) if "matched_ions_pct" in psm_list[0].rescoring_features: matched_ions = [psm.rescoring_features["matched_ions_pct"] for psm in psm_list] @@ -116,7 +116,7 @@ def filter_mumble_psms(psm_list: PSMList) -> PSMList: keep[i] = True else: original_matched_ions_pct = np.logical_and.reduce( - [original_hit, spectrum_indices == psm.spectrum_index, runs == psm.run] + [original_hit, spectrum_indices == psm.spectrum_id, runs == psm.run] ) if original_matched_ions_pct > matched_ions[i]: keep[i] = False From 5333e4601160a9d0ad77694671fe2c623fad3773 Mon Sep 17 00:00:00 2001 From: Kevin Velghe Date: Fri, 17 Jan 2025 12:56:25 +0100 Subject: [PATCH 24/54] remove unused imports --- ms2rescore/feature_generators/ms2.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py index df12ca95..65bb42a9 100644 --- a/ms2rescore/feature_generators/ms2.py +++ b/ms2rescore/feature_generators/ms2.py @@ -10,10 +10,7 @@ from typing import List, Optional, Union import numpy as np -import pyopenms as oms -from psm_utils import PSM, Peptidoform, PSMList -from psm_utils.peptidoform import PeptidoformException -from pyteomics.proforma import ProFormaError +from psm_utils import PSMList from pyteomics import mass, mgf, mzml from rustyms import FragmentationModel, LinearPeptide, MassMode, RawSpectrum from pathlib import Path @@ -316,4 +313,4 @@ def get_by_fragments(annotated_spectrum): b_intensities.append(peak.intensity) if ion_type == 'y': y_intensities.append(peak.intensity) - return b_intensities, y_intensities \ No newline at end of file + return b_intensities, y_intensities From dd2259f90b14fe2de0bdaae249f38481089c5bc5 Mon Sep 17 00:00:00 2001 From: Kevin Velghe Date: Fri, 17 Jan 2025 12:57:48 +0100 Subject: [PATCH 25/54] remove unused import in deeplc feature generator --- ms2rescore/feature_generators/deeplc.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py index 287833f8..2ee6b063 100644 --- a/ms2rescore/feature_generators/deeplc.py +++ b/ms2rescore/feature_generators/deeplc.py @@ -25,7 +25,6 @@ import numpy as np from psm_utils import PSMList -from psm_utils.io import read_file from ms2rescore.feature_generators.base import FeatureGeneratorBase from ms2rescore.parse_spectra import MSDataType From d24ef309c68a166ac64f02bf8eb3291e40f8740a Mon Sep 17 00:00:00 2001 From: Kevin Velghe Date: Fri, 17 Jan 2025 13:06:42 +0100 Subject: [PATCH 26/54] add rustyms dependency --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index c1a1cbc8..595c9c37 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,6 +50,7 @@ dependencies = [ "pyteomics>=4.7.2", "rich>=12", "tomli>=2; python_version < '3.11'", + "rustyms>=0.9.0a3", ] [project.optional-dependencies] From 21cafc7d2de05b3806ffa190ec09f9f39032a4ea Mon Sep 17 00:00:00 2001 From: Kevin Velghe Date: Fri, 17 Jan 2025 13:34:33 +0100 Subject: [PATCH 27/54] drop rustyms requirement to 0.8.3 rustyms 0.9.0a3 requires python 3.11, while we support 3.9. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 595c9c37..d600a6cc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,7 +50,7 @@ dependencies = [ "pyteomics>=4.7.2", "rich>=12", "tomli>=2; python_version < '3.11'", - "rustyms>=0.9.0a3", + "rustyms>=0.8.3", ] [project.optional-dependencies] From ca9da7d9ffd5f147213d81ba48627c32dc4495fb Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Fri, 17 Jan 2025 14:14:02 +0100 Subject: [PATCH 28/54] mumble related changes --- ms2rescore/core.py | 10 ++-- ms2rescore/feature_generators/ms2.py | 76 ++++++++++++---------------- ms2rescore/parse_psms.py | 13 +++++ ms2rescore/utils.py | 67 ++++++++++++++++-------- 4 files changed, 97 insertions(+), 69 deletions(-) diff --git a/ms2rescore/core.py b/ms2rescore/core.py index a75bf753..25eaf62b 100644 --- a/ms2rescore/core.py +++ b/ms2rescore/core.py @@ -111,8 +111,10 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: psm_list = psm_list[psms_with_features] if "mumble" in config["psm_generator"]: - # Remove PSMS that have a less matched ions than the original hit - psm_list = filter_mumble_psms(psm_list) + # Remove PSMs where matched_ions_pct drops 25% below the original hit + psm_list = filter_mumble_psms(psm_list, threshold=0.75) + # Currently replace the score with the hyperscore for Mumble + # psm_list["score"] = [ft["hyperscore"] for ft in psm_list["rescoring_features"]] # TODO: This is a temporary fix # Write feature names to file _write_feature_names(feature_names, output_file_root) @@ -288,7 +290,9 @@ def _calculate_confidence(psm_list: PSMList) -> PSMList: ) # Recalculate confidence - new_confidence = lin_psm_data.assign_confidence(scores=psm_list["score"]) + new_confidence = lin_psm_data.assign_confidence( + scores=list(psm_list["score"]) + ) # explicity make it a list to avoid TypingError: Failed in nopython mode pipeline (step: nopython frontend) in mokapot # Add new confidence estimations to PSMList add_psm_confidence(psm_list, new_confidence) diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py index 65bb42a9..0b226a5d 100644 --- a/ms2rescore/feature_generators/ms2.py +++ b/ms2rescore/feature_generators/ms2.py @@ -7,13 +7,12 @@ import re from collections import defaultdict from itertools import chain -from typing import List, Optional, Union +from typing import List, Optional import numpy as np from psm_utils import PSMList -from pyteomics import mass, mgf, mzml from rustyms import FragmentationModel, LinearPeptide, MassMode, RawSpectrum -from pathlib import Path +from ms2rescore_rs import get_ms2_spectra from ms2rescore.exceptions import ParseSpectrumError from ms2rescore.feature_generators.base import FeatureGeneratorBase @@ -68,7 +67,6 @@ def __init__( self.processes = processes self.calculate_hyperscore = calculate_hyperscore - @property def feature_names(self) -> List[str]: return [ @@ -109,26 +107,22 @@ def add_features(self, psm_list: PSMList) -> None: def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List: """Retrieve annotated spectra for all psms.""" - spectrum_reader = self._read_spectrum_file(spectrum_file) - + spectra = get_ms2_spectra(str(spectrum_file)) spectrum_id_pattern = re.compile( self.spectrum_id_pattern if self.spectrum_id_pattern else r"(.*)" ) - try: - mapper = { - spectrum_id_pattern.search(spectrum_id).group(1): spectrum_id - for spectrum_id in spectrum_reader._offset_index.mapping["spectrum"].keys() + spectra_dict = { + spectrum_id_pattern.search(spectrum.identifier).group(1): spectrum + for spectrum in spectra } except AttributeError: raise ParseSpectrumError( "Could not parse spectrum IDs using ´spectrum_id_pattern´. Please make sure that there is a capturing in the pattern." ) - annotated_spectra = [ - self._annotate_spectrum(psm, spectrum_reader.get_by_id(mapper[psm.spectrum_id])) - for psm in psm_list - ] - for psm, annotated_spectrum in zip(psm_list, annotated_spectra): + + for psm in psm_list: + annotated_spectrum = self._annotate_spectrum(psm, spectra_dict[psm.spectrum_id]) psm.rescoring_features.update( self._calculate_spectrum_features(psm, annotated_spectrum) ) @@ -137,20 +131,9 @@ def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List: # Filters out peaks which are unnannotated (can be specified, but keep at b-y ions of any charge ?) b, y = get_by_fragments(annotated_spectrum) hs = calculate_hyperscore( - n_y=len(y), - n_b=len(b), - y_ion_intensities=y, - b_ion_intensities=b + n_y=len(y), n_b=len(b), y_ion_intensities=y, b_ion_intensities=b ) - psm.rescoring_features.update({'hyperscore': hs}) - - @staticmethod - def _read_spectrum_file(spectrum_filepath: Path) -> Union[mzml.PreIndexedMzML, mgf.IndexedMGF]: - - if spectrum_filepath.suffix.lower() == ".mzml": - return mzml.PreIndexedMzML(str(spectrum_filepath)) - elif spectrum_filepath.suffix.lower() == ".mgf": - return mgf.IndexedMGF(str(spectrum_filepath)) + psm.rescoring_features.update({"hyperscore": hs}) @staticmethod def _longest_ion_sequence(lst): @@ -219,16 +202,16 @@ def _calculate_spectrum_features(self, psm, annotated_spectrum): / (len(b_ions_matched) + len(y_ions_matched)), } - def _annotate_spectrum(self, psm, pyteomics_spectrum): + def _annotate_spectrum(self, psm, spectrum): spectrum = RawSpectrum( title=psm.spectrum_id, num_scans=1, rt=psm.retention_time, precursor_charge=psm.get_precursor_charge(), - precursor_mass=mass.calculate_mass(psm.peptidoform.composition), - mz_array=pyteomics_spectrum["m/z array"], - intensity_array=pyteomics_spectrum["intensity array"], + precursor_mass=spectrum.precursor.mz, + mz_array=spectrum.mz, + intensity_array=spectrum.intensity, ) try: linear_peptide = LinearPeptide(psm.peptidoform.proforma.split("/")[0]) @@ -242,7 +225,7 @@ def _annotate_spectrum(self, psm, pyteomics_spectrum): return annotated_spectrum.spectrum - + def factorial(n): """ Compute factorial of n using a loop. @@ -255,7 +238,8 @@ def factorial(n): for i in range(1, n + 1): result *= i return result - + + def calculate_hyperscore(n_y, n_b, y_ion_intensities, b_ion_intensities): """ Calculate the hyperscore for a peptide-spectrum match. @@ -270,24 +254,25 @@ def calculate_hyperscore(n_y, n_b, y_ion_intensities, b_ion_intensities): # Calculate the product of y-ion and b-ion intensities product_y = np.sum(y_ion_intensities) if y_ion_intensities else 1 product_b = np.sum(b_ion_intensities) if b_ion_intensities else 1 - + # Calculate factorial using custom function factorial_y = factorial(n_y) factorial_b = factorial(n_b) - + # Compute hyperscore - hyperscore = np.log(factorial_y * factorial_b * (product_y+product_b)) + hyperscore = np.log(factorial_y * factorial_b * (product_y + product_b)) return hyperscore -def infer_fragment_identity(frag, allow_ion_types=['b', 'y']): + +def infer_fragment_identity(frag, allow_ion_types=["b", "y"]): ion = frag.ion is_allowed = False for allowed_ion_type in allow_ion_types: if allowed_ion_type in ion: - is_allowed=True + is_allowed = True break - + if not is_allowed: return False # Radicals @@ -297,20 +282,21 @@ def infer_fragment_identity(frag, allow_ion_types=['b', 'y']): return False if frag.charge > 2: return False - + return ion[0] + def get_by_fragments(annotated_spectrum): b_intensities = [] y_intensities = [] for peak in annotated_spectrum: - + for fragment in peak.annotation: ion_type = infer_fragment_identity(fragment) - - if ion_type == 'b': + + if ion_type == "b": b_intensities.append(peak.intensity) - if ion_type == 'y': + if ion_type == "y": y_intensities.append(peak.intensity) return b_intensities, y_intensities diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py index 2f0ec3b0..cd69c6bb 100644 --- a/ms2rescore/parse_psms.py +++ b/ms2rescore/parse_psms.py @@ -94,6 +94,19 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: # Addition of Modifications for mass shifts in the PSMs with Mumble if "mumble" in config["psm_generator"]: logger.debug("Applying modifications for mass shifts using Mumble...") + + # set inlcude original psm to True and include decoy psm to true + if ( + "include_original_psm" not in config["psm_generator"]["mumble"] + or not config["psm_generator"]["mumble"] + ): + config["psm_generator"]["mumble"]["include_original_psm"] = True + if ( + "include_decoy_psm" not in config["psm_generator"]["mumble"] + or not config["psm_generator"]["mumble"] + ): + config["psm_generator"]["mumble"]["include_decoy_psm"] = True + mumble_config = config["psm_generator"]["mumble"] # Check if psm_list[0].rescoring_features is empty or not diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py index ca06db77..a64d6eca 100644 --- a/ms2rescore/utils.py +++ b/ms2rescore/utils.py @@ -95,30 +95,55 @@ def _is_minitdf(spectrum_file: str) -> bool: return len(files) >= 2 -def filter_mumble_psms(psm_list: PSMList) -> PSMList: +def filter_mumble_psms(psm_list: PSMList, threshold=1) -> PSMList: """ - Filter out PSMs with mumble in the peptide sequence. + Filter out mumble PSMs with `matched_ions_pct` lower than the original hit. + + Parameters + ---------- + psm_list : PSMList + List of PSMs to filter + threshold : float, optional + Threshold to lower the maximum matched_ions_pct of the original hit """ - keep = [None] * len(psm_list) - original_hit = np.array([metadata.get("original_hit") for metadata in psm_list["metadata"]]) + # Extract relevant fields from the PSM list + original_hit = np.array([metadata.get("original_psm") for metadata in psm_list["metadata"]]) spectrum_indices = np.array([psm.spectrum_id for psm in psm_list]) runs = np.array([psm.run for psm in psm_list]) - if "matched_ions_pct" in psm_list[0].rescoring_features: - matched_ions = [psm.rescoring_features["matched_ions_pct"] for psm in psm_list] - else: + + # Check if matched_ions_pct exists + if "matched_ions_pct" not in psm_list[0].rescoring_features: return psm_list - for i, psm in enumerate(psm_list): - if isinstance(keep[i], bool): - continue - elif original_hit[i]: - keep[i] = True - else: - original_matched_ions_pct = np.logical_and.reduce( - [original_hit, spectrum_indices == psm.spectrum_id, runs == psm.run] - ) - if original_matched_ions_pct > matched_ions[i]: - keep[i] = False - else: - keep[i] = True - logger.debug(f"Filtered out {len(psm_list) - sum(keep)} mumble PSMs.") + + matched_ions = np.array([psm.rescoring_features["matched_ions_pct"] for psm in psm_list]) + + # Create unique keys for each (run, spectrum_id) + unique_keys = np.core.defchararray.add(runs.astype(str), spectrum_indices.astype(str)) + unique_keys_indices, inverse_indices = np.unique(unique_keys, return_inverse=True) + + # Initialize an array to store the `matched_ions_pct` of original hits per group + original_matched_ions_pct = np.full( + len(unique_keys_indices), -np.inf + ) # Default to -inf for groups without original hits + + # Assign the `matched_ions_pct` of original hits to their groups + np.maximum.at( + original_matched_ions_pct, inverse_indices[original_hit], matched_ions[original_hit] + ) + + # lower the maximum with the threshold + original_matched_ions_pct = original_matched_ions_pct * threshold + + # Broadcast the original `matched_ions_pct` back to all PSMs in each group + original_matched_ions_for_all = original_matched_ions_pct[inverse_indices] + + # Determine the filtering condition + keep = np.logical_or( + original_hit, # Always keep original hits + matched_ions + >= original_matched_ions_for_all, # Keep hits with `matched_ions_pct` >= the original + ) + + # Filter PSMs + logger.debug(f"Filtered out {len(psm_list) - np.sum(keep)} mumble PSMs.") return psm_list[keep] From c5b6eb02ca8345d4266bfedc1a889a4d19bf9ac8 Mon Sep 17 00:00:00 2001 From: Kevin Velghe Date: Fri, 17 Jan 2025 13:50:21 +0100 Subject: [PATCH 29/54] add mumble As mumble hasn't been published on pypi yet, use a git dependency for now. --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index d600a6cc..f80647c8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,7 @@ dependencies = [ "rich>=12", "tomli>=2; python_version < '3.11'", "rustyms>=0.8.3", + "mumble-mod @ git+https://github.com/compomics/mumble.git@f435e91", ] [project.optional-dependencies] From aee8ec76f983385b8da54ed196daf4201101e19a Mon Sep 17 00:00:00 2001 From: Kevin Velghe Date: Tue, 21 Jan 2025 16:11:15 +0100 Subject: [PATCH 30/54] update mumble to use user cache dir --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f80647c8..7e2f587e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,7 +51,7 @@ dependencies = [ "rich>=12", "tomli>=2; python_version < '3.11'", "rustyms>=0.8.3", - "mumble-mod @ git+https://github.com/compomics/mumble.git@f435e91", + "mumble-mod @ git+https://github.com/compomics/mumble.git@114ad7d", ] [project.optional-dependencies] From 7ce56c215bd530e61213a4d193955a96c008f61c Mon Sep 17 00:00:00 2001 From: Kevin Velghe Date: Fri, 24 Jan 2025 14:34:08 +0100 Subject: [PATCH 31/54] bump im2deep dependency im2deep.utils was introduced in 0.3.0. https://github.com/compomics/IM2Deep/commit/0a4bc9de616ebefdf5d124e7625a5c1b86f588e5 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7e2f587e..96d0cec6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ dependencies = [ "customtkinter>=5,<6", "deeplc>=3.0,<3.1", "deeplcretrainer", - "im2deep>=0.1.3", + "im2deep>=0.3.0", "jinja2>=3", "lxml>=4.5", "mokapot>=0.10", From 106ad8fd54cbed73d01a4fbfca437031fe17cda0 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Fri, 14 Feb 2025 13:34:15 +0100 Subject: [PATCH 32/54] make mumble and rustyms optional dependancy --- pyproject.toml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 96d0cec6..dc475819 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,13 +50,11 @@ dependencies = [ "pyteomics>=4.7.2", "rich>=12", "tomli>=2; python_version < '3.11'", - "rustyms>=0.8.3", - "mumble-mod @ git+https://github.com/compomics/mumble.git@114ad7d", ] [project.optional-dependencies] ionmob = ["ionmob>=0.2", "tensorflow"] -ms2 = ["rustyms"] +mumble = ["rustyms>=0.8.3", "mumble>=0.2.0"] dev = ["ruff", "black", "pytest", "pytest-cov", "pre-commit"] docs = [ "sphinx", From 29aac8a224460fab9d0b16b5bfca1d1f6026da32 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Tue, 23 Sep 2025 14:45:03 +0200 Subject: [PATCH 33/54] set defaults in mumble config --- ms2rescore/parse_psms.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py index cd69c6bb..3c865074 100644 --- a/ms2rescore/parse_psms.py +++ b/ms2rescore/parse_psms.py @@ -94,19 +94,9 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: # Addition of Modifications for mass shifts in the PSMs with Mumble if "mumble" in config["psm_generator"]: logger.debug("Applying modifications for mass shifts using Mumble...") - # set inlcude original psm to True and include decoy psm to true - if ( - "include_original_psm" not in config["psm_generator"]["mumble"] - or not config["psm_generator"]["mumble"] - ): - config["psm_generator"]["mumble"]["include_original_psm"] = True - if ( - "include_decoy_psm" not in config["psm_generator"]["mumble"] - or not config["psm_generator"]["mumble"] - ): - config["psm_generator"]["mumble"]["include_decoy_psm"] = True - + config["psm_generator"]["mumble"]["include_original_psm"] = True + config["psm_generator"]["mumble"]["include_decoy_psm"] = True mumble_config = config["psm_generator"]["mumble"] # Check if psm_list[0].rescoring_features is empty or not From 487f66112d7f5e79152af4545c52578e2a1bb47e Mon Sep 17 00:00:00 2001 From: SamvPy Date: Wed, 3 Dec 2025 14:47:10 +0100 Subject: [PATCH 34/54] fix rustyms 0.8.0 -> 0.10.0 --- ms2rescore/feature_generators/ms2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py index 0b226a5d..121a6bcb 100644 --- a/ms2rescore/feature_generators/ms2.py +++ b/ms2rescore/feature_generators/ms2.py @@ -11,7 +11,7 @@ import numpy as np from psm_utils import PSMList -from rustyms import FragmentationModel, LinearPeptide, MassMode, RawSpectrum +from rustyms import FragmentationModel, CompoundPeptidoformIon, MassMode, RawSpectrum from ms2rescore_rs import get_ms2_spectra from ms2rescore.exceptions import ParseSpectrumError @@ -214,7 +214,7 @@ def _annotate_spectrum(self, psm, spectrum): intensity_array=spectrum.intensity, ) try: - linear_peptide = LinearPeptide(psm.peptidoform.proforma.split("/")[0]) + linear_peptide = CompoundPeptidoformIon(psm.peptidoform.proforma.split("/")[0]) annotated_spectrum = spectrum.annotate( peptide=linear_peptide, model=self.fragmentation_model, From 05078cdf94648fcafc3eaffd856270a5689d7b5b Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Mon, 22 Dec 2025 14:22:18 +0100 Subject: [PATCH 35/54] moved maxquant features to ms2 --- ms2rescore/feature_generators/maxquant.py | 208 ---------------------- ms2rescore/feature_generators/ms2.py | 52 ++++++ 2 files changed, 52 insertions(+), 208 deletions(-) delete mode 100644 ms2rescore/feature_generators/maxquant.py diff --git a/ms2rescore/feature_generators/maxquant.py b/ms2rescore/feature_generators/maxquant.py deleted file mode 100644 index 45205c2a..00000000 --- a/ms2rescore/feature_generators/maxquant.py +++ /dev/null @@ -1,208 +0,0 @@ -""" -Feature generator for PSMs from the MaxQuant search engine. - -MaxQuant msms.txt files contain various metrics from peptide-spectrum matching that can be used -to generate rescoring features. These include features related to the mass errors of the seven -fragment ions with the highest intensities, and features related to the ion current of the -identified fragment ions. - -""" - -import logging -from typing import List, Tuple - -import numpy as np -from psm_utils import PSMList - -from ms2rescore.exceptions import MS2RescoreError -from ms2rescore.feature_generators.base import FeatureGeneratorBase - -logger = logging.getLogger(__name__) - - -class MaxQuantFeatureGenerator(FeatureGeneratorBase): - """Generate MaxQuant-derived features.""" - - available_features = [ - "mean_error_top7", - "sq_mean_error_top7", - "stdev_error_top7", - "ln_explained_ion_current", - "ln_nterm_ion_current_ratio", - "ln_cterm_ion_current_ratio", - "ln_ms2_ion_current", - ] - - def __init__(self, *args, **kwargs) -> None: - """ - Generate MaxQuant-derived features. - - Attributes - ---------- - feature_names: list[str] - Names of the features that will be added to the PSMs. - - Raises - ------ - MissingMetadataError - If the required metadata entries are not present in the PSMs. - - """ - super().__init__(*args, **kwargs) - self._feature_names = self.available_features.copy() - - @property - def feature_names(self) -> List[str]: - return self._feature_names - - def add_features(self, psm_list: PSMList): - """ - Add MaxQuant-derived features to PSMs. - - Parameters - ---------- - psm_list - PSMs to add features to. - - """ - # Check if all PSMs are from MaxQuant - if not self._all_psms_from_maxquant(psm_list): - self._feature_names = [] # Set feature names to empty list to indicate none added - logger.warning("Not all PSMs are from MaxQuant. Skipping MaxQuant feature generation.") - return - else: - self._feature_names = self.available_features # Reset feature names - logger.info("Adding MaxQuant-derived features to PSMs.") - - # Infer mass deviations column name - for column_name in [ - "Mass deviations [Da]", - "Mass Deviations [Da]", - "Mass deviations [ppm]", - "Mass Deviations [ppm]", - ]: - if column_name in psm_list[0]["metadata"].keys(): - self._mass_deviations_key = column_name - break - else: - raise MissingMetadataError( - "No mass deviations entry in PSM metadata. Cannot compute MaxQuant features." - ) - - # Check other columns - for column_name in ["Intensities", "Matches", "Intensity coverage"]: - if column_name not in psm_list[0]["metadata"].keys(): - raise MissingMetadataError( - f"Missing {column_name} entry in PSM metadata. Cannot compute MaxQuant features." - ) - - # Add features to PSMs - for psm in psm_list: - psm["rescoring_features"].update(self._compute_features(psm["metadata"])) - - @staticmethod - def _all_psms_from_maxquant(psm_list): - """Check if the PSMs are from MaxQuant.""" - return (psm_list["source"] == "msms").all() - - def _compute_features(self, psm_metadata): - """Compute features from derived from intensities and mass errors.""" - features = {} - if all(k in psm_metadata.keys() for k in ["Intensities", self._mass_deviations_key]): - ( - features["mean_error_top7"], - features["sq_mean_error_top7"], - features["stdev_error_top7"], - ) = self._calculate_top7_peak_features( - psm_metadata["Intensities"], psm_metadata[self._mass_deviations_key] - ) - - if all(k in psm_metadata.keys() for k in ["Intensities", "Matches", "Intensity coverage"]): - ( - features["ln_explained_ion_current"], - features["ln_nterm_ion_current_ratio"], - features["ln_cterm_ion_current_ratio"], - features["ln_ms2_ion_current"], - ) = self._calculate_ion_current_features( - psm_metadata["Matches"], - psm_metadata["Intensities"], - psm_metadata["Intensity coverage"], - ) - - return features - - @staticmethod - def _calculate_top7_peak_features(intensities: str, mass_errors: str) -> Tuple[np.ndarray]: - """ - Calculate "top 7 peak"-related search engine features. - The following features are calculated: - - mean_error_top7: Mean of mass errors of the seven fragment ion peaks with the - highest intensities - - sq_mean_error_top7: Squared MeanErrorTop7 - - stdev_error_top7: Standard deviation of mass errors of the seven fragment ion - peaks with the highest intensities - """ - try: - intensities = [float(i) for i in intensities.split(";")] - mass_errors = [float(i) for i in mass_errors.split(";")] - except ValueError: - return 0.0, 0.0, 0.0 - - indices_most_intens = np.array(intensities).argsort()[-1:-8:-1] - mass_errors_top7 = [(mass_errors[i]) for i in indices_most_intens] - mean_error_top7 = np.mean(mass_errors_top7) - sq_mean_error_top7 = mean_error_top7**2 - stdev_error_top7 = np.std(mass_errors_top7) - - return mean_error_top7, sq_mean_error_top7, stdev_error_top7 - - @staticmethod - def _calculate_ion_current_features( - matches: str, intensities: str, intensity_coverage: str - ) -> Tuple[np.ndarray]: - """ - Calculate ion current related search engine features. - The following features are calculated: - - ln_explained_ion_current: Summed intensity of identified fragment ions, - divided by that of all fragment ions, logged - - ln_nterm_ion_current_ratio: Summed intensity of identified N-terminal - fragments, divided by that of all identified fragments, logged - - ln_cterm_ion_current_ratio: Summed intensity of identified N-terminal - fragments, divided by that of all identified fragments, logged - - ln_ms2_ion_current: Summed intensity of all observed fragment ions, logged - """ - pseudo_count = 0.00001 - try: - ln_explained_ion_current = float(intensity_coverage) + pseudo_count - summed_intensities = sum([float(i) for i in intensities.split(";")]) - except ValueError: - return 0.0, 0.0, 0.0, 0.0 - - # Calculate ratio between matched b- and y-ion intensities - y_ion_int = sum( - [ - float(intensities.split(";")[i]) - for i, m in enumerate(matches.split(";")) - if m.startswith("y") - ] - ) - y_int_ratio = y_ion_int / summed_intensities - - ln_nterm_ion_current_ratio = (y_int_ratio + pseudo_count) * ln_explained_ion_current - ln_cterm_ion_current_ratio = (1 - y_int_ratio + pseudo_count) * ln_explained_ion_current - ln_ms2_ion_current = summed_intensities / ln_explained_ion_current - - out = [ - ln_explained_ion_current, - ln_nterm_ion_current_ratio, - ln_cterm_ion_current_ratio, - ln_ms2_ion_current, - ] - - return tuple([np.log(x) for x in out]) - - -class MissingMetadataError(MS2RescoreError): - """Exception raised when a required metadata entry is missing.""" - - pass diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py index 121a6bcb..c5ae3ba9 100644 --- a/ms2rescore/feature_generators/ms2.py +++ b/ms2rescore/feature_generators/ms2.py @@ -83,6 +83,9 @@ def feature_names(self) -> List[str]: "matched_y_ions_pct", "matched_ions_pct", "hyperscore", + "mean_error_top7", + "sq_mean_error_top7", + "stdev_error_top7", ] def add_features(self, psm_list: PSMList) -> None: @@ -146,6 +149,47 @@ def _longest_ion_sequence(lst): return max_sequence + @staticmethod + def _calculate_top7_peak_features(annotated_spectrum): + """ + Calculate "top 7 peak"-related features using mass errors. + The following features are calculated: + - mean_error_top7: Mean of mass errors of the seven fragment ion peaks with the + highest intensities + - sq_mean_error_top7: Squared MeanErrorTop7 + - stdev_error_top7: Standard deviation of mass errors of the seven fragment ion + peaks with the highest intensities + """ + if not annotated_spectrum: + return 0.0, 0.0, 0.0 + + # Collect peaks with annotations (matched peaks) and their mass errors + peak_data = [] + for peak in annotated_spectrum: + if peak.annotation: + for matched_ion in peak.annotation: + # Calculate mass error (ppm) between observed and theoretical m/z + theoretical_mz = matched_ion.mz + observed_mz = peak.mz + mass_error = ((observed_mz - theoretical_mz) / theoretical_mz) * 1e6 + peak_data.append((peak.intensity, mass_error)) + + if len(peak_data) == 0: + return 0.0, 0.0, 0.0 + + # Sort by intensity and get top 7 + peak_data.sort(key=lambda x: x[0], reverse=True) + top7_errors = [error for _, error in peak_data[:7]] + + if len(top7_errors) == 0: + return 0.0, 0.0, 0.0 + + mean_error_top7 = np.mean(top7_errors) + sq_mean_error_top7 = mean_error_top7**2 + stdev_error_top7 = np.std(top7_errors) if len(top7_errors) > 1 else 0.0 + + return mean_error_top7, sq_mean_error_top7, stdev_error_top7 + def _calculate_spectrum_features(self, psm, annotated_spectrum): if not annotated_spectrum: @@ -180,6 +224,11 @@ def _calculate_spectrum_features(self, psm, annotated_spectrum): b_ion_matched_sum = np.sum(features["b_ion_matched"]) y_ion_matched_sum = np.sum(features["y_ion_matched"]) + # Calculate top 7 peak features (MaxQuant-derived) + mean_error_top7, sq_mean_error_top7, stdev_error_top7 = self._calculate_top7_peak_features( + annotated_spectrum + ) + return { "ln_explained_intensity": np.log(matched_intensity_sum + pseudo_count), "ln_total_intensity": np.log(total_intensity_sum + pseudo_count), @@ -200,6 +249,9 @@ def _calculate_spectrum_features(self, psm, annotated_spectrum): "matched_y_ions_pct": sum(y_ions_matched) / len(y_ions_matched), "matched_ions_pct": (sum(b_ions_matched) + sum(y_ions_matched)) / (len(b_ions_matched) + len(y_ions_matched)), + "mean_error_top7": mean_error_top7, + "sq_mean_error_top7": sq_mean_error_top7, + "stdev_error_top7": stdev_error_top7, } def _annotate_spectrum(self, psm, spectrum): From 2011241bcfbedb6a200815c44a65f020ca4f2f31 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Wed, 24 Dec 2025 16:45:13 +0100 Subject: [PATCH 36/54] im2deep refactoring --- ms2rescore/feature_generators/im2deep.py | 147 ++++++++++++----------- 1 file changed, 79 insertions(+), 68 deletions(-) diff --git a/ms2rescore/feature_generators/im2deep.py b/ms2rescore/feature_generators/im2deep.py index b48bbd96..b42fabc9 100644 --- a/ms2rescore/feature_generators/im2deep.py +++ b/ms2rescore/feature_generators/im2deep.py @@ -8,23 +8,22 @@ """ -import contextlib import logging import os from inspect import getfullargspec -from itertools import chain from typing import List import numpy as np import pandas as pd from im2deep.utils import im2ccs -from im2deep.im2deep import predict_ccs -from psm_utils import PSMList +from im2deep.im2deep import predict_ccs, REFERENCE_DATASET_PATH +from im2deep.calibrate import calculate_ccs_shift +from psm_utils import PSMList, Peptidoform from ms2rescore.feature_generators.base import FeatureGeneratorBase from ms2rescore.parse_spectra import MSDataType -os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" logger = logging.getLogger(__name__) @@ -75,69 +74,77 @@ def add_features(self, psm_list: PSMList) -> None: """Add IM2Deep-derived features to PSMs""" logger.info("Adding IM2Deep-derived features to PSMs") - # Get easy-access nested version of PSMlist - psm_dict = psm_list.get_psm_dict() - - # Run IM2Deep for each spectrum file - current_run = 1 - total_runs = sum(len(runs) for runs in psm_dict.values()) - - for runs in psm_dict.values(): - # Reset IM2Deep predictor for each collection of runs - for run, psms in runs.items(): - logger.info( - f"Running IM2Deep for PSMs from run ({current_run}/{total_runs}): `{run}`..." - ) - - # Disable wild logging to stdout by TensorFlow, unless in debug mode - with ( - contextlib.redirect_stdout(open(os.devnull, "w", encoding="utf-8")) - if not self._verbose - else contextlib.nullcontext() - ): - # Make new PSM list for this run (chain PSMs per spectrum to flat list) - psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) - - logger.debug("Calibrating IM2Deep...") - - # Convert ion mobility to CCS and calibrate CCS values - psm_list_run_df = psm_list_run.to_dataframe() - psm_list_run_df["charge"] = [ - pep.precursor_charge for pep in psm_list_run_df["peptidoform"] - ] - psm_list_run_df["ccs_observed"] = im2ccs( - psm_list_run_df["ion_mobility"], - psm_list_run_df["precursor_mz"], - psm_list_run_df["charge"], - ) - - # Create dataframe with high confidence hits for calibration - cal_psm_df = self.make_calibration_df(psm_list_run_df) - - # Make predictions with IM2Deep - logger.debug("Predicting CCS values...") - predictions = predict_ccs( - psm_list_run, cal_psm_df, write_output=False, **self.im2deep_kwargs - ) - - # Add features to PSMs - logger.debug("Adding features to PSMs...") - observations = psm_list_run_df["ccs_observed"] - ccs_diffs_run = np.abs(predictions - observations) - for i, psm in enumerate(psm_list_run): - psm["rescoring_features"].update( - { - "ccs_observed_im2deep": observations[i], - "ccs_predicted_im2deep": predictions[i], - "ccs_error_im2deep": ccs_diffs_run[i], - "abs_ccs_error_im2deep": np.abs(ccs_diffs_run[i]), - "perc_ccs_error_im2deep": np.abs(ccs_diffs_run[i]) - / observations[i] - * 100, - } - ) - - current_run += 1 + # Convert ion mobility to CCS + psm_list_df = psm_list.to_dataframe() + # Remove unnecessary columns to save memory #TODO: optimize further? + psm_list_df = psm_list_df[ + [ + "peptidoform", + "ion_mobility", + "precursor_mz", + "run", + "qvalue", + "is_decoy", + "metadata", + ] + ] + + psm_list_df["charge"] = [pep.precursor_charge for pep in psm_list_df["peptidoform"]] + psm_list_df["sequence"] = psm_list_df["peptidoform"].apply( + lambda x: x.proforma.split("/")[0] + ) + psm_list_df["ccs_observed"] = im2ccs( + psm_list_df["ion_mobility"], + psm_list_df["precursor_mz"], + psm_list_df["charge"], + ) + + # Make predictions with IM2Deep + logger.debug("Predicting CCS values...") + predictions = predict_ccs(psm_list, write_output=False, **self.im2deep_kwargs) + psm_list_df["ccs_predicted"] = predictions + + # Create dataframe with high confidence hits for calibration + logger.debug("Calibrating IM2Deep...") + reference_dataset = pd.read_csv(REFERENCE_DATASET_PATH) + reference_dataset["charge"] = reference_dataset["peptidoform"].apply( + lambda x: int(x.split("/")[1]) if isinstance(x, str) else x.precursor_charge + ) + logger.debug(f"Loaded reference dataset with {len(reference_dataset)} entries") + + run_shift_dict = {} + for run in psm_list_df["run"].unique(): + cal_run_psm_df = self.make_calibration_df(psm_list_df[psm_list_df["run"] == run]) + # Extract sequence from peptidoform for calculate_ccs_shift + shift = calculate_ccs_shift( + cal_df=cal_run_psm_df, reference_dataset=reference_dataset, per_charge=True + ) + run_shift_dict[run] = shift + shift_df = pd.DataFrame.from_dict(run_shift_dict, orient="index").stack().reset_index() + shift_df.columns = ["run", "charge", "ccs_shift"] + + # Apply calibration shifts + psm_list_df = psm_list_df.merge(shift_df, on=["run", "charge"], how="left") + psm_list_df["ccs_shift"] = psm_list_df["ccs_shift"].fillna( + 0 + ) # Fill missing shifts with 0 (no calibration for that run/charge) #TODO check with ROBBE + psm_list_df["ccs_predicted_im2deep"] = ( + psm_list_df["ccs_predicted"] + psm_list_df["ccs_shift"] + ) + psm_list_df.rename(columns={"ccs_predicted": "ccs_predicted_im2deep"}, inplace=True) + psm_list_df["ccs_error_im2deep"] = ( + psm_list_df["ccs_predicted_im2deep"] - psm_list_df["ccs_observed"] + ) + psm_list_df["abs_ccs_error_im2deep"] = np.abs(psm_list_df["ccs_error_im2deep"]) + psm_list_df["perc_ccs_error_im2deep"] = ( + np.abs(psm_list_df["ccs_error_im2deep"]) / psm_list_df["ccs_observed"] * 100 + ) + + psm_list_feature_dicts = psm_list_df[self.feature_names].to_dict(orient="records") + # Add features to PSMs + logger.debug("Adding features to PSMs...") + for psm, features in zip(psm_list, psm_list_feature_dicts): + psm.rescoring_features.update(features) @staticmethod def make_calibration_df(psm_list_df: pd.DataFrame, threshold: float = 0.25) -> pd.DataFrame: @@ -167,6 +174,10 @@ def make_calibration_df(psm_list_df: pd.DataFrame, threshold: float = 0.25) -> p calibration_psms = identified_psms[ identified_psms["qvalue"] < identified_psms["qvalue"].quantile(1 - threshold) ] + if isinstance(calibration_psms["peptidoform"].iloc[0], Peptidoform): + calibration_psms["peptidoform"] = calibration_psms["peptidoform"].apply( + lambda x: x.proforma + ) logger.debug( f"Number of high confidence hits for calculating shift: {len(calibration_psms)}" ) From 36750b444d2a48a67b9335a39dc3f581bf1b0bcd Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Wed, 24 Dec 2025 16:45:25 +0100 Subject: [PATCH 37/54] ms2pip refactoring --- ms2rescore/feature_generators/ms2pip.py | 50 ++++++------------------- 1 file changed, 11 insertions(+), 39 deletions(-) diff --git a/ms2rescore/feature_generators/ms2pip.py b/ms2rescore/feature_generators/ms2pip.py index c830a79e..67ea48d0 100644 --- a/ms2rescore/feature_generators/ms2pip.py +++ b/ms2rescore/feature_generators/ms2pip.py @@ -25,22 +25,20 @@ import logging import multiprocessing -import os import warnings -from itertools import chain from typing import List, Optional, Union import numpy as np import pandas as pd -from ms2pip import correlate +from ms2pip import process_MS2_spectra from ms2pip.exceptions import NoMatchingSpectraFound from ms2pip.result import ProcessingResult + from psm_utils import PSMList from rich.progress import track -from ms2rescore.feature_generators.base import FeatureGeneratorBase, FeatureGeneratorException +from ms2rescore.feature_generators.base import FeatureGeneratorBase from ms2rescore.parse_spectra import MSDataType -from ms2rescore.utils import infer_spectrum_path logger = logging.getLogger(__name__) @@ -182,40 +180,14 @@ def add_features(self, psm_list: PSMList) -> None: """ logger.info("Adding MS²PIP-derived features to PSMs.") - psm_dict = psm_list.get_psm_dict() - current_run = 1 - total_runs = sum(len(runs) for runs in psm_dict.values()) - - for runs in psm_dict.values(): - for run, psms in runs.items(): - logger.info( - f"Running MS²PIP for PSMs from run ({current_run}/{total_runs}) `{run}`..." - ) - psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) - spectrum_filename = infer_spectrum_path(self.spectrum_path, run) - logger.debug(f"Using spectrum file `{spectrum_filename}`") - try: - os.environ.pop("CUDA_VISIBLE_DEVICES", None) - ms2pip_results = correlate( - psms=psm_list_run, - spectrum_file=str(spectrum_filename), - spectrum_id_pattern=self.spectrum_id_pattern, - model=self.model, - ms2_tolerance=self.ms2_tolerance, - compute_correlations=False, - model_dir=self.model_dir, - processes=self.processes, - ) - except NoMatchingSpectraFound as e: - raise FeatureGeneratorException( - f"Could not find any matching spectra for PSMs from run `{run}`. " - "Please check that the `spectrum_id_pattern` and `psm_id_pattern` " - "options are configured correctly. See " - "https://ms2rescore.readthedocs.io/en/latest/userguide/configuration/#mapping-psms-to-spectra" - " for more information." - ) from e - self._calculate_features(psm_list_run, ms2pip_results) - current_run += 1 + # temporarily remove spectrum from psm_utils before multiprocessing + ms2pip_results = process_MS2_spectra( + psms=psm_list, + model=self.model, + model_dir=self.model_dir, + processes=self.processes, + ) + self._calculate_features(psm_list, ms2pip_results) def _calculate_features( self, psm_list: PSMList, ms2pip_results: List[ProcessingResult] From 3091b0fad54f907651a354659aa1d46835362735 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Wed, 24 Dec 2025 16:45:43 +0100 Subject: [PATCH 38/54] parsing spectra once and storing spectra objects --- ms2rescore/core.py | 2 - ms2rescore/parse_spectra.py | 164 ++++++++++++++++++------------------ 2 files changed, 80 insertions(+), 86 deletions(-) diff --git a/ms2rescore/core.py b/ms2rescore/core.py index 2c701ca4..4384c6f7 100644 --- a/ms2rescore/core.py +++ b/ms2rescore/core.py @@ -121,8 +121,6 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: if "mumble" in config["psm_generator"]: # Remove PSMs where matched_ions_pct drops 25% below the original hit psm_list = filter_mumble_psms(psm_list, threshold=0.75) - # Currently replace the score with the hyperscore for Mumble - # psm_list["score"] = [ft["hyperscore"] for ft in psm_list["rescoring_features"]] # TODO: This is a temporary fix # Write feature names to file _write_feature_names(feature_names, output_file_root) diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py index 103619c6..acff93e0 100644 --- a/ms2rescore/parse_spectra.py +++ b/ms2rescore/parse_spectra.py @@ -7,7 +7,8 @@ from typing import Optional, Set, Tuple import numpy as np -from ms2rescore_rs import Precursor, get_precursor_info +from ms2rescore_rs import Precursor, get_ms2_spectra, MS2Spectrum +from ms2pip.spectrum import ObservedSpectrum from psm_utils import PSMList from ms2rescore.exceptions import MS2RescoreConfigurationError, MS2RescoreError @@ -24,7 +25,6 @@ class MSDataType(str, Enum): precursor_mz = "precursor m/z" ms2_spectra = "MS2 spectra" - # Mimic behavior of StrEnum (Python >=3.11) def __str__(self): return self.value @@ -69,7 +69,7 @@ def add_precursor_values( # Check which data types are missing # Missing if: all values are 0, OR any values are None/NaN missing_data_types = set() - if spectrum_path is None: + if MSDataType.ms2_spectra in required_data_types: missing_data_types.add(MSDataType.ms2_spectra) rt_values = np.asarray(psm_list["retention_time"]) @@ -84,6 +84,8 @@ def add_precursor_values( if np.any(np.isnan(mz_values)) or np.all(mz_values == 0): missing_data_types.add(MSDataType.precursor_mz) + LOGGER.debug("Missing data types: %s", missing_data_types) + # Find data types that are both missing and required data_types_to_parse = missing_data_types & required_data_types @@ -104,36 +106,59 @@ def add_precursor_values( # Get precursor values from spectrum files LOGGER.info("Parsing precursor info from spectrum files...") - mz, rt, im = _get_precursor_values(psm_list, spectrum_path, spectrum_id_pattern) + _add_precursor_values(psm_list, spectrum_path, spectrum_id_pattern) + + # Extract precursor values from MS2 spectrum objects in a single pass + precursor_data = [ + (ms2.precursor.rt, ms2.precursor.im, ms2.precursor.mz) for ms2 in psm_list["spectrum"] + ] + rts, ims, mzs = map(np.array, zip(*precursor_data)) # Determine which data types were successfully found in spectrum files - # ms2rescore_rs always returns 0.0 for missing values - found_data_types = {MSDataType.ms2_spectra} # MS2 spectra available when processing files - if np.all(rt != 0.0): + found_data_types = {MSDataType.ms2_spectra} + + # Add found data types: if missing and all zeros, raise error + if not np.all(rts == 0.0): found_data_types.add(MSDataType.retention_time) - if np.all(im != 0.0): + if MSDataType.retention_time in data_types_to_parse: + LOGGER.debug( + "Missing retention time values in PSM list. Updating from spectrum files." + ) + psm_list["retention_time"] = rts + elif MSDataType.retention_time in data_types_to_parse: + raise SpectrumParsingError( + "Retention time values are required but not available in spectrum files " + "(all values are zero)." + ) + + if not np.all(ims == 0.0): found_data_types.add(MSDataType.ion_mobility) - if np.all(mz != 0.0): + if MSDataType.ion_mobility in data_types_to_parse: + LOGGER.debug("Missing ion mobility values in PSM list. Updating from spectrum files.") + psm_list["ion_mobility"] = ims + elif MSDataType.ion_mobility in data_types_to_parse: + raise SpectrumParsingError( + "Ion mobility values are required but not available in spectrum files " + "(all values are zero)." + ) + + if np.all(mzs == 0.0): found_data_types.add(MSDataType.precursor_mz) + if MSDataType.precursor_mz in data_types_to_parse: + LOGGER.debug("Missing precursor m/z values in PSM list. Updating from spectrum files.") + psm_list["precursor_mz"] = mzs + elif MSDataType.precursor_mz in data_types_to_parse: + raise SpectrumParsingError( + "Precursor m/z values are required but not available in spectrum files " + "(all values are zero)." + ) - # Update PSM list with missing precursor values that were found - update_types = data_types_to_parse & found_data_types - - if MSDataType.retention_time in update_types: - LOGGER.debug("Missing retention time values in PSM list. Updating from spectrum files.") - psm_list["retention_time"] = rt - if MSDataType.ion_mobility in update_types: - LOGGER.debug("Missing ion mobility values in PSM list. Updating from spectrum files.") - psm_list["ion_mobility"] = im - if MSDataType.precursor_mz in update_types: - LOGGER.debug("Missing precursor m/z values in PSM list. Updating from spectrum files.") - psm_list["precursor_mz"] = mz - elif ( + # Check if precursor m/z values are consistent between PSMs and spectrum files + if ( MSDataType.precursor_mz not in missing_data_types and MSDataType.precursor_mz in found_data_types ): - # Check if precursor m/z values are consistent between PSMs and spectrum files - mz_diff = np.abs(psm_list["precursor_mz"] - mz) + mz_diff = np.abs(psm_list["precursor_mz"] - mzs) if np.mean(mz_diff) > 1e-2: LOGGER.warning( "Mismatch between precursor m/z values in PSM list and spectrum files (mean " @@ -149,20 +174,22 @@ def add_precursor_values( return available_data_types -def _apply_spectrum_id_pattern( - precursors: dict[str, Precursor], pattern: str +def _acquire_observed_spectra_dict( + ms2: list[MS2Spectrum], pattern: str, spectrum_ids: list[str] ) -> dict[str, Precursor]: """Apply spectrum ID pattern to precursor IDs.""" # Map precursor IDs using regex pattern compiled_pattern = re.compile(pattern) - id_mapping = { - match.group(1): spectrum_id - for spectrum_id in precursors.keys() - if (match := compiled_pattern.search(spectrum_id)) is not None + + ms2_observed_spectra_mapping = { + match.group(1): ms2_spectrum + for ms2_spectrum in ms2 + if (match := compiled_pattern.search(str(ms2_spectrum.identifier))) is not None + and match.group(1) in spectrum_ids } # Validate that any IDs were matched - if not id_mapping: + if not ms2_observed_spectra_mapping: raise MS2RescoreConfigurationError( "'spectrum_id_pattern' did not match any spectrum-file IDs. Please check and try " "again. See " @@ -170,64 +197,33 @@ def _apply_spectrum_id_pattern( "for more information." ) - # Validate that the same number of unique IDs were matched - elif len(id_mapping) != len(precursors): - new_id, old_id = next(iter(id_mapping.items())) - raise MS2RescoreConfigurationError( - "'spectrum_id_pattern' resulted in a different number of unique spectrum IDs. This " - "indicates issues with the regex pattern. Please check and try again. " - f"Example old ID: '{old_id}' -> new ID: '{new_id}'. " - "See https://ms2rescore.readthedocs.io/en/stable/userguide/configuration/#mapping-psms-to-spectra " - "for more information." - ) - - precursors = {new_id: precursors[orig_id] for new_id, orig_id in id_mapping.items()} - - return precursors + return ms2_observed_spectra_mapping -def _get_precursor_values( +def _add_precursor_values( psm_list: PSMList, spectrum_path: str, spectrum_id_pattern: Optional[str] = None -) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: +) -> None: """Get precursor m/z, RT, and IM from spectrum files.""" # Iterate over different runs in PSM list - precursor_dict = dict() - psm_dict = psm_list.get_psm_dict() - for runs in psm_dict.values(): - for run_name, psms in runs.items(): - psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) - spectrum_file = infer_spectrum_path(spectrum_path, run_name) - - LOGGER.debug("Reading spectrum file: '%s'", spectrum_file) - precursors: dict[str, Precursor] = get_precursor_info(str(spectrum_file)) - - # Parse spectrum IDs with regex pattern if provided - if spectrum_id_pattern: - precursors = _apply_spectrum_id_pattern(precursors, spectrum_id_pattern) - - # Ensure all PSMs have precursor values - for psm in psm_list_run: - if psm.spectrum_id not in precursors: - raise MS2RescoreConfigurationError( - "Mismatch between PSM and spectrum file IDs. Could not find precursor " - f"values for PSM with ID {psm.spectrum_id} in run {run_name}.\n" - "Please check that the `spectrum_id_pattern` and `psm_id_pattern` options " - "are configured correctly. See " - "https://ms2rescore.readthedocs.io/en/stable/userguide/configuration/#mapping-psms-to-spectra" - " for more information.\n" - f"Example ID from PSM file: {psm.spectrum_id}\n" - f"Example ID from spectrum file: {list(precursors.keys())[0]}" - ) - - # Store precursor values in dictionary - precursor_dict[run_name] = precursors - - # Reshape precursor values into arrays matching PSM list - mzs = np.fromiter((precursor_dict[psm.run][psm.spectrum_id].mz for psm in psm_list), float) - rts = np.fromiter((precursor_dict[psm.run][psm.spectrum_id].rt for psm in psm_list), float) - ims = np.fromiter((precursor_dict[psm.run][psm.spectrum_id].im for psm in psm_list), float) - - return mzs, rts, ims + if spectrum_id_pattern is None: + spectrum_id_pattern = r"^(.*)$" # Match entire identifier if no pattern provided + + for run_name in set(psm_list["run"]): + run_mask = psm_list["run"] == run_name + psm_list_run = psm_list[run_mask] + spectrum_file = infer_spectrum_path(spectrum_path, run_name) + + LOGGER.debug("Reading spectrum file: '%s'", spectrum_file) + ms2_spectra: list[MS2Spectrum] = get_ms2_spectra(str(spectrum_file)) + + # Parse spectrum IDs with regex pattern if provided + ms2_spectra_dict = _acquire_observed_spectra_dict( + ms2_spectra, spectrum_id_pattern, psm_list_run["spectrum_id"] + ) + + psm_list_run["spectrum"] = [ + ms2_spectra_dict[spec_id] for spec_id in psm_list_run["spectrum_id"] + ] class SpectrumParsingError(MS2RescoreError): From 5b3d4c466b5814a044f79b866696a74af53ec627 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Wed, 24 Dec 2025 16:46:06 +0100 Subject: [PATCH 39/54] directly operate on spectra objects instead of reacquiring them --- ms2rescore/feature_generators/ms2.py | 148 ++++++++++----------------- 1 file changed, 56 insertions(+), 92 deletions(-) diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py index c5ae3ba9..ebeff27b 100644 --- a/ms2rescore/feature_generators/ms2.py +++ b/ms2rescore/feature_generators/ms2.py @@ -6,17 +6,13 @@ import logging import re from collections import defaultdict -from itertools import chain from typing import List, Optional import numpy as np from psm_utils import PSMList -from rustyms import FragmentationModel, CompoundPeptidoformIon, MassMode, RawSpectrum -from ms2rescore_rs import get_ms2_spectra +from rustyms import FragmentationModel, LinearPeptide, MassMode, RawSpectrum -from ms2rescore.exceptions import ParseSpectrumError from ms2rescore.feature_generators.base import FeatureGeneratorBase -from ms2rescore.utils import infer_spectrum_path logger = logging.getLogger(__name__) @@ -83,49 +79,17 @@ def feature_names(self) -> List[str]: "matched_y_ions_pct", "matched_ions_pct", "hyperscore", - "mean_error_top7", - "sq_mean_error_top7", - "stdev_error_top7", + # "mean_error_top7", + # "sq_mean_error_top7", + # "stdev_error_top7", ] def add_features(self, psm_list: PSMList) -> None: """Add MS2-derived features to PSMs.""" logger.info("Adding MS2-derived features to PSMs.") - psm_dict = psm_list.get_psm_dict() - current_run = 1 - total_runs = sum(len(runs) for runs in psm_dict.values()) - - for runs in psm_dict.values(): - for run, psms in runs.items(): - logger.info( - f"Running MS2 for PSMs from run ({current_run}/{total_runs}) `{run}`..." - ) - psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) - spectrum_filename = infer_spectrum_path(self.spectrum_path, run) - - self._calculate_features(psm_list_run, spectrum_filename) - current_run += 1 - - def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List: - """Retrieve annotated spectra for all psms.""" - - spectra = get_ms2_spectra(str(spectrum_file)) - spectrum_id_pattern = re.compile( - self.spectrum_id_pattern if self.spectrum_id_pattern else r"(.*)" - ) - try: - spectra_dict = { - spectrum_id_pattern.search(spectrum.identifier).group(1): spectrum - for spectrum in spectra - } - except AttributeError: - raise ParseSpectrumError( - "Could not parse spectrum IDs using ´spectrum_id_pattern´. Please make sure that there is a capturing in the pattern." - ) - for psm in psm_list: - annotated_spectrum = self._annotate_spectrum(psm, spectra_dict[psm.spectrum_id]) + annotated_spectrum = self._annotate_spectrum(psm) psm.rescoring_features.update( self._calculate_spectrum_features(psm, annotated_spectrum) ) @@ -149,46 +113,46 @@ def _longest_ion_sequence(lst): return max_sequence - @staticmethod - def _calculate_top7_peak_features(annotated_spectrum): - """ - Calculate "top 7 peak"-related features using mass errors. - The following features are calculated: - - mean_error_top7: Mean of mass errors of the seven fragment ion peaks with the - highest intensities - - sq_mean_error_top7: Squared MeanErrorTop7 - - stdev_error_top7: Standard deviation of mass errors of the seven fragment ion - peaks with the highest intensities - """ - if not annotated_spectrum: - return 0.0, 0.0, 0.0 - - # Collect peaks with annotations (matched peaks) and their mass errors - peak_data = [] - for peak in annotated_spectrum: - if peak.annotation: - for matched_ion in peak.annotation: - # Calculate mass error (ppm) between observed and theoretical m/z - theoretical_mz = matched_ion.mz - observed_mz = peak.mz - mass_error = ((observed_mz - theoretical_mz) / theoretical_mz) * 1e6 - peak_data.append((peak.intensity, mass_error)) - - if len(peak_data) == 0: - return 0.0, 0.0, 0.0 - - # Sort by intensity and get top 7 - peak_data.sort(key=lambda x: x[0], reverse=True) - top7_errors = [error for _, error in peak_data[:7]] - - if len(top7_errors) == 0: - return 0.0, 0.0, 0.0 - - mean_error_top7 = np.mean(top7_errors) - sq_mean_error_top7 = mean_error_top7**2 - stdev_error_top7 = np.std(top7_errors) if len(top7_errors) > 1 else 0.0 - - return mean_error_top7, sq_mean_error_top7, stdev_error_top7 + # @staticmethod + # def _calculate_top7_peak_features(annotated_spectrum): + # """ + # Calculate "top 7 peak"-related features using mass errors. + # The following features are calculated: + # - mean_error_top7: Mean of mass errors of the seven fragment ion peaks with the + # highest intensities + # - sq_mean_error_top7: Squared MeanErrorTop7 + # - stdev_error_top7: Standard deviation of mass errors of the seven fragment ion + # peaks with the highest intensities + # """ + # if not annotated_spectrum: + # return 0.0, 0.0, 0.0 + + # # Collect peaks with annotations (matched peaks) and their mass errors + # peak_data = [] + # for peak in annotated_spectrum: + # if peak.annotation: + # for matched_ion in peak.annotation: + # # Calculate mass error (ppm) between observed and theoretical m/z + # theoretical_mz = matched_ion.mz + # observed_mz = peak.mz + # mass_error = ((observed_mz - theoretical_mz) / theoretical_mz) * 1e6 + # peak_data.append((peak.intensity, mass_error)) + + # if len(peak_data) == 0: + # return 0.0, 0.0, 0.0 + + # # Sort by intensity and get top 7 + # peak_data.sort(key=lambda x: x[0], reverse=True) + # top7_errors = [error for _, error in peak_data[:7]] + + # if len(top7_errors) == 0: + # return 0.0, 0.0, 0.0 + + # mean_error_top7 = np.mean(top7_errors) + # sq_mean_error_top7 = mean_error_top7**2 + # stdev_error_top7 = np.std(top7_errors) if len(top7_errors) > 1 else 0.0 + + # return mean_error_top7, sq_mean_error_top7, stdev_error_top7 def _calculate_spectrum_features(self, psm, annotated_spectrum): @@ -225,9 +189,9 @@ def _calculate_spectrum_features(self, psm, annotated_spectrum): y_ion_matched_sum = np.sum(features["y_ion_matched"]) # Calculate top 7 peak features (MaxQuant-derived) - mean_error_top7, sq_mean_error_top7, stdev_error_top7 = self._calculate_top7_peak_features( - annotated_spectrum - ) + # mean_error_top7, sq_mean_error_top7, stdev_error_top7 = self._calculate_top7_peak_features( + # annotated_spectrum + # ) return { "ln_explained_intensity": np.log(matched_intensity_sum + pseudo_count), @@ -249,24 +213,24 @@ def _calculate_spectrum_features(self, psm, annotated_spectrum): "matched_y_ions_pct": sum(y_ions_matched) / len(y_ions_matched), "matched_ions_pct": (sum(b_ions_matched) + sum(y_ions_matched)) / (len(b_ions_matched) + len(y_ions_matched)), - "mean_error_top7": mean_error_top7, - "sq_mean_error_top7": sq_mean_error_top7, - "stdev_error_top7": stdev_error_top7, + # "mean_error_top7": mean_error_top7, + # "sq_mean_error_top7": sq_mean_error_top7, + # "stdev_error_top7": stdev_error_top7, } - def _annotate_spectrum(self, psm, spectrum): + def _annotate_spectrum(self, psm): spectrum = RawSpectrum( title=psm.spectrum_id, num_scans=1, rt=psm.retention_time, precursor_charge=psm.get_precursor_charge(), - precursor_mass=spectrum.precursor.mz, - mz_array=spectrum.mz, - intensity_array=spectrum.intensity, + precursor_mass=psm.spectrum.precursor.mz, + mz_array=psm.spectrum.mz, + intensity_array=psm.spectrum.intensity, ) try: - linear_peptide = CompoundPeptidoformIon(psm.peptidoform.proforma.split("/")[0]) + linear_peptide = LinearPeptide(psm.peptidoform.proforma.split("/")[0]) annotated_spectrum = spectrum.annotate( peptide=linear_peptide, model=self.fragmentation_model, From a3cbb1b1e45fb8888a633f4e30f5f2b53b5f15f2 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Wed, 24 Dec 2025 16:46:21 +0100 Subject: [PATCH 40/54] updated profiling --- ms2rescore/__main__.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/ms2rescore/__main__.py b/ms2rescore/__main__.py index 85309ffc..52e377f3 100644 --- a/ms2rescore/__main__.py +++ b/ms2rescore/__main__.py @@ -6,6 +6,7 @@ import json import logging import sys +from datetime import datetime from pathlib import Path from typing import Union @@ -196,7 +197,13 @@ def profile(fnc, filepath): def inner(*args, **kwargs): with cProfile.Profile() as profiler: return_value = fnc(*args, **kwargs) - profiler.dump_stats(filepath + ".profile.prof") + + # Add timestamp to profiler output filename + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + profile_filename = f"{filepath}.profile_{timestamp}.prof" + profiler.dump_stats(profile_filename) + LOGGER.info(f"Profile data written to: {profile_filename}") + return return_value return inner @@ -248,6 +255,7 @@ def main(tims=False): # Run MS²Rescore try: if config["ms2rescore"]["profile"]: + LOGGER.info("Profiling enabled") profiled_rescore = profile(rescore, config["ms2rescore"]["output_path"]) profiled_rescore(configuration=config) else: From a1df72d346b181b3ef1d1478cd919ea5f3cd787c Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Wed, 24 Dec 2025 16:46:37 +0100 Subject: [PATCH 41/54] removed maxquant generator from fg --- ms2rescore/feature_generators/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/ms2rescore/feature_generators/__init__.py b/ms2rescore/feature_generators/__init__.py index ac09923e..4956868c 100644 --- a/ms2rescore/feature_generators/__init__.py +++ b/ms2rescore/feature_generators/__init__.py @@ -7,7 +7,6 @@ from ms2rescore.feature_generators.deeplc import DeepLCFeatureGenerator from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator from ms2rescore.feature_generators.ionmob import IonMobFeatureGenerator -from ms2rescore.feature_generators.maxquant import MaxQuantFeatureGenerator from ms2rescore.feature_generators.ms2 import MS2FeatureGenerator from ms2rescore.feature_generators.ms2pip import MS2PIPFeatureGenerator @@ -15,7 +14,6 @@ "basic": BasicFeatureGenerator, "ms2pip": MS2PIPFeatureGenerator, "deeplc": DeepLCFeatureGenerator, - "maxquant": MaxQuantFeatureGenerator, "ionmob": IonMobFeatureGenerator, "im2deep": IM2DeepFeatureGenerator, "ms2": MS2FeatureGenerator, From 2c7a09b7392895ab85eb29390aad5b456f1ee36e Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Mon, 5 Jan 2026 13:09:04 +0100 Subject: [PATCH 42/54] changes to column names --- ms2rescore/feature_generators/im2deep.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ms2rescore/feature_generators/im2deep.py b/ms2rescore/feature_generators/im2deep.py index b42fabc9..12094306 100644 --- a/ms2rescore/feature_generators/im2deep.py +++ b/ms2rescore/feature_generators/im2deep.py @@ -90,10 +90,8 @@ def add_features(self, psm_list: PSMList) -> None: ] psm_list_df["charge"] = [pep.precursor_charge for pep in psm_list_df["peptidoform"]] - psm_list_df["sequence"] = psm_list_df["peptidoform"].apply( - lambda x: x.proforma.split("/")[0] - ) - psm_list_df["ccs_observed"] = im2ccs( + psm_list_df["sequence"] = psm_list_df["peptidoform"].apply(lambda x: x.proforma) + psm_list_df["ccs_observed_im2deep"] = im2ccs( psm_list_df["ion_mobility"], psm_list_df["precursor_mz"], psm_list_df["charge"], @@ -115,7 +113,10 @@ def add_features(self, psm_list: PSMList) -> None: run_shift_dict = {} for run in psm_list_df["run"].unique(): cal_run_psm_df = self.make_calibration_df(psm_list_df[psm_list_df["run"] == run]) - # Extract sequence from peptidoform for calculate_ccs_shift + # Rename for calculate_ccs_shift compatibility + cal_run_psm_df = cal_run_psm_df.rename( + columns={"ccs_observed_im2deep": "ccs_observed"} + ) shift = calculate_ccs_shift( cal_df=cal_run_psm_df, reference_dataset=reference_dataset, per_charge=True ) @@ -131,13 +132,12 @@ def add_features(self, psm_list: PSMList) -> None: psm_list_df["ccs_predicted_im2deep"] = ( psm_list_df["ccs_predicted"] + psm_list_df["ccs_shift"] ) - psm_list_df.rename(columns={"ccs_predicted": "ccs_predicted_im2deep"}, inplace=True) psm_list_df["ccs_error_im2deep"] = ( - psm_list_df["ccs_predicted_im2deep"] - psm_list_df["ccs_observed"] + psm_list_df["ccs_predicted_im2deep"] - psm_list_df["ccs_observed_im2deep"] ) psm_list_df["abs_ccs_error_im2deep"] = np.abs(psm_list_df["ccs_error_im2deep"]) psm_list_df["perc_ccs_error_im2deep"] = ( - np.abs(psm_list_df["ccs_error_im2deep"]) / psm_list_df["ccs_observed"] * 100 + np.abs(psm_list_df["ccs_error_im2deep"]) / psm_list_df["ccs_observed_im2deep"] * 100 ) psm_list_feature_dicts = psm_list_df[self.feature_names].to_dict(orient="records") From e855abf42fa3c8662820f90571aeff0c94eeec54 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Mon, 5 Jan 2026 14:39:48 +0100 Subject: [PATCH 43/54] changes to avoid out of memory error due to multiprocessing --- ms2rescore/feature_generators/ms2pip.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ms2rescore/feature_generators/ms2pip.py b/ms2rescore/feature_generators/ms2pip.py index 67ea48d0..3fdb62ad 100644 --- a/ms2rescore/feature_generators/ms2pip.py +++ b/ms2rescore/feature_generators/ms2pip.py @@ -194,13 +194,16 @@ def _calculate_features( ) -> None: """Calculate features from all MS²PIP results and add to PSMs.""" logger.debug("Calculating features from predicted spectra") - with multiprocessing.Pool(int(self.processes)) as pool: + # Use spawn context to avoid fork memory duplication issues + # maxtasksperchild recycles workers to free accumulated memory + ctx = multiprocessing.get_context("spawn") + with ctx.Pool(int(self.processes), maxtasksperchild=500) as pool: # Use imap, so we can use a progress bar counts_failed = 0 for result, features in zip( ms2pip_results, track( - pool.imap(self._calculate_features_single, ms2pip_results, chunksize=1000), + pool.imap(self._calculate_features_single, ms2pip_results, chunksize=250), total=len(ms2pip_results), description="Calculating features...", transient=True, From 577df191ddef858e33d4bef3bd6849b6fd4c8509 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Mon, 5 Jan 2026 14:40:14 +0100 Subject: [PATCH 44/54] replace list with set to reduce lookup time to O(1) --- ms2rescore/parse_spectra.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py index acff93e0..9b58c1da 100644 --- a/ms2rescore/parse_spectra.py +++ b/ms2rescore/parse_spectra.py @@ -180,12 +180,13 @@ def _acquire_observed_spectra_dict( """Apply spectrum ID pattern to precursor IDs.""" # Map precursor IDs using regex pattern compiled_pattern = re.compile(pattern) + spectrum_ids_set = set(spectrum_ids) # For faster lookup ms2_observed_spectra_mapping = { match.group(1): ms2_spectrum for ms2_spectrum in ms2 if (match := compiled_pattern.search(str(ms2_spectrum.identifier))) is not None - and match.group(1) in spectrum_ids + and match.group(1) in spectrum_ids_set } # Validate that any IDs were matched From a80238b1dc105eda164df8ca5762304a78b5c7a0 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Mon, 12 Jan 2026 10:50:04 +0100 Subject: [PATCH 45/54] remove unused imports --- ms2rescore/parse_spectra.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py index 9b58c1da..d897c40c 100644 --- a/ms2rescore/parse_spectra.py +++ b/ms2rescore/parse_spectra.py @@ -3,12 +3,11 @@ import logging import re from enum import Enum -from itertools import chain -from typing import Optional, Set, Tuple +from typing import Optional, Set import numpy as np from ms2rescore_rs import Precursor, get_ms2_spectra, MS2Spectrum -from ms2pip.spectrum import ObservedSpectrum + from psm_utils import PSMList from ms2rescore.exceptions import MS2RescoreConfigurationError, MS2RescoreError From abf66b4396a10ef4b150ab2031789ba99ffe73d7 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Mon, 12 Jan 2026 10:50:23 +0100 Subject: [PATCH 46/54] migrate ms2 and ms2pip features to ms2rescore-rs --- ms2rescore/feature_generators/ms2.py | 271 ++---------------------- ms2rescore/feature_generators/ms2pip.py | 210 +++--------------- 2 files changed, 47 insertions(+), 434 deletions(-) diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py index ebeff27b..6ef573a2 100644 --- a/ms2rescore/feature_generators/ms2.py +++ b/ms2rescore/feature_generators/ms2.py @@ -4,29 +4,16 @@ """ import logging -import re -from collections import defaultdict + from typing import List, Optional import numpy as np from psm_utils import PSMList -from rustyms import FragmentationModel, LinearPeptide, MassMode, RawSpectrum - +from ms2rescore_rs import batch_ms2_features_from_spectra from ms2rescore.feature_generators.base import FeatureGeneratorBase logger = logging.getLogger(__name__) -FRAGMENTATION_MODELS = { - "cidhcd": FragmentationModel.CidHcd, - "etd": FragmentationModel.Etd, - "ethcd": FragmentationModel.Ethcd, - "all": FragmentationModel.All, -} -MASS_MODES = { - "average": MassMode.Average, - "monoisotopic": MassMode.Monoisotopic, -} - class MS2FeatureGenerator(FeatureGeneratorBase): """DeepLC retention time-based feature generator.""" @@ -36,10 +23,10 @@ def __init__( *args, spectrum_path: Optional[str] = None, spectrum_id_pattern: str = "(.*)", - fragmentation_model: str = "All", - mass_mode: str = "Monoisotopic", + fragmentation_model: str = "cidhcd", + mass_mode: str = "monoisotopic", processes: int = 1, - calculate_hyperscore: bool = True, # Allow optional ? + calculate_hyperscore: bool = True, **kwargs, ) -> None: """ @@ -58,8 +45,8 @@ def __init__( self.spectrum_path = spectrum_path self.spectrum_id_pattern = spectrum_id_pattern - self.fragmentation_model = FRAGMENTATION_MODELS[fragmentation_model.lower()] - self.mass_mode = MASS_MODES[mass_mode.lower()] + self.fragmentation_model = fragmentation_model.lower() + self.mass_mode = mass_mode.lower() self.processes = processes self.calculate_hyperscore = calculate_hyperscore @@ -79,240 +66,24 @@ def feature_names(self) -> List[str]: "matched_y_ions_pct", "matched_ions_pct", "hyperscore", - # "mean_error_top7", - # "sq_mean_error_top7", - # "stdev_error_top7", ] def add_features(self, psm_list: PSMList) -> None: - """Add MS2-derived features to PSMs.""" - logger.info("Adding MS2-derived features to PSMs.") - for psm in psm_list: - annotated_spectrum = self._annotate_spectrum(psm) - psm.rescoring_features.update( - self._calculate_spectrum_features(psm, annotated_spectrum) - ) - - if self.calculate_hyperscore: - # Filters out peaks which are unnannotated (can be specified, but keep at b-y ions of any charge ?) - b, y = get_by_fragments(annotated_spectrum) - hs = calculate_hyperscore( - n_y=len(y), n_b=len(b), y_ion_intensities=y, b_ion_intensities=b - ) - psm.rescoring_features.update({"hyperscore": hs}) - - @staticmethod - def _longest_ion_sequence(lst): - max_sequence = 0 - current_sequence = 0 - - for value in lst: - current_sequence = current_sequence + 1 if value else 0 - max_sequence = max(max_sequence, current_sequence) - - return max_sequence - - # @staticmethod - # def _calculate_top7_peak_features(annotated_spectrum): - # """ - # Calculate "top 7 peak"-related features using mass errors. - # The following features are calculated: - # - mean_error_top7: Mean of mass errors of the seven fragment ion peaks with the - # highest intensities - # - sq_mean_error_top7: Squared MeanErrorTop7 - # - stdev_error_top7: Standard deviation of mass errors of the seven fragment ion - # peaks with the highest intensities - # """ - # if not annotated_spectrum: - # return 0.0, 0.0, 0.0 - - # # Collect peaks with annotations (matched peaks) and their mass errors - # peak_data = [] - # for peak in annotated_spectrum: - # if peak.annotation: - # for matched_ion in peak.annotation: - # # Calculate mass error (ppm) between observed and theoretical m/z - # theoretical_mz = matched_ion.mz - # observed_mz = peak.mz - # mass_error = ((observed_mz - theoretical_mz) / theoretical_mz) * 1e6 - # peak_data.append((peak.intensity, mass_error)) - - # if len(peak_data) == 0: - # return 0.0, 0.0, 0.0 - - # # Sort by intensity and get top 7 - # peak_data.sort(key=lambda x: x[0], reverse=True) - # top7_errors = [error for _, error in peak_data[:7]] - - # if len(top7_errors) == 0: - # return 0.0, 0.0, 0.0 - - # mean_error_top7 = np.mean(top7_errors) - # sq_mean_error_top7 = mean_error_top7**2 - # stdev_error_top7 = np.std(top7_errors) if len(top7_errors) > 1 else 0.0 - - # return mean_error_top7, sq_mean_error_top7, stdev_error_top7 - - def _calculate_spectrum_features(self, psm, annotated_spectrum): - - if not annotated_spectrum: - return {} - - features = defaultdict(list) - b_ions_matched = [False] * (len(psm.peptidoform.sequence)) - y_ions_matched = [False] * (len(psm.peptidoform.sequence)) - - pseudo_count = 0.00001 - ion_fragment_regex = re.compile(r"\d+") - - for peak in annotated_spectrum: - features["total_intensity"].append(peak.intensity) - - if peak.annotation: - features["matched_intensity"].append(peak.intensity) - for matched_ion in peak.annotation: - if "y" in matched_ion.ion: - features["y_ion_matched"].append(peak.intensity) - y_ions_matched[int(ion_fragment_regex.search(matched_ion.ion).group())] = ( - True - ) - elif "b" in matched_ion.ion: - features["b_ion_matched"].append(peak.intensity) - b_ions_matched[int(ion_fragment_regex.search(matched_ion.ion).group())] = ( - True - ) - total_intensity_sum = np.sum(features["total_intensity"]) - matched_intensity_sum = np.sum(features["matched_intensity"]) - b_ion_matched_sum = np.sum(features["b_ion_matched"]) - y_ion_matched_sum = np.sum(features["y_ion_matched"]) - - # Calculate top 7 peak features (MaxQuant-derived) - # mean_error_top7, sq_mean_error_top7, stdev_error_top7 = self._calculate_top7_peak_features( - # annotated_spectrum - # ) - - return { - "ln_explained_intensity": np.log(matched_intensity_sum + pseudo_count), - "ln_total_intensity": np.log(total_intensity_sum + pseudo_count), - "ln_explained_intensity_ratio": np.log( - matched_intensity_sum / total_intensity_sum + pseudo_count - ), - "ln_explained_b_ion_ratio": np.log( - b_ion_matched_sum / matched_intensity_sum + pseudo_count - ), - "ln_explained_y_ion_ratio": np.log( - y_ion_matched_sum / matched_intensity_sum + pseudo_count - ), - "longest_b_ion_sequence": self._longest_ion_sequence(b_ions_matched), - "longest_y_ion_sequence": self._longest_ion_sequence(y_ions_matched), - "matched_b_ions": sum(b_ions_matched), - "matched_b_ions_pct": sum(b_ions_matched) / len(b_ions_matched), - "matched_y_ions": sum(y_ions_matched), - "matched_y_ions_pct": sum(y_ions_matched) / len(y_ions_matched), - "matched_ions_pct": (sum(b_ions_matched) + sum(y_ions_matched)) - / (len(b_ions_matched) + len(y_ions_matched)), - # "mean_error_top7": mean_error_top7, - # "sq_mean_error_top7": sq_mean_error_top7, - # "stdev_error_top7": stdev_error_top7, - } - - def _annotate_spectrum(self, psm): - - spectrum = RawSpectrum( - title=psm.spectrum_id, - num_scans=1, - rt=psm.retention_time, - precursor_charge=psm.get_precursor_charge(), - precursor_mass=psm.spectrum.precursor.mz, - mz_array=psm.spectrum.mz, - intensity_array=psm.spectrum.intensity, + spectra = psm_list["spectrum"] + # Keep parity with your current behavior: + proformas = [psm.peptidoform.proforma.split("/")[0] for psm in psm_list] + seq_lens = [len(psm.peptidoform.sequence) for psm in psm_list] + + feature_dicts = batch_ms2_features_from_spectra( + spectra=spectra, + proformas=proformas, + seq_lens=seq_lens, + fragmentation_model=self.fragmentation_model, + mass_mode=self.mass_mode, + calculate_hyperscore=self.calculate_hyperscore, ) - try: - linear_peptide = LinearPeptide(psm.peptidoform.proforma.split("/")[0]) - annotated_spectrum = spectrum.annotate( - peptide=linear_peptide, - model=self.fragmentation_model, - mode=self.mass_mode, - ) - except: # noqa E722 - return [] - - return annotated_spectrum.spectrum - - -def factorial(n): - """ - Compute factorial of n using a loop. - Parameters: - n (int): Non-negative integer. - Returns: - int: Factorial of n. - """ - result = 1 - for i in range(1, n + 1): - result *= i - return result - - -def calculate_hyperscore(n_y, n_b, y_ion_intensities, b_ion_intensities): - """ - Calculate the hyperscore for a peptide-spectrum match. - Parameters: - n_y (int): Number of matched y-ions. - n_b (int): Number of matched b-ions. - y_ion_intensities (list): Intensities of matched y-ions. - b_ion_intensities (list): Intensities of matched b-ions. - Returns: - float: Calculated hyperscore. - """ - # Calculate the product of y-ion and b-ion intensities - product_y = np.sum(y_ion_intensities) if y_ion_intensities else 1 - product_b = np.sum(b_ion_intensities) if b_ion_intensities else 1 - - # Calculate factorial using custom function - factorial_y = factorial(n_y) - factorial_b = factorial(n_b) - - # Compute hyperscore - hyperscore = np.log(factorial_y * factorial_b * (product_y + product_b)) - return hyperscore - - -def infer_fragment_identity(frag, allow_ion_types=["b", "y"]): - ion = frag.ion - - is_allowed = False - for allowed_ion_type in allow_ion_types: - if allowed_ion_type in ion: - is_allowed = True - break - - if not is_allowed: - return False - # Radicals - if "·" in ion: - return False - if frag.neutral_loss is not None: - return False - if frag.charge > 2: - return False - - return ion[0] - - -def get_by_fragments(annotated_spectrum): - b_intensities = [] - y_intensities = [] - for peak in annotated_spectrum: - - for fragment in peak.annotation: - - ion_type = infer_fragment_identity(fragment) - if ion_type == "b": - b_intensities.append(peak.intensity) - if ion_type == "y": - y_intensities.append(peak.intensity) - return b_intensities, y_intensities + for psm, feats in zip(psm_list, feature_dicts): + psm.rescoring_features.update(feats) diff --git a/ms2rescore/feature_generators/ms2pip.py b/ms2rescore/feature_generators/ms2pip.py index 3fdb62ad..dcb0100e 100644 --- a/ms2rescore/feature_generators/ms2pip.py +++ b/ms2rescore/feature_generators/ms2pip.py @@ -24,18 +24,13 @@ """ import logging -import multiprocessing -import warnings from typing import List, Optional, Union - import numpy as np -import pandas as pd + from ms2pip import process_MS2_spectra -from ms2pip.exceptions import NoMatchingSpectraFound -from ms2pip.result import ProcessingResult +from ms2rescore_rs import batch_ms2pip_features_numpy from psm_utils import PSMList -from rich.progress import track from ms2rescore.feature_generators.base import FeatureGeneratorBase from ms2rescore.parse_spectra import MSDataType @@ -180,7 +175,6 @@ def add_features(self, psm_list: PSMList) -> None: """ logger.info("Adding MS²PIP-derived features to PSMs.") - # temporarily remove spectrum from psm_utils before multiprocessing ms2pip_results = process_MS2_spectra( psms=psm_list, model=self.model, @@ -189,181 +183,29 @@ def add_features(self, psm_list: PSMList) -> None: ) self._calculate_features(psm_list, ms2pip_results) - def _calculate_features( - self, psm_list: PSMList, ms2pip_results: List[ProcessingResult] - ) -> None: - """Calculate features from all MS²PIP results and add to PSMs.""" - logger.debug("Calculating features from predicted spectra") - # Use spawn context to avoid fork memory duplication issues - # maxtasksperchild recycles workers to free accumulated memory - ctx = multiprocessing.get_context("spawn") - with ctx.Pool(int(self.processes), maxtasksperchild=500) as pool: - # Use imap, so we can use a progress bar - counts_failed = 0 - for result, features in zip( - ms2pip_results, - track( - pool.imap(self._calculate_features_single, ms2pip_results, chunksize=250), - total=len(ms2pip_results), - description="Calculating features...", - transient=True, - ), - ): - if features: - # Cannot use result.psm directly, as it is a copy from MS²PIP multiprocessing - try: - psm_list[result.psm_index]["rescoring_features"].update(features) - except (AttributeError, TypeError): - psm_list[result.psm_index]["rescoring_features"] = features - else: - counts_failed += 1 - - if counts_failed > 0: - logger.warning(f"Failed to calculate features for {counts_failed} PSMs") - - def _calculate_features_single(self, processing_result: ProcessingResult) -> Union[dict, None]: - """Calculate MS²PIP-based features for single PSM.""" - if ( - processing_result.observed_intensity is None - or processing_result.predicted_intensity is None - ): - return None - - # Suppress RuntimeWarnings about invalid values - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - - # Convert intensities to arrays - target_b = processing_result.predicted_intensity["b"].clip(np.log2(0.001)) - target_y = processing_result.predicted_intensity["y"].clip(np.log2(0.001)) - target_all = np.concatenate([target_b, target_y]) - prediction_b = processing_result.observed_intensity["b"].clip(np.log2(0.001)) - prediction_y = processing_result.observed_intensity["y"].clip(np.log2(0.001)) - prediction_all = np.concatenate([prediction_b, prediction_y]) - - # Prepare 'unlogged' intensity arrays - target_b_unlog = 2**target_b - 0.001 - target_y_unlog = 2**target_y - 0.001 - target_all_unlog = 2**target_all - 0.001 - prediction_b_unlog = 2**prediction_b - 0.001 - prediction_y_unlog = 2**prediction_y - 0.001 - prediction_all_unlog = 2**prediction_all - 0.001 - - # Calculate absolute differences - abs_diff_b = np.abs(target_b - prediction_b) - abs_diff_y = np.abs(target_y - prediction_y) - abs_diff_all = np.abs(target_all - prediction_all) - abs_diff_b_unlog = np.abs(target_b_unlog - prediction_b_unlog) - abs_diff_y_unlog = np.abs(target_y_unlog - prediction_y_unlog) - abs_diff_all_unlog = np.abs(target_all_unlog - prediction_all_unlog) - - # Compute features - feature_values = [ - # Features between spectra in log space - np.corrcoef(target_all, prediction_all)[0][1], # Pearson all ions - np.corrcoef(target_b, prediction_b)[0][1], # Pearson b ions - np.corrcoef(target_y, prediction_y)[0][1], # Pearson y ions - _mse(target_all, prediction_all), # MSE all ions - _mse(target_b, prediction_b), # MSE b ions - _mse(target_y, prediction_y), # MSE y ions - np.min(abs_diff_all), # min_abs_diff_norm - np.max(abs_diff_all), # max_abs_diff_norm - np.quantile(abs_diff_all, 0.25), # abs_diff_Q1_norm - np.quantile(abs_diff_all, 0.5), # abs_diff_Q2_norm - np.quantile(abs_diff_all, 0.75), # abs_diff_Q3_norm - np.mean(abs_diff_all), # mean_abs_diff_norm - np.std(abs_diff_all), # std_abs_diff_norm - np.min(abs_diff_b), # ionb_min_abs_diff_norm - np.max(abs_diff_b), # ionb_max_abs_diff_norm - np.quantile(abs_diff_b, 0.25), # ionb_abs_diff_Q1_norm - np.quantile(abs_diff_b, 0.5), # ionb_abs_diff_Q2_norm - np.quantile(abs_diff_b, 0.75), # ionb_abs_diff_Q3_norm - np.mean(abs_diff_b), # ionb_mean_abs_diff_norm - np.std(abs_diff_b), # ionb_std_abs_diff_norm - np.min(abs_diff_y), # iony_min_abs_diff_norm - np.max(abs_diff_y), # iony_max_abs_diff_norm - np.quantile(abs_diff_y, 0.25), # iony_abs_diff_Q1_norm - np.quantile(abs_diff_y, 0.5), # iony_abs_diff_Q2_norm - np.quantile(abs_diff_y, 0.75), # iony_abs_diff_Q3_norm - np.mean(abs_diff_y), # iony_mean_abs_diff_norm - np.std(abs_diff_y), # iony_std_abs_diff_norm - np.dot(target_all, prediction_all), # Dot product all ions - np.dot(target_b, prediction_b), # Dot product b ions - np.dot(target_y, prediction_y), # Dot product y ions - _cosine_similarity(target_all, prediction_all), # Cos similarity all ions - _cosine_similarity(target_b, prediction_b), # Cos similarity b ions - _cosine_similarity(target_y, prediction_y), # Cos similarity y ions - # Same features in normal space - np.corrcoef(target_all_unlog, prediction_all_unlog)[0][1], # Pearson all - np.corrcoef(target_b_unlog, prediction_b_unlog)[0][1], # Pearson b - np.corrcoef(target_y_unlog, prediction_y_unlog)[0][1], # Pearson y - _spearman(target_all_unlog, prediction_all_unlog), # Spearman all ions - _spearman(target_b_unlog, prediction_b_unlog), # Spearman b ions - _spearman(target_y_unlog, prediction_y_unlog), # Spearman y ions - _mse(target_all_unlog, prediction_all_unlog), # MSE all ions - _mse(target_b_unlog, prediction_b_unlog), # MSE b ions - _mse(target_y_unlog, prediction_y_unlog), # MSE y ions, - # Ion type with min absolute difference - 0 if np.min(abs_diff_b_unlog) <= np.min(abs_diff_y_unlog) else 1, - # Ion type with max absolute difference - 0 if np.max(abs_diff_b_unlog) >= np.max(abs_diff_y_unlog) else 1, - np.min(abs_diff_all_unlog), # min_abs_diff - np.max(abs_diff_all_unlog), # max_abs_diff - np.quantile(abs_diff_all_unlog, 0.25), # abs_diff_Q1 - np.quantile(abs_diff_all_unlog, 0.5), # abs_diff_Q2 - np.quantile(abs_diff_all_unlog, 0.75), # abs_diff_Q3 - np.mean(abs_diff_all_unlog), # mean_abs_diff - np.std(abs_diff_all_unlog), # std_abs_diff - np.min(abs_diff_b_unlog), # ionb_min_abs_diff - np.max(abs_diff_b_unlog), # ionb_max_abs_diff_norm - np.quantile(abs_diff_b_unlog, 0.25), # ionb_abs_diff_Q1 - np.quantile(abs_diff_b_unlog, 0.5), # ionb_abs_diff_Q2 - np.quantile(abs_diff_b_unlog, 0.75), # ionb_abs_diff_Q3 - np.mean(abs_diff_b_unlog), # ionb_mean_abs_diff - np.std(abs_diff_b_unlog), # ionb_std_abs_diff - np.min(abs_diff_y_unlog), # iony_min_abs_diff - np.max(abs_diff_y_unlog), # iony_max_abs_diff - np.quantile(abs_diff_y_unlog, 0.25), # iony_abs_diff_Q1 - np.quantile(abs_diff_y_unlog, 0.5), # iony_abs_diff_Q2 - np.quantile(abs_diff_y_unlog, 0.75), # iony_abs_diff_Q3 - np.mean(abs_diff_y_unlog), # iony_mean_abs_diff - np.std(abs_diff_y_unlog), # iony_std_abs_diff - np.dot(target_all_unlog, prediction_all_unlog), # Dot product all ions - np.dot(target_b_unlog, prediction_b_unlog), # Dot product b ions - np.dot(target_y_unlog, prediction_y_unlog), # Dot product y ions - _cosine_similarity(target_all_unlog, prediction_all_unlog), # Cos similarity all - _cosine_similarity(target_b_unlog, prediction_b_unlog), # Cos similarity b ions - _cosine_similarity(target_y_unlog, prediction_y_unlog), # Cos similarity y ions - ] - - features = dict( - zip( - self.feature_names, - [0.0 if np.isnan(ft) else ft for ft in feature_values], - ) + def _calculate_features(self, psm_list, ms2pip_results): + idx = [] + pred_b = [] + pred_y = [] + obs_b = [] + obs_y = [] + + for r in ms2pip_results: + if r.observed_intensity is None or r.predicted_intensity is None: + continue + idx.append(r.psm_index) + pred_b.append(r.predicted_intensity["b"]) + pred_y.append(r.predicted_intensity["y"]) + obs_b.append(r.observed_intensity["b"]) + obs_y.append(r.observed_intensity["y"]) + + results = batch_ms2pip_features_numpy( + idx, pred_b, pred_y, obs_b, obs_y ) - return features - - -def _spearman(x: np.ndarray, y: np.ndarray) -> float: - """Spearman rank correlation.""" - x = np.array(x) - y = np.array(y) - x_rank = pd.Series(x).rank() - y_rank = pd.Series(y).rank() - return np.corrcoef(x_rank, y_rank)[0][1] - - -def _mse(x: np.ndarray, y: np.ndarray) -> float: - """Mean squared error""" - x = np.array(x) - y = np.array(y) - return np.mean((x - y) ** 2) - - -def _cosine_similarity(x: np.ndarray, y: np.ndarray) -> float: - """Cosine similarity""" - x = np.array(x) - y = np.array(y) - return np.dot(x, y) / (np.linalg.norm(x, 2) * np.linalg.norm(y, 2)) + for psm_index, feats in results: + if feats: + try: + psm_list[psm_index]["rescoring_features"].update(feats) + except (AttributeError, TypeError): + psm_list[psm_index]["rescoring_features"] = feats \ No newline at end of file From a9108b9633af7f4f8ac0580692e71052933b65d3 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Mon, 12 Jan 2026 10:50:45 +0100 Subject: [PATCH 47/54] reimplement deeplc feature calculation --- ms2rescore/feature_generators/deeplc.py | 381 +++++++++++++++--------- 1 file changed, 241 insertions(+), 140 deletions(-) diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py index 3506d3ea..2e1db888 100644 --- a/ms2rescore/feature_generators/deeplc.py +++ b/ms2rescore/feature_generators/deeplc.py @@ -15,23 +15,23 @@ """ -import contextlib import logging -import os -from collections import defaultdict -from inspect import getfullargspec -from itertools import chain from typing import List, Union import numpy as np from psm_utils import PSMList +from deeplc.core import predict, finetune +from deeplc.calibration import SplineTransformerCalibration + from ms2rescore.feature_generators.base import FeatureGeneratorBase from ms2rescore.parse_spectra import MSDataType -os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" logger = logging.getLogger(__name__) +# Suppress verbose onnx2torch logging +logging.getLogger("onnx2torch").setLevel(logging.WARNING) + class DeepLCFeatureGenerator(FeatureGeneratorBase): """DeepLC retention time-based feature generator.""" @@ -41,9 +41,7 @@ class DeepLCFeatureGenerator(FeatureGeneratorBase): def __init__( self, *args, - lower_score_is_better: bool = False, calibration_set_size: Union[int, float, None] = None, - calibration_set: Union[str, None] = None, processes: int = 1, **kwargs, ) -> None: @@ -55,8 +53,7 @@ def __init__( Parameters ---------- - lower_score_is_better - Whether a lower PSM score denotes a better matching PSM. Default: False + calibration_set_size: int or float Amount of best PSMs to use for DeepLC calibration. If this value is lower than the number of available PSMs, all PSMs will be used. (default: 0.15) @@ -73,35 +70,42 @@ def __init__( """ super().__init__(*args, **kwargs) - self.lower_psm_score_better = lower_score_is_better self.calibration_set_size = calibration_set_size - self.calibration_set = calibration_set - self.processes = processes self.deeplc_kwargs = kwargs or {} self._verbose = logger.getEffectiveLevel() <= logging.DEBUG - # Lazy-load DeepLC - from deeplc import DeepLC + self.model = self.deeplc_kwargs.get("model", None) - self.DeepLC = DeepLC + self.calibration = None - # Remove any kwargs that are not DeepLC arguments - self.deeplc_kwargs = { - k: v for k, v in self.deeplc_kwargs.items() if k in getfullargspec(DeepLC).args - } - self.deeplc_kwargs.update({"config_file": None}) + # Prepare DeepLC predict kwargs + self.predict_kwargs = { + k: v for k, v in self.deeplc_kwargs.items() if k in ["device", "batch_size"] + } # getfullargspec(predict).args does not work on this outer predict function + self.predict_kwargs["num_workers"] = processes - # Set default DeepLC arguments + # Prepare DeepLC finetune kwargs if "deeplc_retrain" not in self.deeplc_kwargs: self.deeplc_kwargs["deeplc_retrain"] = False - - self.deeplc_predictor = None - if "path_model" in self.deeplc_kwargs: - self.user_model = self.deeplc_kwargs.pop("path_model") - logging.debug(f"Using user-provided DeepLC model {self.user_model}.") - else: - self.user_model = None + return # skip the rest of the init if no retraining + + if self.deeplc_kwargs["deeplc_retrain"]: + self.finetune_kwargs = { + k: v + for k, v in self.deeplc_kwargs.items() + if k + in [ + "epochs", + "device", + "batch_size", + "learning_rate", + "patience", + "trainable_layers", + "validation_split", + ] + } + self.finetune_kwargs["num_workers"] = processes @property def feature_names(self) -> List[str]: @@ -118,131 +122,228 @@ def add_features(self, psm_list: PSMList) -> None: """Add DeepLC-derived features to PSMs.""" logger.info("Adding DeepLC-derived features to PSMs.") + psm_list_df = psm_list.to_dataframe() + psm_list_df = psm_list_df[ + [ + "peptidoform", + "retention_time", + "run", + "qvalue", + "is_decoy", + ] + ] + psm_list_df["sequence"] = psm_list_df["peptidoform"].apply(lambda x: x.modified_sequence) - # Get easy-access nested version of PSMList - psm_dict = psm_list.get_psm_dict() - - # Run DeepLC for each spectrum file - current_run = 1 - total_runs = sum(len(runs) for runs in psm_dict.values()) - for runs in psm_dict.values(): - # Reset DeepLC predictor for each collection of runs - self.deeplc_predictor = None - self.selected_model = None - for run, psms in runs.items(): - peptide_rt_diff_dict = defaultdict( - lambda: { - "observed_retention_time_best": np.inf, - "predicted_retention_time_best": np.inf, - "rt_diff_best": np.inf, - } - ) - logger.info( - f"Running DeepLC for PSMs from run ({current_run}/{total_runs}): `{run}`..." - ) + if self.deeplc_kwargs["deeplc_retrain"]: + # Filter high-confidence target PSMs once for transfer learning + target_mask = (psm_list["qvalue"] <= 0.01) & (~psm_list["is_decoy"]) + target_psms = psm_list[target_mask] + + # Determine best run for transfer learning + best_run = self._best_run_by_shared_proteoforms( + target_psms["run"], + target_psms["peptidoform"], + ) + + # Fine-tune on best run + best_run_psms = target_psms[target_psms["run"] == best_run] + logger.debug( + f"Fine-tuning DeepLC on run '{best_run}'... with {len(best_run_psms)} PSMs" + ) + self.model = finetune( + best_run_psms, + model=self.model, + train_kwargs=self.finetune_kwargs, + ) + + # Predict retention times for all PSMs + logger.debug("Predicting retention times with DeepLC...") + psm_list_df["predicted_retention_time"] = predict( + psm_list, model=self.model, predict_kwargs=self.predict_kwargs + ) + + # Calibrate predictions per run + for run in psm_list_df["run"].unique(): + run_df = psm_list_df[psm_list_df["run"] == run].copy() + + # Get calibration data (target PSMs only) + observed_rt_calibration, predicted_rt_calibration = self._get_calibration_data(run_df) + + # Fit calibration and transform all predictions for this run + calibration = SplineTransformerCalibration() + calibration.fit( + target=observed_rt_calibration, + source=predicted_rt_calibration, + ) - # Disable wild logging to stdout by Tensorflow, unless in debug mode - with ( - contextlib.redirect_stdout(open(os.devnull, "w", encoding="utf-8")) - if not self._verbose - else contextlib.nullcontext() - ): - # Make new PSM list for this run (chain PSMs per spectrum to flat list) - psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) - psm_list_calibration = self._get_calibration_psms(psm_list_run) - logger.debug(f"Calibrating DeepLC with {len(psm_list_calibration)} PSMs...") - self.deeplc_predictor = self.DeepLC( - n_jobs=self.processes, - verbose=self._verbose, - path_model=self.selected_model or self.user_model, - **self.deeplc_kwargs, - ) - self.deeplc_predictor.calibrate_preds(psm_list_calibration) - # Still calibrate for each run, but do not try out all model options. - # Just use model that was selected based on first run - if not self.selected_model: - self.selected_model = list(self.deeplc_predictor.model.keys()) - self.deeplc_kwargs["deeplc_retrain"] = False - logger.debug( - f"Selected DeepLC model {self.selected_model} based on " - "calibration of first run. Using this model (after new " - "calibrations) for the remaining runs." - ) - - logger.debug("Predicting retention times...") - predictions = np.array(self.deeplc_predictor.make_preds(psm_list_run)) - observations = psm_list_run["retention_time"] - rt_diffs_run = np.abs(predictions - observations) - - logger.debug("Adding features to PSMs...") - for i, psm in enumerate(psm_list_run): - psm["rescoring_features"].update( - { - "observed_retention_time": observations[i], - "predicted_retention_time": predictions[i], - "rt_diff": rt_diffs_run[i], - } - ) - peptide = psm.peptidoform.proforma.split("\\")[0] # remove charge - if peptide_rt_diff_dict[peptide]["rt_diff_best"] > rt_diffs_run[i]: - peptide_rt_diff_dict[peptide] = { - "observed_retention_time_best": observations[i], - "predicted_retention_time_best": predictions[i], - "rt_diff_best": rt_diffs_run[i], - } - for psm in psm_list_run: - psm["rescoring_features"].update( - peptide_rt_diff_dict[psm.peptidoform.proforma.split("\\")[0]] - ) - current_run += 1 - - def _get_calibration_psms(self, psm_list: PSMList): - """Get N best scoring target PSMs for calibration.""" - psm_list_targets = psm_list[ - ~psm_list["is_decoy"] - & [metadata.get("original_psm", True) for metadata in psm_list["metadata"]] + calibrated_rt = calibration.transform(run_df["predicted_retention_time"].values) + + # Update predictions with calibrated values + psm_list_df.loc[psm_list_df["run"] == run, "predicted_retention_time"] = calibrated_rt + + psm_list_df["observed_retention_time"] = psm_list_df["retention_time"] + psm_list_df["rt_diff"] = ( + psm_list_df["observed_retention_time"] - psm_list_df["predicted_retention_time"] + ).abs() + psm_list_df_best = psm_list_df.loc[ + psm_list_df.groupby(["run", "sequence"])["rt_diff"].idxmin() ] - if self.calibration_set_size: - n_psms = self._get_number_of_calibration_psms(psm_list_targets) - indices = np.argsort(psm_list_targets["score"]) - indices = indices[:n_psms] if self.lower_psm_score_better else indices[-n_psms:] - return psm_list_targets[indices] - else: - identified_psms = psm_list_targets[psm_list_targets["qvalue"] <= 0.01] - if len(identified_psms) == 0: - raise ValueError( - "No target PSMs with q-value <= 0.01 found. Please set calibration set size for calibrating deeplc." - ) - elif (len(identified_psms) < 500) & (self.deeplc_kwargs["deeplc_retrain"]): - logger.warning( - " Less than 500 target PSMs with q-value <= 0.01 found for retraining. Consider turning of deeplc_retrain, as this is likely not enough data for retraining." - ) - return identified_psms - def _get_number_of_calibration_psms(self, psm_list): - """Get number of calibration PSMs given `calibration_set_size` and total number of PSMs.""" + psm_list_df = psm_list_df.merge( + psm_list_df_best[ + [ + "run", + "sequence", + "observed_retention_time", + "predicted_retention_time", + "rt_diff", + ] + ], + on=["run", "sequence"], + how="left", + suffixes=("", "_best"), + ) + + psm_list_feature_dicts = psm_list_df[self.feature_names].to_dict(orient="records") + # Add features to PSMs + logger.debug("Adding features to PSMs...") + for psm, features in zip(psm_list, psm_list_feature_dicts): + psm.rescoring_features.update(features) + + def _get_calibration_data(self, run_df) -> tuple[np.ndarray, np.ndarray]: + """Get calibration data (observed and predicted RTs) from run dataframe. + + Only target (non-decoy) PSMs are used for calibration. + + Parameters + ---------- + run_df : pd.DataFrame + Dataframe containing PSMs for a single run, with columns: + 'retention_time', 'predicted_retention_time', 'qvalue', 'is_decoy' + + Returns + ------- + tuple[np.ndarray, np.ndarray] + Observed and predicted retention times for calibration + """ + # Filter to target PSMs only + target_df = run_df[~run_df["is_decoy"]].copy() + + # Determine number of calibration PSMs if isinstance(self.calibration_set_size, float): if not 0 < self.calibration_set_size <= 1: raise ValueError( "If `calibration_set_size` is a float, it cannot be smaller than " "or equal to 0 or larger than 1." ) - else: - num_calibration_psms = round(len(psm_list) * self.calibration_set_size) + num_calibration_psms = round(len(target_df) * self.calibration_set_size) elif isinstance(self.calibration_set_size, int): - if self.calibration_set_size > len(psm_list): + if self.calibration_set_size > len(target_df): logger.warning( - f"Requested number of calibration PSMs ({self.calibration_set_size}" - f") is larger than total number of PSMs ({len(psm_list)}). Using " - "all PSMs for calibration." + f"Requested number of calibration PSMs ({self.calibration_set_size}) " + f"is larger than total number of target PSMs ({len(target_df)}). Using " + "all target PSMs for calibration." ) - num_calibration_psms = len(psm_list) + num_calibration_psms = len(target_df) else: num_calibration_psms = self.calibration_set_size else: - raise TypeError( - "Expected float or int for `calibration_set_size`. Got " - f"{type(self.calibration_set_size)} instead. " + # Use PSMs with q-value <= 0.01 + num_calibration_psms = (target_df["qvalue"] <= 0.01).sum() + + logger.debug(f"Using {num_calibration_psms} target PSMs for calibration") + + # Select calibration PSMs (assuming they are sorted by q-value) + calibration_df = target_df.head(num_calibration_psms) + + return ( + calibration_df["retention_time"].values, + calibration_df["predicted_retention_time"].values, + ) + + @staticmethod + def _best_run_by_shared_proteoforms(runs, proteoforms): + """ + Return the run whose proteoform set has the largest total overlap with all other runs: + score(run_i) = sum_{j != i} |P_i ∩ P_j| + + Tie / degenerate handling (per request): + - If no run shares anything (all scores == 0): warn and return the first run (by first appearance). + - If multiple runs are tied for best score: return the first among them (by first appearance). + """ + runs = np.asarray(runs) + proteoforms = np.asarray(proteoforms) + if runs.shape[0] != proteoforms.shape[0]: + raise ValueError("runs and proteoforms must have the same length.") + if runs.size == 0: + raise ValueError("Empty input: runs/proteoforms must contain at least one entry.") + + # Preserve run order by first appearance + run_to_idx = {} + run_order = [] + run_idx = np.empty(runs.shape[0], dtype=np.int64) + for i, r in enumerate(runs): + if r not in run_to_idx: + run_to_idx[r] = len(run_order) + run_order.append(r) + run_idx[i] = run_to_idx[r] + + # Fast path: sparse incidence matrix + try: + from scipy.sparse import coo_matrix + except ImportError: + # Fallback (slower): set-based + run_sets = {} + for r, p in zip(runs, proteoforms): + run_sets.setdefault(r, set()).add(p) + + scores = [] + for r in run_order: + Pi = run_sets[r] + s = 0 + for r2 in run_order: + if r2 is r: + continue + s += len(Pi & run_sets[r2]) + scores.append(s) + + scores = np.asarray(scores, dtype=np.int64) + max_score = scores.max() + best_candidates = np.flatnonzero(scores == max_score) + + if max_score == 0: + logger.warning( + "No runs share any identified proteoforms with other runs; transfer learning might not be as effective." + ) + return run_order[0] + + return run_order[int(best_candidates[0])] + + # Encode proteoforms (order does not matter for correctness) + _, prot_idx = np.unique(proteoforms, return_inverse=True) + + # De-duplicate (run, proteoform) pairs + pairs = np.unique(np.stack([run_idx, prot_idx], axis=1), axis=0) + r = pairs[:, 0] + p = pairs[:, 1] + + M = coo_matrix( + (np.ones(len(r), dtype=np.int8), (r, p)), + shape=(len(run_order), int(prot_idx.max()) + 1), + ).tocsr() + + # Overlap matrix O[i,j] = |P_i ∩ P_j| + overlap = (M @ M.T).toarray() + np.fill_diagonal(overlap, 0) + + scores = overlap.sum(axis=1).astype(np.int64) + max_score = int(scores.max()) + best_candidates = np.flatnonzero(scores == max_score) + + if max_score == 0: + logger.warning( + "No runs share any identified proteoforms with other runs; transfer learning might not be as effective." ) - logger.debug(f"Using {num_calibration_psms} PSMs for calibration") - return num_calibration_psms + return run_order[0] + + return run_order[int(best_candidates[0])] From 698dd5e2a1f19551356ab18ca789e18a9325163a Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Mon, 12 Jan 2026 20:17:59 +0100 Subject: [PATCH 48/54] change logging --- ms2rescore/feature_generators/deeplc.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py index 2e1db888..b14094ef 100644 --- a/ms2rescore/feature_generators/deeplc.py +++ b/ms2rescore/feature_generators/deeplc.py @@ -23,7 +23,6 @@ from deeplc.core import predict, finetune from deeplc.calibration import SplineTransformerCalibration - from ms2rescore.feature_generators.base import FeatureGeneratorBase from ms2rescore.parse_spectra import MSDataType @@ -157,12 +156,13 @@ def add_features(self, psm_list: PSMList) -> None: ) # Predict retention times for all PSMs - logger.debug("Predicting retention times with DeepLC...") + logger.info("Predicting retention times with DeepLC...") psm_list_df["predicted_retention_time"] = predict( psm_list, model=self.model, predict_kwargs=self.predict_kwargs ) # Calibrate predictions per run + logger.info("Calibrating predicted retention times per run...") for run in psm_list_df["run"].unique(): run_df = psm_list_df[psm_list_df["run"] == run].copy() @@ -228,6 +228,7 @@ def _get_calibration_data(self, run_df) -> tuple[np.ndarray, np.ndarray]: """ # Filter to target PSMs only target_df = run_df[~run_df["is_decoy"]].copy() + target_df = target_df.sort_values("qvalue", ascending=True) # Determine number of calibration PSMs if isinstance(self.calibration_set_size, float): From 862f9be27bcbc13d65737de9ef1f16ae6674f208 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Wed, 14 Jan 2026 15:41:43 +0100 Subject: [PATCH 49/54] fix im2deep fg --- ms2rescore/feature_generators/im2deep.py | 162 +++++++++++++---------- 1 file changed, 91 insertions(+), 71 deletions(-) diff --git a/ms2rescore/feature_generators/im2deep.py b/ms2rescore/feature_generators/im2deep.py index 12094306..23800602 100644 --- a/ms2rescore/feature_generators/im2deep.py +++ b/ms2rescore/feature_generators/im2deep.py @@ -9,21 +9,17 @@ """ import logging -import os -from inspect import getfullargspec -from typing import List +from typing import List, Union import numpy as np -import pandas as pd +from psm_utils import PSMList +from im2deep.core import predict +from im2deep.calibration import LinearCCSCalibration, get_default_reference from im2deep.utils import im2ccs -from im2deep.im2deep import predict_ccs, REFERENCE_DATASET_PATH -from im2deep.calibrate import calculate_ccs_shift -from psm_utils import PSMList, Peptidoform from ms2rescore.feature_generators.base import FeatureGeneratorBase from ms2rescore.parse_spectra import MSDataType -os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" logger = logging.getLogger(__name__) @@ -34,6 +30,8 @@ class IM2DeepFeatureGenerator(FeatureGeneratorBase): def __init__( self, + multi: bool = False, + calibration_set_size: Union[int, float] = None, *args, processes: int = 1, **kwargs, @@ -51,14 +49,24 @@ def __init__( """ super().__init__(*args, **kwargs) - self._verbose = logger.getEffectiveLevel() <= logging.DEBUG + self.multi = multi + if self.multi: + raise NotImplementedError( + "Multi-IM mode is not yet implemented for IM2DeepFeatureGenerator." + ) + + self.calibration_set_size = calibration_set_size - # Remove any kwargs that are not IM2Deep arguments self.im2deep_kwargs = kwargs or {} - self.im2deep_kwargs = { - k: v for k, v in self.im2deep_kwargs.items() if k in getfullargspec(predict_ccs).args + self._verbose = logger.getEffectiveLevel() <= logging.DEBUG + + self.model = self.im2deep_kwargs.get("model", None) + + # Prepare IM2Deep predict kwargs + self.predict_kwargs = { + k: v for k, v in self.im2deep_kwargs.items() if k in ["device", "batch_size"] } - self.im2deep_kwargs["n_jobs"] = processes + self.predict_kwargs["num_workers"] = processes @property def feature_names(self) -> List[str]: @@ -72,11 +80,10 @@ def feature_names(self) -> List[str]: def add_features(self, psm_list: PSMList) -> None: """Add IM2Deep-derived features to PSMs""" + logger.info("Adding IM2Deep-derived features to PSMs") - # Convert ion mobility to CCS psm_list_df = psm_list.to_dataframe() - # Remove unnecessary columns to save memory #TODO: optimize further? psm_list_df = psm_list_df[ [ "peptidoform", @@ -89,8 +96,8 @@ def add_features(self, psm_list: PSMList) -> None: ] ] + psm_list_df["sequence"] = psm_list_df["peptidoform"].apply(lambda x: x.modified_sequence) psm_list_df["charge"] = [pep.precursor_charge for pep in psm_list_df["peptidoform"]] - psm_list_df["sequence"] = psm_list_df["peptidoform"].apply(lambda x: x.proforma) psm_list_df["ccs_observed_im2deep"] = im2ccs( psm_list_df["ion_mobility"], psm_list_df["precursor_mz"], @@ -98,39 +105,39 @@ def add_features(self, psm_list: PSMList) -> None: ) # Make predictions with IM2Deep - logger.debug("Predicting CCS values...") - predictions = predict_ccs(psm_list, write_output=False, **self.im2deep_kwargs) - psm_list_df["ccs_predicted"] = predictions - - # Create dataframe with high confidence hits for calibration - logger.debug("Calibrating IM2Deep...") - reference_dataset = pd.read_csv(REFERENCE_DATASET_PATH) - reference_dataset["charge"] = reference_dataset["peptidoform"].apply( - lambda x: int(x.split("/")[1]) if isinstance(x, str) else x.precursor_charge + logger.info("Predicting CCS values with IM2Deep...") + psm_list_df["predicted_CCS_uncalibrated"] = predict( + psm_list, model=self.model, predict_kwargs=self.predict_kwargs ) - logger.debug(f"Loaded reference dataset with {len(reference_dataset)} entries") - run_shift_dict = {} + # getting reference CCS values for calibration + source_dataframe = get_default_reference(multi=self.multi) + + # Create dataframe with high confidence hits for calibration + logger.info("Calibrating predicted CCS values per run...") for run in psm_list_df["run"].unique(): - cal_run_psm_df = self.make_calibration_df(psm_list_df[psm_list_df["run"] == run]) - # Rename for calculate_ccs_shift compatibility - cal_run_psm_df = cal_run_psm_df.rename( - columns={"ccs_observed_im2deep": "ccs_observed"} + run_df = psm_list_df[psm_list_df["run"] == run].copy() + + calibration_df = self._get_im_calibration_data(run_df) + + calibration = LinearCCSCalibration() + calibration.fit( + psm_df_target=calibration_df, + psm_df_source=source_dataframe, + ) + + calibrated_im = calibration.transform( + run_df[["peptidoform", "predicted_CCS_uncalibrated"]] ) - shift = calculate_ccs_shift( - cal_df=cal_run_psm_df, reference_dataset=reference_dataset, per_charge=True + + # Update predictions with calibrated values + psm_list_df.loc[psm_list_df["run"] == run, "predicted_CCS_uncalibrated"] = ( + calibrated_im ) - run_shift_dict[run] = shift - shift_df = pd.DataFrame.from_dict(run_shift_dict, orient="index").stack().reset_index() - shift_df.columns = ["run", "charge", "ccs_shift"] # Apply calibration shifts - psm_list_df = psm_list_df.merge(shift_df, on=["run", "charge"], how="left") - psm_list_df["ccs_shift"] = psm_list_df["ccs_shift"].fillna( - 0 - ) # Fill missing shifts with 0 (no calibration for that run/charge) #TODO check with ROBBE - psm_list_df["ccs_predicted_im2deep"] = ( - psm_list_df["ccs_predicted"] + psm_list_df["ccs_shift"] + psm_list_df.rename( + columns={"predicted_CCS_uncalibrated": "ccs_predicted_im2deep"}, inplace=True ) psm_list_df["ccs_error_im2deep"] = ( psm_list_df["ccs_predicted_im2deep"] - psm_list_df["ccs_observed_im2deep"] @@ -146,39 +153,52 @@ def add_features(self, psm_list: PSMList) -> None: for psm, features in zip(psm_list, psm_list_feature_dicts): psm.rescoring_features.update(features) - @staticmethod - def make_calibration_df(psm_list_df: pd.DataFrame, threshold: float = 0.25) -> pd.DataFrame: - """ - Make dataframe for calibration of IM2Deep predictions. + def _get_im_calibration_data(self, run_df) -> tuple[np.ndarray, np.ndarray]: + """Get calibration data (observed and predicted CCS values) from run dataframe. + + Only target (non-decoy) PSMs are used for calibration. Parameters ---------- - psm_list_df - DataFrame with PSMs. - threshold - Percentage of highest scoring identified target PSMs to use for calibration, - default 0.25. - + run_df : pd.DataFrame + Dataframe containing PSMs for a single run, with columns: + 'ccs_observed_im2deep', 'ccs_predicted_im2deep', 'qvalue', 'is_decoy' Returns ------- - pd.DataFrame - DataFrame with high confidence hits for calibration. - + tuple[np.ndarray, np.ndarray] + Observed and predicted CCS values for calibration """ - identified_psms = psm_list_df[ - (psm_list_df["qvalue"] < 0.01) - & (~psm_list_df["is_decoy"]) - & (psm_list_df["charge"] < 7) # predictions do not go higher for IM2Deep - & ([metadata.get("original_psm", True) for metadata in psm_list_df["metadata"]]) - ] - calibration_psms = identified_psms[ - identified_psms["qvalue"] < identified_psms["qvalue"].quantile(1 - threshold) - ] - if isinstance(calibration_psms["peptidoform"].iloc[0], Peptidoform): - calibration_psms["peptidoform"] = calibration_psms["peptidoform"].apply( - lambda x: x.proforma - ) - logger.debug( - f"Number of high confidence hits for calculating shift: {len(calibration_psms)}" + # Filter to target PSMs only + target_df = run_df[~run_df["is_decoy"]].copy() + target_df = target_df.sort_values("qvalue", ascending=True) + + # Determine number of calibration PSMs + if isinstance(self.calibration_set_size, float): + if not 0 < self.calibration_set_size <= 1: + raise ValueError( + "If `calibration_set_size` is a float, it cannot be smaller than " + "or equal to 0 or larger than 1." + ) + num_calibration_psms = round(len(target_df) * self.calibration_set_size) + elif isinstance(self.calibration_set_size, int): + if self.calibration_set_size > len(target_df): + logger.warning( + f"Requested number of calibration PSMs ({self.calibration_set_size}) " + f"is larger than total number of target PSMs ({len(target_df)}). Using " + "all target PSMs for calibration." + ) + num_calibration_psms = len(target_df) + else: + num_calibration_psms = self.calibration_set_size + else: + # Use PSMs with q-value <= 0.01 + num_calibration_psms = (target_df["qvalue"] <= 0.01).sum() + + logger.debug(f"Using {num_calibration_psms} target PSMs for calibration") + + # Select calibration PSMs (assuming they are sorted by q-value) + calibration_df = target_df.head(num_calibration_psms) + + return calibration_df[["peptidoform", "ccs_observed_im2deep"]].rename( + columns={"ccs_observed_im2deep": "CCS"} ) - return calibration_psms From 889b42d93e260765f912b054f83411b2d45cacae Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Wed, 14 Jan 2026 15:42:41 +0100 Subject: [PATCH 50/54] add support for continue runs and writing intermediate file on error to continue from --- ms2rescore/core.py | 35 +++++++++++++++++++++++++++++------ ms2rescore/parse_psms.py | 2 +- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/ms2rescore/core.py b/ms2rescore/core.py index 4384c6f7..8b3c64ef 100644 --- a/ms2rescore/core.py +++ b/ms2rescore/core.py @@ -63,6 +63,19 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: logger.debug( f"PSMs already contain the following rescoring features: {psm_list_feature_names}" ) + # ckeck if all features are already present + for fgen_name, fgen_config in list(config["feature_generators"].items()): + # conf = config.copy() + # conf.update(fgen_config) + fgen_features = FEATURE_GENERATORS[fgen_name]().feature_names + if set(fgen_features).issubset(psm_list_feature_names): + logger.debug( + f"Skipping feature generator {fgen_name} because all features are already " + "present in the PSM file." + ) + feature_names[fgen_name] = set(fgen_features) + feature_names["psm_file"] = psm_list_feature_names - set(fgen_features) + del config["feature_generators"][fgen_name] # Add missing precursor info from spectrum file if needed required_ms_data = { @@ -93,9 +106,17 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: "files or disable the feature generator." ) continue - - # Add features - fgen.add_features(psm_list) + try: + fgen.add_features(psm_list) + except (Exception, KeyboardInterrupt) as e: + logger.error( + f"Error while adding features from {fgen_name}: {e}, writing intermediary output..." + ) + # Write intermediate TSV + psm_utils.io.write_file( + psm_list, output_file_root + ".intermediate.psms.tsv", filetype="tsv" + ) + raise e logger.debug(f"Adding features from {fgen_name}: {set(fgen.feature_names)}") feature_names[fgen_name] = set(fgen.feature_names) @@ -168,10 +189,12 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: protein_kwargs=protein_kwargs, **config["rescoring_engine"]["mokapot"], ) - except exceptions.RescoringError as e: + except (Exception, KeyboardInterrupt) as e: # Write output - logger.info(f"Writing intermediary output to {output_file_root}.psms.tsv...") - psm_utils.io.write_file(psm_list, output_file_root + ".psms.tsv", filetype="tsv") + logger.info(f"Writing intermediary output to {output_file_root}.intermediate.psms.tsv...") + psm_utils.io.write_file( + psm_list, output_file_root + ".intermediate.psms.tsv", filetype="tsv" + ) # Reraise exception raise e diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py index 8a61539c..c12f2f07 100644 --- a/ms2rescore/parse_psms.py +++ b/ms2rescore/parse_psms.py @@ -85,7 +85,7 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: { "before_rescoring_score": psm.score, "before_rescoring_qvalue": psm.qvalue, - "before_rescoring_pep": psm.pep, + "before_rescoring_pep": psm.pep if psm.pep is not None else float("nan"), "before_rescoring_rank": psm.rank, } ) From ba930b86881044a393637b6c149dc6c503b501ee Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Wed, 14 Jan 2026 15:42:55 +0100 Subject: [PATCH 51/54] changes to default features sets instead of based on charge --- ms2rescore/feature_generators/basic.py | 48 ++++++++++++++------------ 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/ms2rescore/feature_generators/basic.py b/ms2rescore/feature_generators/basic.py index df005167..3e9aa374 100644 --- a/ms2rescore/feature_generators/basic.py +++ b/ms2rescore/feature_generators/basic.py @@ -31,13 +31,24 @@ def __init__(self, *args, **kwargs) -> None: """ super().__init__(*args, **kwargs) - self._feature_names = None @property def feature_names(self) -> List[str]: - if self._feature_names is None: - raise ValueError("Feature names have not been set yet. First run `add_features`.") - return self._feature_names + return [ + "charge_n", + "charge_1", + "charge_2", + "charge_3", + "charge_4", + "charge_5", + "charge_6", + "abs_ms1_error_ppm", + "search_engine_score", + "theoretical_mass", + "experimental_mass", + "mass_error", + "pep_len", + ] def add_features(self, psm_list: PSMList) -> None: """ @@ -51,8 +62,6 @@ def add_features(self, psm_list: PSMList) -> None: """ logger.info("Adding basic features to PSMs.") - self._feature_names = [] # Reset feature names - charge_states = np.array([psm.peptidoform.precursor_charge for psm in psm_list]) precursor_mzs = psm_list["precursor_mz"] scores = psm_list["score"] @@ -64,24 +73,16 @@ def add_features(self, psm_list: PSMList) -> None: if has_charge: charge_n = charge_states - charge_one_hot, one_hot_names = _one_hot_encode_charge(charge_states) - self._feature_names.extend(["charge_n"] + one_hot_names) + charge_one_hot, _ = _one_hot_encode_charge(charge_states) if has_mz: # Charge also required for theoretical m/z theo_mz = np.array([psm.peptidoform.theoretical_mz for psm in psm_list]) abs_ms1_error_ppm = np.abs((precursor_mzs - theo_mz) / theo_mz * 10**6) - self._feature_names.append("abs_ms1_error_ppm") - - if has_score: - self._feature_names.append("search_engine_score") if has_mz and has_charge: experimental_mass = (precursor_mzs * charge_n) - (charge_n * 1.007276466812) theoretical_mass = (theo_mz * charge_n) - (charge_n * 1.007276466812) mass_error = experimental_mass - theoretical_mass - self._feature_names.extend(["theoretical_mass", "experimental_mass", "mass_error"]) - - self._feature_names.append("pep_len") for i, psm in enumerate(psm_list): psm.rescoring_features.update( @@ -101,15 +102,18 @@ def add_features(self, psm_list: PSMList) -> None: def _one_hot_encode_charge( charge_states: np.ndarray, ) -> Tuple[Iterable[Dict[str, int]], List[str]]: - """One-hot encode charge states.""" + """One-hot encode charge states with fixed range 1-6.""" n_entries = len(charge_states) - min_charge = np.min(charge_states) - max_charge = np.max(charge_states) + heading = [f"charge_{i}" for i in range(1, 7)] - mask = np.zeros((n_entries, max_charge - min_charge + 1), dtype=bool) - mask[np.arange(n_entries), charge_states - min_charge] = 1 - one_hot = mask.view("i1") + # Create mask for charges 1-6 + mask = np.zeros((n_entries, 6), dtype=bool) + + # Set the appropriate charge position to 1 for each entry + for i, charge in enumerate(charge_states): + if charge is not None and 1 <= charge <= 6: + mask[i, int(charge) - 1] = 1 - heading = [f"charge_{i}" for i in range(min_charge, max_charge + 1)] + one_hot = mask.view("i1") return [dict(zip(heading, row)) for row in one_hot], heading From a3875da596c570c0b122b92d00cd503784b4e92b Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Wed, 14 Jan 2026 15:43:00 +0100 Subject: [PATCH 52/54] minor changes --- ms2rescore/feature_generators/ms2pip.py | 17 +++++++---------- pyproject.toml | 1 - 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/ms2rescore/feature_generators/ms2pip.py b/ms2rescore/feature_generators/ms2pip.py index dcb0100e..0cc6fd52 100644 --- a/ms2rescore/feature_generators/ms2pip.py +++ b/ms2rescore/feature_generators/ms2pip.py @@ -24,8 +24,7 @@ """ import logging -from typing import List, Optional, Union -import numpy as np +from typing import Optional from ms2pip import process_MS2_spectra from ms2rescore_rs import batch_ms2pip_features_numpy @@ -51,7 +50,7 @@ def __init__( spectrum_path: Optional[str] = None, spectrum_id_pattern: str = "(.*)", model_dir: Optional[str] = None, - processes: 1, + processes: int = 1, **kwargs, ) -> None: """ @@ -187,8 +186,8 @@ def _calculate_features(self, psm_list, ms2pip_results): idx = [] pred_b = [] pred_y = [] - obs_b = [] - obs_y = [] + obs_b = [] + obs_y = [] for r in ms2pip_results: if r.observed_intensity is None or r.predicted_intensity is None: @@ -198,14 +197,12 @@ def _calculate_features(self, psm_list, ms2pip_results): pred_y.append(r.predicted_intensity["y"]) obs_b.append(r.observed_intensity["b"]) obs_y.append(r.observed_intensity["y"]) - - results = batch_ms2pip_features_numpy( - idx, pred_b, pred_y, obs_b, obs_y - ) + + results = batch_ms2pip_features_numpy(idx, pred_b, pred_y, obs_b, obs_y) for psm_index, feats in results: if feats: try: psm_list[psm_index]["rescoring_features"].update(feats) except (AttributeError, TypeError): - psm_list[psm_index]["rescoring_features"] = feats \ No newline at end of file + psm_list[psm_index]["rescoring_features"] = feats diff --git a/pyproject.toml b/pyproject.toml index f36cd56b..059fb300 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,6 @@ dependencies = [ "click>=7", "customtkinter>=5,<6", "deeplc>=3.1", - "deeplcretrainer", "im2deep>=1.1", "jinja2>=3", "lxml>=4.5", From 8e793a00e62277050ef707e64634a4659af9428e Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Wed, 14 Jan 2026 15:53:59 +0100 Subject: [PATCH 53/54] conditional import of mumble --- ms2rescore/core.py | 3 ++- ms2rescore/feature_generators/deeplc.py | 1 + ms2rescore/parse_psms.py | 11 ++++++++--- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/ms2rescore/core.py b/ms2rescore/core.py index 8b3c64ef..e72d1b05 100644 --- a/ms2rescore/core.py +++ b/ms2rescore/core.py @@ -17,7 +17,6 @@ add_peptide_confidence, add_psm_confidence, ) -from ms2rescore.utils import filter_mumble_psms logger = logging.getLogger(__name__) @@ -140,6 +139,8 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: psm_list = psm_list[psms_with_features] if "mumble" in config["psm_generator"]: + from ms2rescore.utils import filter_mumble_psms + # Remove PSMs where matched_ions_pct drops 25% below the original hit psm_list = filter_mumble_psms(psm_list, threshold=0.75) diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py index b14094ef..4c2f0b49 100644 --- a/ms2rescore/feature_generators/deeplc.py +++ b/ms2rescore/feature_generators/deeplc.py @@ -272,6 +272,7 @@ def _best_run_by_shared_proteoforms(runs, proteoforms): - If no run shares anything (all scores == 0): warn and return the first run (by first appearance). - If multiple runs are tied for best score: return the first among them (by first appearance). """ + logger.debug("Determining best run for transfer learning based on shared proteoforms.") runs = np.asarray(runs) proteoforms = np.asarray(proteoforms) if runs.shape[0] != proteoforms.shape[0]: diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py index c12f2f07..0366d6b8 100644 --- a/ms2rescore/parse_psms.py +++ b/ms2rescore/parse_psms.py @@ -4,7 +4,6 @@ import numpy as np import psm_utils.io -from mumble import PSMHandler from psm_utils import PSMList from ms2rescore.exceptions import MS2RescoreConfigurationError @@ -121,6 +120,12 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: # Addition of Modifications for mass shifts in the PSMs with Mumble if "mumble" in config["psm_generator"]: + try: + from mumble import PSMHandler + except ImportError: + raise MS2RescoreConfigurationError( + "mumble is not installed. Please install it with: pip install ms2rescore[mumble]" + ) logger.debug("Applying modifications for mass shifts using Mumble...") # set inlcude original psm to True and include decoy psm to true config["psm_generator"]["mumble"]["include_original_psm"] = True @@ -150,7 +155,7 @@ def _read_psms(config, psm_list): psm_list = [] for current_file, psm_file in enumerate(config["psm_file"]): logger.info( - f"Reading PSMs from PSM file ({current_file+1}/{total_files}): '{psm_file}'..." + f"Reading PSMs from PSM file ({current_file + 1}/{total_files}): '{psm_file}'..." ) psm_list.extend( psm_utils.io.read_file( @@ -216,7 +221,7 @@ def _parse_values_from_spectrum_id( ["retention_time", "ion_mobility"], ): if pattern: - logger.debug(f"Parsing {label} from spectrum_id with regex pattern " f"{pattern}") + logger.debug(f"Parsing {label} from spectrum_id with regex pattern {pattern}") try: pattern = re.compile(pattern) psm_list[key] = [ From 07b96e73e288ed7cead7b18a63af141e66b24fa2 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Wed, 14 Jan 2026 18:59:35 +0100 Subject: [PATCH 54/54] add tracking to spectrum file reading --- ms2rescore/parse_spectra.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py index d897c40c..3b8df3e8 100644 --- a/ms2rescore/parse_spectra.py +++ b/ms2rescore/parse_spectra.py @@ -7,6 +7,7 @@ import numpy as np from ms2rescore_rs import Precursor, get_ms2_spectra, MS2Spectrum +from rich.progress import track from psm_utils import PSMList @@ -208,7 +209,7 @@ def _add_precursor_values( if spectrum_id_pattern is None: spectrum_id_pattern = r"^(.*)$" # Match entire identifier if no pattern provided - for run_name in set(psm_list["run"]): + for run_name in track(set(psm_list["run"])): run_mask = psm_list["run"] == run_name psm_list_run = psm_list[run_mask] spectrum_file = infer_spectrum_path(spectrum_path, run_name)