From fdceebac3d306a0630ba992343b3fb2c9f9a0286 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <Arthur.Declercq@UGent.be>
Date: Sat, 24 Feb 2024 15:48:00 +0100
Subject: [PATCH 01/54] initial commit

---
 ms2rescore/feature_generators/ms2.py | 161 +++++++++++++++++++++++++++
 1 file changed, 161 insertions(+)
 create mode 100644 ms2rescore/feature_generators/ms2.py

diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py
new file mode 100644
index 00000000..8e144574
--- /dev/null
+++ b/ms2rescore/feature_generators/ms2.py
@@ -0,0 +1,161 @@
+"""
+MS2-based feature generator.
+
+"""
+
+import logging
+import re
+from typing import List, Optional, Union
+from itertools import chain
+from collections import defaultdict
+
+
+import numpy as np
+from psm_utils import PSMList
+from pyteomics.mass import calculate_mass
+
+from ms2rescore.feature_generators.base import FeatureGeneratorBase
+from ms2rescore.utils import infer_spectrum_path
+
+logger = logging.getLogger(__name__)
+
+
+class MS2FeatureGenerator(FeatureGeneratorBase):
+    """DeepLC retention time-based feature generator."""
+
+    def __init__(
+        self,
+        *args,
+        spectrum_path: Optional[str] = None,
+        spectrum_id_pattern: str = "(.*)",
+        **kwargs,
+    ) -> None:
+        """
+        Generate MS2-based features for rescoring.
+
+        DeepLC retraining is on by default. Add ``deeplc_retrain: False`` as a keyword argument to
+        disable retraining.
+
+        Parameters
+        ----------
+
+        Attributes
+        ----------
+        feature_names: list[str]
+            Names of the features that will be added to the PSMs.
+
+        """
+        super().__init__(*args, **kwargs)
+        self.spectrum_path = spectrum_path
+        self.spectrum_id_pattern = spectrum_id_pattern
+
+    @property
+    def feature_names(self) -> List[str]:
+        return [
+            "ln_explained_intensity",
+            "ln_total_intensity",
+            "ln_explained_intensity_ratio",
+            "ln_explained_b_ion_ratio",
+            "ln_explained_y_ion_ratio",
+            "longest_b_ion_sequence",
+            "longest_y_ion_sequence",
+            "matched_b_ions",
+            "matched_b_ions_pct",
+            "matched_y_ions",
+            "matched_y_ions_pct",
+            "matched_ions_pct",
+        ]
+
+    def add_features(self, psm_list: PSMList) -> None:
+        """Add DeepLC-derived features to PSMs."""
+
+        logger.info("Adding MS2-derived features to PSMs.")
+        psm_dict = psm_list.get_psm_dict()
+        current_run = 1
+        total_runs = sum(len(runs) for runs in psm_dict.values())
+
+        for runs in psm_dict.values():
+            for run, psms in runs.items():
+                logger.info(
+                    f"Running MS2 for PSMs from run ({current_run}/{total_runs}) `{run}`..."
+                )
+                psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
+                spectrum_filename = infer_spectrum_path(self.spectrum_path, run)
+                logger.debug(f"Using spectrum file `{spectrum_filename}`")
+                self._add_features_run(psm_list_run, spectrum_filename)
+                current_run += 1
+
+    def _add_features_run(self, psm_list: PSMList, spectrum_filename: str) -> None:
+        pass
+
+
+def longest_sequence_of_trues(lst):
+    max_sequence = 0
+    current_sequence = 0
+
+    for value in lst:
+        current_sequence = current_sequence + 1 if value else 0
+        max_sequence = max(max_sequence, current_sequence)
+
+    return max_sequence
+
+
+def extract_spectrum_features(annotated_spectrum, peptidoform):
+    features = defaultdict(list)
+    b_ions_matched = [False] * (len(peptidoform.sequence))
+    y_ions_matched = [False] * (len(peptidoform.sequence))
+
+    pseudo_count = 0.00001
+
+    for annotated_peak in annotated_spectrum.spectrum:
+        features["total_intensity"].append(annotated_peak.intensity)
+
+        if annotated_peak.annotation:
+            features["matched_intensity"].append(annotated_peak.intensity)
+            for matched_ion in annotated_peak.annotation:
+                if "y" in matched_ion.ion:
+                    features["y_ion_matched"].append(annotated_peak.intensity)
+                    y_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True
+                elif "b" in matched_ion.ion:
+                    features["b_ion_matched"].append(annotated_peak.intensity)
+                    b_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True
+
+    return {
+        "ln_explained_intensity": np.log(np.sum(features["matched_intensity"]) + pseudo_count),
+        "ln_total_intensity": np.log(np.sum(features["total_intensity"]) + pseudo_count),
+        "ln_explained_intensity_ratio": np.log(
+            np.sum(features["matched_intensity"]) / np.sum(features["total_intensity"])
+            + pseudo_count
+        ),
+        "ln_explained_b_ion_ratio": np.log(
+            np.sum(features["b_ion_matched"]) / np.sum(features["matched_intensity"])
+            + pseudo_count
+        ),
+        "ln_explained_y_ion_ratio": np.log(
+            np.sum(features["y_ion_matched"]) / np.sum(features["matched_intensity"])
+            + pseudo_count
+        ),
+        "longest_b_ion_sequence": longest_sequence_of_trues(b_ions_matched),
+        "longest_y_ion_sequence": longest_sequence_of_trues(y_ions_matched),
+        "matched_b_ions": sum(b_ions_matched),
+        "matched_b_ions_pct": sum(b_ions_matched) / len(b_ions_matched),
+        "matched_y_ions": sum(y_ions_matched),
+        "matched_y_ions_pct": sum(y_ions_matched) / len(y_ions_matched),
+        "matched_ions_pct": (sum(b_ions_matched) + sum(y_ions_matched))
+        / (len(b_ions_matched) + len(y_ions_matched)),
+    }
+
+
+# TODO: keep this here?
+def modification_evidence():
+    return
+
+
+# TODO: move to basic feature generator
+def extract_psm_features(psm):
+    return {
+        "peptide_len": len(psm.peptidoform.sequence),
+        "expmass": (psm.precursor_mz * psm.get_precursor_charge())
+        - (psm.get_precursor_charge() * 1.007276),  # TODO: replace by constant
+        "calcmass": calculate_mass(psm.peptidoform.composition),
+    }

From 5374ed83aa56c075bcd2cf2da1004e049d13ee9d Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <Arthur.Declercq@UGent.be>
Date: Sun, 25 Feb 2024 16:44:54 +0100
Subject: [PATCH 02/54] finalize ms2 feature generation

---
 ms2rescore/core.py                        |  25 +--
 ms2rescore/exceptions.py                  |   6 +
 ms2rescore/feature_generators/__init__.py |   2 +
 ms2rescore/feature_generators/ms2.py      | 218 +++++++++++++++-------
 4 files changed, 171 insertions(+), 80 deletions(-)

diff --git a/ms2rescore/core.py b/ms2rescore/core.py
index e5ae26f1..f8ff38d7 100644
--- a/ms2rescore/core.py
+++ b/ms2rescore/core.py
@@ -54,18 +54,18 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
         f"PSMs already contain the following rescoring features: {psm_list_feature_names}"
     )
 
-    # TODO: avoid hard coding feature generators in some way
-    rt_required = ("deeplc" in config["feature_generators"]) and (
-        None in psm_list["retention_time"]
-    )
-    im_required = ("ionmob" or "im2deep" in config["feature_generators"]) and (
-        None in psm_list["ion_mobility"]
-    )
-    logger.debug(f"RT required: {rt_required}, IM required: {im_required}")
-
-    if rt_required or im_required:
-        logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
-        fill_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required)
+    # # TODO: avoid hard coding feature generators in some way
+    # rt_required = ("deeplc" in config["feature_generators"]) and (
+    #     None in psm_list["retention_time"]
+    # )
+    # im_required = ("ionmob" or "im2deep" in config["feature_generators"]) and (
+    #     None in psm_list["ion_mobility"]
+    # )
+    # logger.debug(f"RT required: {rt_required}, IM required: {im_required}")
+
+    # if rt_required or im_required:
+    #     logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
+    #     fill_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required)
 
     # Add rescoring features
     for fgen_name, fgen_config in config["feature_generators"].items():
@@ -82,6 +82,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
         fgen.add_features(psm_list)
         logger.debug(f"Adding features from {fgen_name}: {set(fgen.feature_names)}")
         feature_names[fgen_name] = set(fgen.feature_names)
+    exit()
 
     # Filter out psms that do not have all added features
     all_feature_names = {f for fgen in feature_names.values() for f in fgen}
diff --git a/ms2rescore/exceptions.py b/ms2rescore/exceptions.py
index f8d1fa01..9b429a30 100644
--- a/ms2rescore/exceptions.py
+++ b/ms2rescore/exceptions.py
@@ -23,3 +23,9 @@ class ModificationParsingError(IDFileParsingError):
     """Identification file parsing error."""
 
     pass
+
+
+class ParseSpectrumError(MS2RescoreError):
+    """Error parsing spectrum file."""
+
+    pass
diff --git a/ms2rescore/feature_generators/__init__.py b/ms2rescore/feature_generators/__init__.py
index 52440a5f..7a10fd4c 100644
--- a/ms2rescore/feature_generators/__init__.py
+++ b/ms2rescore/feature_generators/__init__.py
@@ -8,6 +8,7 @@
 from ms2rescore.feature_generators.maxquant import MaxQuantFeatureGenerator
 from ms2rescore.feature_generators.ms2pip import MS2PIPFeatureGenerator
 from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator
+from ms2rescore.feature_generators.ms2 import MS2FeatureGenerator
 
 FEATURE_GENERATORS = {
     "basic": BasicFeatureGenerator,
@@ -16,4 +17,5 @@
     "maxquant": MaxQuantFeatureGenerator,
     "ionmob": IonMobFeatureGenerator,
     "im2deep": IM2DeepFeatureGenerator,
+    "ms2": MS2FeatureGenerator,
 }
diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py
index 8e144574..013bcb44 100644
--- a/ms2rescore/feature_generators/ms2.py
+++ b/ms2rescore/feature_generators/ms2.py
@@ -12,13 +12,27 @@
 
 import numpy as np
 from psm_utils import PSMList
-from pyteomics.mass import calculate_mass
+from pyteomics import mass, mzml, mgf
+from rustyms import RawSpectrum, LinearPeptide, FragmentationModel, MassMode
 
 from ms2rescore.feature_generators.base import FeatureGeneratorBase
 from ms2rescore.utils import infer_spectrum_path
+from ms2rescore.exceptions import ParseSpectrumError
 
 logger = logging.getLogger(__name__)
 
+FRAGMENTATION_MODELS = {
+    "cidhcd": FragmentationModel.CidHcd,
+    "etcid": FragmentationModel.Etcid,
+    "etd": FragmentationModel.Etd,
+    "ethcd": FragmentationModel.Ethcd,
+    "all": FragmentationModel.All,
+}
+MASS_MODES = {
+    "average": MassMode.Average,
+    "monoisotopic": MassMode.Monoisotopic,
+}
+
 
 class MS2FeatureGenerator(FeatureGeneratorBase):
     """DeepLC retention time-based feature generator."""
@@ -28,14 +42,13 @@ def __init__(
         *args,
         spectrum_path: Optional[str] = None,
         spectrum_id_pattern: str = "(.*)",
+        fragmentationmodel: str = "All",
+        massmode: str = "Monoisotopic",
         **kwargs,
     ) -> None:
         """
         Generate MS2-based features for rescoring.
 
-        DeepLC retraining is on by default. Add ``deeplc_retrain: False`` as a keyword argument to
-        disable retraining.
-
         Parameters
         ----------
 
@@ -48,6 +61,8 @@ def __init__(
         super().__init__(*args, **kwargs)
         self.spectrum_path = spectrum_path
         self.spectrum_id_pattern = spectrum_id_pattern
+        self.fragmentation_model = FRAGMENTATION_MODELS[fragmentationmodel.lower()]
+        self.mass_mode = MASS_MODES[massmode.lower()]
 
     @property
     def feature_names(self) -> List[str]:
@@ -67,7 +82,7 @@ def feature_names(self) -> List[str]:
         ]
 
     def add_features(self, psm_list: PSMList) -> None:
-        """Add DeepLC-derived features to PSMs."""
+        """Add MS2-derived features to PSMs."""
 
         logger.info("Adding MS2-derived features to PSMs.")
         psm_dict = psm_list.get_psm_dict()
@@ -82,68 +97,128 @@ def add_features(self, psm_list: PSMList) -> None:
                 psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
                 spectrum_filename = infer_spectrum_path(self.spectrum_path, run)
                 logger.debug(f"Using spectrum file `{spectrum_filename}`")
-                self._add_features_run(psm_list_run, spectrum_filename)
-                current_run += 1
-
-    def _add_features_run(self, psm_list: PSMList, spectrum_filename: str) -> None:
-        pass
-
-
-def longest_sequence_of_trues(lst):
-    max_sequence = 0
-    current_sequence = 0
-
-    for value in lst:
-        current_sequence = current_sequence + 1 if value else 0
-        max_sequence = max(max_sequence, current_sequence)
-
-    return max_sequence
-
 
-def extract_spectrum_features(annotated_spectrum, peptidoform):
-    features = defaultdict(list)
-    b_ions_matched = [False] * (len(peptidoform.sequence))
-    y_ions_matched = [False] * (len(peptidoform.sequence))
-
-    pseudo_count = 0.00001
-
-    for annotated_peak in annotated_spectrum.spectrum:
-        features["total_intensity"].append(annotated_peak.intensity)
-
-        if annotated_peak.annotation:
-            features["matched_intensity"].append(annotated_peak.intensity)
-            for matched_ion in annotated_peak.annotation:
-                if "y" in matched_ion.ion:
-                    features["y_ion_matched"].append(annotated_peak.intensity)
-                    y_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True
-                elif "b" in matched_ion.ion:
-                    features["b_ion_matched"].append(annotated_peak.intensity)
-                    b_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True
+                annotated_spectra = self._annotate_spectra(psm_list_run, spectrum_filename)
+                self._calculate_features(psm_list_run, annotated_spectra)
+                current_run += 1
 
-    return {
-        "ln_explained_intensity": np.log(np.sum(features["matched_intensity"]) + pseudo_count),
-        "ln_total_intensity": np.log(np.sum(features["total_intensity"]) + pseudo_count),
-        "ln_explained_intensity_ratio": np.log(
-            np.sum(features["matched_intensity"]) / np.sum(features["total_intensity"])
-            + pseudo_count
-        ),
-        "ln_explained_b_ion_ratio": np.log(
-            np.sum(features["b_ion_matched"]) / np.sum(features["matched_intensity"])
-            + pseudo_count
-        ),
-        "ln_explained_y_ion_ratio": np.log(
-            np.sum(features["y_ion_matched"]) / np.sum(features["matched_intensity"])
-            + pseudo_count
-        ),
-        "longest_b_ion_sequence": longest_sequence_of_trues(b_ions_matched),
-        "longest_y_ion_sequence": longest_sequence_of_trues(y_ions_matched),
-        "matched_b_ions": sum(b_ions_matched),
-        "matched_b_ions_pct": sum(b_ions_matched) / len(b_ions_matched),
-        "matched_y_ions": sum(y_ions_matched),
-        "matched_y_ions_pct": sum(y_ions_matched) / len(y_ions_matched),
-        "matched_ions_pct": (sum(b_ions_matched) + sum(y_ions_matched))
-        / (len(b_ions_matched) + len(y_ions_matched)),
-    }
+    def _calculate_features(self, psm_list: PSMList, annotated_spectra: List) -> None:
+        """Calculate features from annotated spectra and add them to the PSMs."""
+        for psm, spectrum in zip(psm_list, annotated_spectra):
+            psm.features.update(self.extract_spectrum_features(spectrum, psm.peptidoform))
+
+    def _annotate_spectra(self, psm_list: PSMList, spectrum_file: str) -> List:
+        """Retrieve annotated spectra for all psms."""
+
+        annotated_spectra = []
+        spectrum_reader = self._read_spectrum_file(spectrum_file)
+
+        spectrum_id_pattern = re.compile(
+            self.spectrum_id_pattern if self.spectrum_id_pattern else r"(.*)"
+        )
+        logger.info(spectrum_id_pattern)
+        logger.info(spectrum_reader._offset_index.mapping["spectrum"].keys())
+
+        try:
+            mapper = {
+                spectrum_id_pattern.search(spectrum_id).group(1): spectrum_id
+                for spectrum_id in spectrum_reader._offset_index.mapping["spectrum"].keys()
+            }
+        except AttributeError:
+            raise ParseSpectrumError(
+                "Could not parse spectrum IDs using ´spectrum_id_pattern´. Please make sure that there is a capturing in the pattern."
+            )
+
+        for psm in psm_list:
+            pyteomics_spectrum = spectrum_reader.get_by_id(mapper[psm.spectrum_id])
+            annotated_spectrum = self._annotate_spectrum(psm, pyteomics_spectrum)
+            annotated_spectra.append(annotated_spectrum)
+
+        return annotated_spectra
+
+    @staticmethod
+    def _read_spectrum_file(spectrum_filepath: str) -> Union[mzml.PreIndexedMzML, mgf.IndexedMGF]:
+
+        if spectrum_filepath.suffix == ".mzML":
+            return mzml.PreIndexedMzML(str(spectrum_filepath))
+        elif spectrum_filepath.suffix == ".mgf":
+            return mgf.IndexedMGF(str(spectrum_filepath))
+
+    @staticmethod
+    def _longest_ion_sequence(lst):
+        max_sequence = 0
+        current_sequence = 0
+
+        for value in lst:
+            current_sequence = current_sequence + 1 if value else 0
+            max_sequence = max(max_sequence, current_sequence)
+
+        return max_sequence
+
+    def extract_spectrum_features(self, annotated_spectrum, peptidoform):
+        features = defaultdict(list)
+        b_ions_matched = [False] * (len(peptidoform.sequence))
+        y_ions_matched = [False] * (len(peptidoform.sequence))
+
+        pseudo_count = 0.00001
+
+        for annotated_peak in annotated_spectrum.spectrum:
+            features["total_intensity"].append(annotated_peak.intensity)
+
+            if annotated_peak.annotation:
+                features["matched_intensity"].append(annotated_peak.intensity)
+                for matched_ion in annotated_peak.annotation:
+                    if "y" in matched_ion.ion:
+                        features["y_ion_matched"].append(annotated_peak.intensity)
+                        y_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True
+                    elif "b" in matched_ion.ion:
+                        features["b_ion_matched"].append(annotated_peak.intensity)
+                        b_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True
+
+        return {
+            "ln_explained_intensity": np.log(np.sum(features["matched_intensity"]) + pseudo_count),
+            "ln_total_intensity": np.log(np.sum(features["total_intensity"]) + pseudo_count),
+            "ln_explained_intensity_ratio": np.log(
+                np.sum(features["matched_intensity"]) / np.sum(features["total_intensity"])
+                + pseudo_count
+            ),
+            "ln_explained_b_ion_ratio": np.log(
+                np.sum(features["b_ion_matched"]) / np.sum(features["matched_intensity"])
+                + pseudo_count
+            ),
+            "ln_explained_y_ion_ratio": np.log(
+                np.sum(features["y_ion_matched"]) / np.sum(features["matched_intensity"])
+                + pseudo_count
+            ),
+            "longest_b_ion_sequence": self._longest_ion_sequence(b_ions_matched),
+            "longest_y_ion_sequence": self._longest_ion_sequence(y_ions_matched),
+            "matched_b_ions": sum(b_ions_matched),
+            "matched_b_ions_pct": sum(b_ions_matched) / len(b_ions_matched),
+            "matched_y_ions": sum(y_ions_matched),
+            "matched_y_ions_pct": sum(y_ions_matched) / len(y_ions_matched),
+            "matched_ions_pct": (sum(b_ions_matched) + sum(y_ions_matched))
+            / (len(b_ions_matched) + len(y_ions_matched)),
+        }
+
+    def _annotate_spectrum(self, psm, spectrum):
+
+        spectrum = RawSpectrum(
+            title=psm.spectrum_id,
+            num_scans=1,
+            rt=psm.retention_time,
+            precursor_charge=psm.get_precursor_charge(),
+            precursor_mass=mass.calculate_mass(psm.peptidoform.compsoition),
+            mz_array=spectrum["m/z array"],
+            intensity_array=spectrum["intensity array"],
+        )
+
+        annotated_spectrum = spectrum.annotate(
+            peptide=LinearPeptide(psm.peptidoform.proforma),
+            model=self.fragmentationmodel,
+            mode=self.massmode,
+        )
+
+        return annotated_spectrum
 
 
 # TODO: keep this here?
@@ -153,9 +228,16 @@ def modification_evidence():
 
 # TODO: move to basic feature generator
 def extract_psm_features(psm):
+
+    expmass = (psm.precursor_mz * psm.get_precursor_charge()) - (
+        psm.get_precursor_charge() * 1.007276
+    )  # TODO: replace by constant
+    calcmass = mass.calculate_mass(psm.peptidoform.composition)
+    delta_mass = expmass - calcmass
+
     return {
         "peptide_len": len(psm.peptidoform.sequence),
-        "expmass": (psm.precursor_mz * psm.get_precursor_charge())
-        - (psm.get_precursor_charge() * 1.007276),  # TODO: replace by constant
-        "calcmass": calculate_mass(psm.peptidoform.composition),
+        "expmass": expmass,
+        "calcmass": calcmass,
+        "delta_mass": delta_mass,
     }

From 60207a390f08cba34730889543a356fd1cb9b06b Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <Arthur.Declercq@UGent.be>
Date: Sun, 25 Feb 2024 16:45:09 +0100
Subject: [PATCH 03/54] add rustyms

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index be5295ba..beb6b440 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -55,6 +55,7 @@ dependencies = [
 
 [project.optional-dependencies]
 ionmob = ["ionmob>=0.2", "tensorflow"]
+ms2 = ["rustyms"]
 dev = ["ruff", "black", "pytest", "pytest-cov", "pre-commit"]
 docs = [
     "sphinx",

From ae39844a7b46b9e4622c5f66458aae5146e3776b Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <Arthur.Declercq@UGent.be>
Date: Mon, 26 Feb 2024 11:04:37 +0100
Subject: [PATCH 04/54] remove exit statement fixed IM required value

---
 ms2rescore/core.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/ms2rescore/core.py b/ms2rescore/core.py
index f8ff38d7..6e0e547c 100644
--- a/ms2rescore/core.py
+++ b/ms2rescore/core.py
@@ -54,18 +54,19 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
         f"PSMs already contain the following rescoring features: {psm_list_feature_names}"
     )
 
-    # # TODO: avoid hard coding feature generators in some way
-    # rt_required = ("deeplc" in config["feature_generators"]) and (
-    #     None in psm_list["retention_time"]
-    # )
-    # im_required = ("ionmob" or "im2deep" in config["feature_generators"]) and (
-    #     None in psm_list["ion_mobility"]
-    # )
-    # logger.debug(f"RT required: {rt_required}, IM required: {im_required}")
-
-    # if rt_required or im_required:
-    #     logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
-    #     fill_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required)
+    # TODO: avoid hard coding feature generators in some way
+    rt_required = ("deeplc" in config["feature_generators"]) and (
+        None in psm_list["retention_time"]
+    )
+    im_required = (
+        "ionmob" in config["feature_generators"] or "im2deep" in config["feature_generators"]
+    ) and (None in psm_list["ion_mobility"])
+
+    logger.info(f"RT required: {rt_required}, IM required: {im_required}")
+
+    if rt_required or im_required:
+        logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
+        fill_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required)
 
     # Add rescoring features
     for fgen_name, fgen_config in config["feature_generators"].items():
@@ -82,7 +83,6 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
         fgen.add_features(psm_list)
         logger.debug(f"Adding features from {fgen_name}: {set(fgen.feature_names)}")
         feature_names[fgen_name] = set(fgen.feature_names)
-    exit()
 
     # Filter out psms that do not have all added features
     all_feature_names = {f for fgen in feature_names.values() for f in fgen}

From 9b98c4df3b13d0ab935172a5cfeca8bdbb6159f4 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <Arthur.Declercq@UGent.be>
Date: Mon, 26 Feb 2024 15:05:23 +0100
Subject: [PATCH 05/54] change logger.info to debug

---
 ms2rescore/core.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ms2rescore/core.py b/ms2rescore/core.py
index 6e0e547c..032b1463 100644
--- a/ms2rescore/core.py
+++ b/ms2rescore/core.py
@@ -11,10 +11,12 @@
 from ms2rescore.parse_spectra import fill_missing_values
 from ms2rescore.report import generate
 from ms2rescore.rescoring_engines import mokapot, percolator
+from ms2rescore.utils import profile
 
 logger = logging.getLogger(__name__)
 
 
+@profile
 def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
     """
     Run full MS²Rescore workflow with passed configuration.
@@ -62,7 +64,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
         "ionmob" in config["feature_generators"] or "im2deep" in config["feature_generators"]
     ) and (None in psm_list["ion_mobility"])
 
-    logger.info(f"RT required: {rt_required}, IM required: {im_required}")
+    logger.debug(f"RT required: {rt_required}, IM required: {im_required}")
 
     if rt_required or im_required:
         logger.info("Parsing missing retention time and/or ion mobility values from spectra...")

From 5e45756726582fa54e63f2d572c400332bcaa683 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <Arthur.Declercq@UGent.be>
Date: Mon, 26 Feb 2024 15:06:34 +0100
Subject: [PATCH 06/54] added profile decorator to get timings for functions

---
 ms2rescore/utils.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py
index 70417b56..982664b5 100644
--- a/ms2rescore/utils.py
+++ b/ms2rescore/utils.py
@@ -4,6 +4,9 @@
 from glob import glob
 from pathlib import Path
 from typing import Optional, Union
+import cProfile
+import pstats
+import io
 
 from ms2rescore.exceptions import MS2RescoreConfigurationError
 
@@ -76,3 +79,22 @@ def infer_spectrum_path(
             )
 
     return Path(resolved_path)
+
+
+def profile(fnc):
+    """A decorator that uses cProfile to profile a function"""
+
+    def inner(*args, **kwargs):
+
+        pr = cProfile.Profile()
+        pr.enable()
+        retval = fnc(*args, **kwargs)
+        pr.disable()
+        s = io.StringIO()
+        sortby = "time"
+        ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
+        ps.print_stats()
+        logger.info(s.getvalue())
+        return retval
+
+    return inner

From 304777cfa171cf5b7abf5cd14640e971f6510167 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <Arthur.Declercq@UGent.be>
Date: Mon, 26 Feb 2024 15:07:13 +0100
Subject: [PATCH 07/54] removed profile as standard rescore debug statement

---
 ms2rescore/core.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ms2rescore/core.py b/ms2rescore/core.py
index 032b1463..d0a52a4e 100644
--- a/ms2rescore/core.py
+++ b/ms2rescore/core.py
@@ -11,12 +11,10 @@
 from ms2rescore.parse_spectra import fill_missing_values
 from ms2rescore.report import generate
 from ms2rescore.rescoring_engines import mokapot, percolator
-from ms2rescore.utils import profile
 
 logger = logging.getLogger(__name__)
 
 
-@profile
 def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
     """
     Run full MS²Rescore workflow with passed configuration.

From 95ee4755325cd85990473cf15313c92e8c2f567a Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <Arthur.Declercq@UGent.be>
Date: Mon, 26 Feb 2024 15:07:21 +0100
Subject: [PATCH 08/54] added new basic features

---
 ms2rescore/feature_generators/basic.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/ms2rescore/feature_generators/basic.py b/ms2rescore/feature_generators/basic.py
index bc2222ee..df005167 100644
--- a/ms2rescore/feature_generators/basic.py
+++ b/ms2rescore/feature_generators/basic.py
@@ -56,6 +56,7 @@ def add_features(self, psm_list: PSMList) -> None:
         charge_states = np.array([psm.peptidoform.precursor_charge for psm in psm_list])
         precursor_mzs = psm_list["precursor_mz"]
         scores = psm_list["score"]
+        peptide_lengths = np.array([len(psm.peptidoform.sequence) for psm in psm_list])
 
         has_charge = None not in charge_states
         has_mz = None not in precursor_mzs and has_charge
@@ -74,6 +75,14 @@ def add_features(self, psm_list: PSMList) -> None:
         if has_score:
             self._feature_names.append("search_engine_score")
 
+        if has_mz and has_charge:
+            experimental_mass = (precursor_mzs * charge_n) - (charge_n * 1.007276466812)
+            theoretical_mass = (theo_mz * charge_n) - (charge_n * 1.007276466812)
+            mass_error = experimental_mass - theoretical_mass
+            self._feature_names.extend(["theoretical_mass", "experimental_mass", "mass_error"])
+
+        self._feature_names.append("pep_len")
+
         for i, psm in enumerate(psm_list):
             psm.rescoring_features.update(
                 dict(
@@ -81,6 +90,10 @@ def add_features(self, psm_list: PSMList) -> None:
                     **charge_one_hot[i] if has_charge else {},
                     **{"abs_ms1_error_ppm": abs_ms1_error_ppm[i]} if has_mz else {},
                     **{"search_engine_score": scores[i]} if has_score else {},
+                    **{"theoretical_mass": theoretical_mass[i]} if has_mz and has_charge else {},
+                    **{"experimental_mass": experimental_mass[i]} if has_mz and has_charge else {},
+                    **{"mass_error": mass_error[i]} if has_mz and has_charge else {},
+                    **{"pep_len": peptide_lengths[i]},
                 )
             )
 

From 73f4573331a82ea3984457333d3c701736f4d86d Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <Arthur.Declercq@UGent.be>
Date: Mon, 26 Feb 2024 15:07:47 +0100
Subject: [PATCH 09/54] fixes for ms2 feature generator, removed
 multiprocessing

---
 ms2rescore/feature_generators/ms2.py | 149 +++++++++++++++------------
 1 file changed, 81 insertions(+), 68 deletions(-)

diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py
index 013bcb44..997accb2 100644
--- a/ms2rescore/feature_generators/ms2.py
+++ b/ms2rescore/feature_generators/ms2.py
@@ -4,6 +4,7 @@
 """
 
 import logging
+import multiprocessing
 import re
 from typing import List, Optional, Union
 from itertools import chain
@@ -14,6 +15,7 @@
 from psm_utils import PSMList
 from pyteomics import mass, mzml, mgf
 from rustyms import RawSpectrum, LinearPeptide, FragmentationModel, MassMode
+from rich.progress import track
 
 from ms2rescore.feature_generators.base import FeatureGeneratorBase
 from ms2rescore.utils import infer_spectrum_path
@@ -42,8 +44,9 @@ def __init__(
         *args,
         spectrum_path: Optional[str] = None,
         spectrum_id_pattern: str = "(.*)",
-        fragmentationmodel: str = "All",
-        massmode: str = "Monoisotopic",
+        fragmentation_model: str = "All",
+        mass_mode: str = "Monoisotopic",
+        processes: int = 1,
         **kwargs,
     ) -> None:
         """
@@ -61,8 +64,9 @@ def __init__(
         super().__init__(*args, **kwargs)
         self.spectrum_path = spectrum_path
         self.spectrum_id_pattern = spectrum_id_pattern
-        self.fragmentation_model = FRAGMENTATION_MODELS[fragmentationmodel.lower()]
-        self.mass_mode = MASS_MODES[massmode.lower()]
+        self.fragmentation_model = FRAGMENTATION_MODELS[fragmentation_model.lower()]
+        self.mass_mode = MASS_MODES[mass_mode.lower()]
+        self.processes = processes
 
     @property
     def feature_names(self) -> List[str]:
@@ -98,26 +102,17 @@ def add_features(self, psm_list: PSMList) -> None:
                 spectrum_filename = infer_spectrum_path(self.spectrum_path, run)
                 logger.debug(f"Using spectrum file `{spectrum_filename}`")
 
-                annotated_spectra = self._annotate_spectra(psm_list_run, spectrum_filename)
-                self._calculate_features(psm_list_run, annotated_spectra)
+                self._calculate_features(psm_list_run, spectrum_filename)
                 current_run += 1
 
-    def _calculate_features(self, psm_list: PSMList, annotated_spectra: List) -> None:
-        """Calculate features from annotated spectra and add them to the PSMs."""
-        for psm, spectrum in zip(psm_list, annotated_spectra):
-            psm.features.update(self.extract_spectrum_features(spectrum, psm.peptidoform))
-
-    def _annotate_spectra(self, psm_list: PSMList, spectrum_file: str) -> List:
+    def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List:
         """Retrieve annotated spectra for all psms."""
 
-        annotated_spectra = []
         spectrum_reader = self._read_spectrum_file(spectrum_file)
 
         spectrum_id_pattern = re.compile(
             self.spectrum_id_pattern if self.spectrum_id_pattern else r"(.*)"
         )
-        logger.info(spectrum_id_pattern)
-        logger.info(spectrum_reader._offset_index.mapping["spectrum"].keys())
 
         try:
             mapper = {
@@ -128,20 +123,43 @@ def _annotate_spectra(self, psm_list: PSMList, spectrum_file: str) -> List:
             raise ParseSpectrumError(
                 "Could not parse spectrum IDs using ´spectrum_id_pattern´. Please make sure that there is a capturing in the pattern."
             )
-
-        for psm in psm_list:
-            pyteomics_spectrum = spectrum_reader.get_by_id(mapper[psm.spectrum_id])
-            annotated_spectrum = self._annotate_spectrum(psm, pyteomics_spectrum)
-            annotated_spectra.append(annotated_spectrum)
-
-        return annotated_spectra
+        annotated_spectra = [
+            self._annotate_spectrum(psm, spectrum_reader.get_by_id(mapper[psm.spectrum_id]))
+            for psm in psm_list
+        ]
+        for psm, annotated_spectrum in zip(psm_list, annotated_spectra):
+            psm.rescoring_features.update(
+                self._calculate_spectrum_features(psm, annotated_spectrum)
+            )
+        # with multiprocessing.Pool(self.processes) as pool:
+        #     counts_failed = 0
+        #     for psm, features in zip(
+        #         psm_list,
+        #         track(
+        #             pool.imap(
+        #                 self._calculate_spectrum_features_wrapper,
+        #                 zip(psm_list, annotated_spectra),
+        #                 chunksize=1000,
+        #             ),
+        #             total=len(psm_list),
+        #             description="Calculating MS2 features...",
+        #             transient=True,
+        #         ),
+        #     ):
+        #         if features:
+        #             psm.rescoring_features.update(features)
+
+        #         else:
+        #             counts_failed += 1
+        # if counts_failed > 0:
+        #     logger.warning(f"Failed to calculate features for {counts_failed} PSMs")
 
     @staticmethod
     def _read_spectrum_file(spectrum_filepath: str) -> Union[mzml.PreIndexedMzML, mgf.IndexedMGF]:
 
-        if spectrum_filepath.suffix == ".mzML":
+        if spectrum_filepath.suffix.lower() == ".mzml":
             return mzml.PreIndexedMzML(str(spectrum_filepath))
-        elif spectrum_filepath.suffix == ".mgf":
+        elif spectrum_filepath.suffix.lower() == ".mgf":
             return mgf.IndexedMGF(str(spectrum_filepath))
 
     @staticmethod
@@ -155,40 +173,48 @@ def _longest_ion_sequence(lst):
 
         return max_sequence
 
-    def extract_spectrum_features(self, annotated_spectrum, peptidoform):
+    def _calculate_spectrum_features(self, psm, annotated_spectrum):
+
         features = defaultdict(list)
-        b_ions_matched = [False] * (len(peptidoform.sequence))
-        y_ions_matched = [False] * (len(peptidoform.sequence))
+        b_ions_matched = [False] * (len(psm.peptidoform.sequence))
+        y_ions_matched = [False] * (len(psm.peptidoform.sequence))
 
         pseudo_count = 0.00001
+        ion_fragment_regex = re.compile(r"\d+")
 
-        for annotated_peak in annotated_spectrum.spectrum:
-            features["total_intensity"].append(annotated_peak.intensity)
+        for peak in annotated_spectrum:
+            features["total_intensity"].append(peak.intensity)
 
-            if annotated_peak.annotation:
-                features["matched_intensity"].append(annotated_peak.intensity)
-                for matched_ion in annotated_peak.annotation:
+            if peak.annotation:
+                features["matched_intensity"].append(peak.intensity)
+                for matched_ion in peak.annotation:
                     if "y" in matched_ion.ion:
-                        features["y_ion_matched"].append(annotated_peak.intensity)
-                        y_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True
+                        features["y_ion_matched"].append(peak.intensity)
+                        y_ions_matched[int(ion_fragment_regex.search(matched_ion.ion).group())] = (
+                            True
+                        )
                     elif "b" in matched_ion.ion:
-                        features["b_ion_matched"].append(annotated_peak.intensity)
-                        b_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True
+                        features["b_ion_matched"].append(peak.intensity)
+                        b_ions_matched[int(ion_fragment_regex.search(matched_ion.ion).group())] = (
+                            True
+                        )
+
+        total_intensity_sum = np.sum(features["total_intensity"])
+        matched_intensity_sum = np.sum(features["matched_intensity"])
+        b_ion_matched_sum = np.sum(features["b_ion_matched"])
+        y_ion_matched_sum = np.sum(features["y_ion_matched"])
 
         return {
-            "ln_explained_intensity": np.log(np.sum(features["matched_intensity"]) + pseudo_count),
-            "ln_total_intensity": np.log(np.sum(features["total_intensity"]) + pseudo_count),
+            "ln_explained_intensity": np.log(matched_intensity_sum + pseudo_count),
+            "ln_total_intensity": np.log(total_intensity_sum + pseudo_count),
             "ln_explained_intensity_ratio": np.log(
-                np.sum(features["matched_intensity"]) / np.sum(features["total_intensity"])
-                + pseudo_count
+                matched_intensity_sum / total_intensity_sum + pseudo_count
             ),
             "ln_explained_b_ion_ratio": np.log(
-                np.sum(features["b_ion_matched"]) / np.sum(features["matched_intensity"])
-                + pseudo_count
+                b_ion_matched_sum / matched_intensity_sum + pseudo_count
             ),
             "ln_explained_y_ion_ratio": np.log(
-                np.sum(features["y_ion_matched"]) / np.sum(features["matched_intensity"])
-                + pseudo_count
+                y_ion_matched_sum / matched_intensity_sum + pseudo_count
             ),
             "longest_b_ion_sequence": self._longest_ion_sequence(b_ions_matched),
             "longest_y_ion_sequence": self._longest_ion_sequence(y_ions_matched),
@@ -200,44 +226,31 @@ def extract_spectrum_features(self, annotated_spectrum, peptidoform):
             / (len(b_ions_matched) + len(y_ions_matched)),
         }
 
-    def _annotate_spectrum(self, psm, spectrum):
+    def _calculate_spectrum_features_wrapper(self, psm_spectrum_tuple):
+        psm, spectrum = psm_spectrum_tuple
+        return self._calculate_spectrum_features(psm, spectrum)
+
+    def _annotate_spectrum(self, psm, pyteomics_spectrum):
 
         spectrum = RawSpectrum(
             title=psm.spectrum_id,
             num_scans=1,
             rt=psm.retention_time,
             precursor_charge=psm.get_precursor_charge(),
-            precursor_mass=mass.calculate_mass(psm.peptidoform.compsoition),
-            mz_array=spectrum["m/z array"],
-            intensity_array=spectrum["intensity array"],
+            precursor_mass=mass.calculate_mass(psm.peptidoform.composition),
+            mz_array=pyteomics_spectrum["m/z array"],
+            intensity_array=pyteomics_spectrum["intensity array"],
         )
 
         annotated_spectrum = spectrum.annotate(
             peptide=LinearPeptide(psm.peptidoform.proforma),
-            model=self.fragmentationmodel,
-            mode=self.massmode,
+            model=self.fragmentation_model,
+            mode=self.mass_mode,
         )
 
-        return annotated_spectrum
+        return annotated_spectrum.spectrum
 
 
 # TODO: keep this here?
 def modification_evidence():
     return
-
-
-# TODO: move to basic feature generator
-def extract_psm_features(psm):
-
-    expmass = (psm.precursor_mz * psm.get_precursor_charge()) - (
-        psm.get_precursor_charge() * 1.007276
-    )  # TODO: replace by constant
-    calcmass = mass.calculate_mass(psm.peptidoform.composition)
-    delta_mass = expmass - calcmass
-
-    return {
-        "peptide_len": len(psm.peptidoform.sequence),
-        "expmass": expmass,
-        "calcmass": calcmass,
-        "delta_mass": delta_mass,
-    }

From 947233ee0039757676ea5b62daaacf2e1059089f Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <Arthur.Declercq@UGent.be>
Date: Wed, 28 Feb 2024 11:12:32 +0100
Subject: [PATCH 10/54] return empty list on parsing error with rustyms,
 removed multiprocessing

---
 ms2rescore/feature_generators/ms2.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py
index 997accb2..b292d51a 100644
--- a/ms2rescore/feature_generators/ms2.py
+++ b/ms2rescore/feature_generators/ms2.py
@@ -4,7 +4,6 @@
 """
 
 import logging
-import multiprocessing
 import re
 from typing import List, Optional, Union
 from itertools import chain
@@ -15,7 +14,6 @@
 from psm_utils import PSMList
 from pyteomics import mass, mzml, mgf
 from rustyms import RawSpectrum, LinearPeptide, FragmentationModel, MassMode
-from rich.progress import track
 
 from ms2rescore.feature_generators.base import FeatureGeneratorBase
 from ms2rescore.utils import infer_spectrum_path
@@ -62,6 +60,7 @@ def __init__(
 
         """
         super().__init__(*args, **kwargs)
+
         self.spectrum_path = spectrum_path
         self.spectrum_id_pattern = spectrum_id_pattern
         self.fragmentation_model = FRAGMENTATION_MODELS[fragmentation_model.lower()]
@@ -100,7 +99,6 @@ def add_features(self, psm_list: PSMList) -> None:
                 )
                 psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
                 spectrum_filename = infer_spectrum_path(self.spectrum_path, run)
-                logger.debug(f"Using spectrum file `{spectrum_filename}`")
 
                 self._calculate_features(psm_list_run, spectrum_filename)
                 current_run += 1
@@ -175,6 +173,9 @@ def _longest_ion_sequence(lst):
 
     def _calculate_spectrum_features(self, psm, annotated_spectrum):
 
+        if not annotated_spectrum:
+            return {}
+
         features = defaultdict(list)
         b_ions_matched = [False] * (len(psm.peptidoform.sequence))
         y_ions_matched = [False] * (len(psm.peptidoform.sequence))
@@ -241,12 +242,14 @@ def _annotate_spectrum(self, psm, pyteomics_spectrum):
             mz_array=pyteomics_spectrum["m/z array"],
             intensity_array=pyteomics_spectrum["intensity array"],
         )
-
-        annotated_spectrum = spectrum.annotate(
-            peptide=LinearPeptide(psm.peptidoform.proforma),
-            model=self.fragmentation_model,
-            mode=self.mass_mode,
-        )
+        try:
+            annotated_spectrum = spectrum.annotate(
+                peptide=LinearPeptide(psm.peptidoform.proforma.split("/")[0]),
+                model=self.fragmentation_model,
+                mode=self.mass_mode,
+            )
+        except:  # noqa E722
+            return []
 
         return annotated_spectrum.spectrum
 

From 24ce565542fb42cea534b81b53093435dba2a36c Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <Arthur.Declercq@UGent.be>
Date: Fri, 15 Mar 2024 09:47:02 +0100
Subject: [PATCH 11/54] add deeplc_calibration psm set

---
 ms2rescore/feature_generators/deeplc.py | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py
index 30bea716..207f8ba1 100644
--- a/ms2rescore/feature_generators/deeplc.py
+++ b/ms2rescore/feature_generators/deeplc.py
@@ -25,6 +25,7 @@
 
 import numpy as np
 from psm_utils import PSMList
+from psm_utils.io import read_file
 
 from ms2rescore.feature_generators.base import FeatureGeneratorBase
 
@@ -40,6 +41,7 @@ def __init__(
         *args,
         lower_score_is_better: bool = False,
         calibration_set_size: Union[int, float, None] = None,
+        calibration_set: Union[str, None] = None,
         processes: int = 1,
         **kwargs,
     ) -> None:
@@ -71,6 +73,7 @@ def __init__(
 
         self.lower_psm_score_better = lower_score_is_better
         self.calibration_set_size = calibration_set_size
+        self.calibration_set = calibration_set
         self.processes = processes
         self.deeplc_kwargs = kwargs or {}
 
@@ -120,7 +123,9 @@ def add_features(self, psm_list: PSMList) -> None:
         # Run DeepLC for each spectrum file
         current_run = 1
         total_runs = sum(len(runs) for runs in psm_dict.values())
-
+        if self.calibration_set:
+            logger.info("Loading calibration set...")
+            self.calibration_set = read_file(self.calibration_set, filetype="tsv")
         for runs in psm_dict.values():
             # Reset DeepLC predictor for each collection of runs
             self.deeplc_predictor = None
@@ -138,13 +143,19 @@ def add_features(self, psm_list: PSMList) -> None:
                 )
 
                 # Disable wild logging to stdout by Tensorflow, unless in debug mode
-                with contextlib.redirect_stdout(
-                    open(os.devnull, "w")
-                ) if not self._verbose else contextlib.nullcontext():
+                with (
+                    contextlib.redirect_stdout(open(os.devnull, "w"))
+                    if not self._verbose
+                    else contextlib.nullcontext()
+                ):
                     # Make new PSM list for this run (chain PSMs per spectrum to flat list)
                     psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
-
-                    psm_list_calibration = self._get_calibration_psms(psm_list_run)
+                    if self.calibration_set:
+                        psm_list_calibration = self.calibration_set[
+                            self.calibration_set["run"] == run
+                        ]
+                    else:
+                        psm_list_calibration = self._get_calibration_psms(psm_list_run)
                     logger.debug(f"Calibrating DeepLC with {len(psm_list_calibration)} PSMs...")
                     self.deeplc_predictor = self.DeepLC(
                         n_jobs=self.processes,

From 33c38b04e304e6b865714adb26b813c647911628 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Wed, 17 Apr 2024 17:10:38 +0200
Subject: [PATCH 12/54] remove unused import

---
 ms2rescore/utils.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py
index 28249c51..3515893a 100644
--- a/ms2rescore/utils.py
+++ b/ms2rescore/utils.py
@@ -4,9 +4,6 @@
 from glob import glob
 from pathlib import Path
 from typing import Optional, Union
-import cProfile
-import pstats
-import io
 
 from ms2rescore.exceptions import MS2RescoreConfigurationError
 

From 11fdc51ad70c225393553f5964f7485a60b7ae19 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Wed, 21 Aug 2024 13:25:10 +0200
Subject: [PATCH 13/54] integrate mumble into ms2branch

---
 ms2rescore/exceptions.py                    |  6 +++
 ms2rescore/package_data/config_default.json |  1 +
 ms2rescore/package_data/config_schema.json  | 42 +++++++++++++++++++++
 ms2rescore/parse_psms.py                    | 14 +++++++
 pyproject.toml                              |  1 +
 5 files changed, 64 insertions(+)

diff --git a/ms2rescore/exceptions.py b/ms2rescore/exceptions.py
index ba5492a2..68b1b43d 100644
--- a/ms2rescore/exceptions.py
+++ b/ms2rescore/exceptions.py
@@ -41,3 +41,9 @@ class RescoringError(MS2RescoreError):
     """Error while rescoring PSMs."""
 
     pass
+
+
+class ParseSpectrumError(MS2RescoreError):
+    """Error while rescoring PSMs."""
+
+    pass
diff --git a/ms2rescore/package_data/config_default.json b/ms2rescore/package_data/config_default.json
index 733ea707..126fde8b 100644
--- a/ms2rescore/package_data/config_default.json
+++ b/ms2rescore/package_data/config_default.json
@@ -12,6 +12,7 @@
             },
             "maxquant": {}
         },
+        "psm_generator": {},
         "rescoring_engine": {
             "mokapot": {
                 "train_fdr": 0.01,
diff --git a/ms2rescore/package_data/config_schema.json b/ms2rescore/package_data/config_schema.json
index ab77f3b7..40471714 100644
--- a/ms2rescore/package_data/config_schema.json
+++ b/ms2rescore/package_data/config_schema.json
@@ -62,6 +62,18 @@
                         "mokapot": {}
                     }
                 },
+                "psm_generator": {
+                    "description": "PSM generator and their configuration.",
+                    "type": "object",
+                    "minProperties": 0,
+                    "maxProperties": 1,
+                    "patternProperties": {
+                        ".*": { "$ref": "#/definitions/psm_generator" },
+                        "mumble": {
+                            "$ref": "#/definitions/mumble"
+                        }
+                    }
+                },
                 "config_file": {
                     "description": "Path to configuration file",
                     "oneOf": [{ "type": "string" }, { "type": "null" }]
@@ -179,6 +191,7 @@
                     "type": "boolean",
                     "default": false
                 }
+                
             }
         }
     },
@@ -193,6 +206,11 @@
             "type": "object",
             "additionalProperties": true
         },
+        "psm_generator": {
+            "description": "Additional PSM feature generator configuration",
+            "type": "object",
+            "additionalProperties": true
+        },
         "basic": {
             "$ref": "#/definitions/feature_generator",
             "description": "Basic feature generator configuration",
@@ -273,6 +291,30 @@
                 }
             }
         },
+        "mumble": {
+            "$ref": "#/definitions/psm_generator",
+            "description": "Mumble PSM generator configuration using Mubmle",
+            "type": "object",
+            "additionalProperties": true,
+            "properties": {
+                "aa_combinations": {
+                    "description": "Additional amino acid combinations to consider as mass shift",
+                    "type": "integer",
+                    "default": 0
+                },
+                "fasta_file": {
+                    "description": "Maximum number of modifications per peptide",
+                    "oneOf": [{ "type": "string" }, { "type": "null" }],
+                    "default": false
+                },
+                "mass_error": {
+                    "description": "mass error in Da",
+                    "type": "number",
+                    "default": 0.2
+                }
+            }
+        },
+
         "mokapot": {
             "$ref": "#/definitions/rescoring_engine",
             "description": "Mokapot rescoring engine configuration. Additional properties are passed to the Mokapot brew function.",
diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py
index 6a3c6065..ae28b519 100644
--- a/ms2rescore/parse_psms.py
+++ b/ms2rescore/parse_psms.py
@@ -5,6 +5,7 @@
 import numpy as np
 import psm_utils.io
 from psm_utils import PSMList
+from mumble import PSMHandler
 
 from ms2rescore.exceptions import MS2RescoreConfigurationError
 
@@ -90,6 +91,19 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
         new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]]
         psm_list["spectrum_id"] = new_ids
 
+    # Addition of Modifications for mass shifts in the PSMs with Mumble
+    if "mumble" in config["psm_generator"]:
+        logger.debug("Applying modifications for mass shifts using Mumble...")
+        mumble_config = config["psm_generator"]["mumble"]
+        psm_handler = PSMHandler(
+            **mumble_config,  # TODO how do we store config for mumble?
+        )
+        psm_list = psm_handler.add_modified_psms(
+            psm_list, generate_modified_decoys=True, keep_original=True
+        )
+        if mumble_config["output_file"]:
+            psm_handler.write_modified_psm_list(psm_list, mumble_config["output_file"])
+
     return psm_list
 
 
diff --git a/pyproject.toml b/pyproject.toml
index f57ad800..7c1b0a9c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,6 +50,7 @@ dependencies = [
     "pyteomics>=4.7.2",
     "rich>=12",
     "tomli>=2; python_version < '3.11'",
+    "mumble"
 ]
 
 [project.optional-dependencies]

From 883169a90cf7c57970dac1085ba54defbf1ffdb9 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Fri, 27 Sep 2024 09:58:27 +0200
Subject: [PATCH 14/54] temp removal of sage features before rescoring

---
 ms2rescore/parse_psms.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py
index ae28b519..91c94168 100644
--- a/ms2rescore/parse_psms.py
+++ b/ms2rescore/parse_psms.py
@@ -95,6 +95,15 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
     if "mumble" in config["psm_generator"]:
         logger.debug("Applying modifications for mass shifts using Mumble...")
         mumble_config = config["psm_generator"]["mumble"]
+        for psm in psm_list:
+            psm["rescoring_features"].pop("expmass", None)
+            psm["rescoring_features"].pop("calcmass", None)
+            psm["rescoring_features"].pop("peptide_len", None)
+            psm["rescoring_features"].pop("matched_peaks", None)
+            psm["rescoring_features"].pop("longest_b", None)
+            psm["rescoring_features"].pop("longest_y", None)
+            psm["rescoring_features"].pop("longest_y_pct", None)
+            psm["rescoring_features"].pop("matched_intensity_pct", None)
         psm_handler = PSMHandler(
             **mumble_config,  # TODO how do we store config for mumble?
         )

From da39ae81d052b4391ada17fd24bb5a856078e8a8 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Fri, 8 Nov 2024 15:41:26 +0100
Subject: [PATCH 15/54] remove psm_file features when rescoring with mumble

---
 ms2rescore/parse_psms.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py
index 91c94168..309c1c96 100644
--- a/ms2rescore/parse_psms.py
+++ b/ms2rescore/parse_psms.py
@@ -95,15 +95,14 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
     if "mumble" in config["psm_generator"]:
         logger.debug("Applying modifications for mass shifts using Mumble...")
         mumble_config = config["psm_generator"]["mumble"]
-        for psm in psm_list:
-            psm["rescoring_features"].pop("expmass", None)
-            psm["rescoring_features"].pop("calcmass", None)
-            psm["rescoring_features"].pop("peptide_len", None)
-            psm["rescoring_features"].pop("matched_peaks", None)
-            psm["rescoring_features"].pop("longest_b", None)
-            psm["rescoring_features"].pop("longest_y", None)
-            psm["rescoring_features"].pop("longest_y_pct", None)
-            psm["rescoring_features"].pop("matched_intensity_pct", None)
+
+        # Check if psm_list[0].rescoring_features is empty or not
+        if psm_list[0].rescoring_features:
+            logger.debug("Removing psm_file rescoring features from PSMs...")
+            # psm_list.remove_rescoring_features() # TODO add this to psm_utils
+            for psm in psm_list:
+                psm.rescoring_features = {}
+
         psm_handler = PSMHandler(
             **mumble_config,  # TODO how do we store config for mumble?
         )

From 37fff28ebc03fcf66f98be5a5c51ca7e3b68e97b Mon Sep 17 00:00:00 2001
From: SamvPy <sam.vanpuyenbroeck@gmail.com>
Date: Tue, 19 Nov 2024 16:10:43 +0100
Subject: [PATCH 16/54] linting

---
 ms2rescore/core.py                         |  5 ++++-
 ms2rescore/feature_generators/__init__.py  |  4 ++--
 ms2rescore/feature_generators/deeplc.py    |  8 +++++---
 ms2rescore/feature_generators/ionmob.py    | 11 +++++++++--
 ms2rescore/feature_generators/ms2pip.py    |  5 ++++-
 ms2rescore/gui/__main__.py                 |  2 +-
 ms2rescore/parse_psms.py                   |  2 +-
 ms2rescore/rescoring_engines/percolator.py |  2 +-
 8 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/ms2rescore/core.py b/ms2rescore/core.py
index a02ab4f6..48e125bc 100644
--- a/ms2rescore/core.py
+++ b/ms2rescore/core.py
@@ -14,7 +14,10 @@
 from ms2rescore.parse_spectra import get_missing_values
 from ms2rescore.report import generate
 from ms2rescore.rescoring_engines import mokapot, percolator
-from ms2rescore.rescoring_engines.mokapot import add_peptide_confidence, add_psm_confidence
+from ms2rescore.rescoring_engines.mokapot import (
+    add_peptide_confidence,
+    add_psm_confidence,
+)
 
 logger = logging.getLogger(__name__)
 
diff --git a/ms2rescore/feature_generators/__init__.py b/ms2rescore/feature_generators/__init__.py
index 7a10fd4c..9eff1b5c 100644
--- a/ms2rescore/feature_generators/__init__.py
+++ b/ms2rescore/feature_generators/__init__.py
@@ -4,11 +4,11 @@
 
 from ms2rescore.feature_generators.basic import BasicFeatureGenerator
 from ms2rescore.feature_generators.deeplc import DeepLCFeatureGenerator
+from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator
 from ms2rescore.feature_generators.ionmob import IonMobFeatureGenerator
 from ms2rescore.feature_generators.maxquant import MaxQuantFeatureGenerator
-from ms2rescore.feature_generators.ms2pip import MS2PIPFeatureGenerator
-from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator
 from ms2rescore.feature_generators.ms2 import MS2FeatureGenerator
+from ms2rescore.feature_generators.ms2pip import MS2PIPFeatureGenerator
 
 FEATURE_GENERATORS = {
     "basic": BasicFeatureGenerator,
diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py
index 206d6a21..e3113544 100644
--- a/ms2rescore/feature_generators/deeplc.py
+++ b/ms2rescore/feature_generators/deeplc.py
@@ -143,9 +143,11 @@ def add_features(self, psm_list: PSMList) -> None:
                 )
 
                 # Disable wild logging to stdout by Tensorflow, unless in debug mode
-                with contextlib.redirect_stdout(
-                    open(os.devnull, "w", encoding="utf-8")
-                ) if not self._verbose else contextlib.nullcontext():
+                with (
+                    contextlib.redirect_stdout(open(os.devnull, "w", encoding="utf-8"))
+                    if not self._verbose
+                    else contextlib.nullcontext()
+                ):
                     # Make new PSM list for this run (chain PSMs per spectrum to flat list)
                     psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
                     if self.calibration_set:
diff --git a/ms2rescore/feature_generators/ionmob.py b/ms2rescore/feature_generators/ionmob.py
index 7fa0c0a1..e9b51026 100644
--- a/ms2rescore/feature_generators/ionmob.py
+++ b/ms2rescore/feature_generators/ionmob.py
@@ -23,12 +23,19 @@
 import tensorflow as tf
 from psm_utils import Peptidoform, PSMList
 
-from ms2rescore.feature_generators.base import FeatureGeneratorBase, FeatureGeneratorException
+from ms2rescore.feature_generators.base import (
+    FeatureGeneratorBase,
+    FeatureGeneratorException,
+)
 
 try:
     from ionmob import __file__ as ionmob_file
     from ionmob.preprocess.data import to_tf_dataset_inference
-    from ionmob.utilities.chemistry import VARIANT_DICT, calculate_mz, reduced_mobility_to_ccs
+    from ionmob.utilities.chemistry import (
+        VARIANT_DICT,
+        calculate_mz,
+        reduced_mobility_to_ccs,
+    )
     from ionmob.utilities.tokenization import tokenizer_from_json
     from ionmob.utilities.utility import get_ccs_shift
 except ImportError:
diff --git a/ms2rescore/feature_generators/ms2pip.py b/ms2rescore/feature_generators/ms2pip.py
index 3882b3fc..34d427e2 100644
--- a/ms2rescore/feature_generators/ms2pip.py
+++ b/ms2rescore/feature_generators/ms2pip.py
@@ -37,7 +37,10 @@
 from psm_utils import PSMList
 from rich.progress import track
 
-from ms2rescore.feature_generators.base import FeatureGeneratorBase, FeatureGeneratorException
+from ms2rescore.feature_generators.base import (
+    FeatureGeneratorBase,
+    FeatureGeneratorException,
+)
 from ms2rescore.utils import infer_spectrum_path
 
 logger = logging.getLogger(__name__)
diff --git a/ms2rescore/gui/__main__.py b/ms2rescore/gui/__main__.py
index 429e117f..57cee3c6 100644
--- a/ms2rescore/gui/__main__.py
+++ b/ms2rescore/gui/__main__.py
@@ -1,8 +1,8 @@
 """Entrypoint for MS²Rescore GUI."""
 
+import contextlib
 import multiprocessing
 import os
-import contextlib
 
 from ms2rescore.gui.app import app
 
diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py
index 309c1c96..cc61926f 100644
--- a/ms2rescore/parse_psms.py
+++ b/ms2rescore/parse_psms.py
@@ -4,8 +4,8 @@
 
 import numpy as np
 import psm_utils.io
-from psm_utils import PSMList
 from mumble import PSMHandler
+from psm_utils import PSMList
 
 from ms2rescore.exceptions import MS2RescoreConfigurationError
 
diff --git a/ms2rescore/rescoring_engines/percolator.py b/ms2rescore/rescoring_engines/percolator.py
index 192950bd..32d856c4 100644
--- a/ms2rescore/rescoring_engines/percolator.py
+++ b/ms2rescore/rescoring_engines/percolator.py
@@ -19,8 +19,8 @@
 
 import logging
 import subprocess
-from typing import Any, Dict, Optional
 from copy import deepcopy
+from typing import Any, Dict, Optional
 
 import psm_utils
 

From e8b59f3ca2c6f1fbaae4ec5cc6b531f25284fad7 Mon Sep 17 00:00:00 2001
From: SamvPy <sam.vanpuyenbroeck@gmail.com>
Date: Tue, 19 Nov 2024 16:10:56 +0100
Subject: [PATCH 17/54] add hyperscore calculation

---
 ms2rescore/feature_generators/ms2.py | 183 ++++++++++++++++++++++++++-
 1 file changed, 176 insertions(+), 7 deletions(-)

diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py
index b292d51a..490f4f47 100644
--- a/ms2rescore/feature_generators/ms2.py
+++ b/ms2rescore/feature_generators/ms2.py
@@ -5,19 +5,19 @@
 
 import logging
 import re
-from typing import List, Optional, Union
-from itertools import chain
 from collections import defaultdict
-
+from itertools import chain
+from typing import List, Optional, Union
 
 import numpy as np
-from psm_utils import PSMList
-from pyteomics import mass, mzml, mgf
-from rustyms import RawSpectrum, LinearPeptide, FragmentationModel, MassMode
+import pyopenms as oms
+from psm_utils import PSM, Peptidoform, PSMList
+from pyteomics import mass, mgf, mzml
+from rustyms import FragmentationModel, LinearPeptide, MassMode, RawSpectrum
 
+from ms2rescore.exceptions import ParseSpectrumError
 from ms2rescore.feature_generators.base import FeatureGeneratorBase
 from ms2rescore.utils import infer_spectrum_path
-from ms2rescore.exceptions import ParseSpectrumError
 
 logger = logging.getLogger(__name__)
 
@@ -45,6 +45,7 @@ def __init__(
         fragmentation_model: str = "All",
         mass_mode: str = "Monoisotopic",
         processes: int = 1,
+        calculate_hyperscore: bool = True,  # Allow optional ?
         **kwargs,
     ) -> None:
         """
@@ -66,6 +67,7 @@ def __init__(
         self.fragmentation_model = FRAGMENTATION_MODELS[fragmentation_model.lower()]
         self.mass_mode = MASS_MODES[mass_mode.lower()]
         self.processes = processes
+        self.calculate_hyperscore = calculate_hyperscore
 
     @property
     def feature_names(self) -> List[str]:
@@ -82,6 +84,7 @@ def feature_names(self) -> List[str]:
             "matched_y_ions",
             "matched_y_ions_pct",
             "matched_ions_pct",
+            "hyperscore",
         ]
 
     def add_features(self, psm_list: PSMList) -> None:
@@ -129,6 +132,22 @@ def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List:
             psm.rescoring_features.update(
                 self._calculate_spectrum_features(psm, annotated_spectrum)
             )
+
+            if self.calculate_hyperscore:
+                # Filters out peaks which are unnannotated (can be specified, but keep at b-y ions of any charge ?)
+                mz_list, intensity_list = _annotated_spectrum_to_mzint(
+                    annotated_spectrum=annotated_spectrum
+                )
+                psm.rescoring_features.update(
+                    {
+                        "hyperscore": calculate_hyperscore(
+                            psm=psm, observed_mz=mz_list, observed_intensity=intensity_list
+                        )
+                    }
+                )
+
+            else:
+                psm.rescoring_features.update({"hyperscore": np.nan})
         # with multiprocessing.Pool(self.processes) as pool:
         #     counts_failed = 0
         #     for psm, features in zip(
@@ -253,7 +272,157 @@ def _annotate_spectrum(self, psm, pyteomics_spectrum):
 
         return annotated_spectrum.spectrum
 
+    def _calculate_hyperscore(self, psm, spectrum):
+        pass
+
+    def _calculate_fragmentation_features(self, psm, annotated_spectrum):
+        pass
+
 
 # TODO: keep this here?
 def modification_evidence():
     return
+
+
+def _annotated_spectrum_to_mzint(annotated_spectrum, ion_types=["b", "y"]):
+
+    mz_list = []
+    intensity_list = []
+
+    for peak in annotated_spectrum:
+
+        annotations = peak.annotation
+        for fragment in annotations:
+            ion = fragment.ion
+            if ion[0] not in ion_types:
+                continue
+
+            mz_list.append(peak.experimental_mz)
+            intensity_list.append(peak.intensity)
+            break
+
+    return mz_list, intensity_list
+
+
+def _peptidoform_to_oms(peptidoform: Peptidoform) -> tuple[oms.AASequence, Optional[int]]:
+    """
+    Parse a peptidoform object to pyOpenMS compatible format.
+
+    Parameter
+    ---------
+    Peptidoform: Peptidoform
+        Peptide string in Peptidoform format
+
+    Returns
+    -------
+    AASequence (pyOpenMS):
+        A peptide sequence in pyOpenMS format
+    int:
+        charge of the peptide
+    """
+
+    n_term = peptidoform.properties["n_term"]
+    peptide_oms_str = f"[{sum([mod.mass for mod in n_term])}]" if n_term else ""
+
+    for aa, mods in peptidoform.parsed_sequence:
+        peptide_oms_str += aa
+        if isinstance(mods, list):
+            peptide_oms_str += f"[{sum([mod.mass for mod in mods])}]"
+
+    c_term = peptidoform.properties["c_term"]
+    peptide_oms_str += f"[{sum([mod.mass for mod in c_term])}]" if c_term else ""
+
+    peptide_oms = oms.AASequence.fromString(peptide_oms_str)
+
+    return peptide_oms
+
+
+def _peptidoform_to_theoretical_spectrum(peptidoform: str) -> oms.MSSpectrum:
+    """
+    Create a theoretical spectrum from a peptide sequence.
+
+    Parameter
+    ---------
+    peptide: str
+        Peptide sequence in proforma format
+    engine: str
+        The engine to use to create theoretical spectrum.
+        Can only be 'pyopenms' or 'spectrum-utils' (default)
+
+    Return
+    ------
+    MSSpectrum
+        Spectrum object in pyOpenMS format
+    """
+    # Reformat peptide sequence in pyOpenMS format
+    peptide_oms = _peptidoform_to_oms(peptidoform=peptidoform)
+
+    # Initialize the required objects to create the spectrum
+    spectrum = oms.MSSpectrum()
+    tsg = oms.TheoreticalSpectrumGenerator()
+    p = oms.Param()
+
+    p.setValue("add_b_ions", "true")
+    p.setValue("add_metainfo", "true")
+    tsg.setParameters(param=p)
+
+    # Create the theoretical spectrum
+    tsg.getSpectrum(spec=spectrum, peptide=peptide_oms, min_charge=1, max_charge=2)
+    return spectrum
+
+
+def calculate_hyperscore(
+    psm: PSM,
+    observed_mz: List[float],
+    observed_intensity: List[float],
+    fragment_tol_mass=20,
+    fragment_tol_mode="ppm",
+):
+    """
+    Calculate the hyperscore as defined in the X!Tandem search engine.
+
+    It is a metric of how good two spectra match with each other (matching peaks).
+
+    Parameters
+    ----------
+    psm: psm_utils.PSM
+        The PSM used to extract 'spectrum_id' (for MGF spectrum extraction)
+        and 'Peptidoform' (the peptide sequence)
+    observed_mz: List[float]
+        List of observed mz values with matching order as observed intensity
+    observed_intensity: List[float]
+        List of observed intensity values
+    fragment_tol_mass: int
+        The allowed tolerance to match peaks
+    fragment_tol_mode: str
+        'ppm' for parts-per-million mode. 'Da' for fragment_tol_mass in Dalton.
+    Return
+    ------
+    int
+        The hyperscore
+    """
+    if fragment_tol_mode == "ppm":
+        fragment_mass_tolerance_unit_ppm = True
+    elif fragment_tol_mode == "Da":
+        fragment_mass_tolerance_unit_ppm = False
+    else:
+        raise Exception(
+            "fragment_tol_mode can only take 'Da' or 'ppm'. {} was provided.".format(
+                fragment_tol_mode
+            )
+        )
+    if len(observed_intensity) == 0:
+        logging.warning(f"PSM ({psm.spectrum_id}) has no annotated peaks.")
+        return 0.0
+
+    theoretical_spectrum = _peptidoform_to_theoretical_spectrum(peptidoform=psm.peptidoform)
+    observed_spectrum_oms = oms.MSSpectrum()
+    observed_spectrum_oms.set_peaks([observed_mz, observed_intensity])
+    hyperscore = oms.HyperScore()
+    result = hyperscore.compute(
+        fragment_mass_tolerance=fragment_tol_mass,
+        fragment_mass_tolerance_unit_ppm=fragment_mass_tolerance_unit_ppm,
+        exp_spectrum=observed_spectrum_oms,
+        theo_spectrum=theoretical_spectrum,
+    )
+    return result

From c51cd346f438b2afc97c2b171ea73bcdc60a64eb Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Thu, 21 Nov 2024 17:21:04 +0100
Subject: [PATCH 18/54] calibration fixes

---
 ms2rescore/feature_generators/deeplc.py  | 15 +++++----------
 ms2rescore/feature_generators/im2deep.py |  3 ++-
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py
index e3113544..29a249eb 100644
--- a/ms2rescore/feature_generators/deeplc.py
+++ b/ms2rescore/feature_generators/deeplc.py
@@ -123,9 +123,6 @@ def add_features(self, psm_list: PSMList) -> None:
         # Run DeepLC for each spectrum file
         current_run = 1
         total_runs = sum(len(runs) for runs in psm_dict.values())
-        if self.calibration_set:
-            logger.info("Loading calibration set...")
-            self.calibration_set = read_file(self.calibration_set, filetype="tsv")
         for runs in psm_dict.values():
             # Reset DeepLC predictor for each collection of runs
             self.deeplc_predictor = None
@@ -150,12 +147,7 @@ def add_features(self, psm_list: PSMList) -> None:
                 ):
                     # Make new PSM list for this run (chain PSMs per spectrum to flat list)
                     psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
-                    if self.calibration_set:
-                        psm_list_calibration = self.calibration_set[
-                            self.calibration_set["run"] == run
-                        ]
-                    else:
-                        psm_list_calibration = self._get_calibration_psms(psm_list_run)
+                    psm_list_calibration = self._get_calibration_psms(psm_list_run)
                     logger.debug(f"Calibrating DeepLC with {len(psm_list_calibration)} PSMs...")
                     self.deeplc_predictor = self.DeepLC(
                         n_jobs=self.processes,
@@ -204,7 +196,10 @@ def add_features(self, psm_list: PSMList) -> None:
 
     def _get_calibration_psms(self, psm_list: PSMList):
         """Get N best scoring target PSMs for calibration."""
-        psm_list_targets = psm_list[~psm_list["is_decoy"]]
+        psm_list_targets = psm_list[
+            ~psm_list["is_decoy"]
+            & [metadata.get("original_psm", True) for metadata in psm_list["metadata"]]
+        ]
         if self.calibration_set_size:
             n_psms = self._get_number_of_calibration_psms(psm_list_targets)
             indices = np.argsort(psm_list_targets["score"])
diff --git a/ms2rescore/feature_generators/im2deep.py b/ms2rescore/feature_generators/im2deep.py
index 772f6855..2afa4fcd 100644
--- a/ms2rescore/feature_generators/im2deep.py
+++ b/ms2rescore/feature_generators/im2deep.py
@@ -158,7 +158,8 @@ def make_calibration_df(psm_list_df: pd.DataFrame, threshold: float = 0.25) -> p
         identified_psms = psm_list_df[
             (psm_list_df["qvalue"] < 0.01)
             & (~psm_list_df["is_decoy"])
-            & (psm_list_df["charge"] < 5)  # predictions do not go higher for IM2Deep
+            & (psm_list_df["charge"] < 7)  # predictions do not go higher for IM2Deep
+            & ([metadata.get("original_psm", True) for metadata in psm_list_df["metadata"]])
         ]
         calibration_psms = identified_psms[
             identified_psms["qvalue"] < identified_psms["qvalue"].quantile(1 - threshold)

From 295e37f20d2e344f46bf5fc9d911f6955210f184 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Thu, 21 Nov 2024 17:21:16 +0100
Subject: [PATCH 19/54] changes for mumble implementation

---
 ms2rescore/feature_generators/ms2.py | 38 ----------------------------
 ms2rescore/parse_psms.py             |  8 ++----
 pyproject.toml                       |  1 -
 3 files changed, 2 insertions(+), 45 deletions(-)

diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py
index 490f4f47..a3bd6d77 100644
--- a/ms2rescore/feature_generators/ms2.py
+++ b/ms2rescore/feature_generators/ms2.py
@@ -23,7 +23,6 @@
 
 FRAGMENTATION_MODELS = {
     "cidhcd": FragmentationModel.CidHcd,
-    "etcid": FragmentationModel.Etcid,
     "etd": FragmentationModel.Etd,
     "ethcd": FragmentationModel.Ethcd,
     "all": FragmentationModel.All,
@@ -148,28 +147,6 @@ def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List:
 
             else:
                 psm.rescoring_features.update({"hyperscore": np.nan})
-        # with multiprocessing.Pool(self.processes) as pool:
-        #     counts_failed = 0
-        #     for psm, features in zip(
-        #         psm_list,
-        #         track(
-        #             pool.imap(
-        #                 self._calculate_spectrum_features_wrapper,
-        #                 zip(psm_list, annotated_spectra),
-        #                 chunksize=1000,
-        #             ),
-        #             total=len(psm_list),
-        #             description="Calculating MS2 features...",
-        #             transient=True,
-        #         ),
-        #     ):
-        #         if features:
-        #             psm.rescoring_features.update(features)
-
-        #         else:
-        #             counts_failed += 1
-        # if counts_failed > 0:
-        #     logger.warning(f"Failed to calculate features for {counts_failed} PSMs")
 
     @staticmethod
     def _read_spectrum_file(spectrum_filepath: str) -> Union[mzml.PreIndexedMzML, mgf.IndexedMGF]:
@@ -246,10 +223,6 @@ def _calculate_spectrum_features(self, psm, annotated_spectrum):
             / (len(b_ions_matched) + len(y_ions_matched)),
         }
 
-    def _calculate_spectrum_features_wrapper(self, psm_spectrum_tuple):
-        psm, spectrum = psm_spectrum_tuple
-        return self._calculate_spectrum_features(psm, spectrum)
-
     def _annotate_spectrum(self, psm, pyteomics_spectrum):
 
         spectrum = RawSpectrum(
@@ -272,17 +245,6 @@ def _annotate_spectrum(self, psm, pyteomics_spectrum):
 
         return annotated_spectrum.spectrum
 
-    def _calculate_hyperscore(self, psm, spectrum):
-        pass
-
-    def _calculate_fragmentation_features(self, psm, annotated_spectrum):
-        pass
-
-
-# TODO: keep this here?
-def modification_evidence():
-    return
-
 
 def _annotated_spectrum_to_mzint(annotated_spectrum, ion_types=["b", "y"]):
 
diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py
index cc61926f..2f0ec3b0 100644
--- a/ms2rescore/parse_psms.py
+++ b/ms2rescore/parse_psms.py
@@ -104,13 +104,9 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
                 psm.rescoring_features = {}
 
         psm_handler = PSMHandler(
-            **mumble_config,  # TODO how do we store config for mumble?
+            **mumble_config,
         )
-        psm_list = psm_handler.add_modified_psms(
-            psm_list, generate_modified_decoys=True, keep_original=True
-        )
-        if mumble_config["output_file"]:
-            psm_handler.write_modified_psm_list(psm_list, mumble_config["output_file"])
+        psm_list = psm_handler.add_modified_psms(psm_list)
 
     return psm_list
 
diff --git a/pyproject.toml b/pyproject.toml
index 4285f1fa..985cc27e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,7 +50,6 @@ dependencies = [
     "pyteomics>=4.7.2",
     "rich>=12",
     "tomli>=2; python_version < '3.11'",
-    "mumble"
 ]
 
 [project.optional-dependencies]

From 909860d9c6f81490f9af3f6de364b606ddd7fe9c Mon Sep 17 00:00:00 2001
From: SamvPy <sam.vanpuyenbroeck@gmail.com>
Date: Fri, 22 Nov 2024 11:51:23 +0100
Subject: [PATCH 20/54] change openms peptide formatting

---
 ms2rescore/feature_generators/ms2.py | 121 ++++++++++++++++++++++-----
 1 file changed, 98 insertions(+), 23 deletions(-)

diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py
index a3bd6d77..eb049ed8 100644
--- a/ms2rescore/feature_generators/ms2.py
+++ b/ms2rescore/feature_generators/ms2.py
@@ -12,8 +12,11 @@
 import numpy as np
 import pyopenms as oms
 from psm_utils import PSM, Peptidoform, PSMList
+from psm_utils.peptidoform import PeptidoformException
+from pyteomics.proforma import ProFormaError
 from pyteomics import mass, mgf, mzml
 from rustyms import FragmentationModel, LinearPeptide, MassMode, RawSpectrum
+from pathlib import Path
 
 from ms2rescore.exceptions import ParseSpectrumError
 from ms2rescore.feature_generators.base import FeatureGeneratorBase
@@ -68,6 +71,7 @@ def __init__(
         self.processes = processes
         self.calculate_hyperscore = calculate_hyperscore
 
+
     @property
     def feature_names(self) -> List[str]:
         return [
@@ -137,19 +141,20 @@ def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List:
                 mz_list, intensity_list = _annotated_spectrum_to_mzint(
                     annotated_spectrum=annotated_spectrum
                 )
+                hyperscore = calculate_hyperscore(
+                    psm=psm, observed_mz=mz_list, observed_intensity=intensity_list
+                )
+                if hyperscore is None:
+                    continue
                 psm.rescoring_features.update(
                     {
-                        "hyperscore": calculate_hyperscore(
-                            psm=psm, observed_mz=mz_list, observed_intensity=intensity_list
-                        )
+                        "hyperscore": hyperscore
                     }
                 )
 
-            else:
-                psm.rescoring_features.update({"hyperscore": np.nan})
 
     @staticmethod
-    def _read_spectrum_file(spectrum_filepath: str) -> Union[mzml.PreIndexedMzML, mgf.IndexedMGF]:
+    def _read_spectrum_file(spectrum_filepath: Path) -> Union[mzml.PreIndexedMzML, mgf.IndexedMGF]:
 
         if spectrum_filepath.suffix.lower() == ".mzml":
             return mzml.PreIndexedMzML(str(spectrum_filepath))
@@ -235,8 +240,9 @@ def _annotate_spectrum(self, psm, pyteomics_spectrum):
             intensity_array=pyteomics_spectrum["intensity array"],
         )
         try:
+            linear_peptide = LinearPeptide(psm.peptidoform.proforma.split("/")[0])
             annotated_spectrum = spectrum.annotate(
-                peptide=LinearPeptide(psm.peptidoform.proforma.split("/")[0]),
+                peptide=linear_peptide,
                 model=self.fragmentation_model,
                 mode=self.mass_mode,
             )
@@ -246,6 +252,9 @@ def _annotate_spectrum(self, psm, pyteomics_spectrum):
         return annotated_spectrum.spectrum
 
 
+############################
+### HYPERSCORE FUNCTIONS ###
+############################
 def _annotated_spectrum_to_mzint(annotated_spectrum, ion_types=["b", "y"]):
 
     mz_list = []
@@ -268,35 +277,87 @@ def _annotated_spectrum_to_mzint(annotated_spectrum, ion_types=["b", "y"]):
 
 def _peptidoform_to_oms(peptidoform: Peptidoform) -> tuple[oms.AASequence, Optional[int]]:
     """
-    Parse a peptidoform object to pyOpenMS compatible format.
+    Parse a peptide sequence in proforma format to pyOpenMS compatible format.
+
+    Only supports UNIMOD format.
 
     Parameter
     ---------
-    Peptidoform: Peptidoform
-        Peptide string in Peptidoform format
+    peptide: str
+        Peptide string in proforma format
 
     Returns
     -------
     AASequence (pyOpenMS):
         A peptide sequence in pyOpenMS format
-    int:
-        charge of the peptide
     """
+    peptide = peptidoform.proforma
+
+    # Reformat unimod modifications
+    pattern_unimod = r"\[UNIMOD:(\d+)\]"
+
+    def replace_unimod(match):
+        return f"(UniMod:{match.group(1)})"
+
+    peptide_oms_str = re.sub(
+        pattern=pattern_unimod, repl=replace_unimod, string=peptide
+    )
+
+    # Parse N-terminal modifications
+    if ")-" in peptide_oms_str:
+        peptide_oms_list = peptide_oms_str.split(")-")
+        nterm_modification, peptide_oms_str = peptide_oms_list[-2], peptide_oms_list[-1]
+        nterm_modification += ")"
+        peptide_oms_str = "." + nterm_modification + peptide_oms_str + "."
+    elif "]-" in peptide_oms_str:
+        peptide_oms_list = peptide_oms_str.split("]-")
+        nterm_modification, peptide_oms_str = peptide_oms_list[-2], peptide_oms_list[-1]
+        nterm_modification += "]"
+        peptide_oms_str = "." + nterm_modification + peptide_oms_str + "."
 
-    n_term = peptidoform.properties["n_term"]
-    peptide_oms_str = f"[{sum([mod.mass for mod in n_term])}]" if n_term else ""
+    # Split the charge from the peptide string
+    if "/" in peptide_oms_str:
+        peptide_oms_str, _ = peptide_oms_str.split("/")
 
-    for aa, mods in peptidoform.parsed_sequence:
-        peptide_oms_str += aa
-        if isinstance(mods, list):
-            peptide_oms_str += f"[{sum([mod.mass for mod in mods])}]"
+    try:
+        peptide_oms = oms.AASequence.fromString(peptide_oms_str)
+        return peptide_oms
 
-    c_term = peptidoform.properties["c_term"]
-    peptide_oms_str += f"[{sum([mod.mass for mod in c_term])}]" if c_term else ""
+    except:
+        return
 
-    peptide_oms = oms.AASequence.fromString(peptide_oms_str)
 
-    return peptide_oms
+# def _peptidoform_to_oms(peptidoform: Peptidoform) -> tuple[oms.AASequence, Optional[int]]:
+#     """
+#     Parse a peptidoform object to pyOpenMS compatible format.
+
+#     Parameter
+#     ---------
+#     Peptidoform: Peptidoform
+#         Peptide string in Peptidoform format
+
+#     Returns
+#     -------
+#     AASequence (pyOpenMS):
+#         A peptide sequence in pyOpenMS format
+#     int:
+#         charge of the peptide
+#     """
+
+#     n_term = peptidoform.properties["n_term"]
+#     peptide_oms_str = f"[{sum([mod.mass for mod in n_term])}]" if n_term else ""
+
+#     for aa, mods in peptidoform.parsed_sequence:
+#         peptide_oms_str += aa
+#         if isinstance(mods, list):
+#             peptide_oms_str += f"[{sum([mod.mass for mod in mods])}]"
+
+#     c_term = peptidoform.properties["c_term"]
+#     peptide_oms_str += f"[{sum([mod.mass for mod in c_term])}]" if c_term else ""
+
+#     peptide_oms = oms.AASequence.fromString(peptide_oms_str)
+
+#     return peptide_oms
 
 
 def _peptidoform_to_theoretical_spectrum(peptidoform: str) -> oms.MSSpectrum:
@@ -318,6 +379,8 @@ def _peptidoform_to_theoretical_spectrum(peptidoform: str) -> oms.MSSpectrum:
     """
     # Reformat peptide sequence in pyOpenMS format
     peptide_oms = _peptidoform_to_oms(peptidoform=peptidoform)
+    if peptide_oms is None:
+        return
 
     # Initialize the required objects to create the spectrum
     spectrum = oms.MSSpectrum()
@@ -375,9 +438,16 @@ def calculate_hyperscore(
         )
     if len(observed_intensity) == 0:
         logging.warning(f"PSM ({psm.spectrum_id}) has no annotated peaks.")
-        return 0.0
+        return
 
     theoretical_spectrum = _peptidoform_to_theoretical_spectrum(peptidoform=psm.peptidoform)
+    # This is mainly the cause of the modification not being allowed according to pyopenms
+    # pyOpenMS sets stringent criteria on modification being placed on annotated amino acids
+    # according to the unimod database
+    if theoretical_spectrum is None:
+        logging.warning(f'Peptidoform has either unsupported modifications or is being placed on non-allowed residue: {psm.peptidoform.proforma}')
+        return
+
     observed_spectrum_oms = oms.MSSpectrum()
     observed_spectrum_oms.set_peaks([observed_mz, observed_intensity])
     hyperscore = oms.HyperScore()
@@ -388,3 +458,8 @@ def calculate_hyperscore(
         theo_spectrum=theoretical_spectrum,
     )
     return result
+
+
+###########################
+### FRAG SITE FUNCTIONS ###
+###########################

From c5902c21c0c5c72e71bb64cafc17c15c28b50191 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Fri, 22 Nov 2024 13:36:41 +0100
Subject: [PATCH 21/54] add mumble psm filtering functionality

---
 ms2rescore/core.py  | 10 +++++++++-
 ms2rescore/utils.py | 31 +++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/ms2rescore/core.py b/ms2rescore/core.py
index 48e125bc..f314bef6 100644
--- a/ms2rescore/core.py
+++ b/ms2rescore/core.py
@@ -18,6 +18,7 @@
     add_peptide_confidence,
     add_psm_confidence,
 )
+from ms2rescore.utils import filter_mumble_psms
 
 logger = logging.getLogger(__name__)
 
@@ -102,6 +103,10 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
         )
     psm_list = psm_list[psms_with_features]
 
+    if "mumble" in config["psm_generator"]:
+        # Remove PSMS that have a less matched ions than the original hit
+        psm_list = filter_mumble_psms(psm_list)
+
     # Write feature names to file
     _write_feature_names(feature_names, output_file_root)
 
@@ -251,7 +256,10 @@ def _write_feature_names(feature_names, output_file_root):
 def _log_id_psms_before(psm_list: PSMList, fdr: float = 0.01, max_rank: int = 1) -> int:
     """Log #PSMs identified before rescoring."""
     id_psms_before = (
-        (psm_list["qvalue"] <= 0.01) & (psm_list["rank"] <= max_rank) & (~psm_list["is_decoy"])
+        (psm_list["qvalue"] <= 0.01)
+        & (psm_list["rank"] <= max_rank)
+        & (~psm_list["is_decoy"])
+        & ([metadata.get("original_psm", True) for metadata in psm_list["metadata"]])
     ).sum()
     logger.info(
         f"Found {id_psms_before} identified PSMs with rank <= {max_rank} at {fdr} FDR before "
diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py
index 3515893a..33b1f0a1 100644
--- a/ms2rescore/utils.py
+++ b/ms2rescore/utils.py
@@ -4,8 +4,10 @@
 from glob import glob
 from pathlib import Path
 from typing import Optional, Union
+import numpy as np
 
 from ms2rescore.exceptions import MS2RescoreConfigurationError
+from psm_utils import PSMList
 
 logger = logging.getLogger(__name__)
 
@@ -93,3 +95,32 @@ def _is_minitdf(spectrum_file: str) -> bool:
     files = set(Path(spectrum_file).glob("*ms2spectrum.bin"))
     files.update(Path(spectrum_file).glob("*ms2spectrum.parquet"))
     return len(files) >= 2
+
+
+def filter_mumble_psms(psm_list: PSMList) -> PSMList:
+    """
+    Filter out PSMs with mumble in the peptide sequence.
+    """
+    keep = [None] * len(psm_list)
+    original_hit = np.array([metadata.get("original_hit") for metadata in psm_list["metadata"]])
+    spectrum_indices = np.array([psm.spectrum for psm in psm_list])
+    runs = np.array([psm.run for psm in psm_list])
+    if "matched_ions_pct" in psm_list[0].rescoring_features:
+        matched_ions = [psm.rescoring_features["matched_ions_pct"] for psm in psm_list]
+    else:
+        return psm_list
+    for i, psm in enumerate(psm_list):
+        if isinstance(keep[i], bool):
+            continue
+        elif original_hit[i]:
+            keep[i] = True
+        else:
+            original_matched_ions_pct = np.logical_and.reduce(
+                [original_hit, spectrum_indices == psm.spectrum_index, runs == psm.run]
+            )
+            if original_matched_ions_pct > matched_ions[i]:
+                keep[i] = False
+            else:
+                keep[i] = True
+    logger.debug(f"Filtered out {len(psm_list) - sum(keep)} mumble PSMs.")
+    return psm_list[keep]

From 5ce55f501d4a52fe6714a904d6402b90d8450613 Mon Sep 17 00:00:00 2001
From: SamvPy <sam.vanpuyenbroeck@gmail.com>
Date: Fri, 22 Nov 2024 15:05:03 +0100
Subject: [PATCH 22/54] remove pyopenms dependency for hyperscore calculation

---
 ms2rescore/feature_generators/ms2.py | 296 +++++++--------------------
 1 file changed, 75 insertions(+), 221 deletions(-)

diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py
index eb049ed8..df12ca95 100644
--- a/ms2rescore/feature_generators/ms2.py
+++ b/ms2rescore/feature_generators/ms2.py
@@ -138,20 +138,14 @@ def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List:
 
             if self.calculate_hyperscore:
                 # Filters out peaks which are unnannotated (can be specified, but keep at b-y ions of any charge ?)
-                mz_list, intensity_list = _annotated_spectrum_to_mzint(
-                    annotated_spectrum=annotated_spectrum
+                b, y = get_by_fragments(annotated_spectrum)
+                hs = calculate_hyperscore(
+                    n_y=len(y),
+                    n_b=len(b),
+                    y_ion_intensities=y,
+                    b_ion_intensities=b
                 )
-                hyperscore = calculate_hyperscore(
-                    psm=psm, observed_mz=mz_list, observed_intensity=intensity_list
-                )
-                if hyperscore is None:
-                    continue
-                psm.rescoring_features.update(
-                    {
-                        "hyperscore": hyperscore
-                    }
-                )
-
+                psm.rescoring_features.update({'hyperscore': hs})
 
     @staticmethod
     def _read_spectrum_file(spectrum_filepath: Path) -> Union[mzml.PreIndexedMzML, mgf.IndexedMGF]:
@@ -251,215 +245,75 @@ def _annotate_spectrum(self, psm, pyteomics_spectrum):
 
         return annotated_spectrum.spectrum
 
-
-############################
-### HYPERSCORE FUNCTIONS ###
-############################
-def _annotated_spectrum_to_mzint(annotated_spectrum, ion_types=["b", "y"]):
-
-    mz_list = []
-    intensity_list = []
-
-    for peak in annotated_spectrum:
-
-        annotations = peak.annotation
-        for fragment in annotations:
-            ion = fragment.ion
-            if ion[0] not in ion_types:
-                continue
-
-            mz_list.append(peak.experimental_mz)
-            intensity_list.append(peak.intensity)
-            break
-
-    return mz_list, intensity_list
-
-
-def _peptidoform_to_oms(peptidoform: Peptidoform) -> tuple[oms.AASequence, Optional[int]]:
-    """
-    Parse a peptide sequence in proforma format to pyOpenMS compatible format.
-
-    Only supports UNIMOD format.
-
-    Parameter
-    ---------
-    peptide: str
-        Peptide string in proforma format
-
-    Returns
-    -------
-    AASequence (pyOpenMS):
-        A peptide sequence in pyOpenMS format
-    """
-    peptide = peptidoform.proforma
-
-    # Reformat unimod modifications
-    pattern_unimod = r"\[UNIMOD:(\d+)\]"
-
-    def replace_unimod(match):
-        return f"(UniMod:{match.group(1)})"
-
-    peptide_oms_str = re.sub(
-        pattern=pattern_unimod, repl=replace_unimod, string=peptide
-    )
-
-    # Parse N-terminal modifications
-    if ")-" in peptide_oms_str:
-        peptide_oms_list = peptide_oms_str.split(")-")
-        nterm_modification, peptide_oms_str = peptide_oms_list[-2], peptide_oms_list[-1]
-        nterm_modification += ")"
-        peptide_oms_str = "." + nterm_modification + peptide_oms_str + "."
-    elif "]-" in peptide_oms_str:
-        peptide_oms_list = peptide_oms_str.split("]-")
-        nterm_modification, peptide_oms_str = peptide_oms_list[-2], peptide_oms_list[-1]
-        nterm_modification += "]"
-        peptide_oms_str = "." + nterm_modification + peptide_oms_str + "."
-
-    # Split the charge from the peptide string
-    if "/" in peptide_oms_str:
-        peptide_oms_str, _ = peptide_oms_str.split("/")
-
-    try:
-        peptide_oms = oms.AASequence.fromString(peptide_oms_str)
-        return peptide_oms
-
-    except:
-        return
-
-
-# def _peptidoform_to_oms(peptidoform: Peptidoform) -> tuple[oms.AASequence, Optional[int]]:
-#     """
-#     Parse a peptidoform object to pyOpenMS compatible format.
-
-#     Parameter
-#     ---------
-#     Peptidoform: Peptidoform
-#         Peptide string in Peptidoform format
-
-#     Returns
-#     -------
-#     AASequence (pyOpenMS):
-#         A peptide sequence in pyOpenMS format
-#     int:
-#         charge of the peptide
-#     """
-
-#     n_term = peptidoform.properties["n_term"]
-#     peptide_oms_str = f"[{sum([mod.mass for mod in n_term])}]" if n_term else ""
-
-#     for aa, mods in peptidoform.parsed_sequence:
-#         peptide_oms_str += aa
-#         if isinstance(mods, list):
-#             peptide_oms_str += f"[{sum([mod.mass for mod in mods])}]"
-
-#     c_term = peptidoform.properties["c_term"]
-#     peptide_oms_str += f"[{sum([mod.mass for mod in c_term])}]" if c_term else ""
-
-#     peptide_oms = oms.AASequence.fromString(peptide_oms_str)
-
-#     return peptide_oms
-
-
-def _peptidoform_to_theoretical_spectrum(peptidoform: str) -> oms.MSSpectrum:
+ 
+def factorial(n):
     """
-    Create a theoretical spectrum from a peptide sequence.
-
-    Parameter
-    ---------
-    peptide: str
-        Peptide sequence in proforma format
-    engine: str
-        The engine to use to create theoretical spectrum.
-        Can only be 'pyopenms' or 'spectrum-utils' (default)
-
-    Return
-    ------
-    MSSpectrum
-        Spectrum object in pyOpenMS format
+    Compute factorial of n using a loop.
+    Parameters:
+        n (int): Non-negative integer.
+    Returns:
+        int: Factorial of n.
     """
-    # Reformat peptide sequence in pyOpenMS format
-    peptide_oms = _peptidoform_to_oms(peptidoform=peptidoform)
-    if peptide_oms is None:
-        return
-
-    # Initialize the required objects to create the spectrum
-    spectrum = oms.MSSpectrum()
-    tsg = oms.TheoreticalSpectrumGenerator()
-    p = oms.Param()
-
-    p.setValue("add_b_ions", "true")
-    p.setValue("add_metainfo", "true")
-    tsg.setParameters(param=p)
-
-    # Create the theoretical spectrum
-    tsg.getSpectrum(spec=spectrum, peptide=peptide_oms, min_charge=1, max_charge=2)
-    return spectrum
-
-
-def calculate_hyperscore(
-    psm: PSM,
-    observed_mz: List[float],
-    observed_intensity: List[float],
-    fragment_tol_mass=20,
-    fragment_tol_mode="ppm",
-):
+    result = 1
+    for i in range(1, n + 1):
+        result *= i
+    return result
+ 
+def calculate_hyperscore(n_y, n_b, y_ion_intensities, b_ion_intensities):
     """
-    Calculate the hyperscore as defined in the X!Tandem search engine.
-
-    It is a metric of how good two spectra match with each other (matching peaks).
-
-    Parameters
-    ----------
-    psm: psm_utils.PSM
-        The PSM used to extract 'spectrum_id' (for MGF spectrum extraction)
-        and 'Peptidoform' (the peptide sequence)
-    observed_mz: List[float]
-        List of observed mz values with matching order as observed intensity
-    observed_intensity: List[float]
-        List of observed intensity values
-    fragment_tol_mass: int
-        The allowed tolerance to match peaks
-    fragment_tol_mode: str
-        'ppm' for parts-per-million mode. 'Da' for fragment_tol_mass in Dalton.
-    Return
-    ------
-    int
-        The hyperscore
+    Calculate the hyperscore for a peptide-spectrum match.
+    Parameters:
+        n_y (int): Number of matched y-ions.
+        n_b (int): Number of matched b-ions.
+        y_ion_intensities (list): Intensities of matched y-ions.
+        b_ion_intensities (list): Intensities of matched b-ions.
+    Returns:
+        float: Calculated hyperscore.
     """
-    if fragment_tol_mode == "ppm":
-        fragment_mass_tolerance_unit_ppm = True
-    elif fragment_tol_mode == "Da":
-        fragment_mass_tolerance_unit_ppm = False
-    else:
-        raise Exception(
-            "fragment_tol_mode can only take 'Da' or 'ppm'. {} was provided.".format(
-                fragment_tol_mode
-            )
-        )
-    if len(observed_intensity) == 0:
-        logging.warning(f"PSM ({psm.spectrum_id}) has no annotated peaks.")
-        return
-
-    theoretical_spectrum = _peptidoform_to_theoretical_spectrum(peptidoform=psm.peptidoform)
-    # This is mainly the cause of the modification not being allowed according to pyopenms
-    # pyOpenMS sets stringent criteria on modification being placed on annotated amino acids
-    # according to the unimod database
-    if theoretical_spectrum is None:
-        logging.warning(f'Peptidoform has either unsupported modifications or is being placed on non-allowed residue: {psm.peptidoform.proforma}')
-        return
-
-    observed_spectrum_oms = oms.MSSpectrum()
-    observed_spectrum_oms.set_peaks([observed_mz, observed_intensity])
-    hyperscore = oms.HyperScore()
-    result = hyperscore.compute(
-        fragment_mass_tolerance=fragment_tol_mass,
-        fragment_mass_tolerance_unit_ppm=fragment_mass_tolerance_unit_ppm,
-        exp_spectrum=observed_spectrum_oms,
-        theo_spectrum=theoretical_spectrum,
-    )
-    return result
-
-
-###########################
-### FRAG SITE FUNCTIONS ###
-###########################
+    # Calculate the product of y-ion and b-ion intensities
+    product_y = np.sum(y_ion_intensities) if y_ion_intensities else 1
+    product_b = np.sum(b_ion_intensities) if b_ion_intensities else 1
+ 
+    # Calculate factorial using custom function
+    factorial_y = factorial(n_y)
+    factorial_b = factorial(n_b)
+ 
+    # Compute hyperscore
+    hyperscore = np.log(factorial_y * factorial_b * (product_y+product_b))
+    return hyperscore
+
+def infer_fragment_identity(frag, allow_ion_types=['b', 'y']):
+    ion = frag.ion
+
+    is_allowed = False
+    for allowed_ion_type in allow_ion_types:
+        if allowed_ion_type in ion:
+            is_allowed=True
+            break
+    
+    if not is_allowed:
+        return False
+    # Radicals
+    if "·" in ion:
+        return False
+    if frag.neutral_loss is not None:
+        return False
+    if frag.charge > 2:
+        return False
+    
+    return ion[0]
+
+def get_by_fragments(annotated_spectrum):
+    b_intensities = []
+    y_intensities = []
+    for peak in annotated_spectrum:
+        
+        for fragment in peak.annotation:
+
+            ion_type = infer_fragment_identity(fragment)
+            
+            if ion_type == 'b':
+                b_intensities.append(peak.intensity)
+            if ion_type == 'y':
+                y_intensities.append(peak.intensity)
+    return b_intensities, y_intensities
\ No newline at end of file

From 986c5f612bbd46d4f5f8f25e5d95425cde470e31 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Fri, 22 Nov 2024 16:00:42 +0100
Subject: [PATCH 23/54] fix spectrum_id accession

---
 ms2rescore/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py
index 33b1f0a1..7897b924 100644
--- a/ms2rescore/utils.py
+++ b/ms2rescore/utils.py
@@ -103,7 +103,7 @@ def filter_mumble_psms(psm_list: PSMList) -> PSMList:
     """
     keep = [None] * len(psm_list)
     original_hit = np.array([metadata.get("original_hit") for metadata in psm_list["metadata"]])
-    spectrum_indices = np.array([psm.spectrum for psm in psm_list])
+    spectrum_indices = np.array([psm.spectrum_id for psm in psm_list])
     runs = np.array([psm.run for psm in psm_list])
     if "matched_ions_pct" in psm_list[0].rescoring_features:
         matched_ions = [psm.rescoring_features["matched_ions_pct"] for psm in psm_list]
@@ -116,7 +116,7 @@ def filter_mumble_psms(psm_list: PSMList) -> PSMList:
             keep[i] = True
         else:
             original_matched_ions_pct = np.logical_and.reduce(
-                [original_hit, spectrum_indices == psm.spectrum_index, runs == psm.run]
+                [original_hit, spectrum_indices == psm.spectrum_id, runs == psm.run]
             )
             if original_matched_ions_pct > matched_ions[i]:
                 keep[i] = False

From 5333e4601160a9d0ad77694671fe2c623fad3773 Mon Sep 17 00:00:00 2001
From: Kevin Velghe <kevin.velghe@ugent.be>
Date: Fri, 17 Jan 2025 12:56:25 +0100
Subject: [PATCH 24/54] remove unused imports

---
 ms2rescore/feature_generators/ms2.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py
index df12ca95..65bb42a9 100644
--- a/ms2rescore/feature_generators/ms2.py
+++ b/ms2rescore/feature_generators/ms2.py
@@ -10,10 +10,7 @@
 from typing import List, Optional, Union
 
 import numpy as np
-import pyopenms as oms
-from psm_utils import PSM, Peptidoform, PSMList
-from psm_utils.peptidoform import PeptidoformException
-from pyteomics.proforma import ProFormaError
+from psm_utils import PSMList
 from pyteomics import mass, mgf, mzml
 from rustyms import FragmentationModel, LinearPeptide, MassMode, RawSpectrum
 from pathlib import Path
@@ -316,4 +313,4 @@ def get_by_fragments(annotated_spectrum):
                 b_intensities.append(peak.intensity)
             if ion_type == 'y':
                 y_intensities.append(peak.intensity)
-    return b_intensities, y_intensities
\ No newline at end of file
+    return b_intensities, y_intensities

From dd2259f90b14fe2de0bdaae249f38481089c5bc5 Mon Sep 17 00:00:00 2001
From: Kevin Velghe <kevin.velghe@ugent.be>
Date: Fri, 17 Jan 2025 12:57:48 +0100
Subject: [PATCH 25/54] remove unused import in deeplc feature generator

---
 ms2rescore/feature_generators/deeplc.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py
index 287833f8..2ee6b063 100644
--- a/ms2rescore/feature_generators/deeplc.py
+++ b/ms2rescore/feature_generators/deeplc.py
@@ -25,7 +25,6 @@
 
 import numpy as np
 from psm_utils import PSMList
-from psm_utils.io import read_file
 
 from ms2rescore.feature_generators.base import FeatureGeneratorBase
 from ms2rescore.parse_spectra import MSDataType

From d24ef309c68a166ac64f02bf8eb3291e40f8740a Mon Sep 17 00:00:00 2001
From: Kevin Velghe <kevin.velghe@ugent.be>
Date: Fri, 17 Jan 2025 13:06:42 +0100
Subject: [PATCH 26/54] add rustyms dependency

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index c1a1cbc8..595c9c37 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,6 +50,7 @@ dependencies = [
     "pyteomics>=4.7.2",
     "rich>=12",
     "tomli>=2; python_version < '3.11'",
+    "rustyms>=0.9.0a3",
 ]
 
 [project.optional-dependencies]

From 21cafc7d2de05b3806ffa190ec09f9f39032a4ea Mon Sep 17 00:00:00 2001
From: Kevin Velghe <kevin.velghe@ugent.be>
Date: Fri, 17 Jan 2025 13:34:33 +0100
Subject: [PATCH 27/54] drop rustyms requirement to 0.8.3

rustyms 0.9.0a3 requires python 3.11, while we support 3.9.
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 595c9c37..d600a6cc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,7 +50,7 @@ dependencies = [
     "pyteomics>=4.7.2",
     "rich>=12",
     "tomli>=2; python_version < '3.11'",
-    "rustyms>=0.9.0a3",
+    "rustyms>=0.8.3",
 ]
 
 [project.optional-dependencies]

From ca9da7d9ffd5f147213d81ba48627c32dc4495fb Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Fri, 17 Jan 2025 14:14:02 +0100
Subject: [PATCH 28/54] mumble related changes

---
 ms2rescore/core.py                   | 10 ++--
 ms2rescore/feature_generators/ms2.py | 76 ++++++++++++----------------
 ms2rescore/parse_psms.py             | 13 +++++
 ms2rescore/utils.py                  | 67 ++++++++++++++++--------
 4 files changed, 97 insertions(+), 69 deletions(-)

diff --git a/ms2rescore/core.py b/ms2rescore/core.py
index a75bf753..25eaf62b 100644
--- a/ms2rescore/core.py
+++ b/ms2rescore/core.py
@@ -111,8 +111,10 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
     psm_list = psm_list[psms_with_features]
 
     if "mumble" in config["psm_generator"]:
-        # Remove PSMS that have a less matched ions than the original hit
-        psm_list = filter_mumble_psms(psm_list)
+        # Remove PSMs where matched_ions_pct drops 25% below the original hit
+        psm_list = filter_mumble_psms(psm_list, threshold=0.75)
+        # Currently replace the score with the hyperscore for Mumble
+        # psm_list["score"] = [ft["hyperscore"] for ft in psm_list["rescoring_features"]] # TODO: This is a temporary fix
 
     # Write feature names to file
     _write_feature_names(feature_names, output_file_root)
@@ -288,7 +290,9 @@ def _calculate_confidence(psm_list: PSMList) -> PSMList:
     )
 
     # Recalculate confidence
-    new_confidence = lin_psm_data.assign_confidence(scores=psm_list["score"])
+    new_confidence = lin_psm_data.assign_confidence(
+        scores=list(psm_list["score"])
+    )  # explicity make it a list to avoid TypingError: Failed in nopython mode pipeline (step: nopython frontend) in mokapot
 
     # Add new confidence estimations to PSMList
     add_psm_confidence(psm_list, new_confidence)
diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py
index 65bb42a9..0b226a5d 100644
--- a/ms2rescore/feature_generators/ms2.py
+++ b/ms2rescore/feature_generators/ms2.py
@@ -7,13 +7,12 @@
 import re
 from collections import defaultdict
 from itertools import chain
-from typing import List, Optional, Union
+from typing import List, Optional
 
 import numpy as np
 from psm_utils import PSMList
-from pyteomics import mass, mgf, mzml
 from rustyms import FragmentationModel, LinearPeptide, MassMode, RawSpectrum
-from pathlib import Path
+from ms2rescore_rs import get_ms2_spectra
 
 from ms2rescore.exceptions import ParseSpectrumError
 from ms2rescore.feature_generators.base import FeatureGeneratorBase
@@ -68,7 +67,6 @@ def __init__(
         self.processes = processes
         self.calculate_hyperscore = calculate_hyperscore
 
-
     @property
     def feature_names(self) -> List[str]:
         return [
@@ -109,26 +107,22 @@ def add_features(self, psm_list: PSMList) -> None:
     def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List:
         """Retrieve annotated spectra for all psms."""
 
-        spectrum_reader = self._read_spectrum_file(spectrum_file)
-
+        spectra = get_ms2_spectra(str(spectrum_file))
         spectrum_id_pattern = re.compile(
             self.spectrum_id_pattern if self.spectrum_id_pattern else r"(.*)"
         )
-
         try:
-            mapper = {
-                spectrum_id_pattern.search(spectrum_id).group(1): spectrum_id
-                for spectrum_id in spectrum_reader._offset_index.mapping["spectrum"].keys()
+            spectra_dict = {
+                spectrum_id_pattern.search(spectrum.identifier).group(1): spectrum
+                for spectrum in spectra
             }
         except AttributeError:
             raise ParseSpectrumError(
                 "Could not parse spectrum IDs using ´spectrum_id_pattern´. Please make sure that there is a capturing in the pattern."
             )
-        annotated_spectra = [
-            self._annotate_spectrum(psm, spectrum_reader.get_by_id(mapper[psm.spectrum_id]))
-            for psm in psm_list
-        ]
-        for psm, annotated_spectrum in zip(psm_list, annotated_spectra):
+
+        for psm in psm_list:
+            annotated_spectrum = self._annotate_spectrum(psm, spectra_dict[psm.spectrum_id])
             psm.rescoring_features.update(
                 self._calculate_spectrum_features(psm, annotated_spectrum)
             )
@@ -137,20 +131,9 @@ def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List:
                 # Filters out peaks which are unnannotated (can be specified, but keep at b-y ions of any charge ?)
                 b, y = get_by_fragments(annotated_spectrum)
                 hs = calculate_hyperscore(
-                    n_y=len(y),
-                    n_b=len(b),
-                    y_ion_intensities=y,
-                    b_ion_intensities=b
+                    n_y=len(y), n_b=len(b), y_ion_intensities=y, b_ion_intensities=b
                 )
-                psm.rescoring_features.update({'hyperscore': hs})
-
-    @staticmethod
-    def _read_spectrum_file(spectrum_filepath: Path) -> Union[mzml.PreIndexedMzML, mgf.IndexedMGF]:
-
-        if spectrum_filepath.suffix.lower() == ".mzml":
-            return mzml.PreIndexedMzML(str(spectrum_filepath))
-        elif spectrum_filepath.suffix.lower() == ".mgf":
-            return mgf.IndexedMGF(str(spectrum_filepath))
+                psm.rescoring_features.update({"hyperscore": hs})
 
     @staticmethod
     def _longest_ion_sequence(lst):
@@ -219,16 +202,16 @@ def _calculate_spectrum_features(self, psm, annotated_spectrum):
             / (len(b_ions_matched) + len(y_ions_matched)),
         }
 
-    def _annotate_spectrum(self, psm, pyteomics_spectrum):
+    def _annotate_spectrum(self, psm, spectrum):
 
         spectrum = RawSpectrum(
             title=psm.spectrum_id,
             num_scans=1,
             rt=psm.retention_time,
             precursor_charge=psm.get_precursor_charge(),
-            precursor_mass=mass.calculate_mass(psm.peptidoform.composition),
-            mz_array=pyteomics_spectrum["m/z array"],
-            intensity_array=pyteomics_spectrum["intensity array"],
+            precursor_mass=spectrum.precursor.mz,
+            mz_array=spectrum.mz,
+            intensity_array=spectrum.intensity,
         )
         try:
             linear_peptide = LinearPeptide(psm.peptidoform.proforma.split("/")[0])
@@ -242,7 +225,7 @@ def _annotate_spectrum(self, psm, pyteomics_spectrum):
 
         return annotated_spectrum.spectrum
 
- 
+
 def factorial(n):
     """
     Compute factorial of n using a loop.
@@ -255,7 +238,8 @@ def factorial(n):
     for i in range(1, n + 1):
         result *= i
     return result
- 
+
+
 def calculate_hyperscore(n_y, n_b, y_ion_intensities, b_ion_intensities):
     """
     Calculate the hyperscore for a peptide-spectrum match.
@@ -270,24 +254,25 @@ def calculate_hyperscore(n_y, n_b, y_ion_intensities, b_ion_intensities):
     # Calculate the product of y-ion and b-ion intensities
     product_y = np.sum(y_ion_intensities) if y_ion_intensities else 1
     product_b = np.sum(b_ion_intensities) if b_ion_intensities else 1
- 
+
     # Calculate factorial using custom function
     factorial_y = factorial(n_y)
     factorial_b = factorial(n_b)
- 
+
     # Compute hyperscore
-    hyperscore = np.log(factorial_y * factorial_b * (product_y+product_b))
+    hyperscore = np.log(factorial_y * factorial_b * (product_y + product_b))
     return hyperscore
 
-def infer_fragment_identity(frag, allow_ion_types=['b', 'y']):
+
+def infer_fragment_identity(frag, allow_ion_types=["b", "y"]):
     ion = frag.ion
 
     is_allowed = False
     for allowed_ion_type in allow_ion_types:
         if allowed_ion_type in ion:
-            is_allowed=True
+            is_allowed = True
             break
-    
+
     if not is_allowed:
         return False
     # Radicals
@@ -297,20 +282,21 @@ def infer_fragment_identity(frag, allow_ion_types=['b', 'y']):
         return False
     if frag.charge > 2:
         return False
-    
+
     return ion[0]
 
+
 def get_by_fragments(annotated_spectrum):
     b_intensities = []
     y_intensities = []
     for peak in annotated_spectrum:
-        
+
         for fragment in peak.annotation:
 
             ion_type = infer_fragment_identity(fragment)
-            
-            if ion_type == 'b':
+
+            if ion_type == "b":
                 b_intensities.append(peak.intensity)
-            if ion_type == 'y':
+            if ion_type == "y":
                 y_intensities.append(peak.intensity)
     return b_intensities, y_intensities
diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py
index 2f0ec3b0..cd69c6bb 100644
--- a/ms2rescore/parse_psms.py
+++ b/ms2rescore/parse_psms.py
@@ -94,6 +94,19 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
     # Addition of Modifications for mass shifts in the PSMs with Mumble
     if "mumble" in config["psm_generator"]:
         logger.debug("Applying modifications for mass shifts using Mumble...")
+
+        # set inlcude original psm to True and include decoy psm to true
+        if (
+            "include_original_psm" not in config["psm_generator"]["mumble"]
+            or not config["psm_generator"]["mumble"]
+        ):
+            config["psm_generator"]["mumble"]["include_original_psm"] = True
+        if (
+            "include_decoy_psm" not in config["psm_generator"]["mumble"]
+            or not config["psm_generator"]["mumble"]
+        ):
+            config["psm_generator"]["mumble"]["include_decoy_psm"] = True
+
         mumble_config = config["psm_generator"]["mumble"]
 
         # Check if psm_list[0].rescoring_features is empty or not
diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py
index ca06db77..a64d6eca 100644
--- a/ms2rescore/utils.py
+++ b/ms2rescore/utils.py
@@ -95,30 +95,55 @@ def _is_minitdf(spectrum_file: str) -> bool:
     return len(files) >= 2
 
 
-def filter_mumble_psms(psm_list: PSMList) -> PSMList:
+def filter_mumble_psms(psm_list: PSMList, threshold=1) -> PSMList:
     """
-    Filter out PSMs with mumble in the peptide sequence.
+    Filter out mumble PSMs with `matched_ions_pct` lower than the original hit.
+
+    Parameters
+    ----------
+    psm_list : PSMList
+        List of PSMs to filter
+    threshold : float, optional
+        Threshold to lower the maximum matched_ions_pct of the original hit
     """
-    keep = [None] * len(psm_list)
-    original_hit = np.array([metadata.get("original_hit") for metadata in psm_list["metadata"]])
+    # Extract relevant fields from the PSM list
+    original_hit = np.array([metadata.get("original_psm") for metadata in psm_list["metadata"]])
     spectrum_indices = np.array([psm.spectrum_id for psm in psm_list])
     runs = np.array([psm.run for psm in psm_list])
-    if "matched_ions_pct" in psm_list[0].rescoring_features:
-        matched_ions = [psm.rescoring_features["matched_ions_pct"] for psm in psm_list]
-    else:
+
+    # Check if matched_ions_pct exists
+    if "matched_ions_pct" not in psm_list[0].rescoring_features:
         return psm_list
-    for i, psm in enumerate(psm_list):
-        if isinstance(keep[i], bool):
-            continue
-        elif original_hit[i]:
-            keep[i] = True
-        else:
-            original_matched_ions_pct = np.logical_and.reduce(
-                [original_hit, spectrum_indices == psm.spectrum_id, runs == psm.run]
-            )
-            if original_matched_ions_pct > matched_ions[i]:
-                keep[i] = False
-            else:
-                keep[i] = True
-    logger.debug(f"Filtered out {len(psm_list) - sum(keep)} mumble PSMs.")
+
+    matched_ions = np.array([psm.rescoring_features["matched_ions_pct"] for psm in psm_list])
+
+    # Create unique keys for each (run, spectrum_id)
+    unique_keys = np.core.defchararray.add(runs.astype(str), spectrum_indices.astype(str))
+    unique_keys_indices, inverse_indices = np.unique(unique_keys, return_inverse=True)
+
+    # Initialize an array to store the `matched_ions_pct` of original hits per group
+    original_matched_ions_pct = np.full(
+        len(unique_keys_indices), -np.inf
+    )  # Default to -inf for groups without original hits
+
+    # Assign the `matched_ions_pct` of original hits to their groups
+    np.maximum.at(
+        original_matched_ions_pct, inverse_indices[original_hit], matched_ions[original_hit]
+    )
+
+    # lower the maximum with the threshold
+    original_matched_ions_pct = original_matched_ions_pct * threshold
+
+    # Broadcast the original `matched_ions_pct` back to all PSMs in each group
+    original_matched_ions_for_all = original_matched_ions_pct[inverse_indices]
+
+    # Determine the filtering condition
+    keep = np.logical_or(
+        original_hit,  # Always keep original hits
+        matched_ions
+        >= original_matched_ions_for_all,  # Keep hits with `matched_ions_pct` >= the original
+    )
+
+    # Filter PSMs
+    logger.debug(f"Filtered out {len(psm_list) - np.sum(keep)} mumble PSMs.")
     return psm_list[keep]

From c5b6eb02ca8345d4266bfedc1a889a4d19bf9ac8 Mon Sep 17 00:00:00 2001
From: Kevin Velghe <kevin.velghe@ugent.be>
Date: Fri, 17 Jan 2025 13:50:21 +0100
Subject: [PATCH 29/54] add mumble

As mumble hasn't been published on pypi yet, use a git dependency for
now.
---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index d600a6cc..f80647c8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,6 +51,7 @@ dependencies = [
     "rich>=12",
     "tomli>=2; python_version < '3.11'",
     "rustyms>=0.8.3",
+    "mumble-mod @ git+https://github.com/compomics/mumble.git@f435e91",
 ]
 
 [project.optional-dependencies]

From aee8ec76f983385b8da54ed196daf4201101e19a Mon Sep 17 00:00:00 2001
From: Kevin Velghe <kevin.velghe@ugent.be>
Date: Tue, 21 Jan 2025 16:11:15 +0100
Subject: [PATCH 30/54] update mumble to use user cache dir

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index f80647c8..7e2f587e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,7 +51,7 @@ dependencies = [
     "rich>=12",
     "tomli>=2; python_version < '3.11'",
     "rustyms>=0.8.3",
-    "mumble-mod @ git+https://github.com/compomics/mumble.git@f435e91",
+    "mumble-mod @ git+https://github.com/compomics/mumble.git@114ad7d",
 ]
 
 [project.optional-dependencies]

From 7ce56c215bd530e61213a4d193955a96c008f61c Mon Sep 17 00:00:00 2001
From: Kevin Velghe <kevin.velghe@ugent.be>
Date: Fri, 24 Jan 2025 14:34:08 +0100
Subject: [PATCH 31/54] bump im2deep dependency

im2deep.utils was introduced in 0.3.0.
https://github.com/compomics/IM2Deep/commit/0a4bc9de616ebefdf5d124e7625a5c1b86f588e5
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7e2f587e..96d0cec6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ dependencies = [
     "customtkinter>=5,<6",
     "deeplc>=3.0,<3.1",
     "deeplcretrainer",
-    "im2deep>=0.1.3",
+    "im2deep>=0.3.0",
     "jinja2>=3",
     "lxml>=4.5",
     "mokapot>=0.10",

From 106ad8fd54cbed73d01a4fbfca437031fe17cda0 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Fri, 14 Feb 2025 13:34:15 +0100
Subject: [PATCH 32/54] make mumble and rustyms optional dependancy

---
 pyproject.toml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 96d0cec6..dc475819 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,13 +50,11 @@ dependencies = [
     "pyteomics>=4.7.2",
     "rich>=12",
     "tomli>=2; python_version < '3.11'",
-    "rustyms>=0.8.3",
-    "mumble-mod @ git+https://github.com/compomics/mumble.git@114ad7d",
 ]
 
 [project.optional-dependencies]
 ionmob = ["ionmob>=0.2", "tensorflow"]
-ms2 = ["rustyms"]
+mumble = ["rustyms>=0.8.3", "mumble>=0.2.0"]
 dev = ["ruff", "black", "pytest", "pytest-cov", "pre-commit"]
 docs = [
     "sphinx",

From 29aac8a224460fab9d0b16b5bfca1d1f6026da32 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Tue, 23 Sep 2025 14:45:03 +0200
Subject: [PATCH 33/54] set defaults in mumble config

---
 ms2rescore/parse_psms.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py
index cd69c6bb..3c865074 100644
--- a/ms2rescore/parse_psms.py
+++ b/ms2rescore/parse_psms.py
@@ -94,19 +94,9 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
     # Addition of Modifications for mass shifts in the PSMs with Mumble
     if "mumble" in config["psm_generator"]:
         logger.debug("Applying modifications for mass shifts using Mumble...")
-
         # set inlcude original psm to True and include decoy psm to true
-        if (
-            "include_original_psm" not in config["psm_generator"]["mumble"]
-            or not config["psm_generator"]["mumble"]
-        ):
-            config["psm_generator"]["mumble"]["include_original_psm"] = True
-        if (
-            "include_decoy_psm" not in config["psm_generator"]["mumble"]
-            or not config["psm_generator"]["mumble"]
-        ):
-            config["psm_generator"]["mumble"]["include_decoy_psm"] = True
-
+        config["psm_generator"]["mumble"]["include_original_psm"] = True
+        config["psm_generator"]["mumble"]["include_decoy_psm"] = True
         mumble_config = config["psm_generator"]["mumble"]
 
         # Check if psm_list[0].rescoring_features is empty or not

From 487f66112d7f5e79152af4545c52578e2a1bb47e Mon Sep 17 00:00:00 2001
From: SamvPy <sam.vanpuyenbroeck@gmail.com>
Date: Wed, 3 Dec 2025 14:47:10 +0100
Subject: [PATCH 34/54] fix rustyms 0.8.0 -> 0.10.0

---
 ms2rescore/feature_generators/ms2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py
index 0b226a5d..121a6bcb 100644
--- a/ms2rescore/feature_generators/ms2.py
+++ b/ms2rescore/feature_generators/ms2.py
@@ -11,7 +11,7 @@
 
 import numpy as np
 from psm_utils import PSMList
-from rustyms import FragmentationModel, LinearPeptide, MassMode, RawSpectrum
+from rustyms import FragmentationModel, CompoundPeptidoformIon, MassMode, RawSpectrum
 from ms2rescore_rs import get_ms2_spectra
 
 from ms2rescore.exceptions import ParseSpectrumError
@@ -214,7 +214,7 @@ def _annotate_spectrum(self, psm, spectrum):
             intensity_array=spectrum.intensity,
         )
         try:
-            linear_peptide = LinearPeptide(psm.peptidoform.proforma.split("/")[0])
+            linear_peptide = CompoundPeptidoformIon(psm.peptidoform.proforma.split("/")[0])
             annotated_spectrum = spectrum.annotate(
                 peptide=linear_peptide,
                 model=self.fragmentation_model,

From 05078cdf94648fcafc3eaffd856270a5689d7b5b Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Mon, 22 Dec 2025 14:22:18 +0100
Subject: [PATCH 35/54] moved maxquant features to ms2

---
 ms2rescore/feature_generators/maxquant.py | 208 ----------------------
 ms2rescore/feature_generators/ms2.py      |  52 ++++++
 2 files changed, 52 insertions(+), 208 deletions(-)
 delete mode 100644 ms2rescore/feature_generators/maxquant.py

diff --git a/ms2rescore/feature_generators/maxquant.py b/ms2rescore/feature_generators/maxquant.py
deleted file mode 100644
index 45205c2a..00000000
--- a/ms2rescore/feature_generators/maxquant.py
+++ /dev/null
@@ -1,208 +0,0 @@
-"""
-Feature generator for PSMs from the MaxQuant search engine.
-
-MaxQuant msms.txt files contain various metrics from peptide-spectrum matching that can be used
-to generate rescoring features. These include features related to the mass errors of the seven
-fragment ions with the highest intensities, and features related to the ion current of the
-identified fragment ions.
-
-"""
-
-import logging
-from typing import List, Tuple
-
-import numpy as np
-from psm_utils import PSMList
-
-from ms2rescore.exceptions import MS2RescoreError
-from ms2rescore.feature_generators.base import FeatureGeneratorBase
-
-logger = logging.getLogger(__name__)
-
-
-class MaxQuantFeatureGenerator(FeatureGeneratorBase):
-    """Generate MaxQuant-derived features."""
-
-    available_features = [
-        "mean_error_top7",
-        "sq_mean_error_top7",
-        "stdev_error_top7",
-        "ln_explained_ion_current",
-        "ln_nterm_ion_current_ratio",
-        "ln_cterm_ion_current_ratio",
-        "ln_ms2_ion_current",
-    ]
-
-    def __init__(self, *args, **kwargs) -> None:
-        """
-        Generate MaxQuant-derived features.
-
-        Attributes
-        ----------
-        feature_names: list[str]
-            Names of the features that will be added to the PSMs.
-
-        Raises
-        ------
-        MissingMetadataError
-            If the required metadata entries are not present in the PSMs.
-
-        """
-        super().__init__(*args, **kwargs)
-        self._feature_names = self.available_features.copy()
-
-    @property
-    def feature_names(self) -> List[str]:
-        return self._feature_names
-
-    def add_features(self, psm_list: PSMList):
-        """
-        Add MaxQuant-derived features to PSMs.
-
-        Parameters
-        ----------
-        psm_list
-            PSMs to add features to.
-
-        """
-        # Check if all PSMs are from MaxQuant
-        if not self._all_psms_from_maxquant(psm_list):
-            self._feature_names = []  # Set feature names to empty list to indicate none added
-            logger.warning("Not all PSMs are from MaxQuant. Skipping MaxQuant feature generation.")
-            return
-        else:
-            self._feature_names = self.available_features  # Reset feature names
-            logger.info("Adding MaxQuant-derived features to PSMs.")
-
-        # Infer mass deviations column name
-        for column_name in [
-            "Mass deviations [Da]",
-            "Mass Deviations [Da]",
-            "Mass deviations [ppm]",
-            "Mass Deviations [ppm]",
-        ]:
-            if column_name in psm_list[0]["metadata"].keys():
-                self._mass_deviations_key = column_name
-                break
-        else:
-            raise MissingMetadataError(
-                "No mass deviations entry in PSM metadata. Cannot compute MaxQuant features."
-            )
-
-        # Check other columns
-        for column_name in ["Intensities", "Matches", "Intensity coverage"]:
-            if column_name not in psm_list[0]["metadata"].keys():
-                raise MissingMetadataError(
-                    f"Missing {column_name} entry in PSM metadata. Cannot compute MaxQuant features."
-                )
-
-        # Add features to PSMs
-        for psm in psm_list:
-            psm["rescoring_features"].update(self._compute_features(psm["metadata"]))
-
-    @staticmethod
-    def _all_psms_from_maxquant(psm_list):
-        """Check if the PSMs are from MaxQuant."""
-        return (psm_list["source"] == "msms").all()
-
-    def _compute_features(self, psm_metadata):
-        """Compute features from derived from intensities and mass errors."""
-        features = {}
-        if all(k in psm_metadata.keys() for k in ["Intensities", self._mass_deviations_key]):
-            (
-                features["mean_error_top7"],
-                features["sq_mean_error_top7"],
-                features["stdev_error_top7"],
-            ) = self._calculate_top7_peak_features(
-                psm_metadata["Intensities"], psm_metadata[self._mass_deviations_key]
-            )
-
-        if all(k in psm_metadata.keys() for k in ["Intensities", "Matches", "Intensity coverage"]):
-            (
-                features["ln_explained_ion_current"],
-                features["ln_nterm_ion_current_ratio"],
-                features["ln_cterm_ion_current_ratio"],
-                features["ln_ms2_ion_current"],
-            ) = self._calculate_ion_current_features(
-                psm_metadata["Matches"],
-                psm_metadata["Intensities"],
-                psm_metadata["Intensity coverage"],
-            )
-
-        return features
-
-    @staticmethod
-    def _calculate_top7_peak_features(intensities: str, mass_errors: str) -> Tuple[np.ndarray]:
-        """
-        Calculate "top 7 peak"-related search engine features.
-        The following features are calculated:
-        - mean_error_top7: Mean of mass errors of the seven fragment ion peaks with the
-          highest intensities
-        - sq_mean_error_top7: Squared MeanErrorTop7
-        - stdev_error_top7: Standard deviation of mass errors of the seven fragment ion
-          peaks with the highest intensities
-        """
-        try:
-            intensities = [float(i) for i in intensities.split(";")]
-            mass_errors = [float(i) for i in mass_errors.split(";")]
-        except ValueError:
-            return 0.0, 0.0, 0.0
-
-        indices_most_intens = np.array(intensities).argsort()[-1:-8:-1]
-        mass_errors_top7 = [(mass_errors[i]) for i in indices_most_intens]
-        mean_error_top7 = np.mean(mass_errors_top7)
-        sq_mean_error_top7 = mean_error_top7**2
-        stdev_error_top7 = np.std(mass_errors_top7)
-
-        return mean_error_top7, sq_mean_error_top7, stdev_error_top7
-
-    @staticmethod
-    def _calculate_ion_current_features(
-        matches: str, intensities: str, intensity_coverage: str
-    ) -> Tuple[np.ndarray]:
-        """
-        Calculate ion current related search engine features.
-        The following features are calculated:
-        - ln_explained_ion_current: Summed intensity of identified fragment ions,
-          divided by that of all fragment ions, logged
-        - ln_nterm_ion_current_ratio: Summed intensity of identified N-terminal
-          fragments, divided by that of all identified fragments, logged
-        - ln_cterm_ion_current_ratio: Summed intensity of identified N-terminal
-          fragments, divided by that of all identified fragments, logged
-        - ln_ms2_ion_current: Summed intensity of all observed fragment ions, logged
-        """
-        pseudo_count = 0.00001
-        try:
-            ln_explained_ion_current = float(intensity_coverage) + pseudo_count
-            summed_intensities = sum([float(i) for i in intensities.split(";")])
-        except ValueError:
-            return 0.0, 0.0, 0.0, 0.0
-
-        # Calculate ratio between matched b- and y-ion intensities
-        y_ion_int = sum(
-            [
-                float(intensities.split(";")[i])
-                for i, m in enumerate(matches.split(";"))
-                if m.startswith("y")
-            ]
-        )
-        y_int_ratio = y_ion_int / summed_intensities
-
-        ln_nterm_ion_current_ratio = (y_int_ratio + pseudo_count) * ln_explained_ion_current
-        ln_cterm_ion_current_ratio = (1 - y_int_ratio + pseudo_count) * ln_explained_ion_current
-        ln_ms2_ion_current = summed_intensities / ln_explained_ion_current
-
-        out = [
-            ln_explained_ion_current,
-            ln_nterm_ion_current_ratio,
-            ln_cterm_ion_current_ratio,
-            ln_ms2_ion_current,
-        ]
-
-        return tuple([np.log(x) for x in out])
-
-
-class MissingMetadataError(MS2RescoreError):
-    """Exception raised when a required metadata entry is missing."""
-
-    pass
diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py
index 121a6bcb..c5ae3ba9 100644
--- a/ms2rescore/feature_generators/ms2.py
+++ b/ms2rescore/feature_generators/ms2.py
@@ -83,6 +83,9 @@ def feature_names(self) -> List[str]:
             "matched_y_ions_pct",
             "matched_ions_pct",
             "hyperscore",
+            "mean_error_top7",
+            "sq_mean_error_top7",
+            "stdev_error_top7",
         ]
 
     def add_features(self, psm_list: PSMList) -> None:
@@ -146,6 +149,47 @@ def _longest_ion_sequence(lst):
 
         return max_sequence
 
+    @staticmethod
+    def _calculate_top7_peak_features(annotated_spectrum):
+        """
+        Calculate "top 7 peak"-related features using mass errors.
+        The following features are calculated:
+        - mean_error_top7: Mean of mass errors of the seven fragment ion peaks with the
+          highest intensities
+        - sq_mean_error_top7: Squared MeanErrorTop7
+        - stdev_error_top7: Standard deviation of mass errors of the seven fragment ion
+          peaks with the highest intensities
+        """
+        if not annotated_spectrum:
+            return 0.0, 0.0, 0.0
+
+        # Collect peaks with annotations (matched peaks) and their mass errors
+        peak_data = []
+        for peak in annotated_spectrum:
+            if peak.annotation:
+                for matched_ion in peak.annotation:
+                    # Calculate mass error (ppm) between observed and theoretical m/z
+                    theoretical_mz = matched_ion.mz
+                    observed_mz = peak.mz
+                    mass_error = ((observed_mz - theoretical_mz) / theoretical_mz) * 1e6
+                    peak_data.append((peak.intensity, mass_error))
+
+        if len(peak_data) == 0:
+            return 0.0, 0.0, 0.0
+
+        # Sort by intensity and get top 7
+        peak_data.sort(key=lambda x: x[0], reverse=True)
+        top7_errors = [error for _, error in peak_data[:7]]
+
+        if len(top7_errors) == 0:
+            return 0.0, 0.0, 0.0
+
+        mean_error_top7 = np.mean(top7_errors)
+        sq_mean_error_top7 = mean_error_top7**2
+        stdev_error_top7 = np.std(top7_errors) if len(top7_errors) > 1 else 0.0
+
+        return mean_error_top7, sq_mean_error_top7, stdev_error_top7
+
     def _calculate_spectrum_features(self, psm, annotated_spectrum):
 
         if not annotated_spectrum:
@@ -180,6 +224,11 @@ def _calculate_spectrum_features(self, psm, annotated_spectrum):
         b_ion_matched_sum = np.sum(features["b_ion_matched"])
         y_ion_matched_sum = np.sum(features["y_ion_matched"])
 
+        # Calculate top 7 peak features (MaxQuant-derived)
+        mean_error_top7, sq_mean_error_top7, stdev_error_top7 = self._calculate_top7_peak_features(
+            annotated_spectrum
+        )
+
         return {
             "ln_explained_intensity": np.log(matched_intensity_sum + pseudo_count),
             "ln_total_intensity": np.log(total_intensity_sum + pseudo_count),
@@ -200,6 +249,9 @@ def _calculate_spectrum_features(self, psm, annotated_spectrum):
             "matched_y_ions_pct": sum(y_ions_matched) / len(y_ions_matched),
             "matched_ions_pct": (sum(b_ions_matched) + sum(y_ions_matched))
             / (len(b_ions_matched) + len(y_ions_matched)),
+            "mean_error_top7": mean_error_top7,
+            "sq_mean_error_top7": sq_mean_error_top7,
+            "stdev_error_top7": stdev_error_top7,
         }
 
     def _annotate_spectrum(self, psm, spectrum):

From 2011241bcfbedb6a200815c44a65f020ca4f2f31 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Wed, 24 Dec 2025 16:45:13 +0100
Subject: [PATCH 36/54] im2deep refactoring

---
 ms2rescore/feature_generators/im2deep.py | 147 ++++++++++++-----------
 1 file changed, 79 insertions(+), 68 deletions(-)

diff --git a/ms2rescore/feature_generators/im2deep.py b/ms2rescore/feature_generators/im2deep.py
index b48bbd96..b42fabc9 100644
--- a/ms2rescore/feature_generators/im2deep.py
+++ b/ms2rescore/feature_generators/im2deep.py
@@ -8,23 +8,22 @@
 
 """
 
-import contextlib
 import logging
 import os
 from inspect import getfullargspec
-from itertools import chain
 from typing import List
 
 import numpy as np
 import pandas as pd
 from im2deep.utils import im2ccs
-from im2deep.im2deep import predict_ccs
-from psm_utils import PSMList
+from im2deep.im2deep import predict_ccs, REFERENCE_DATASET_PATH
+from im2deep.calibrate import calculate_ccs_shift
+from psm_utils import PSMList, Peptidoform
 
 from ms2rescore.feature_generators.base import FeatureGeneratorBase
 from ms2rescore.parse_spectra import MSDataType
 
-os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
 logger = logging.getLogger(__name__)
 
 
@@ -75,69 +74,77 @@ def add_features(self, psm_list: PSMList) -> None:
         """Add IM2Deep-derived features to PSMs"""
         logger.info("Adding IM2Deep-derived features to PSMs")
 
-        # Get easy-access nested version of PSMlist
-        psm_dict = psm_list.get_psm_dict()
-
-        # Run IM2Deep for each spectrum file
-        current_run = 1
-        total_runs = sum(len(runs) for runs in psm_dict.values())
-
-        for runs in psm_dict.values():
-            # Reset IM2Deep predictor for each collection of runs
-            for run, psms in runs.items():
-                logger.info(
-                    f"Running IM2Deep for PSMs from run ({current_run}/{total_runs}): `{run}`..."
-                )
-
-                # Disable wild logging to stdout by TensorFlow, unless in debug mode
-                with (
-                    contextlib.redirect_stdout(open(os.devnull, "w", encoding="utf-8"))
-                    if not self._verbose
-                    else contextlib.nullcontext()
-                ):
-                    # Make new PSM list for this run (chain PSMs per spectrum to flat list)
-                    psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
-
-                    logger.debug("Calibrating IM2Deep...")
-
-                    # Convert ion mobility to CCS and calibrate CCS values
-                    psm_list_run_df = psm_list_run.to_dataframe()
-                    psm_list_run_df["charge"] = [
-                        pep.precursor_charge for pep in psm_list_run_df["peptidoform"]
-                    ]
-                    psm_list_run_df["ccs_observed"] = im2ccs(
-                        psm_list_run_df["ion_mobility"],
-                        psm_list_run_df["precursor_mz"],
-                        psm_list_run_df["charge"],
-                    )
-
-                    # Create dataframe with high confidence hits for calibration
-                    cal_psm_df = self.make_calibration_df(psm_list_run_df)
-
-                    # Make predictions with IM2Deep
-                    logger.debug("Predicting CCS values...")
-                    predictions = predict_ccs(
-                        psm_list_run, cal_psm_df, write_output=False, **self.im2deep_kwargs
-                    )
-
-                    # Add features to PSMs
-                    logger.debug("Adding features to PSMs...")
-                    observations = psm_list_run_df["ccs_observed"]
-                    ccs_diffs_run = np.abs(predictions - observations)
-                    for i, psm in enumerate(psm_list_run):
-                        psm["rescoring_features"].update(
-                            {
-                                "ccs_observed_im2deep": observations[i],
-                                "ccs_predicted_im2deep": predictions[i],
-                                "ccs_error_im2deep": ccs_diffs_run[i],
-                                "abs_ccs_error_im2deep": np.abs(ccs_diffs_run[i]),
-                                "perc_ccs_error_im2deep": np.abs(ccs_diffs_run[i])
-                                / observations[i]
-                                * 100,
-                            }
-                        )
-
-                current_run += 1
+        # Convert ion mobility to CCS
+        psm_list_df = psm_list.to_dataframe()
+        # Remove unnecessary columns to save memory #TODO: optimize further?
+        psm_list_df = psm_list_df[
+            [
+                "peptidoform",
+                "ion_mobility",
+                "precursor_mz",
+                "run",
+                "qvalue",
+                "is_decoy",
+                "metadata",
+            ]
+        ]
+
+        psm_list_df["charge"] = [pep.precursor_charge for pep in psm_list_df["peptidoform"]]
+        psm_list_df["sequence"] = psm_list_df["peptidoform"].apply(
+            lambda x: x.proforma.split("/")[0]
+        )
+        psm_list_df["ccs_observed"] = im2ccs(
+            psm_list_df["ion_mobility"],
+            psm_list_df["precursor_mz"],
+            psm_list_df["charge"],
+        )
+
+        # Make predictions with IM2Deep
+        logger.debug("Predicting CCS values...")
+        predictions = predict_ccs(psm_list, write_output=False, **self.im2deep_kwargs)
+        psm_list_df["ccs_predicted"] = predictions
+
+        # Create dataframe with high confidence hits for calibration
+        logger.debug("Calibrating IM2Deep...")
+        reference_dataset = pd.read_csv(REFERENCE_DATASET_PATH)
+        reference_dataset["charge"] = reference_dataset["peptidoform"].apply(
+            lambda x: int(x.split("/")[1]) if isinstance(x, str) else x.precursor_charge
+        )
+        logger.debug(f"Loaded reference dataset with {len(reference_dataset)} entries")
+
+        run_shift_dict = {}
+        for run in psm_list_df["run"].unique():
+            cal_run_psm_df = self.make_calibration_df(psm_list_df[psm_list_df["run"] == run])
+            # Extract sequence from peptidoform for calculate_ccs_shift
+            shift = calculate_ccs_shift(
+                cal_df=cal_run_psm_df, reference_dataset=reference_dataset, per_charge=True
+            )
+            run_shift_dict[run] = shift
+        shift_df = pd.DataFrame.from_dict(run_shift_dict, orient="index").stack().reset_index()
+        shift_df.columns = ["run", "charge", "ccs_shift"]
+
+        # Apply calibration shifts
+        psm_list_df = psm_list_df.merge(shift_df, on=["run", "charge"], how="left")
+        psm_list_df["ccs_shift"] = psm_list_df["ccs_shift"].fillna(
+            0
+        )  # Fill missing shifts with 0 (no calibration for that run/charge) #TODO check with ROBBE
+        psm_list_df["ccs_predicted_im2deep"] = (
+            psm_list_df["ccs_predicted"] + psm_list_df["ccs_shift"]
+        )
+        psm_list_df.rename(columns={"ccs_predicted": "ccs_predicted_im2deep"}, inplace=True)
+        psm_list_df["ccs_error_im2deep"] = (
+            psm_list_df["ccs_predicted_im2deep"] - psm_list_df["ccs_observed"]
+        )
+        psm_list_df["abs_ccs_error_im2deep"] = np.abs(psm_list_df["ccs_error_im2deep"])
+        psm_list_df["perc_ccs_error_im2deep"] = (
+            np.abs(psm_list_df["ccs_error_im2deep"]) / psm_list_df["ccs_observed"] * 100
+        )
+
+        psm_list_feature_dicts = psm_list_df[self.feature_names].to_dict(orient="records")
+        # Add features to PSMs
+        logger.debug("Adding features to PSMs...")
+        for psm, features in zip(psm_list, psm_list_feature_dicts):
+            psm.rescoring_features.update(features)
 
     @staticmethod
     def make_calibration_df(psm_list_df: pd.DataFrame, threshold: float = 0.25) -> pd.DataFrame:
@@ -167,6 +174,10 @@ def make_calibration_df(psm_list_df: pd.DataFrame, threshold: float = 0.25) -> p
         calibration_psms = identified_psms[
             identified_psms["qvalue"] < identified_psms["qvalue"].quantile(1 - threshold)
         ]
+        if isinstance(calibration_psms["peptidoform"].iloc[0], Peptidoform):
+            calibration_psms["peptidoform"] = calibration_psms["peptidoform"].apply(
+                lambda x: x.proforma
+            )
         logger.debug(
             f"Number of high confidence hits for calculating shift: {len(calibration_psms)}"
         )

From 36750b444d2a48a67b9335a39dc3f581bf1b0bcd Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Wed, 24 Dec 2025 16:45:25 +0100
Subject: [PATCH 37/54] ms2pip refactoring

---
 ms2rescore/feature_generators/ms2pip.py | 50 ++++++-------------------
 1 file changed, 11 insertions(+), 39 deletions(-)

diff --git a/ms2rescore/feature_generators/ms2pip.py b/ms2rescore/feature_generators/ms2pip.py
index c830a79e..67ea48d0 100644
--- a/ms2rescore/feature_generators/ms2pip.py
+++ b/ms2rescore/feature_generators/ms2pip.py
@@ -25,22 +25,20 @@
 
 import logging
 import multiprocessing
-import os
 import warnings
-from itertools import chain
 from typing import List, Optional, Union
 
 import numpy as np
 import pandas as pd
-from ms2pip import correlate
+from ms2pip import process_MS2_spectra
 from ms2pip.exceptions import NoMatchingSpectraFound
 from ms2pip.result import ProcessingResult
+
 from psm_utils import PSMList
 from rich.progress import track
 
-from ms2rescore.feature_generators.base import FeatureGeneratorBase, FeatureGeneratorException
+from ms2rescore.feature_generators.base import FeatureGeneratorBase
 from ms2rescore.parse_spectra import MSDataType
-from ms2rescore.utils import infer_spectrum_path
 
 logger = logging.getLogger(__name__)
 
@@ -182,40 +180,14 @@ def add_features(self, psm_list: PSMList) -> None:
 
         """
         logger.info("Adding MS²PIP-derived features to PSMs.")
-        psm_dict = psm_list.get_psm_dict()
-        current_run = 1
-        total_runs = sum(len(runs) for runs in psm_dict.values())
-
-        for runs in psm_dict.values():
-            for run, psms in runs.items():
-                logger.info(
-                    f"Running MS²PIP for PSMs from run ({current_run}/{total_runs}) `{run}`..."
-                )
-                psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
-                spectrum_filename = infer_spectrum_path(self.spectrum_path, run)
-                logger.debug(f"Using spectrum file `{spectrum_filename}`")
-                try:
-                    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
-                    ms2pip_results = correlate(
-                        psms=psm_list_run,
-                        spectrum_file=str(spectrum_filename),
-                        spectrum_id_pattern=self.spectrum_id_pattern,
-                        model=self.model,
-                        ms2_tolerance=self.ms2_tolerance,
-                        compute_correlations=False,
-                        model_dir=self.model_dir,
-                        processes=self.processes,
-                    )
-                except NoMatchingSpectraFound as e:
-                    raise FeatureGeneratorException(
-                        f"Could not find any matching spectra for PSMs from run `{run}`. "
-                        "Please check that the `spectrum_id_pattern` and `psm_id_pattern` "
-                        "options are configured correctly. See "
-                        "https://ms2rescore.readthedocs.io/en/latest/userguide/configuration/#mapping-psms-to-spectra"
-                        " for more information."
-                    ) from e
-                self._calculate_features(psm_list_run, ms2pip_results)
-                current_run += 1
+        # temporarily remove spectrum from psm_utils before multiprocessing
+        ms2pip_results = process_MS2_spectra(
+            psms=psm_list,
+            model=self.model,
+            model_dir=self.model_dir,
+            processes=self.processes,
+        )
+        self._calculate_features(psm_list, ms2pip_results)
 
     def _calculate_features(
         self, psm_list: PSMList, ms2pip_results: List[ProcessingResult]

From 3091b0fad54f907651a354659aa1d46835362735 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Wed, 24 Dec 2025 16:45:43 +0100
Subject: [PATCH 38/54] parsing spectra once and storing spectra objects

---
 ms2rescore/core.py          |   2 -
 ms2rescore/parse_spectra.py | 164 ++++++++++++++++++------------------
 2 files changed, 80 insertions(+), 86 deletions(-)

diff --git a/ms2rescore/core.py b/ms2rescore/core.py
index 2c701ca4..4384c6f7 100644
--- a/ms2rescore/core.py
+++ b/ms2rescore/core.py
@@ -121,8 +121,6 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
     if "mumble" in config["psm_generator"]:
         # Remove PSMs where matched_ions_pct drops 25% below the original hit
         psm_list = filter_mumble_psms(psm_list, threshold=0.75)
-        # Currently replace the score with the hyperscore for Mumble
-        # psm_list["score"] = [ft["hyperscore"] for ft in psm_list["rescoring_features"]] # TODO: This is a temporary fix
 
     # Write feature names to file
     _write_feature_names(feature_names, output_file_root)
diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py
index 103619c6..acff93e0 100644
--- a/ms2rescore/parse_spectra.py
+++ b/ms2rescore/parse_spectra.py
@@ -7,7 +7,8 @@
 from typing import Optional, Set, Tuple
 
 import numpy as np
-from ms2rescore_rs import Precursor, get_precursor_info
+from ms2rescore_rs import Precursor, get_ms2_spectra, MS2Spectrum
+from ms2pip.spectrum import ObservedSpectrum
 from psm_utils import PSMList
 
 from ms2rescore.exceptions import MS2RescoreConfigurationError, MS2RescoreError
@@ -24,7 +25,6 @@ class MSDataType(str, Enum):
     precursor_mz = "precursor m/z"
     ms2_spectra = "MS2 spectra"
 
-    # Mimic behavior of StrEnum (Python >=3.11)
     def __str__(self):
         return self.value
 
@@ -69,7 +69,7 @@ def add_precursor_values(
     # Check which data types are missing
     # Missing if: all values are 0, OR any values are None/NaN
     missing_data_types = set()
-    if spectrum_path is None:
+    if MSDataType.ms2_spectra in required_data_types:
         missing_data_types.add(MSDataType.ms2_spectra)
 
     rt_values = np.asarray(psm_list["retention_time"])
@@ -84,6 +84,8 @@ def add_precursor_values(
     if np.any(np.isnan(mz_values)) or np.all(mz_values == 0):
         missing_data_types.add(MSDataType.precursor_mz)
 
+    LOGGER.debug("Missing data types: %s", missing_data_types)
+
     # Find data types that are both missing and required
     data_types_to_parse = missing_data_types & required_data_types
 
@@ -104,36 +106,59 @@ def add_precursor_values(
 
     # Get precursor values from spectrum files
     LOGGER.info("Parsing precursor info from spectrum files...")
-    mz, rt, im = _get_precursor_values(psm_list, spectrum_path, spectrum_id_pattern)
+    _add_precursor_values(psm_list, spectrum_path, spectrum_id_pattern)
+
+    # Extract precursor values from MS2 spectrum objects in a single pass
+    precursor_data = [
+        (ms2.precursor.rt, ms2.precursor.im, ms2.precursor.mz) for ms2 in psm_list["spectrum"]
+    ]
+    rts, ims, mzs = map(np.array, zip(*precursor_data))
 
     # Determine which data types were successfully found in spectrum files
-    # ms2rescore_rs always returns 0.0 for missing values
-    found_data_types = {MSDataType.ms2_spectra}  # MS2 spectra available when processing files
-    if np.all(rt != 0.0):
+    found_data_types = {MSDataType.ms2_spectra}
+
+    # Add found data types: if missing and all zeros, raise error
+    if not np.all(rts == 0.0):
         found_data_types.add(MSDataType.retention_time)
-    if np.all(im != 0.0):
+        if MSDataType.retention_time in data_types_to_parse:
+            LOGGER.debug(
+                "Missing retention time values in PSM list. Updating from spectrum files."
+            )
+            psm_list["retention_time"] = rts
+    elif MSDataType.retention_time in data_types_to_parse:
+        raise SpectrumParsingError(
+            "Retention time values are required but not available in spectrum files "
+            "(all values are zero)."
+        )
+
+    if not np.all(ims == 0.0):
         found_data_types.add(MSDataType.ion_mobility)
-    if np.all(mz != 0.0):
+        if MSDataType.ion_mobility in data_types_to_parse:
+            LOGGER.debug("Missing ion mobility values in PSM list. Updating from spectrum files.")
+            psm_list["ion_mobility"] = ims
+    elif MSDataType.ion_mobility in data_types_to_parse:
+        raise SpectrumParsingError(
+            "Ion mobility values are required but not available in spectrum files "
+            "(all values are zero)."
+        )
+
+    if np.all(mzs == 0.0):
         found_data_types.add(MSDataType.precursor_mz)
+        if MSDataType.precursor_mz in data_types_to_parse:
+            LOGGER.debug("Missing precursor m/z values in PSM list. Updating from spectrum files.")
+            psm_list["precursor_mz"] = mzs
+    elif MSDataType.precursor_mz in data_types_to_parse:
+        raise SpectrumParsingError(
+            "Precursor m/z values are required but not available in spectrum files "
+            "(all values are zero)."
+        )
 
-    # Update PSM list with missing precursor values that were found
-    update_types = data_types_to_parse & found_data_types
-
-    if MSDataType.retention_time in update_types:
-        LOGGER.debug("Missing retention time values in PSM list. Updating from spectrum files.")
-        psm_list["retention_time"] = rt
-    if MSDataType.ion_mobility in update_types:
-        LOGGER.debug("Missing ion mobility values in PSM list. Updating from spectrum files.")
-        psm_list["ion_mobility"] = im
-    if MSDataType.precursor_mz in update_types:
-        LOGGER.debug("Missing precursor m/z values in PSM list. Updating from spectrum files.")
-        psm_list["precursor_mz"] = mz
-    elif (
+    # Check if precursor m/z values are consistent between PSMs and spectrum files
+    if (
         MSDataType.precursor_mz not in missing_data_types
         and MSDataType.precursor_mz in found_data_types
     ):
-        # Check if precursor m/z values are consistent between PSMs and spectrum files
-        mz_diff = np.abs(psm_list["precursor_mz"] - mz)
+        mz_diff = np.abs(psm_list["precursor_mz"] - mzs)
         if np.mean(mz_diff) > 1e-2:
             LOGGER.warning(
                 "Mismatch between precursor m/z values in PSM list and spectrum files (mean "
@@ -149,20 +174,22 @@ def add_precursor_values(
     return available_data_types
 
 
-def _apply_spectrum_id_pattern(
-    precursors: dict[str, Precursor], pattern: str
+def _acquire_observed_spectra_dict(
+    ms2: list[MS2Spectrum], pattern: str, spectrum_ids: list[str]
 ) -> dict[str, Precursor]:
     """Apply spectrum ID pattern to precursor IDs."""
     # Map precursor IDs using regex pattern
     compiled_pattern = re.compile(pattern)
-    id_mapping = {
-        match.group(1): spectrum_id
-        for spectrum_id in precursors.keys()
-        if (match := compiled_pattern.search(spectrum_id)) is not None
+
+    ms2_observed_spectra_mapping = {
+        match.group(1): ms2_spectrum
+        for ms2_spectrum in ms2
+        if (match := compiled_pattern.search(str(ms2_spectrum.identifier))) is not None
+        and match.group(1) in spectrum_ids
     }
 
     # Validate that any IDs were matched
-    if not id_mapping:
+    if not ms2_observed_spectra_mapping:
         raise MS2RescoreConfigurationError(
             "'spectrum_id_pattern' did not match any spectrum-file IDs. Please check and try "
             "again. See "
@@ -170,64 +197,33 @@ def _apply_spectrum_id_pattern(
             "for more information."
         )
 
-    # Validate that the same number of unique IDs were matched
-    elif len(id_mapping) != len(precursors):
-        new_id, old_id = next(iter(id_mapping.items()))
-        raise MS2RescoreConfigurationError(
-            "'spectrum_id_pattern' resulted in a different number of unique spectrum IDs. This "
-            "indicates issues with the regex pattern. Please check and try again. "
-            f"Example old ID: '{old_id}' -> new ID: '{new_id}'. "
-            "See https://ms2rescore.readthedocs.io/en/stable/userguide/configuration/#mapping-psms-to-spectra "
-            "for more information."
-        )
-
-    precursors = {new_id: precursors[orig_id] for new_id, orig_id in id_mapping.items()}
-
-    return precursors
+    return ms2_observed_spectra_mapping
 
 
-def _get_precursor_values(
+def _add_precursor_values(
     psm_list: PSMList, spectrum_path: str, spectrum_id_pattern: Optional[str] = None
-) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+) -> None:
     """Get precursor m/z, RT, and IM from spectrum files."""
     # Iterate over different runs in PSM list
-    precursor_dict = dict()
-    psm_dict = psm_list.get_psm_dict()
-    for runs in psm_dict.values():
-        for run_name, psms in runs.items():
-            psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
-            spectrum_file = infer_spectrum_path(spectrum_path, run_name)
-
-            LOGGER.debug("Reading spectrum file: '%s'", spectrum_file)
-            precursors: dict[str, Precursor] = get_precursor_info(str(spectrum_file))
-
-            # Parse spectrum IDs with regex pattern if provided
-            if spectrum_id_pattern:
-                precursors = _apply_spectrum_id_pattern(precursors, spectrum_id_pattern)
-
-            # Ensure all PSMs have precursor values
-            for psm in psm_list_run:
-                if psm.spectrum_id not in precursors:
-                    raise MS2RescoreConfigurationError(
-                        "Mismatch between PSM and spectrum file IDs. Could not find precursor "
-                        f"values for PSM with ID {psm.spectrum_id} in run {run_name}.\n"
-                        "Please check that the `spectrum_id_pattern` and `psm_id_pattern` options "
-                        "are configured correctly. See "
-                        "https://ms2rescore.readthedocs.io/en/stable/userguide/configuration/#mapping-psms-to-spectra"
-                        " for more information.\n"
-                        f"Example ID from PSM file: {psm.spectrum_id}\n"
-                        f"Example ID from spectrum file: {list(precursors.keys())[0]}"
-                    )
-
-            # Store precursor values in dictionary
-            precursor_dict[run_name] = precursors
-
-    # Reshape precursor values into arrays matching PSM list
-    mzs = np.fromiter((precursor_dict[psm.run][psm.spectrum_id].mz for psm in psm_list), float)
-    rts = np.fromiter((precursor_dict[psm.run][psm.spectrum_id].rt for psm in psm_list), float)
-    ims = np.fromiter((precursor_dict[psm.run][psm.spectrum_id].im for psm in psm_list), float)
-
-    return mzs, rts, ims
+    if spectrum_id_pattern is None:
+        spectrum_id_pattern = r"^(.*)$"  # Match entire identifier if no pattern provided
+
+    for run_name in set(psm_list["run"]):
+        run_mask = psm_list["run"] == run_name
+        psm_list_run = psm_list[run_mask]
+        spectrum_file = infer_spectrum_path(spectrum_path, run_name)
+
+        LOGGER.debug("Reading spectrum file: '%s'", spectrum_file)
+        ms2_spectra: list[MS2Spectrum] = get_ms2_spectra(str(spectrum_file))
+
+        # Parse spectrum IDs with regex pattern if provided
+        ms2_spectra_dict = _acquire_observed_spectra_dict(
+            ms2_spectra, spectrum_id_pattern, psm_list_run["spectrum_id"]
+        )
+
+        psm_list_run["spectrum"] = [
+            ms2_spectra_dict[spec_id] for spec_id in psm_list_run["spectrum_id"]
+        ]
 
 
 class SpectrumParsingError(MS2RescoreError):

From 5b3d4c466b5814a044f79b866696a74af53ec627 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Wed, 24 Dec 2025 16:46:06 +0100
Subject: [PATCH 39/54] directly operate on spectra objects instead of
 reacquiring them

---
 ms2rescore/feature_generators/ms2.py | 148 ++++++++++-----------------
 1 file changed, 56 insertions(+), 92 deletions(-)

diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py
index c5ae3ba9..ebeff27b 100644
--- a/ms2rescore/feature_generators/ms2.py
+++ b/ms2rescore/feature_generators/ms2.py
@@ -6,17 +6,13 @@
 import logging
 import re
 from collections import defaultdict
-from itertools import chain
 from typing import List, Optional
 
 import numpy as np
 from psm_utils import PSMList
-from rustyms import FragmentationModel, CompoundPeptidoformIon, MassMode, RawSpectrum
-from ms2rescore_rs import get_ms2_spectra
+from rustyms import FragmentationModel, LinearPeptide, MassMode, RawSpectrum
 
-from ms2rescore.exceptions import ParseSpectrumError
 from ms2rescore.feature_generators.base import FeatureGeneratorBase
-from ms2rescore.utils import infer_spectrum_path
 
 logger = logging.getLogger(__name__)
 
@@ -83,49 +79,17 @@ def feature_names(self) -> List[str]:
             "matched_y_ions_pct",
             "matched_ions_pct",
             "hyperscore",
-            "mean_error_top7",
-            "sq_mean_error_top7",
-            "stdev_error_top7",
+            # "mean_error_top7",
+            # "sq_mean_error_top7",
+            # "stdev_error_top7",
         ]
 
     def add_features(self, psm_list: PSMList) -> None:
         """Add MS2-derived features to PSMs."""
 
         logger.info("Adding MS2-derived features to PSMs.")
-        psm_dict = psm_list.get_psm_dict()
-        current_run = 1
-        total_runs = sum(len(runs) for runs in psm_dict.values())
-
-        for runs in psm_dict.values():
-            for run, psms in runs.items():
-                logger.info(
-                    f"Running MS2 for PSMs from run ({current_run}/{total_runs}) `{run}`..."
-                )
-                psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
-                spectrum_filename = infer_spectrum_path(self.spectrum_path, run)
-
-                self._calculate_features(psm_list_run, spectrum_filename)
-                current_run += 1
-
-    def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List:
-        """Retrieve annotated spectra for all psms."""
-
-        spectra = get_ms2_spectra(str(spectrum_file))
-        spectrum_id_pattern = re.compile(
-            self.spectrum_id_pattern if self.spectrum_id_pattern else r"(.*)"
-        )
-        try:
-            spectra_dict = {
-                spectrum_id_pattern.search(spectrum.identifier).group(1): spectrum
-                for spectrum in spectra
-            }
-        except AttributeError:
-            raise ParseSpectrumError(
-                "Could not parse spectrum IDs using ´spectrum_id_pattern´. Please make sure that there is a capturing in the pattern."
-            )
-
         for psm in psm_list:
-            annotated_spectrum = self._annotate_spectrum(psm, spectra_dict[psm.spectrum_id])
+            annotated_spectrum = self._annotate_spectrum(psm)
             psm.rescoring_features.update(
                 self._calculate_spectrum_features(psm, annotated_spectrum)
             )
@@ -149,46 +113,46 @@ def _longest_ion_sequence(lst):
 
         return max_sequence
 
-    @staticmethod
-    def _calculate_top7_peak_features(annotated_spectrum):
-        """
-        Calculate "top 7 peak"-related features using mass errors.
-        The following features are calculated:
-        - mean_error_top7: Mean of mass errors of the seven fragment ion peaks with the
-          highest intensities
-        - sq_mean_error_top7: Squared MeanErrorTop7
-        - stdev_error_top7: Standard deviation of mass errors of the seven fragment ion
-          peaks with the highest intensities
-        """
-        if not annotated_spectrum:
-            return 0.0, 0.0, 0.0
-
-        # Collect peaks with annotations (matched peaks) and their mass errors
-        peak_data = []
-        for peak in annotated_spectrum:
-            if peak.annotation:
-                for matched_ion in peak.annotation:
-                    # Calculate mass error (ppm) between observed and theoretical m/z
-                    theoretical_mz = matched_ion.mz
-                    observed_mz = peak.mz
-                    mass_error = ((observed_mz - theoretical_mz) / theoretical_mz) * 1e6
-                    peak_data.append((peak.intensity, mass_error))
-
-        if len(peak_data) == 0:
-            return 0.0, 0.0, 0.0
-
-        # Sort by intensity and get top 7
-        peak_data.sort(key=lambda x: x[0], reverse=True)
-        top7_errors = [error for _, error in peak_data[:7]]
-
-        if len(top7_errors) == 0:
-            return 0.0, 0.0, 0.0
-
-        mean_error_top7 = np.mean(top7_errors)
-        sq_mean_error_top7 = mean_error_top7**2
-        stdev_error_top7 = np.std(top7_errors) if len(top7_errors) > 1 else 0.0
-
-        return mean_error_top7, sq_mean_error_top7, stdev_error_top7
+    # @staticmethod
+    # def _calculate_top7_peak_features(annotated_spectrum):
+    #     """
+    #     Calculate "top 7 peak"-related features using mass errors.
+    #     The following features are calculated:
+    #     - mean_error_top7: Mean of mass errors of the seven fragment ion peaks with the
+    #       highest intensities
+    #     - sq_mean_error_top7: Squared MeanErrorTop7
+    #     - stdev_error_top7: Standard deviation of mass errors of the seven fragment ion
+    #       peaks with the highest intensities
+    #     """
+    #     if not annotated_spectrum:
+    #         return 0.0, 0.0, 0.0
+
+    #     # Collect peaks with annotations (matched peaks) and their mass errors
+    #     peak_data = []
+    #     for peak in annotated_spectrum:
+    #         if peak.annotation:
+    #             for matched_ion in peak.annotation:
+    #                 # Calculate mass error (ppm) between observed and theoretical m/z
+    #                 theoretical_mz = matched_ion.mz
+    #                 observed_mz = peak.mz
+    #                 mass_error = ((observed_mz - theoretical_mz) / theoretical_mz) * 1e6
+    #                 peak_data.append((peak.intensity, mass_error))
+
+    #     if len(peak_data) == 0:
+    #         return 0.0, 0.0, 0.0
+
+    #     # Sort by intensity and get top 7
+    #     peak_data.sort(key=lambda x: x[0], reverse=True)
+    #     top7_errors = [error for _, error in peak_data[:7]]
+
+    #     if len(top7_errors) == 0:
+    #         return 0.0, 0.0, 0.0
+
+    #     mean_error_top7 = np.mean(top7_errors)
+    #     sq_mean_error_top7 = mean_error_top7**2
+    #     stdev_error_top7 = np.std(top7_errors) if len(top7_errors) > 1 else 0.0
+
+    #     return mean_error_top7, sq_mean_error_top7, stdev_error_top7
 
     def _calculate_spectrum_features(self, psm, annotated_spectrum):
 
@@ -225,9 +189,9 @@ def _calculate_spectrum_features(self, psm, annotated_spectrum):
         y_ion_matched_sum = np.sum(features["y_ion_matched"])
 
         # Calculate top 7 peak features (MaxQuant-derived)
-        mean_error_top7, sq_mean_error_top7, stdev_error_top7 = self._calculate_top7_peak_features(
-            annotated_spectrum
-        )
+        # mean_error_top7, sq_mean_error_top7, stdev_error_top7 = self._calculate_top7_peak_features(
+        #     annotated_spectrum
+        # )
 
         return {
             "ln_explained_intensity": np.log(matched_intensity_sum + pseudo_count),
@@ -249,24 +213,24 @@ def _calculate_spectrum_features(self, psm, annotated_spectrum):
             "matched_y_ions_pct": sum(y_ions_matched) / len(y_ions_matched),
             "matched_ions_pct": (sum(b_ions_matched) + sum(y_ions_matched))
             / (len(b_ions_matched) + len(y_ions_matched)),
-            "mean_error_top7": mean_error_top7,
-            "sq_mean_error_top7": sq_mean_error_top7,
-            "stdev_error_top7": stdev_error_top7,
+            # "mean_error_top7": mean_error_top7,
+            # "sq_mean_error_top7": sq_mean_error_top7,
+            # "stdev_error_top7": stdev_error_top7,
         }
 
-    def _annotate_spectrum(self, psm, spectrum):
+    def _annotate_spectrum(self, psm):
 
         spectrum = RawSpectrum(
             title=psm.spectrum_id,
             num_scans=1,
             rt=psm.retention_time,
             precursor_charge=psm.get_precursor_charge(),
-            precursor_mass=spectrum.precursor.mz,
-            mz_array=spectrum.mz,
-            intensity_array=spectrum.intensity,
+            precursor_mass=psm.spectrum.precursor.mz,
+            mz_array=psm.spectrum.mz,
+            intensity_array=psm.spectrum.intensity,
         )
         try:
-            linear_peptide = CompoundPeptidoformIon(psm.peptidoform.proforma.split("/")[0])
+            linear_peptide = LinearPeptide(psm.peptidoform.proforma.split("/")[0])
             annotated_spectrum = spectrum.annotate(
                 peptide=linear_peptide,
                 model=self.fragmentation_model,

From a3cbb1b1e45fb8888a633f4e30f5f2b53b5f15f2 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Wed, 24 Dec 2025 16:46:21 +0100
Subject: [PATCH 40/54] updated profiling

---
 ms2rescore/__main__.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/ms2rescore/__main__.py b/ms2rescore/__main__.py
index 85309ffc..52e377f3 100644
--- a/ms2rescore/__main__.py
+++ b/ms2rescore/__main__.py
@@ -6,6 +6,7 @@
 import json
 import logging
 import sys
+from datetime import datetime
 from pathlib import Path
 from typing import Union
 
@@ -196,7 +197,13 @@ def profile(fnc, filepath):
     def inner(*args, **kwargs):
         with cProfile.Profile() as profiler:
             return_value = fnc(*args, **kwargs)
-        profiler.dump_stats(filepath + ".profile.prof")
+
+        # Add timestamp to profiler output filename
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        profile_filename = f"{filepath}.profile_{timestamp}.prof"
+        profiler.dump_stats(profile_filename)
+        LOGGER.info(f"Profile data written to: {profile_filename}")
+
         return return_value
 
     return inner
@@ -248,6 +255,7 @@ def main(tims=False):
     # Run MS²Rescore
     try:
         if config["ms2rescore"]["profile"]:
+            LOGGER.info("Profiling enabled")
             profiled_rescore = profile(rescore, config["ms2rescore"]["output_path"])
             profiled_rescore(configuration=config)
         else:

From a1df72d346b181b3ef1d1478cd919ea5f3cd787c Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Wed, 24 Dec 2025 16:46:37 +0100
Subject: [PATCH 41/54] removed maxquant generator from fg

---
 ms2rescore/feature_generators/__init__.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ms2rescore/feature_generators/__init__.py b/ms2rescore/feature_generators/__init__.py
index ac09923e..4956868c 100644
--- a/ms2rescore/feature_generators/__init__.py
+++ b/ms2rescore/feature_generators/__init__.py
@@ -7,7 +7,6 @@
 from ms2rescore.feature_generators.deeplc import DeepLCFeatureGenerator
 from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator
 from ms2rescore.feature_generators.ionmob import IonMobFeatureGenerator
-from ms2rescore.feature_generators.maxquant import MaxQuantFeatureGenerator
 from ms2rescore.feature_generators.ms2 import MS2FeatureGenerator
 from ms2rescore.feature_generators.ms2pip import MS2PIPFeatureGenerator
 
@@ -15,7 +14,6 @@
     "basic": BasicFeatureGenerator,
     "ms2pip": MS2PIPFeatureGenerator,
     "deeplc": DeepLCFeatureGenerator,
-    "maxquant": MaxQuantFeatureGenerator,
     "ionmob": IonMobFeatureGenerator,
     "im2deep": IM2DeepFeatureGenerator,
     "ms2": MS2FeatureGenerator,

From 2c7a09b7392895ab85eb29390aad5b456f1ee36e Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Mon, 5 Jan 2026 13:09:04 +0100
Subject: [PATCH 42/54] changes to column names

---
 ms2rescore/feature_generators/im2deep.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/ms2rescore/feature_generators/im2deep.py b/ms2rescore/feature_generators/im2deep.py
index b42fabc9..12094306 100644
--- a/ms2rescore/feature_generators/im2deep.py
+++ b/ms2rescore/feature_generators/im2deep.py
@@ -90,10 +90,8 @@ def add_features(self, psm_list: PSMList) -> None:
         ]
 
         psm_list_df["charge"] = [pep.precursor_charge for pep in psm_list_df["peptidoform"]]
-        psm_list_df["sequence"] = psm_list_df["peptidoform"].apply(
-            lambda x: x.proforma.split("/")[0]
-        )
-        psm_list_df["ccs_observed"] = im2ccs(
+        psm_list_df["sequence"] = psm_list_df["peptidoform"].apply(lambda x: x.proforma)
+        psm_list_df["ccs_observed_im2deep"] = im2ccs(
             psm_list_df["ion_mobility"],
             psm_list_df["precursor_mz"],
             psm_list_df["charge"],
@@ -115,7 +113,10 @@ def add_features(self, psm_list: PSMList) -> None:
         run_shift_dict = {}
         for run in psm_list_df["run"].unique():
             cal_run_psm_df = self.make_calibration_df(psm_list_df[psm_list_df["run"] == run])
-            # Extract sequence from peptidoform for calculate_ccs_shift
+            # Rename for calculate_ccs_shift compatibility
+            cal_run_psm_df = cal_run_psm_df.rename(
+                columns={"ccs_observed_im2deep": "ccs_observed"}
+            )
             shift = calculate_ccs_shift(
                 cal_df=cal_run_psm_df, reference_dataset=reference_dataset, per_charge=True
             )
@@ -131,13 +132,12 @@ def add_features(self, psm_list: PSMList) -> None:
         psm_list_df["ccs_predicted_im2deep"] = (
             psm_list_df["ccs_predicted"] + psm_list_df["ccs_shift"]
         )
-        psm_list_df.rename(columns={"ccs_predicted": "ccs_predicted_im2deep"}, inplace=True)
         psm_list_df["ccs_error_im2deep"] = (
-            psm_list_df["ccs_predicted_im2deep"] - psm_list_df["ccs_observed"]
+            psm_list_df["ccs_predicted_im2deep"] - psm_list_df["ccs_observed_im2deep"]
         )
         psm_list_df["abs_ccs_error_im2deep"] = np.abs(psm_list_df["ccs_error_im2deep"])
         psm_list_df["perc_ccs_error_im2deep"] = (
-            np.abs(psm_list_df["ccs_error_im2deep"]) / psm_list_df["ccs_observed"] * 100
+            np.abs(psm_list_df["ccs_error_im2deep"]) / psm_list_df["ccs_observed_im2deep"] * 100
         )
 
         psm_list_feature_dicts = psm_list_df[self.feature_names].to_dict(orient="records")

From e855abf42fa3c8662820f90571aeff0c94eeec54 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Mon, 5 Jan 2026 14:39:48 +0100
Subject: [PATCH 43/54] changes to avoid out of memory error due to
 multiprocessing

---
 ms2rescore/feature_generators/ms2pip.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/ms2rescore/feature_generators/ms2pip.py b/ms2rescore/feature_generators/ms2pip.py
index 67ea48d0..3fdb62ad 100644
--- a/ms2rescore/feature_generators/ms2pip.py
+++ b/ms2rescore/feature_generators/ms2pip.py
@@ -194,13 +194,16 @@ def _calculate_features(
     ) -> None:
         """Calculate features from all MS²PIP results and add to PSMs."""
         logger.debug("Calculating features from predicted spectra")
-        with multiprocessing.Pool(int(self.processes)) as pool:
+        # Use spawn context to avoid fork memory duplication issues
+        # maxtasksperchild recycles workers to free accumulated memory
+        ctx = multiprocessing.get_context("spawn")
+        with ctx.Pool(int(self.processes), maxtasksperchild=500) as pool:
             # Use imap, so we can use a progress bar
             counts_failed = 0
             for result, features in zip(
                 ms2pip_results,
                 track(
-                    pool.imap(self._calculate_features_single, ms2pip_results, chunksize=1000),
+                    pool.imap(self._calculate_features_single, ms2pip_results, chunksize=250),
                     total=len(ms2pip_results),
                     description="Calculating features...",
                     transient=True,

From 577df191ddef858e33d4bef3bd6849b6fd4c8509 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Mon, 5 Jan 2026 14:40:14 +0100
Subject: [PATCH 44/54] replace list with set to reduce lookup time to O(1)

---
 ms2rescore/parse_spectra.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py
index acff93e0..9b58c1da 100644
--- a/ms2rescore/parse_spectra.py
+++ b/ms2rescore/parse_spectra.py
@@ -180,12 +180,13 @@ def _acquire_observed_spectra_dict(
     """Apply spectrum ID pattern to precursor IDs."""
     # Map precursor IDs using regex pattern
     compiled_pattern = re.compile(pattern)
+    spectrum_ids_set = set(spectrum_ids)  # For faster lookup
 
     ms2_observed_spectra_mapping = {
         match.group(1): ms2_spectrum
         for ms2_spectrum in ms2
         if (match := compiled_pattern.search(str(ms2_spectrum.identifier))) is not None
-        and match.group(1) in spectrum_ids
+        and match.group(1) in spectrum_ids_set
     }
 
     # Validate that any IDs were matched

From a80238b1dc105eda164df8ca5762304a78b5c7a0 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Mon, 12 Jan 2026 10:50:04 +0100
Subject: [PATCH 45/54] remove unused imports

---
 ms2rescore/parse_spectra.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py
index 9b58c1da..d897c40c 100644
--- a/ms2rescore/parse_spectra.py
+++ b/ms2rescore/parse_spectra.py
@@ -3,12 +3,11 @@
 import logging
 import re
 from enum import Enum
-from itertools import chain
-from typing import Optional, Set, Tuple
+from typing import Optional, Set
 
 import numpy as np
 from ms2rescore_rs import Precursor, get_ms2_spectra, MS2Spectrum
-from ms2pip.spectrum import ObservedSpectrum
+
 from psm_utils import PSMList
 
 from ms2rescore.exceptions import MS2RescoreConfigurationError, MS2RescoreError

From abf66b4396a10ef4b150ab2031789ba99ffe73d7 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Mon, 12 Jan 2026 10:50:23 +0100
Subject: [PATCH 46/54] migrate ms2 and ms2pip features to ms2rescore-rs

---
 ms2rescore/feature_generators/ms2.py    | 271 ++----------------------
 ms2rescore/feature_generators/ms2pip.py | 210 +++---------------
 2 files changed, 47 insertions(+), 434 deletions(-)

diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py
index ebeff27b..6ef573a2 100644
--- a/ms2rescore/feature_generators/ms2.py
+++ b/ms2rescore/feature_generators/ms2.py
@@ -4,29 +4,16 @@
 """
 
 import logging
-import re
-from collections import defaultdict
+
 from typing import List, Optional
 
 import numpy as np
 from psm_utils import PSMList
-from rustyms import FragmentationModel, LinearPeptide, MassMode, RawSpectrum
-
+from ms2rescore_rs import batch_ms2_features_from_spectra
 from ms2rescore.feature_generators.base import FeatureGeneratorBase
 
 logger = logging.getLogger(__name__)
 
-FRAGMENTATION_MODELS = {
-    "cidhcd": FragmentationModel.CidHcd,
-    "etd": FragmentationModel.Etd,
-    "ethcd": FragmentationModel.Ethcd,
-    "all": FragmentationModel.All,
-}
-MASS_MODES = {
-    "average": MassMode.Average,
-    "monoisotopic": MassMode.Monoisotopic,
-}
-
 
 class MS2FeatureGenerator(FeatureGeneratorBase):
     """DeepLC retention time-based feature generator."""
@@ -36,10 +23,10 @@ def __init__(
         *args,
         spectrum_path: Optional[str] = None,
         spectrum_id_pattern: str = "(.*)",
-        fragmentation_model: str = "All",
-        mass_mode: str = "Monoisotopic",
+        fragmentation_model: str = "cidhcd",
+        mass_mode: str = "monoisotopic",
         processes: int = 1,
-        calculate_hyperscore: bool = True,  # Allow optional ?
+        calculate_hyperscore: bool = True,
         **kwargs,
     ) -> None:
         """
@@ -58,8 +45,8 @@ def __init__(
 
         self.spectrum_path = spectrum_path
         self.spectrum_id_pattern = spectrum_id_pattern
-        self.fragmentation_model = FRAGMENTATION_MODELS[fragmentation_model.lower()]
-        self.mass_mode = MASS_MODES[mass_mode.lower()]
+        self.fragmentation_model = fragmentation_model.lower()
+        self.mass_mode = mass_mode.lower()
         self.processes = processes
         self.calculate_hyperscore = calculate_hyperscore
 
@@ -79,240 +66,24 @@ def feature_names(self) -> List[str]:
             "matched_y_ions_pct",
             "matched_ions_pct",
             "hyperscore",
-            # "mean_error_top7",
-            # "sq_mean_error_top7",
-            # "stdev_error_top7",
         ]
 
     def add_features(self, psm_list: PSMList) -> None:
-        """Add MS2-derived features to PSMs."""
-
         logger.info("Adding MS2-derived features to PSMs.")
-        for psm in psm_list:
-            annotated_spectrum = self._annotate_spectrum(psm)
-            psm.rescoring_features.update(
-                self._calculate_spectrum_features(psm, annotated_spectrum)
-            )
-
-            if self.calculate_hyperscore:
-                # Filters out peaks which are unnannotated (can be specified, but keep at b-y ions of any charge ?)
-                b, y = get_by_fragments(annotated_spectrum)
-                hs = calculate_hyperscore(
-                    n_y=len(y), n_b=len(b), y_ion_intensities=y, b_ion_intensities=b
-                )
-                psm.rescoring_features.update({"hyperscore": hs})
-
-    @staticmethod
-    def _longest_ion_sequence(lst):
-        max_sequence = 0
-        current_sequence = 0
-
-        for value in lst:
-            current_sequence = current_sequence + 1 if value else 0
-            max_sequence = max(max_sequence, current_sequence)
-
-        return max_sequence
-
-    # @staticmethod
-    # def _calculate_top7_peak_features(annotated_spectrum):
-    #     """
-    #     Calculate "top 7 peak"-related features using mass errors.
-    #     The following features are calculated:
-    #     - mean_error_top7: Mean of mass errors of the seven fragment ion peaks with the
-    #       highest intensities
-    #     - sq_mean_error_top7: Squared MeanErrorTop7
-    #     - stdev_error_top7: Standard deviation of mass errors of the seven fragment ion
-    #       peaks with the highest intensities
-    #     """
-    #     if not annotated_spectrum:
-    #         return 0.0, 0.0, 0.0
-
-    #     # Collect peaks with annotations (matched peaks) and their mass errors
-    #     peak_data = []
-    #     for peak in annotated_spectrum:
-    #         if peak.annotation:
-    #             for matched_ion in peak.annotation:
-    #                 # Calculate mass error (ppm) between observed and theoretical m/z
-    #                 theoretical_mz = matched_ion.mz
-    #                 observed_mz = peak.mz
-    #                 mass_error = ((observed_mz - theoretical_mz) / theoretical_mz) * 1e6
-    #                 peak_data.append((peak.intensity, mass_error))
-
-    #     if len(peak_data) == 0:
-    #         return 0.0, 0.0, 0.0
-
-    #     # Sort by intensity and get top 7
-    #     peak_data.sort(key=lambda x: x[0], reverse=True)
-    #     top7_errors = [error for _, error in peak_data[:7]]
-
-    #     if len(top7_errors) == 0:
-    #         return 0.0, 0.0, 0.0
-
-    #     mean_error_top7 = np.mean(top7_errors)
-    #     sq_mean_error_top7 = mean_error_top7**2
-    #     stdev_error_top7 = np.std(top7_errors) if len(top7_errors) > 1 else 0.0
-
-    #     return mean_error_top7, sq_mean_error_top7, stdev_error_top7
-
-    def _calculate_spectrum_features(self, psm, annotated_spectrum):
-
-        if not annotated_spectrum:
-            return {}
-
-        features = defaultdict(list)
-        b_ions_matched = [False] * (len(psm.peptidoform.sequence))
-        y_ions_matched = [False] * (len(psm.peptidoform.sequence))
-
-        pseudo_count = 0.00001
-        ion_fragment_regex = re.compile(r"\d+")
-
-        for peak in annotated_spectrum:
-            features["total_intensity"].append(peak.intensity)
-
-            if peak.annotation:
-                features["matched_intensity"].append(peak.intensity)
-                for matched_ion in peak.annotation:
-                    if "y" in matched_ion.ion:
-                        features["y_ion_matched"].append(peak.intensity)
-                        y_ions_matched[int(ion_fragment_regex.search(matched_ion.ion).group())] = (
-                            True
-                        )
-                    elif "b" in matched_ion.ion:
-                        features["b_ion_matched"].append(peak.intensity)
-                        b_ions_matched[int(ion_fragment_regex.search(matched_ion.ion).group())] = (
-                            True
-                        )
 
-        total_intensity_sum = np.sum(features["total_intensity"])
-        matched_intensity_sum = np.sum(features["matched_intensity"])
-        b_ion_matched_sum = np.sum(features["b_ion_matched"])
-        y_ion_matched_sum = np.sum(features["y_ion_matched"])
-
-        # Calculate top 7 peak features (MaxQuant-derived)
-        # mean_error_top7, sq_mean_error_top7, stdev_error_top7 = self._calculate_top7_peak_features(
-        #     annotated_spectrum
-        # )
-
-        return {
-            "ln_explained_intensity": np.log(matched_intensity_sum + pseudo_count),
-            "ln_total_intensity": np.log(total_intensity_sum + pseudo_count),
-            "ln_explained_intensity_ratio": np.log(
-                matched_intensity_sum / total_intensity_sum + pseudo_count
-            ),
-            "ln_explained_b_ion_ratio": np.log(
-                b_ion_matched_sum / matched_intensity_sum + pseudo_count
-            ),
-            "ln_explained_y_ion_ratio": np.log(
-                y_ion_matched_sum / matched_intensity_sum + pseudo_count
-            ),
-            "longest_b_ion_sequence": self._longest_ion_sequence(b_ions_matched),
-            "longest_y_ion_sequence": self._longest_ion_sequence(y_ions_matched),
-            "matched_b_ions": sum(b_ions_matched),
-            "matched_b_ions_pct": sum(b_ions_matched) / len(b_ions_matched),
-            "matched_y_ions": sum(y_ions_matched),
-            "matched_y_ions_pct": sum(y_ions_matched) / len(y_ions_matched),
-            "matched_ions_pct": (sum(b_ions_matched) + sum(y_ions_matched))
-            / (len(b_ions_matched) + len(y_ions_matched)),
-            # "mean_error_top7": mean_error_top7,
-            # "sq_mean_error_top7": sq_mean_error_top7,
-            # "stdev_error_top7": stdev_error_top7,
-        }
-
-    def _annotate_spectrum(self, psm):
-
-        spectrum = RawSpectrum(
-            title=psm.spectrum_id,
-            num_scans=1,
-            rt=psm.retention_time,
-            precursor_charge=psm.get_precursor_charge(),
-            precursor_mass=psm.spectrum.precursor.mz,
-            mz_array=psm.spectrum.mz,
-            intensity_array=psm.spectrum.intensity,
+        spectra = psm_list["spectrum"]
+        # Keep parity with your current behavior:
+        proformas = [psm.peptidoform.proforma.split("/")[0] for psm in psm_list]
+        seq_lens = [len(psm.peptidoform.sequence) for psm in psm_list]
+
+        feature_dicts = batch_ms2_features_from_spectra(
+            spectra=spectra,
+            proformas=proformas,
+            seq_lens=seq_lens,
+            fragmentation_model=self.fragmentation_model,
+            mass_mode=self.mass_mode,
+            calculate_hyperscore=self.calculate_hyperscore,
         )
-        try:
-            linear_peptide = LinearPeptide(psm.peptidoform.proforma.split("/")[0])
-            annotated_spectrum = spectrum.annotate(
-                peptide=linear_peptide,
-                model=self.fragmentation_model,
-                mode=self.mass_mode,
-            )
-        except:  # noqa E722
-            return []
-
-        return annotated_spectrum.spectrum
-
-
-def factorial(n):
-    """
-    Compute factorial of n using a loop.
-    Parameters:
-        n (int): Non-negative integer.
-    Returns:
-        int: Factorial of n.
-    """
-    result = 1
-    for i in range(1, n + 1):
-        result *= i
-    return result
-
-
-def calculate_hyperscore(n_y, n_b, y_ion_intensities, b_ion_intensities):
-    """
-    Calculate the hyperscore for a peptide-spectrum match.
-    Parameters:
-        n_y (int): Number of matched y-ions.
-        n_b (int): Number of matched b-ions.
-        y_ion_intensities (list): Intensities of matched y-ions.
-        b_ion_intensities (list): Intensities of matched b-ions.
-    Returns:
-        float: Calculated hyperscore.
-    """
-    # Calculate the product of y-ion and b-ion intensities
-    product_y = np.sum(y_ion_intensities) if y_ion_intensities else 1
-    product_b = np.sum(b_ion_intensities) if b_ion_intensities else 1
-
-    # Calculate factorial using custom function
-    factorial_y = factorial(n_y)
-    factorial_b = factorial(n_b)
-
-    # Compute hyperscore
-    hyperscore = np.log(factorial_y * factorial_b * (product_y + product_b))
-    return hyperscore
-
-
-def infer_fragment_identity(frag, allow_ion_types=["b", "y"]):
-    ion = frag.ion
-
-    is_allowed = False
-    for allowed_ion_type in allow_ion_types:
-        if allowed_ion_type in ion:
-            is_allowed = True
-            break
-
-    if not is_allowed:
-        return False
-    # Radicals
-    if "·" in ion:
-        return False
-    if frag.neutral_loss is not None:
-        return False
-    if frag.charge > 2:
-        return False
-
-    return ion[0]
-
-
-def get_by_fragments(annotated_spectrum):
-    b_intensities = []
-    y_intensities = []
-    for peak in annotated_spectrum:
-
-        for fragment in peak.annotation:
-
-            ion_type = infer_fragment_identity(fragment)
 
-            if ion_type == "b":
-                b_intensities.append(peak.intensity)
-            if ion_type == "y":
-                y_intensities.append(peak.intensity)
-    return b_intensities, y_intensities
+        for psm, feats in zip(psm_list, feature_dicts):
+            psm.rescoring_features.update(feats)
diff --git a/ms2rescore/feature_generators/ms2pip.py b/ms2rescore/feature_generators/ms2pip.py
index 3fdb62ad..dcb0100e 100644
--- a/ms2rescore/feature_generators/ms2pip.py
+++ b/ms2rescore/feature_generators/ms2pip.py
@@ -24,18 +24,13 @@
 """
 
 import logging
-import multiprocessing
-import warnings
 from typing import List, Optional, Union
-
 import numpy as np
-import pandas as pd
+
 from ms2pip import process_MS2_spectra
-from ms2pip.exceptions import NoMatchingSpectraFound
-from ms2pip.result import ProcessingResult
+from ms2rescore_rs import batch_ms2pip_features_numpy
 
 from psm_utils import PSMList
-from rich.progress import track
 
 from ms2rescore.feature_generators.base import FeatureGeneratorBase
 from ms2rescore.parse_spectra import MSDataType
@@ -180,7 +175,6 @@ def add_features(self, psm_list: PSMList) -> None:
 
         """
         logger.info("Adding MS²PIP-derived features to PSMs.")
-        # temporarily remove spectrum from psm_utils before multiprocessing
         ms2pip_results = process_MS2_spectra(
             psms=psm_list,
             model=self.model,
@@ -189,181 +183,29 @@ def add_features(self, psm_list: PSMList) -> None:
         )
         self._calculate_features(psm_list, ms2pip_results)
 
-    def _calculate_features(
-        self, psm_list: PSMList, ms2pip_results: List[ProcessingResult]
-    ) -> None:
-        """Calculate features from all MS²PIP results and add to PSMs."""
-        logger.debug("Calculating features from predicted spectra")
-        # Use spawn context to avoid fork memory duplication issues
-        # maxtasksperchild recycles workers to free accumulated memory
-        ctx = multiprocessing.get_context("spawn")
-        with ctx.Pool(int(self.processes), maxtasksperchild=500) as pool:
-            # Use imap, so we can use a progress bar
-            counts_failed = 0
-            for result, features in zip(
-                ms2pip_results,
-                track(
-                    pool.imap(self._calculate_features_single, ms2pip_results, chunksize=250),
-                    total=len(ms2pip_results),
-                    description="Calculating features...",
-                    transient=True,
-                ),
-            ):
-                if features:
-                    # Cannot use result.psm directly, as it is a copy from MS²PIP multiprocessing
-                    try:
-                        psm_list[result.psm_index]["rescoring_features"].update(features)
-                    except (AttributeError, TypeError):
-                        psm_list[result.psm_index]["rescoring_features"] = features
-                else:
-                    counts_failed += 1
-
-        if counts_failed > 0:
-            logger.warning(f"Failed to calculate features for {counts_failed} PSMs")
-
-    def _calculate_features_single(self, processing_result: ProcessingResult) -> Union[dict, None]:
-        """Calculate MS²PIP-based features for single PSM."""
-        if (
-            processing_result.observed_intensity is None
-            or processing_result.predicted_intensity is None
-        ):
-            return None
-
-        # Suppress RuntimeWarnings about invalid values
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-
-            # Convert intensities to arrays
-            target_b = processing_result.predicted_intensity["b"].clip(np.log2(0.001))
-            target_y = processing_result.predicted_intensity["y"].clip(np.log2(0.001))
-            target_all = np.concatenate([target_b, target_y])
-            prediction_b = processing_result.observed_intensity["b"].clip(np.log2(0.001))
-            prediction_y = processing_result.observed_intensity["y"].clip(np.log2(0.001))
-            prediction_all = np.concatenate([prediction_b, prediction_y])
-
-            # Prepare 'unlogged' intensity arrays
-            target_b_unlog = 2**target_b - 0.001
-            target_y_unlog = 2**target_y - 0.001
-            target_all_unlog = 2**target_all - 0.001
-            prediction_b_unlog = 2**prediction_b - 0.001
-            prediction_y_unlog = 2**prediction_y - 0.001
-            prediction_all_unlog = 2**prediction_all - 0.001
-
-            # Calculate absolute differences
-            abs_diff_b = np.abs(target_b - prediction_b)
-            abs_diff_y = np.abs(target_y - prediction_y)
-            abs_diff_all = np.abs(target_all - prediction_all)
-            abs_diff_b_unlog = np.abs(target_b_unlog - prediction_b_unlog)
-            abs_diff_y_unlog = np.abs(target_y_unlog - prediction_y_unlog)
-            abs_diff_all_unlog = np.abs(target_all_unlog - prediction_all_unlog)
-
-            # Compute features
-            feature_values = [
-                # Features between spectra in log space
-                np.corrcoef(target_all, prediction_all)[0][1],  # Pearson all ions
-                np.corrcoef(target_b, prediction_b)[0][1],  # Pearson b ions
-                np.corrcoef(target_y, prediction_y)[0][1],  # Pearson y ions
-                _mse(target_all, prediction_all),  # MSE all ions
-                _mse(target_b, prediction_b),  # MSE b ions
-                _mse(target_y, prediction_y),  # MSE y ions
-                np.min(abs_diff_all),  # min_abs_diff_norm
-                np.max(abs_diff_all),  # max_abs_diff_norm
-                np.quantile(abs_diff_all, 0.25),  # abs_diff_Q1_norm
-                np.quantile(abs_diff_all, 0.5),  # abs_diff_Q2_norm
-                np.quantile(abs_diff_all, 0.75),  # abs_diff_Q3_norm
-                np.mean(abs_diff_all),  # mean_abs_diff_norm
-                np.std(abs_diff_all),  # std_abs_diff_norm
-                np.min(abs_diff_b),  # ionb_min_abs_diff_norm
-                np.max(abs_diff_b),  # ionb_max_abs_diff_norm
-                np.quantile(abs_diff_b, 0.25),  # ionb_abs_diff_Q1_norm
-                np.quantile(abs_diff_b, 0.5),  # ionb_abs_diff_Q2_norm
-                np.quantile(abs_diff_b, 0.75),  # ionb_abs_diff_Q3_norm
-                np.mean(abs_diff_b),  # ionb_mean_abs_diff_norm
-                np.std(abs_diff_b),  # ionb_std_abs_diff_norm
-                np.min(abs_diff_y),  # iony_min_abs_diff_norm
-                np.max(abs_diff_y),  # iony_max_abs_diff_norm
-                np.quantile(abs_diff_y, 0.25),  # iony_abs_diff_Q1_norm
-                np.quantile(abs_diff_y, 0.5),  # iony_abs_diff_Q2_norm
-                np.quantile(abs_diff_y, 0.75),  # iony_abs_diff_Q3_norm
-                np.mean(abs_diff_y),  # iony_mean_abs_diff_norm
-                np.std(abs_diff_y),  # iony_std_abs_diff_norm
-                np.dot(target_all, prediction_all),  # Dot product all ions
-                np.dot(target_b, prediction_b),  # Dot product b ions
-                np.dot(target_y, prediction_y),  # Dot product y ions
-                _cosine_similarity(target_all, prediction_all),  # Cos similarity all ions
-                _cosine_similarity(target_b, prediction_b),  # Cos similarity b ions
-                _cosine_similarity(target_y, prediction_y),  # Cos similarity y ions
-                # Same features in normal space
-                np.corrcoef(target_all_unlog, prediction_all_unlog)[0][1],  # Pearson all
-                np.corrcoef(target_b_unlog, prediction_b_unlog)[0][1],  # Pearson b
-                np.corrcoef(target_y_unlog, prediction_y_unlog)[0][1],  # Pearson y
-                _spearman(target_all_unlog, prediction_all_unlog),  # Spearman all ions
-                _spearman(target_b_unlog, prediction_b_unlog),  # Spearman b ions
-                _spearman(target_y_unlog, prediction_y_unlog),  # Spearman y ions
-                _mse(target_all_unlog, prediction_all_unlog),  # MSE all ions
-                _mse(target_b_unlog, prediction_b_unlog),  # MSE b ions
-                _mse(target_y_unlog, prediction_y_unlog),  # MSE y ions,
-                # Ion type with min absolute difference
-                0 if np.min(abs_diff_b_unlog) <= np.min(abs_diff_y_unlog) else 1,
-                # Ion type with max absolute difference
-                0 if np.max(abs_diff_b_unlog) >= np.max(abs_diff_y_unlog) else 1,
-                np.min(abs_diff_all_unlog),  # min_abs_diff
-                np.max(abs_diff_all_unlog),  # max_abs_diff
-                np.quantile(abs_diff_all_unlog, 0.25),  # abs_diff_Q1
-                np.quantile(abs_diff_all_unlog, 0.5),  # abs_diff_Q2
-                np.quantile(abs_diff_all_unlog, 0.75),  # abs_diff_Q3
-                np.mean(abs_diff_all_unlog),  # mean_abs_diff
-                np.std(abs_diff_all_unlog),  # std_abs_diff
-                np.min(abs_diff_b_unlog),  # ionb_min_abs_diff
-                np.max(abs_diff_b_unlog),  # ionb_max_abs_diff_norm
-                np.quantile(abs_diff_b_unlog, 0.25),  # ionb_abs_diff_Q1
-                np.quantile(abs_diff_b_unlog, 0.5),  # ionb_abs_diff_Q2
-                np.quantile(abs_diff_b_unlog, 0.75),  # ionb_abs_diff_Q3
-                np.mean(abs_diff_b_unlog),  # ionb_mean_abs_diff
-                np.std(abs_diff_b_unlog),  # ionb_std_abs_diff
-                np.min(abs_diff_y_unlog),  # iony_min_abs_diff
-                np.max(abs_diff_y_unlog),  # iony_max_abs_diff
-                np.quantile(abs_diff_y_unlog, 0.25),  # iony_abs_diff_Q1
-                np.quantile(abs_diff_y_unlog, 0.5),  # iony_abs_diff_Q2
-                np.quantile(abs_diff_y_unlog, 0.75),  # iony_abs_diff_Q3
-                np.mean(abs_diff_y_unlog),  # iony_mean_abs_diff
-                np.std(abs_diff_y_unlog),  # iony_std_abs_diff
-                np.dot(target_all_unlog, prediction_all_unlog),  # Dot product all ions
-                np.dot(target_b_unlog, prediction_b_unlog),  # Dot product b ions
-                np.dot(target_y_unlog, prediction_y_unlog),  # Dot product y ions
-                _cosine_similarity(target_all_unlog, prediction_all_unlog),  # Cos similarity all
-                _cosine_similarity(target_b_unlog, prediction_b_unlog),  # Cos similarity b ions
-                _cosine_similarity(target_y_unlog, prediction_y_unlog),  # Cos similarity y ions
-            ]
-
-        features = dict(
-            zip(
-                self.feature_names,
-                [0.0 if np.isnan(ft) else ft for ft in feature_values],
-            )
+    def _calculate_features(self, psm_list, ms2pip_results):
+        idx = []
+        pred_b = []
+        pred_y = []
+        obs_b  = []
+        obs_y  = []
+
+        for r in ms2pip_results:
+            if r.observed_intensity is None or r.predicted_intensity is None:
+                continue
+            idx.append(r.psm_index)
+            pred_b.append(r.predicted_intensity["b"])
+            pred_y.append(r.predicted_intensity["y"])
+            obs_b.append(r.observed_intensity["b"])
+            obs_y.append(r.observed_intensity["y"])
+        
+        results = batch_ms2pip_features_numpy(
+            idx, pred_b, pred_y, obs_b, obs_y
         )
 
-        return features
-
-
-def _spearman(x: np.ndarray, y: np.ndarray) -> float:
-    """Spearman rank correlation."""
-    x = np.array(x)
-    y = np.array(y)
-    x_rank = pd.Series(x).rank()
-    y_rank = pd.Series(y).rank()
-    return np.corrcoef(x_rank, y_rank)[0][1]
-
-
-def _mse(x: np.ndarray, y: np.ndarray) -> float:
-    """Mean squared error"""
-    x = np.array(x)
-    y = np.array(y)
-    return np.mean((x - y) ** 2)
-
-
-def _cosine_similarity(x: np.ndarray, y: np.ndarray) -> float:
-    """Cosine similarity"""
-    x = np.array(x)
-    y = np.array(y)
-    return np.dot(x, y) / (np.linalg.norm(x, 2) * np.linalg.norm(y, 2))
+        for psm_index, feats in results:
+            if feats:
+                try:
+                    psm_list[psm_index]["rescoring_features"].update(feats)
+                except (AttributeError, TypeError):
+                    psm_list[psm_index]["rescoring_features"] = feats
\ No newline at end of file

From a9108b9633af7f4f8ac0580692e71052933b65d3 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Mon, 12 Jan 2026 10:50:45 +0100
Subject: [PATCH 47/54] reimplement deeplc feature calculation

---
 ms2rescore/feature_generators/deeplc.py | 381 +++++++++++++++---------
 1 file changed, 241 insertions(+), 140 deletions(-)

diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py
index 3506d3ea..2e1db888 100644
--- a/ms2rescore/feature_generators/deeplc.py
+++ b/ms2rescore/feature_generators/deeplc.py
@@ -15,23 +15,23 @@
 
 """
 
-import contextlib
 import logging
-import os
-from collections import defaultdict
-from inspect import getfullargspec
-from itertools import chain
 from typing import List, Union
 
 import numpy as np
 from psm_utils import PSMList
+from deeplc.core import predict, finetune
+from deeplc.calibration import SplineTransformerCalibration
+
 
 from ms2rescore.feature_generators.base import FeatureGeneratorBase
 from ms2rescore.parse_spectra import MSDataType
 
-os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
 logger = logging.getLogger(__name__)
 
+# Suppress verbose onnx2torch logging
+logging.getLogger("onnx2torch").setLevel(logging.WARNING)
+
 
 class DeepLCFeatureGenerator(FeatureGeneratorBase):
     """DeepLC retention time-based feature generator."""
@@ -41,9 +41,7 @@ class DeepLCFeatureGenerator(FeatureGeneratorBase):
     def __init__(
         self,
         *args,
-        lower_score_is_better: bool = False,
         calibration_set_size: Union[int, float, None] = None,
-        calibration_set: Union[str, None] = None,
         processes: int = 1,
         **kwargs,
     ) -> None:
@@ -55,8 +53,7 @@ def __init__(
 
         Parameters
         ----------
-        lower_score_is_better
-            Whether a lower PSM score denotes a better matching PSM. Default: False
+
         calibration_set_size: int or float
             Amount of best PSMs to use for DeepLC calibration. If this value is lower
             than the number of available PSMs, all PSMs will be used. (default: 0.15)
@@ -73,35 +70,42 @@ def __init__(
         """
         super().__init__(*args, **kwargs)
 
-        self.lower_psm_score_better = lower_score_is_better
         self.calibration_set_size = calibration_set_size
-        self.calibration_set = calibration_set
-        self.processes = processes
         self.deeplc_kwargs = kwargs or {}
 
         self._verbose = logger.getEffectiveLevel() <= logging.DEBUG
 
-        # Lazy-load DeepLC
-        from deeplc import DeepLC
+        self.model = self.deeplc_kwargs.get("model", None)
 
-        self.DeepLC = DeepLC
+        self.calibration = None
 
-        # Remove any kwargs that are not DeepLC arguments
-        self.deeplc_kwargs = {
-            k: v for k, v in self.deeplc_kwargs.items() if k in getfullargspec(DeepLC).args
-        }
-        self.deeplc_kwargs.update({"config_file": None})
+        # Prepare DeepLC predict kwargs
+        self.predict_kwargs = {
+            k: v for k, v in self.deeplc_kwargs.items() if k in ["device", "batch_size"]
+        }  # getfullargspec(predict).args does not work on this outer predict function
+        self.predict_kwargs["num_workers"] = processes
 
-        # Set default DeepLC arguments
+        # Prepare DeepLC finetune kwargs
         if "deeplc_retrain" not in self.deeplc_kwargs:
             self.deeplc_kwargs["deeplc_retrain"] = False
-
-        self.deeplc_predictor = None
-        if "path_model" in self.deeplc_kwargs:
-            self.user_model = self.deeplc_kwargs.pop("path_model")
-            logging.debug(f"Using user-provided DeepLC model {self.user_model}.")
-        else:
-            self.user_model = None
+            return  # skip the rest of the init if no retraining
+
+        if self.deeplc_kwargs["deeplc_retrain"]:
+            self.finetune_kwargs = {
+                k: v
+                for k, v in self.deeplc_kwargs.items()
+                if k
+                in [
+                    "epochs",
+                    "device",
+                    "batch_size",
+                    "learning_rate",
+                    "patience",
+                    "trainable_layers",
+                    "validation_split",
+                ]
+            }
+            self.finetune_kwargs["num_workers"] = processes
 
     @property
     def feature_names(self) -> List[str]:
@@ -118,131 +122,228 @@ def add_features(self, psm_list: PSMList) -> None:
         """Add DeepLC-derived features to PSMs."""
 
         logger.info("Adding DeepLC-derived features to PSMs.")
+        psm_list_df = psm_list.to_dataframe()
+        psm_list_df = psm_list_df[
+            [
+                "peptidoform",
+                "retention_time",
+                "run",
+                "qvalue",
+                "is_decoy",
+            ]
+        ]
+        psm_list_df["sequence"] = psm_list_df["peptidoform"].apply(lambda x: x.modified_sequence)
 
-        # Get easy-access nested version of PSMList
-        psm_dict = psm_list.get_psm_dict()
-
-        # Run DeepLC for each spectrum file
-        current_run = 1
-        total_runs = sum(len(runs) for runs in psm_dict.values())
-        for runs in psm_dict.values():
-            # Reset DeepLC predictor for each collection of runs
-            self.deeplc_predictor = None
-            self.selected_model = None
-            for run, psms in runs.items():
-                peptide_rt_diff_dict = defaultdict(
-                    lambda: {
-                        "observed_retention_time_best": np.inf,
-                        "predicted_retention_time_best": np.inf,
-                        "rt_diff_best": np.inf,
-                    }
-                )
-                logger.info(
-                    f"Running DeepLC for PSMs from run ({current_run}/{total_runs}): `{run}`..."
-                )
+        if self.deeplc_kwargs["deeplc_retrain"]:
+            # Filter high-confidence target PSMs once for transfer learning
+            target_mask = (psm_list["qvalue"] <= 0.01) & (~psm_list["is_decoy"])
+            target_psms = psm_list[target_mask]
+
+            # Determine best run for transfer learning
+            best_run = self._best_run_by_shared_proteoforms(
+                target_psms["run"],
+                target_psms["peptidoform"],
+            )
+
+            # Fine-tune on best run
+            best_run_psms = target_psms[target_psms["run"] == best_run]
+            logger.debug(
+                f"Fine-tuning DeepLC on run '{best_run}'... with {len(best_run_psms)} PSMs"
+            )
+            self.model = finetune(
+                best_run_psms,
+                model=self.model,
+                train_kwargs=self.finetune_kwargs,
+            )
+
+        # Predict retention times for all PSMs
+        logger.debug("Predicting retention times with DeepLC...")
+        psm_list_df["predicted_retention_time"] = predict(
+            psm_list, model=self.model, predict_kwargs=self.predict_kwargs
+        )
+
+        # Calibrate predictions per run
+        for run in psm_list_df["run"].unique():
+            run_df = psm_list_df[psm_list_df["run"] == run].copy()
+
+            # Get calibration data (target PSMs only)
+            observed_rt_calibration, predicted_rt_calibration = self._get_calibration_data(run_df)
+
+            # Fit calibration and transform all predictions for this run
+            calibration = SplineTransformerCalibration()
+            calibration.fit(
+                target=observed_rt_calibration,
+                source=predicted_rt_calibration,
+            )
 
-                # Disable wild logging to stdout by Tensorflow, unless in debug mode
-                with (
-                    contextlib.redirect_stdout(open(os.devnull, "w", encoding="utf-8"))
-                    if not self._verbose
-                    else contextlib.nullcontext()
-                ):
-                    # Make new PSM list for this run (chain PSMs per spectrum to flat list)
-                    psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
-                    psm_list_calibration = self._get_calibration_psms(psm_list_run)
-                    logger.debug(f"Calibrating DeepLC with {len(psm_list_calibration)} PSMs...")
-                    self.deeplc_predictor = self.DeepLC(
-                        n_jobs=self.processes,
-                        verbose=self._verbose,
-                        path_model=self.selected_model or self.user_model,
-                        **self.deeplc_kwargs,
-                    )
-                    self.deeplc_predictor.calibrate_preds(psm_list_calibration)
-                    # Still calibrate for each run, but do not try out all model options.
-                    # Just use model that was selected based on first run
-                    if not self.selected_model:
-                        self.selected_model = list(self.deeplc_predictor.model.keys())
-                        self.deeplc_kwargs["deeplc_retrain"] = False
-                        logger.debug(
-                            f"Selected DeepLC model {self.selected_model} based on "
-                            "calibration of first run. Using this model (after new "
-                            "calibrations) for the remaining runs."
-                        )
-
-                    logger.debug("Predicting retention times...")
-                    predictions = np.array(self.deeplc_predictor.make_preds(psm_list_run))
-                    observations = psm_list_run["retention_time"]
-                    rt_diffs_run = np.abs(predictions - observations)
-
-                    logger.debug("Adding features to PSMs...")
-                    for i, psm in enumerate(psm_list_run):
-                        psm["rescoring_features"].update(
-                            {
-                                "observed_retention_time": observations[i],
-                                "predicted_retention_time": predictions[i],
-                                "rt_diff": rt_diffs_run[i],
-                            }
-                        )
-                        peptide = psm.peptidoform.proforma.split("\\")[0]  # remove charge
-                        if peptide_rt_diff_dict[peptide]["rt_diff_best"] > rt_diffs_run[i]:
-                            peptide_rt_diff_dict[peptide] = {
-                                "observed_retention_time_best": observations[i],
-                                "predicted_retention_time_best": predictions[i],
-                                "rt_diff_best": rt_diffs_run[i],
-                            }
-                    for psm in psm_list_run:
-                        psm["rescoring_features"].update(
-                            peptide_rt_diff_dict[psm.peptidoform.proforma.split("\\")[0]]
-                        )
-                current_run += 1
-
-    def _get_calibration_psms(self, psm_list: PSMList):
-        """Get N best scoring target PSMs for calibration."""
-        psm_list_targets = psm_list[
-            ~psm_list["is_decoy"]
-            & [metadata.get("original_psm", True) for metadata in psm_list["metadata"]]
+            calibrated_rt = calibration.transform(run_df["predicted_retention_time"].values)
+
+            # Update predictions with calibrated values
+            psm_list_df.loc[psm_list_df["run"] == run, "predicted_retention_time"] = calibrated_rt
+
+        psm_list_df["observed_retention_time"] = psm_list_df["retention_time"]
+        psm_list_df["rt_diff"] = (
+            psm_list_df["observed_retention_time"] - psm_list_df["predicted_retention_time"]
+        ).abs()
+        psm_list_df_best = psm_list_df.loc[
+            psm_list_df.groupby(["run", "sequence"])["rt_diff"].idxmin()
         ]
-        if self.calibration_set_size:
-            n_psms = self._get_number_of_calibration_psms(psm_list_targets)
-            indices = np.argsort(psm_list_targets["score"])
-            indices = indices[:n_psms] if self.lower_psm_score_better else indices[-n_psms:]
-            return psm_list_targets[indices]
-        else:
-            identified_psms = psm_list_targets[psm_list_targets["qvalue"] <= 0.01]
-            if len(identified_psms) == 0:
-                raise ValueError(
-                    "No target PSMs with q-value <= 0.01 found. Please set calibration set size for calibrating deeplc."
-                )
-            elif (len(identified_psms) < 500) & (self.deeplc_kwargs["deeplc_retrain"]):
-                logger.warning(
-                    " Less than 500 target PSMs with q-value <= 0.01 found for retraining. Consider turning of deeplc_retrain, as this is likely not enough data for retraining."
-                )
-            return identified_psms
 
-    def _get_number_of_calibration_psms(self, psm_list):
-        """Get number of calibration PSMs given `calibration_set_size` and total number of PSMs."""
+        psm_list_df = psm_list_df.merge(
+            psm_list_df_best[
+                [
+                    "run",
+                    "sequence",
+                    "observed_retention_time",
+                    "predicted_retention_time",
+                    "rt_diff",
+                ]
+            ],
+            on=["run", "sequence"],
+            how="left",
+            suffixes=("", "_best"),
+        )
+
+        psm_list_feature_dicts = psm_list_df[self.feature_names].to_dict(orient="records")
+        # Add features to PSMs
+        logger.debug("Adding features to PSMs...")
+        for psm, features in zip(psm_list, psm_list_feature_dicts):
+            psm.rescoring_features.update(features)
+
+    def _get_calibration_data(self, run_df) -> tuple[np.ndarray, np.ndarray]:
+        """Get calibration data (observed and predicted RTs) from run dataframe.
+
+        Only target (non-decoy) PSMs are used for calibration.
+
+        Parameters
+        ----------
+        run_df : pd.DataFrame
+            Dataframe containing PSMs for a single run, with columns:
+            'retention_time', 'predicted_retention_time', 'qvalue', 'is_decoy'
+
+        Returns
+        -------
+        tuple[np.ndarray, np.ndarray]
+            Observed and predicted retention times for calibration
+        """
+        # Filter to target PSMs only
+        target_df = run_df[~run_df["is_decoy"]].copy()
+
+        # Determine number of calibration PSMs
         if isinstance(self.calibration_set_size, float):
             if not 0 < self.calibration_set_size <= 1:
                 raise ValueError(
                     "If `calibration_set_size` is a float, it cannot be smaller than "
                     "or equal to 0 or larger than 1."
                 )
-            else:
-                num_calibration_psms = round(len(psm_list) * self.calibration_set_size)
+            num_calibration_psms = round(len(target_df) * self.calibration_set_size)
         elif isinstance(self.calibration_set_size, int):
-            if self.calibration_set_size > len(psm_list):
+            if self.calibration_set_size > len(target_df):
                 logger.warning(
-                    f"Requested number of calibration PSMs ({self.calibration_set_size}"
-                    f") is larger than total number of PSMs ({len(psm_list)}). Using "
-                    "all PSMs for calibration."
+                    f"Requested number of calibration PSMs ({self.calibration_set_size}) "
+                    f"is larger than total number of target PSMs ({len(target_df)}). Using "
+                    "all target PSMs for calibration."
                 )
-                num_calibration_psms = len(psm_list)
+                num_calibration_psms = len(target_df)
             else:
                 num_calibration_psms = self.calibration_set_size
         else:
-            raise TypeError(
-                "Expected float or int for `calibration_set_size`. Got "
-                f"{type(self.calibration_set_size)} instead. "
+            # Use PSMs with q-value <= 0.01
+            num_calibration_psms = (target_df["qvalue"] <= 0.01).sum()
+
+        logger.debug(f"Using {num_calibration_psms} target PSMs for calibration")
+
+        # Select calibration PSMs (assuming they are sorted by q-value)
+        calibration_df = target_df.head(num_calibration_psms)
+
+        return (
+            calibration_df["retention_time"].values,
+            calibration_df["predicted_retention_time"].values,
+        )
+
+    @staticmethod
+    def _best_run_by_shared_proteoforms(runs, proteoforms):
+        """
+        Return the run whose proteoform set has the largest total overlap with all other runs:
+            score(run_i) = sum_{j != i} |P_i ∩ P_j|
+
+        Tie / degenerate handling (per request):
+        - If no run shares anything (all scores == 0): warn and return the first run (by first appearance).
+        - If multiple runs are tied for best score: return the first among them (by first appearance).
+        """
+        runs = np.asarray(runs)
+        proteoforms = np.asarray(proteoforms)
+        if runs.shape[0] != proteoforms.shape[0]:
+            raise ValueError("runs and proteoforms must have the same length.")
+        if runs.size == 0:
+            raise ValueError("Empty input: runs/proteoforms must contain at least one entry.")
+
+        # Preserve run order by first appearance
+        run_to_idx = {}
+        run_order = []
+        run_idx = np.empty(runs.shape[0], dtype=np.int64)
+        for i, r in enumerate(runs):
+            if r not in run_to_idx:
+                run_to_idx[r] = len(run_order)
+                run_order.append(r)
+            run_idx[i] = run_to_idx[r]
+
+        # Fast path: sparse incidence matrix
+        try:
+            from scipy.sparse import coo_matrix
+        except ImportError:
+            # Fallback (slower): set-based
+            run_sets = {}
+            for r, p in zip(runs, proteoforms):
+                run_sets.setdefault(r, set()).add(p)
+
+            scores = []
+            for r in run_order:
+                Pi = run_sets[r]
+                s = 0
+                for r2 in run_order:
+                    if r2 is r:
+                        continue
+                    s += len(Pi & run_sets[r2])
+                scores.append(s)
+
+            scores = np.asarray(scores, dtype=np.int64)
+            max_score = scores.max()
+            best_candidates = np.flatnonzero(scores == max_score)
+
+            if max_score == 0:
+                logger.warning(
+                    "No runs share any identified proteoforms with other runs; transfer learning might not be as effective."
+                )
+                return run_order[0]
+
+            return run_order[int(best_candidates[0])]
+
+        # Encode proteoforms (order does not matter for correctness)
+        _, prot_idx = np.unique(proteoforms, return_inverse=True)
+
+        # De-duplicate (run, proteoform) pairs
+        pairs = np.unique(np.stack([run_idx, prot_idx], axis=1), axis=0)
+        r = pairs[:, 0]
+        p = pairs[:, 1]
+
+        M = coo_matrix(
+            (np.ones(len(r), dtype=np.int8), (r, p)),
+            shape=(len(run_order), int(prot_idx.max()) + 1),
+        ).tocsr()
+
+        # Overlap matrix O[i,j] = |P_i ∩ P_j|
+        overlap = (M @ M.T).toarray()
+        np.fill_diagonal(overlap, 0)
+
+        scores = overlap.sum(axis=1).astype(np.int64)
+        max_score = int(scores.max())
+        best_candidates = np.flatnonzero(scores == max_score)
+
+        if max_score == 0:
+            logger.warning(
+                "No runs share any identified proteoforms with other runs; transfer learning might not be as effective."
             )
-        logger.debug(f"Using {num_calibration_psms} PSMs for calibration")
-        return num_calibration_psms
+            return run_order[0]
+
+        return run_order[int(best_candidates[0])]

From 698dd5e2a1f19551356ab18ca789e18a9325163a Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Mon, 12 Jan 2026 20:17:59 +0100
Subject: [PATCH 48/54] change logging

---
 ms2rescore/feature_generators/deeplc.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py
index 2e1db888..b14094ef 100644
--- a/ms2rescore/feature_generators/deeplc.py
+++ b/ms2rescore/feature_generators/deeplc.py
@@ -23,7 +23,6 @@
 from deeplc.core import predict, finetune
 from deeplc.calibration import SplineTransformerCalibration
 
-
 from ms2rescore.feature_generators.base import FeatureGeneratorBase
 from ms2rescore.parse_spectra import MSDataType
 
@@ -157,12 +156,13 @@ def add_features(self, psm_list: PSMList) -> None:
             )
 
         # Predict retention times for all PSMs
-        logger.debug("Predicting retention times with DeepLC...")
+        logger.info("Predicting retention times with DeepLC...")
         psm_list_df["predicted_retention_time"] = predict(
             psm_list, model=self.model, predict_kwargs=self.predict_kwargs
         )
 
         # Calibrate predictions per run
+        logger.info("Calibrating predicted retention times per run...")
         for run in psm_list_df["run"].unique():
             run_df = psm_list_df[psm_list_df["run"] == run].copy()
 
@@ -228,6 +228,7 @@ def _get_calibration_data(self, run_df) -> tuple[np.ndarray, np.ndarray]:
         """
         # Filter to target PSMs only
         target_df = run_df[~run_df["is_decoy"]].copy()
+        target_df = target_df.sort_values("qvalue", ascending=True)
 
         # Determine number of calibration PSMs
         if isinstance(self.calibration_set_size, float):

From 862f9be27bcbc13d65737de9ef1f16ae6674f208 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Wed, 14 Jan 2026 15:41:43 +0100
Subject: [PATCH 49/54] fix im2deep fg

---
 ms2rescore/feature_generators/im2deep.py | 162 +++++++++++++----------
 1 file changed, 91 insertions(+), 71 deletions(-)

diff --git a/ms2rescore/feature_generators/im2deep.py b/ms2rescore/feature_generators/im2deep.py
index 12094306..23800602 100644
--- a/ms2rescore/feature_generators/im2deep.py
+++ b/ms2rescore/feature_generators/im2deep.py
@@ -9,21 +9,17 @@
 """
 
 import logging
-import os
-from inspect import getfullargspec
-from typing import List
+from typing import List, Union
 
 import numpy as np
-import pandas as pd
+from psm_utils import PSMList
+from im2deep.core import predict
+from im2deep.calibration import LinearCCSCalibration, get_default_reference
 from im2deep.utils import im2ccs
-from im2deep.im2deep import predict_ccs, REFERENCE_DATASET_PATH
-from im2deep.calibrate import calculate_ccs_shift
-from psm_utils import PSMList, Peptidoform
 
 from ms2rescore.feature_generators.base import FeatureGeneratorBase
 from ms2rescore.parse_spectra import MSDataType
 
-os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
 logger = logging.getLogger(__name__)
 
 
@@ -34,6 +30,8 @@ class IM2DeepFeatureGenerator(FeatureGeneratorBase):
 
     def __init__(
         self,
+        multi: bool = False,
+        calibration_set_size: Union[int, float] = None,
         *args,
         processes: int = 1,
         **kwargs,
@@ -51,14 +49,24 @@ def __init__(
         """
         super().__init__(*args, **kwargs)
 
-        self._verbose = logger.getEffectiveLevel() <= logging.DEBUG
+        self.multi = multi
+        if self.multi:
+            raise NotImplementedError(
+                "Multi-IM mode is not yet implemented for IM2DeepFeatureGenerator."
+            )
+
+        self.calibration_set_size = calibration_set_size
 
-        # Remove any kwargs that are not IM2Deep arguments
         self.im2deep_kwargs = kwargs or {}
-        self.im2deep_kwargs = {
-            k: v for k, v in self.im2deep_kwargs.items() if k in getfullargspec(predict_ccs).args
+        self._verbose = logger.getEffectiveLevel() <= logging.DEBUG
+
+        self.model = self.im2deep_kwargs.get("model", None)
+
+        # Prepare IM2Deep predict kwargs
+        self.predict_kwargs = {
+            k: v for k, v in self.im2deep_kwargs.items() if k in ["device", "batch_size"]
         }
-        self.im2deep_kwargs["n_jobs"] = processes
+        self.predict_kwargs["num_workers"] = processes
 
     @property
     def feature_names(self) -> List[str]:
@@ -72,11 +80,10 @@ def feature_names(self) -> List[str]:
 
     def add_features(self, psm_list: PSMList) -> None:
         """Add IM2Deep-derived features to PSMs"""
+
         logger.info("Adding IM2Deep-derived features to PSMs")
 
-        # Convert ion mobility to CCS
         psm_list_df = psm_list.to_dataframe()
-        # Remove unnecessary columns to save memory #TODO: optimize further?
         psm_list_df = psm_list_df[
             [
                 "peptidoform",
@@ -89,8 +96,8 @@ def add_features(self, psm_list: PSMList) -> None:
             ]
         ]
 
+        psm_list_df["sequence"] = psm_list_df["peptidoform"].apply(lambda x: x.modified_sequence)
         psm_list_df["charge"] = [pep.precursor_charge for pep in psm_list_df["peptidoform"]]
-        psm_list_df["sequence"] = psm_list_df["peptidoform"].apply(lambda x: x.proforma)
         psm_list_df["ccs_observed_im2deep"] = im2ccs(
             psm_list_df["ion_mobility"],
             psm_list_df["precursor_mz"],
@@ -98,39 +105,39 @@ def add_features(self, psm_list: PSMList) -> None:
         )
 
         # Make predictions with IM2Deep
-        logger.debug("Predicting CCS values...")
-        predictions = predict_ccs(psm_list, write_output=False, **self.im2deep_kwargs)
-        psm_list_df["ccs_predicted"] = predictions
-
-        # Create dataframe with high confidence hits for calibration
-        logger.debug("Calibrating IM2Deep...")
-        reference_dataset = pd.read_csv(REFERENCE_DATASET_PATH)
-        reference_dataset["charge"] = reference_dataset["peptidoform"].apply(
-            lambda x: int(x.split("/")[1]) if isinstance(x, str) else x.precursor_charge
+        logger.info("Predicting CCS values with IM2Deep...")
+        psm_list_df["predicted_CCS_uncalibrated"] = predict(
+            psm_list, model=self.model, predict_kwargs=self.predict_kwargs
         )
-        logger.debug(f"Loaded reference dataset with {len(reference_dataset)} entries")
 
-        run_shift_dict = {}
+        # getting reference CCS values for calibration
+        source_dataframe = get_default_reference(multi=self.multi)
+
+        # Create dataframe with high confidence hits for calibration
+        logger.info("Calibrating predicted CCS values per run...")
         for run in psm_list_df["run"].unique():
-            cal_run_psm_df = self.make_calibration_df(psm_list_df[psm_list_df["run"] == run])
-            # Rename for calculate_ccs_shift compatibility
-            cal_run_psm_df = cal_run_psm_df.rename(
-                columns={"ccs_observed_im2deep": "ccs_observed"}
+            run_df = psm_list_df[psm_list_df["run"] == run].copy()
+
+            calibration_df = self._get_im_calibration_data(run_df)
+
+            calibration = LinearCCSCalibration()
+            calibration.fit(
+                psm_df_target=calibration_df,
+                psm_df_source=source_dataframe,
+            )
+
+            calibrated_im = calibration.transform(
+                run_df[["peptidoform", "predicted_CCS_uncalibrated"]]
             )
-            shift = calculate_ccs_shift(
-                cal_df=cal_run_psm_df, reference_dataset=reference_dataset, per_charge=True
+
+            # Update predictions with calibrated values
+            psm_list_df.loc[psm_list_df["run"] == run, "predicted_CCS_uncalibrated"] = (
+                calibrated_im
             )
-            run_shift_dict[run] = shift
-        shift_df = pd.DataFrame.from_dict(run_shift_dict, orient="index").stack().reset_index()
-        shift_df.columns = ["run", "charge", "ccs_shift"]
 
         # Apply calibration shifts
-        psm_list_df = psm_list_df.merge(shift_df, on=["run", "charge"], how="left")
-        psm_list_df["ccs_shift"] = psm_list_df["ccs_shift"].fillna(
-            0
-        )  # Fill missing shifts with 0 (no calibration for that run/charge) #TODO check with ROBBE
-        psm_list_df["ccs_predicted_im2deep"] = (
-            psm_list_df["ccs_predicted"] + psm_list_df["ccs_shift"]
+        psm_list_df.rename(
+            columns={"predicted_CCS_uncalibrated": "ccs_predicted_im2deep"}, inplace=True
         )
         psm_list_df["ccs_error_im2deep"] = (
             psm_list_df["ccs_predicted_im2deep"] - psm_list_df["ccs_observed_im2deep"]
@@ -146,39 +153,52 @@ def add_features(self, psm_list: PSMList) -> None:
         for psm, features in zip(psm_list, psm_list_feature_dicts):
             psm.rescoring_features.update(features)
 
-    @staticmethod
-    def make_calibration_df(psm_list_df: pd.DataFrame, threshold: float = 0.25) -> pd.DataFrame:
-        """
-        Make dataframe for calibration of IM2Deep predictions.
+    def _get_im_calibration_data(self, run_df) -> tuple[np.ndarray, np.ndarray]:
+        """Get calibration data (observed and predicted CCS values) from run dataframe.
+
+        Only target (non-decoy) PSMs are used for calibration.
 
         Parameters
         ----------
-        psm_list_df
-            DataFrame with PSMs.
-        threshold
-            Percentage of highest scoring identified target PSMs to use for calibration,
-            default 0.25.
-
+        run_df : pd.DataFrame
+            Dataframe containing PSMs for a single run, with columns:
+            'ccs_observed_im2deep', 'ccs_predicted_im2deep', 'qvalue', 'is_decoy'
         Returns
         -------
-        pd.DataFrame
-            DataFrame with high confidence hits for calibration.
-
+        tuple[np.ndarray, np.ndarray]
+            Observed and predicted CCS values for calibration
         """
-        identified_psms = psm_list_df[
-            (psm_list_df["qvalue"] < 0.01)
-            & (~psm_list_df["is_decoy"])
-            & (psm_list_df["charge"] < 7)  # predictions do not go higher for IM2Deep
-            & ([metadata.get("original_psm", True) for metadata in psm_list_df["metadata"]])
-        ]
-        calibration_psms = identified_psms[
-            identified_psms["qvalue"] < identified_psms["qvalue"].quantile(1 - threshold)
-        ]
-        if isinstance(calibration_psms["peptidoform"].iloc[0], Peptidoform):
-            calibration_psms["peptidoform"] = calibration_psms["peptidoform"].apply(
-                lambda x: x.proforma
-            )
-        logger.debug(
-            f"Number of high confidence hits for calculating shift: {len(calibration_psms)}"
+        # Filter to target PSMs only
+        target_df = run_df[~run_df["is_decoy"]].copy()
+        target_df = target_df.sort_values("qvalue", ascending=True)
+
+        # Determine number of calibration PSMs
+        if isinstance(self.calibration_set_size, float):
+            if not 0 < self.calibration_set_size <= 1:
+                raise ValueError(
+                    "If `calibration_set_size` is a float, it cannot be smaller than "
+                    "or equal to 0 or larger than 1."
+                )
+            num_calibration_psms = round(len(target_df) * self.calibration_set_size)
+        elif isinstance(self.calibration_set_size, int):
+            if self.calibration_set_size > len(target_df):
+                logger.warning(
+                    f"Requested number of calibration PSMs ({self.calibration_set_size}) "
+                    f"is larger than total number of target PSMs ({len(target_df)}). Using "
+                    "all target PSMs for calibration."
+                )
+                num_calibration_psms = len(target_df)
+            else:
+                num_calibration_psms = self.calibration_set_size
+        else:
+            # Use PSMs with q-value <= 0.01
+            num_calibration_psms = (target_df["qvalue"] <= 0.01).sum()
+
+        logger.debug(f"Using {num_calibration_psms} target PSMs for calibration")
+
+        # Select calibration PSMs (assuming they are sorted by q-value)
+        calibration_df = target_df.head(num_calibration_psms)
+
+        return calibration_df[["peptidoform", "ccs_observed_im2deep"]].rename(
+            columns={"ccs_observed_im2deep": "CCS"}
         )
-        return calibration_psms

From 889b42d93e260765f912b054f83411b2d45cacae Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Wed, 14 Jan 2026 15:42:41 +0100
Subject: [PATCH 50/54] add support for continue runs and writing intermediate
 file on error to continue from

---
 ms2rescore/core.py       | 35 +++++++++++++++++++++++++++++------
 ms2rescore/parse_psms.py |  2 +-
 2 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/ms2rescore/core.py b/ms2rescore/core.py
index 4384c6f7..8b3c64ef 100644
--- a/ms2rescore/core.py
+++ b/ms2rescore/core.py
@@ -63,6 +63,19 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
     logger.debug(
         f"PSMs already contain the following rescoring features: {psm_list_feature_names}"
     )
+    # ckeck if all features are already present
+    for fgen_name, fgen_config in list(config["feature_generators"].items()):
+        # conf = config.copy()
+        # conf.update(fgen_config)
+        fgen_features = FEATURE_GENERATORS[fgen_name]().feature_names
+        if set(fgen_features).issubset(psm_list_feature_names):
+            logger.debug(
+                f"Skipping feature generator {fgen_name} because all features are already "
+                "present in the PSM file."
+            )
+            feature_names[fgen_name] = set(fgen_features)
+            feature_names["psm_file"] = psm_list_feature_names - set(fgen_features)
+            del config["feature_generators"][fgen_name]
 
     # Add missing precursor info from spectrum file if needed
     required_ms_data = {
@@ -93,9 +106,17 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
                 "files or disable the feature generator."
             )
             continue
-
-        # Add features
-        fgen.add_features(psm_list)
+        try:
+            fgen.add_features(psm_list)
+        except (Exception, KeyboardInterrupt) as e:
+            logger.error(
+                f"Error while adding features from {fgen_name}: {e}, writing intermediary output..."
+            )
+            # Write intermediate TSV
+            psm_utils.io.write_file(
+                psm_list, output_file_root + ".intermediate.psms.tsv", filetype="tsv"
+            )
+            raise e
         logger.debug(f"Adding features from {fgen_name}: {set(fgen.feature_names)}")
         feature_names[fgen_name] = set(fgen.feature_names)
 
@@ -168,10 +189,12 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
                 protein_kwargs=protein_kwargs,
                 **config["rescoring_engine"]["mokapot"],
             )
-    except exceptions.RescoringError as e:
+    except (Exception, KeyboardInterrupt) as e:
         # Write output
-        logger.info(f"Writing intermediary output to {output_file_root}.psms.tsv...")
-        psm_utils.io.write_file(psm_list, output_file_root + ".psms.tsv", filetype="tsv")
+        logger.info(f"Writing intermediary output to {output_file_root}.intermediate.psms.tsv...")
+        psm_utils.io.write_file(
+            psm_list, output_file_root + ".intermediate.psms.tsv", filetype="tsv"
+        )
 
         # Reraise exception
         raise e
diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py
index 8a61539c..c12f2f07 100644
--- a/ms2rescore/parse_psms.py
+++ b/ms2rescore/parse_psms.py
@@ -85,7 +85,7 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
             {
                 "before_rescoring_score": psm.score,
                 "before_rescoring_qvalue": psm.qvalue,
-                "before_rescoring_pep": psm.pep,
+                "before_rescoring_pep": psm.pep if psm.pep is not None else float("nan"),
                 "before_rescoring_rank": psm.rank,
             }
         )

From ba930b86881044a393637b6c149dc6c503b501ee Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Wed, 14 Jan 2026 15:42:55 +0100
Subject: [PATCH 51/54] changes to default features sets instead of based on
 charge

---
 ms2rescore/feature_generators/basic.py | 48 ++++++++++++++------------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/ms2rescore/feature_generators/basic.py b/ms2rescore/feature_generators/basic.py
index df005167..3e9aa374 100644
--- a/ms2rescore/feature_generators/basic.py
+++ b/ms2rescore/feature_generators/basic.py
@@ -31,13 +31,24 @@ def __init__(self, *args, **kwargs) -> None:
 
         """
         super().__init__(*args, **kwargs)
-        self._feature_names = None
 
     @property
     def feature_names(self) -> List[str]:
-        if self._feature_names is None:
-            raise ValueError("Feature names have not been set yet. First run `add_features`.")
-        return self._feature_names
+        return [
+            "charge_n",
+            "charge_1",
+            "charge_2",
+            "charge_3",
+            "charge_4",
+            "charge_5",
+            "charge_6",
+            "abs_ms1_error_ppm",
+            "search_engine_score",
+            "theoretical_mass",
+            "experimental_mass",
+            "mass_error",
+            "pep_len",
+        ]
 
     def add_features(self, psm_list: PSMList) -> None:
         """
@@ -51,8 +62,6 @@ def add_features(self, psm_list: PSMList) -> None:
         """
         logger.info("Adding basic features to PSMs.")
 
-        self._feature_names = []  # Reset feature names
-
         charge_states = np.array([psm.peptidoform.precursor_charge for psm in psm_list])
         precursor_mzs = psm_list["precursor_mz"]
         scores = psm_list["score"]
@@ -64,24 +73,16 @@ def add_features(self, psm_list: PSMList) -> None:
 
         if has_charge:
             charge_n = charge_states
-            charge_one_hot, one_hot_names = _one_hot_encode_charge(charge_states)
-            self._feature_names.extend(["charge_n"] + one_hot_names)
+            charge_one_hot, _ = _one_hot_encode_charge(charge_states)
 
         if has_mz:  # Charge also required for theoretical m/z
             theo_mz = np.array([psm.peptidoform.theoretical_mz for psm in psm_list])
             abs_ms1_error_ppm = np.abs((precursor_mzs - theo_mz) / theo_mz * 10**6)
-            self._feature_names.append("abs_ms1_error_ppm")
-
-        if has_score:
-            self._feature_names.append("search_engine_score")
 
         if has_mz and has_charge:
             experimental_mass = (precursor_mzs * charge_n) - (charge_n * 1.007276466812)
             theoretical_mass = (theo_mz * charge_n) - (charge_n * 1.007276466812)
             mass_error = experimental_mass - theoretical_mass
-            self._feature_names.extend(["theoretical_mass", "experimental_mass", "mass_error"])
-
-        self._feature_names.append("pep_len")
 
         for i, psm in enumerate(psm_list):
             psm.rescoring_features.update(
@@ -101,15 +102,18 @@ def add_features(self, psm_list: PSMList) -> None:
 def _one_hot_encode_charge(
     charge_states: np.ndarray,
 ) -> Tuple[Iterable[Dict[str, int]], List[str]]:
-    """One-hot encode charge states."""
+    """One-hot encode charge states with fixed range 1-6."""
     n_entries = len(charge_states)
-    min_charge = np.min(charge_states)
-    max_charge = np.max(charge_states)
+    heading = [f"charge_{i}" for i in range(1, 7)]
 
-    mask = np.zeros((n_entries, max_charge - min_charge + 1), dtype=bool)
-    mask[np.arange(n_entries), charge_states - min_charge] = 1
-    one_hot = mask.view("i1")
+    # Create mask for charges 1-6
+    mask = np.zeros((n_entries, 6), dtype=bool)
+
+    # Set the appropriate charge position to 1 for each entry
+    for i, charge in enumerate(charge_states):
+        if charge is not None and 1 <= charge <= 6:
+            mask[i, int(charge) - 1] = 1
 
-    heading = [f"charge_{i}" for i in range(min_charge, max_charge + 1)]
+    one_hot = mask.view("i1")
 
     return [dict(zip(heading, row)) for row in one_hot], heading

From a3875da596c570c0b122b92d00cd503784b4e92b Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Wed, 14 Jan 2026 15:43:00 +0100
Subject: [PATCH 52/54] minor changes

---
 ms2rescore/feature_generators/ms2pip.py | 17 +++++++----------
 pyproject.toml                          |  1 -
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/ms2rescore/feature_generators/ms2pip.py b/ms2rescore/feature_generators/ms2pip.py
index dcb0100e..0cc6fd52 100644
--- a/ms2rescore/feature_generators/ms2pip.py
+++ b/ms2rescore/feature_generators/ms2pip.py
@@ -24,8 +24,7 @@
 """
 
 import logging
-from typing import List, Optional, Union
-import numpy as np
+from typing import Optional
 
 from ms2pip import process_MS2_spectra
 from ms2rescore_rs import batch_ms2pip_features_numpy
@@ -51,7 +50,7 @@ def __init__(
         spectrum_path: Optional[str] = None,
         spectrum_id_pattern: str = "(.*)",
         model_dir: Optional[str] = None,
-        processes: 1,
+        processes: int = 1,
         **kwargs,
     ) -> None:
         """
@@ -187,8 +186,8 @@ def _calculate_features(self, psm_list, ms2pip_results):
         idx = []
         pred_b = []
         pred_y = []
-        obs_b  = []
-        obs_y  = []
+        obs_b = []
+        obs_y = []
 
         for r in ms2pip_results:
             if r.observed_intensity is None or r.predicted_intensity is None:
@@ -198,14 +197,12 @@ def _calculate_features(self, psm_list, ms2pip_results):
             pred_y.append(r.predicted_intensity["y"])
             obs_b.append(r.observed_intensity["b"])
             obs_y.append(r.observed_intensity["y"])
-        
-        results = batch_ms2pip_features_numpy(
-            idx, pred_b, pred_y, obs_b, obs_y
-        )
+
+        results = batch_ms2pip_features_numpy(idx, pred_b, pred_y, obs_b, obs_y)
 
         for psm_index, feats in results:
             if feats:
                 try:
                     psm_list[psm_index]["rescoring_features"].update(feats)
                 except (AttributeError, TypeError):
-                    psm_list[psm_index]["rescoring_features"] = feats
\ No newline at end of file
+                    psm_list[psm_index]["rescoring_features"] = feats
diff --git a/pyproject.toml b/pyproject.toml
index f36cd56b..059fb300 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,6 @@ dependencies = [
     "click>=7",
     "customtkinter>=5,<6",
     "deeplc>=3.1",
-    "deeplcretrainer",
     "im2deep>=1.1",
     "jinja2>=3",
     "lxml>=4.5",

From 8e793a00e62277050ef707e64634a4659af9428e Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Wed, 14 Jan 2026 15:53:59 +0100
Subject: [PATCH 53/54] conditional import of mumble

---
 ms2rescore/core.py                      |  3 ++-
 ms2rescore/feature_generators/deeplc.py |  1 +
 ms2rescore/parse_psms.py                | 11 ++++++++---
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/ms2rescore/core.py b/ms2rescore/core.py
index 8b3c64ef..e72d1b05 100644
--- a/ms2rescore/core.py
+++ b/ms2rescore/core.py
@@ -17,7 +17,6 @@
     add_peptide_confidence,
     add_psm_confidence,
 )
-from ms2rescore.utils import filter_mumble_psms
 
 logger = logging.getLogger(__name__)
 
@@ -140,6 +139,8 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
     psm_list = psm_list[psms_with_features]
 
     if "mumble" in config["psm_generator"]:
+        from ms2rescore.utils import filter_mumble_psms
+
         # Remove PSMs where matched_ions_pct drops 25% below the original hit
         psm_list = filter_mumble_psms(psm_list, threshold=0.75)
 
diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py
index b14094ef..4c2f0b49 100644
--- a/ms2rescore/feature_generators/deeplc.py
+++ b/ms2rescore/feature_generators/deeplc.py
@@ -272,6 +272,7 @@ def _best_run_by_shared_proteoforms(runs, proteoforms):
         - If no run shares anything (all scores == 0): warn and return the first run (by first appearance).
         - If multiple runs are tied for best score: return the first among them (by first appearance).
         """
+        logger.debug("Determining best run for transfer learning based on shared proteoforms.")
         runs = np.asarray(runs)
         proteoforms = np.asarray(proteoforms)
         if runs.shape[0] != proteoforms.shape[0]:
diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py
index c12f2f07..0366d6b8 100644
--- a/ms2rescore/parse_psms.py
+++ b/ms2rescore/parse_psms.py
@@ -4,7 +4,6 @@
 
 import numpy as np
 import psm_utils.io
-from mumble import PSMHandler
 from psm_utils import PSMList
 
 from ms2rescore.exceptions import MS2RescoreConfigurationError
@@ -121,6 +120,12 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
 
     # Addition of Modifications for mass shifts in the PSMs with Mumble
     if "mumble" in config["psm_generator"]:
+        try:
+            from mumble import PSMHandler
+        except ImportError:
+            raise MS2RescoreConfigurationError(
+                "mumble is not installed. Please install it with: pip install ms2rescore[mumble]"
+            )
         logger.debug("Applying modifications for mass shifts using Mumble...")
         # set inlcude original psm to True and include decoy psm to true
         config["psm_generator"]["mumble"]["include_original_psm"] = True
@@ -150,7 +155,7 @@ def _read_psms(config, psm_list):
         psm_list = []
         for current_file, psm_file in enumerate(config["psm_file"]):
             logger.info(
-                f"Reading PSMs from PSM file ({current_file+1}/{total_files}): '{psm_file}'..."
+                f"Reading PSMs from PSM file ({current_file + 1}/{total_files}): '{psm_file}'..."
             )
             psm_list.extend(
                 psm_utils.io.read_file(
@@ -216,7 +221,7 @@ def _parse_values_from_spectrum_id(
         ["retention_time", "ion_mobility"],
     ):
         if pattern:
-            logger.debug(f"Parsing {label} from spectrum_id with regex pattern " f"{pattern}")
+            logger.debug(f"Parsing {label} from spectrum_id with regex pattern {pattern}")
             try:
                 pattern = re.compile(pattern)
                 psm_list[key] = [

From 07b96e73e288ed7cead7b18a63af141e66b24fa2 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Wed, 14 Jan 2026 18:59:35 +0100
Subject: [PATCH 54/54] add tracking to spectrum file reading

---
 ms2rescore/parse_spectra.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py
index d897c40c..3b8df3e8 100644
--- a/ms2rescore/parse_spectra.py
+++ b/ms2rescore/parse_spectra.py
@@ -7,6 +7,7 @@
 
 import numpy as np
 from ms2rescore_rs import Precursor, get_ms2_spectra, MS2Spectrum
+from rich.progress import track
 
 from psm_utils import PSMList
 
@@ -208,7 +209,7 @@ def _add_precursor_values(
     if spectrum_id_pattern is None:
         spectrum_id_pattern = r"^(.*)$"  # Match entire identifier if no pattern provided
 
-    for run_name in set(psm_list["run"]):
+    for run_name in track(set(psm_list["run"])):
         run_mask = psm_list["run"] == run_name
         psm_list_run = psm_list[run_mask]
         spectrum_file = infer_spectrum_path(spectrum_path, run_name)