diff --git a/.gitignore b/.gitignore index 70cf3b4..5f582cb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,4 @@ -tests/test_data -tests/test_data/* -modification_cache/* +mumble/package_data/modifications_cache.pkl # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/README.md b/README.md index 6ea6311..4bd41a5 100644 --- a/README.md +++ b/README.md @@ -28,9 +28,87 @@ pip install mumble ``` -### Basic Usage +## Usage -Here's a quick example of how to use the PSM Modification Handler for single PSMs: +### Command-Line Interface (CLI) Usage + +Mumble provides a command-line interface to modify PSMs based on mass shifts, as well as several parameters for customization. You can use the `mumble` command to interact with the tool. + +#### Basic Command Syntax + +To run the CLI, use the following command: + +```bash +mumble [OPTIONS] INPUT_FILE +``` + +Where `INPUT_FILE` is the path to the input file containing the PSM data. + +#### Parameters: + +Here are the available options you can pass when running the command: + +- **`--psm-list`**: (required) Path to the input file containing the PSM data. Must be provided if not already set via arguments. +- **`--modification-file`**: Path to a restriction list of modifications to use from Unimod. Defaults to `default_ptm_list.tsv` included with the package. +- **`--psm-file-type`**: Type of the input file to read with PSM_utils (e.g., `mzid`, `tsv`). Default is "infer". +- **`--aa-combinations`**: Number of amino acid combinations to add as modification. Requires a `fasta_file`. Default is `0`. +- **`--fasta-file`**: Path to a fasta file (for use with `aa_combinations`). +- **`--mass-error`**: Mass error for the mass shift, default is `0.02`. +- **`--output-file`**: Path to the output file to write modified PSMs. +- **`--filetype-write`**: Type of the output file to write with PSM_utils (e.g., `tsv`, `csv`). Default is `tsv`. +- **`--include-decoy-psm`**: Flag to parse modifications for decoys in the modified PSM list. +- **`--include-original-psm`**: Flag to keep the original PSMs in the modified PSM list. +- **`--combination-length`**: Maximum number of modifications per combination. All lower numbers will be included as well. Default is `1`. +- **`--exclude-mutations`**: If set, modifications with the classification 'AA substitution' will be excluded. +- **`--config-file`**: Path to a config file for additional configuration parameters (e.g., custom modification sets, advanced settings). +- **`--log-level`**: Set the logging level. Options: `DEBUG`, `INFO`, `WARNING`, `ERROR`, `CRITICAL`. Default is `INFO`. +- **`--clear-cache`**: Remove the modification cache file and exit early. +- **`--all-unimod-modifications`**: Use all available modifications from Unimod instead of a subset. + +#### Examples: + +1. **Modify a single PSM**: +```bash +mumble --psm-list "path/to/psm_file.mzid" --mass-error 0.02 --output-file "modified_psms.tsv" +``` + +2. **Modify a list of PSMs with custom configurations**: +```bash +mumble --psm-list "path/to/psm_file.mzid" --fasta-file "path/to/proteins.fasta" --aa-combinations 5 --config-file "path/to/config_file.toml" +``` + +3. **Clear the cache and exit**: +```bash +mumble --clear-cache +``` + +4. **Using a custom modification file**: +```bash +mumble --psm-list "path/to/psm_file.mzid" --modification-file "path/to/custom_ptm_list.tsv" +``` + +#### Config file usage + +You can also use a configuration file to specify options that will be loaded automatically when running the command. This allows you to store commonly used parameters without needing to pass them every time. + +Example configuration file (`config_file.json`): + +```json +{"mass_error" : 0.05 +"aa_combinations" : 2 +"psm_file_type" : "mzid" +"output_file" : "output.tsv" +} +``` + +You can then specify the path to this file using the `--config-file` option: + +```bash +mumble --config-file "path/to/config_file.toml" +``` + +### Python API +Here's a quick example of how to use the PSM Modification Handler through the python API for single PSMs: ```python >>> from mumble import PSMHandler @@ -59,7 +137,7 @@ Here's a quick example of how to use the PSM Modification Handler for single PSM # ) # ] ``` -Here's a quick example of how to use the PSM Modification Handler for PSM lists: +Here's a quick example of how to use the PSM Modification Handler through the python API for PSM lists: ```python >>> # Or load a PSM list (from a file or PSMList object) >>> psm_list = psm_handler.parse_psm_list("path/to/psm_file.mzid", psm_file_type="mzid") @@ -71,6 +149,7 @@ Here's a quick example of how to use the PSM Modification Handler for PSM lists: >>> psm_handler.write_modified_psm_list(modified_psm_list, output_file="modified_psms.tsv", psm_file_type="tsv") ``` For more information on PSM objects and PSM lists visit [psm_utils](https://github.com/compomics/psm_utils) + ## Testing The project includes unit tests using `pytest` to ensure code reliability. diff --git a/mumble/__init__.py b/mumble/__init__.py index caa3a3d..f0f7f79 100644 --- a/mumble/__init__.py +++ b/mumble/__init__.py @@ -1,4 +1,4 @@ __version__ = "0.2.0" -__all__ = ["PSMHandler"] +__all__ = ["PSMHandler", "remove_modification_cache"] -from mumble.mumble import PSMHandler +from mumble.mumble import PSMHandler, remove_modification_cache diff --git a/mumble/__main__.py b/mumble/__main__.py index a01e6d9..45c30d4 100644 --- a/mumble/__main__.py +++ b/mumble/__main__.py @@ -1,10 +1,10 @@ import click import logging - +import sys +import importlib from rich.logging import RichHandler -from mumble import PSMHandler - +from mumble import PSMHandler, remove_modification_cache # setup logging logging.basicConfig( @@ -20,9 +20,11 @@ "type": click.Path(exists=True), "help": "Path to the input file.", }, - "unimod_modification_file": { + "modification_file": { "type": click.Path(exists=True), "help": "Restriction list of modifications to use from Unimod.", + "default": str(importlib.resources.files("mumble.package_data") / "default_ptm_list.tsv"), + "show_default": True, }, "psm_file_type": { "type": click.STRING, @@ -57,13 +59,13 @@ "default": "tsv", "show_default": True, }, - "generate_modified_decoys": { + "include_decoy_psm": { "is_flag": True, "help": "Parse modifications for decoys in modified PSMlist", "default": False, "show_default": True, }, - "keep_original": { + "include_original_psm": { "is_flag": True, "help": "Keep the original PSMs in the modified PSMlist", "default": False, @@ -86,15 +88,43 @@ "help": "Path to a config file", "default": None, }, + "log_level": { + "type": click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]), + "default": "INFO", + "help": "Set the logging level", + "show_default": True, + }, + "all_unimod_modifications": { + "is_flag": True, + "default": False, + "help": "Instead of using a subset of modifications from Unimod, use all available modifications.", + "show_default": True, + }, } @click.command("cli", context_settings={"show_default": True}) -@click.argument("input_file", type=click.Path(exists=True), default=None) -def main(**kwargs): +@click.argument("input_file", type=click.Path(exists=True), default=None, required=False) +@click.option( + "--clear-cache/--no-clear-cache", + is_flag=True, + default=False, + help="Remove the modification cache file and exit early.", +) +def main(clear_cache, **kwargs): """ Finding the perfect match for your mass shift. """ + # if the user just wants to clear the cache, do it and quit + if clear_cache: + remove_modification_cache() + logging.info("Exiting Mumble. You will find your match another time.") + sys.exit(0) + + # Set the logging level based on the CLI option + log_level = kwargs.get("log_level", "INFO").upper() + logging.getLogger().setLevel(log_level) + ctx = click.get_current_context() # Extract CLI-provided parameters diff --git a/mumble/mumble.py b/mumble/mumble.py index 73e7831..7bdf02e 100644 --- a/mumble/mumble.py +++ b/mumble/mumble.py @@ -1,4 +1,5 @@ from copy import deepcopy +import importlib.resources import logging import itertools import os @@ -6,6 +7,9 @@ from collections import namedtuple from pathlib import Path from functools import lru_cache +import hashlib +import importlib +import warnings import pandas as pd import pickle @@ -16,10 +20,16 @@ from pyteomics.mass import std_aa_mass, unimod from pyteomics.fasta import IndexedFASTA from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TimeElapsedColumn +from rich.pretty import pretty_repr +from sqlalchemy import exc + # Add a logger logger = logging.getLogger(__name__) +# suppress warnings from sqlalchemy +warnings.filterwarnings("ignore", category=exc.SAWarning, message=".*will copy column.*") + class PSMHandler: """Class that contains all information about the input file""" @@ -42,7 +52,7 @@ def __init__(self, config_file: str = None, **kwargs): fasta_file=self.params["fasta_file"], combination_length=self.params["combination_length"], exclude_mutations=self.params["exclude_mutations"], - unimod_modification_file=self.params["unimod_modification_file"], + modification_file=self.params["modification_file"], ) self.psm_file_name = None @@ -65,21 +75,25 @@ def _load_parameters(self, overrides: dict) -> dict: "psm_list": None, "output_file": None, "write_filetype": "tsv", - "keep_original": False, - "generate_modified_decoys": False, + "include_original_psm": False, + "include_decoy_psm": False, "psm_file_type": "infer", - "unimod_modification_file": None, + "modification_file": str( + importlib.resources.files("mumble.package_data") / "default_ptm_list.tsv" + ), "modification_mapping": {}, + "all_unimod_modifications": False, } - # Use a single loop to consolidate parameters params = { key: overrides.get( key, self.config_loader.get(key, default) if self.config_loader else default ) for key, default in keys_with_defaults.items() } - logger.info(f"Mumble config: {params}") + if params["all_unimod_modifications"]: + params["modification_file"] = False + logger.info(f"Mumble config: {pretty_repr(params)}") return params @@ -195,20 +209,20 @@ def _create_new_psm(psm, new_peptidoform) -> PSM: copy_psm.peptidoform = new_peptidoform return copy_psm - def _get_modified_peptidoforms(self, psm, keep_original=False) -> list: + def _get_modified_peptidoforms(self, psm, include_original_psm=False) -> list: """ Get modified peptidoforms derived from a single PSM. Args: psm (psm_utils.PSM): Original PSM object. - keep_original (bool, optional): Whether to keep the original PSM alongside modified ones. Defaults to False. + include_original_psm (bool, optional): Whether to keep the original PSM alongside modified ones. Defaults to False. Returns: list: List of modified PSMs, or None if no modifications were applied. """ modified_peptidoforms = [] - if keep_original: + if include_original_psm: psm["metadata"]["original_psm"] = True modified_peptidoforms.append(psm) @@ -228,26 +242,28 @@ def _get_modified_peptidoforms(self, psm, keep_original=False) -> list: return modified_peptidoforms - def get_modified_peptidoforms_list(self, psm, keep_original=False) -> PSMList: + def get_modified_peptidoforms_list(self, psm, include_original_psm=False) -> PSMList: """ Get modified peptidoforms derived from 1 PSM in a PSMList. Args: psm (psm_utils.PSM): PSM object - keep_original (bool, optional): Keep the original PSM. Defaults to False. + include_original_psm (bool, optional): Keep the original PSM. Defaults to False. return: psm_utils.PSMList: PSMList object """ - modified_peptidoforms = self._get_modified_peptidoforms(psm, keep_original=keep_original) + modified_peptidoforms = self._get_modified_peptidoforms( + psm, include_original_psm=include_original_psm + ) return PSMList(psm_list=modified_peptidoforms) def add_modified_psms( self, psm_list=None, psm_file_type=None, - generate_modified_decoys=None, - keep_original=None, + include_decoy_psm=None, + include_original_psm=None, ) -> PSMList: """ Add modified PSMs to a PSMList based on open modification searches. @@ -255,8 +271,8 @@ def add_modified_psms( Args: psm_list (str, list, or PSMList): Path to a PSM file, list of PSMs, or a PSMList object. psm_file_type (str, optional): Type of PSM file to read, inferred automatically if not provided. Defaults to "infer". - generate_modified_decoys (bool, optional): Whether to generate decoys for the modified PSMs. Defaults to False. - keep_original (bool, optional): Whether to keep the original unmodified PSMs. Defaults to False. + include_decoy_psm (bool, optional): Whether to generate decoys for the modified PSMs. Defaults to False. + include_original_psm (bool, optional): Whether to keep the original unmodified PSMs. Defaults to False. Returns: psm_utils.PSMList: A new PSMList object containing the modified PSMs. @@ -266,15 +282,15 @@ def add_modified_psms( pass else: raise ValueError("No PSM list provided") - if not generate_modified_decoys: - generate_modified_decoys = self.params["generate_modified_decoys"] - if not keep_original: - keep_original = self.params["keep_original"] + if not include_decoy_psm: + include_decoy_psm = self.params["include_decoy_psm"] + if not include_original_psm: + include_original_psm = self.params["include_original_psm"] if not psm_file_type: psm_file_type = self.params["psm_file_type"] logger.info( - f"Adding modified PSMs to PSMlist {'WITH' if keep_original else 'WITHOUT'} originals, {'INCLUDING' if generate_modified_decoys else 'EXCLUDING'} modfied decoys" + f"Adding modified PSMs to PSMlist {'WITH' if include_original_psm else 'WITHOUT'} originals, {'INCLUDING' if include_decoy_psm else 'EXCLUDING'} modfied decoys" ) parsed_psm_list = self._parse_psm_list( @@ -295,12 +311,16 @@ def add_modified_psms( task = progress.add_task("Processing PSMs...", total=len(parsed_psm_list)) for psm in parsed_psm_list: - if (psm.is_decoy) & (not generate_modified_decoys): + if (psm.is_decoy) & (not include_decoy_psm): progress.update(task, advance=1) continue - new_psms = self._get_modified_peptidoforms(psm, keep_original=keep_original) + new_psms = self._get_modified_peptidoforms( + psm, include_original_psm=include_original_psm + ) if new_psms: - total_new_psms += len(new_psms) if not keep_original else len(new_psms) - 1 + total_new_psms += ( + len(new_psms) if not include_original_psm else len(new_psms) - 1 + ) mass_shifted_psms += 1 new_psm_list.extend(new_psms) progress.update(task, advance=1) @@ -392,7 +412,7 @@ def __init__( fasta_file=None, combination_length=1, exclude_mutations=False, - unimod_modification_file=None, + modification_file=None, ) -> None: """ Constructor of the class. @@ -408,13 +428,17 @@ def __init__( self.cache = _ModificationCache( combination_length=combination_length, exclude_mutations=exclude_mutations, - modification_file=unimod_modification_file, + modification_file=modification_file, ) + self.cache.load_cache() self.modification_df = self.cache.modification_df self.monoisotopic_masses = self.cache.monoisotopic_masses self.modifications_names = self.cache.modifications_names - + if len(self.modification_df["name"].unique()) == 0: + raise ValueError( + "No modifications found in the modification file. Please check fileformat." + ) logger.info( f'Including {len(self.modification_df["name"].unique())} unique modifications on {len(self.modification_df["name"])} sites' ) @@ -763,6 +787,9 @@ def __init__( self.combination_length = combination_length self.exclude_mutations = exclude_mutations self.modification_file = modification_file + self.modification_file_hash = ( + self._calculate_file_hash(modification_file) if modification_file else None + ) self.modification_inclusion_dict, self.filter_key = self._read_unimod_file( modification_file ) @@ -770,51 +797,82 @@ def __init__( self.modifications_names = [] self.modification_df = None - # Load or generate data - cache_file = self._get_cache_file_path() - self._load_or_generate_data(cache_file, force_reload=False) + # get cache file path + self.cache_file = self._get_cache_file_path() - def _get_cache_file_path(self): + @classmethod + def _remove_cache(cls): + """ + Remove the cache file for modifications. + """ + cache_file = cls._get_cache_file_path() + if os.path.exists(cache_file): + os.remove(cache_file) + logger.info("Modification cache removed.") + else: + logger.warning("Modification cache file does not exist.") + + def load_cache(self, force_reload=False): + """ + Load the cache or generate it if it doesn't exist. + + Args: + force_reload (bool, optional): If True, regenerate the cache even if it exists. Defaults to False. + """ + self._load_or_generate_data(force_reload=force_reload) + + @classmethod + def _get_cache_file_path(cls): """ Get path to cache file for combinations of modifications. return: str: path to cache file """ - current_dir = os.path.dirname(os.path.realpath(__file__)) - parent_dir = os.path.dirname(current_dir) - cache_dir = os.path.join(parent_dir, "modification_cache") + return str(importlib.resources.files("mumble.package_data") / "modifications_cache.pkl") - # Create the cache directory if it doesn't exist - os.makedirs(cache_dir, exist_ok=True) + @staticmethod + def _calculate_file_hash(file_path: str) -> str: + """ + Calculate the SHA-256 hash of a file. - cache_file = os.path.join(cache_dir, "modification_cache.pkl") - return cache_file + Args: + file_path (str): Path to the file to hash. + + Returns: + str: SHA-256 hash of the file. + """ + sha256 = hashlib.sha256() + with open(file_path, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + sha256.update(chunk) + return sha256.hexdigest() - def _load_or_generate_data(self, cache_file: str, force_reload: bool = False) -> None: + def _load_or_generate_data(self, force_reload: bool = False) -> None: """Load data from cache or generate and save it if cache doesn't exist.""" - if os.path.exists(cache_file) and not force_reload: + if os.path.exists(self.cache_file) and not force_reload: logger.info("Checking cache") - with open(cache_file, "rb") as f: + with open(self.cache_file, "rb") as f: cache_data = pickle.load(f) if cache_data["metadata"] == ( self.combination_length, self.exclude_mutations, - self.modification_file, + self.modification_file_hash, ): + logger.debug("Cache metadata matches current configuration") try: - logger.info("Loading cache data") + logger.info("Using cached modifcation data") self.modification_df = cache_data["modification_df"] self.monoisotopic_masses = cache_data["monoisotopic_masses"] self.modifications_names = cache_data["modifications_names"] except KeyError: - logger.info("Cache data missing") - self._regenerate_and_save_cache(cache_file) + logger.info("Cached data invalid or incomplete, regenerating cache") + self._regenerate_and_save_cache() else: - self._regenerate_and_save_cache(cache_file) + self._regenerate_and_save_cache() else: - self._regenerate_and_save_cache(cache_file) + self._regenerate_and_save_cache() def get_unimod_database(self): """ @@ -823,7 +881,11 @@ def get_unimod_database(self): Args: exclude_mutations (bool, optional): If True, modifications with the classification 'AA substitution' will be excluded. Defaults to False. """ - unimod_db = unimod.Unimod() + + # Load Unimod database + unimod_db = unimod.Unimod( + "sqlite:///" + str(importlib.resources.files("mumble.package_data") / "unimod.db") + ) position_id_mapper = { 2: "anywhere", 3: "N-term", @@ -950,21 +1012,23 @@ def generate_combinations(items, length): else: return [], [] - def _regenerate_and_save_cache(self, cache_file: str) -> None: + def _regenerate_and_save_cache(self) -> None: """Regenerate data and save it to the cache.""" - logger.info("Generating cache data") + logger.info("Generating new cache data") self.get_unimod_database() self.monoisotopic_masses, self.modifications_names = ( self._generate_modifications_combinations_lists(self.combination_length) ) - - with open(cache_file, "wb") as f: + logger.debug( + f"New cache metadata: \ncombination length {self.combination_length}, \nexclude_mutations {self.exclude_mutations},\nmodification file hash {self.modification_file_hash}", + ) + with open(self.cache_file, "wb") as f: pickle.dump( { "metadata": ( self.combination_length, self.exclude_mutations, - self.modification_file, + self.modification_file_hash, ), "modification_df": self.modification_df, "monoisotopic_masses": self.monoisotopic_masses, @@ -1004,6 +1068,10 @@ def _read_unimod_file(self, modification_file=None): return None, None +def remove_modification_cache(): + _ModificationCache._remove_cache() + + class JSONConfigLoader: """Loads a single-level configuration from a JSON file.""" diff --git a/mumble/package_data/default_ptm_list.tsv b/mumble/package_data/default_ptm_list.tsv new file mode 100755 index 0000000..607692e --- /dev/null +++ b/mumble/package_data/default_ptm_list.tsv @@ -0,0 +1,581 @@ +unimod_id name residue +1009 Thiazolidine C +1009 Thiazolidine F +1009 Thiazolidine H +1009 Thiazolidine K +1009 Thiazolidine N-term +1009 Thiazolidine W +1009 Thiazolidine Y +1014 glycidamide N-term +1017 DMPO C +1032 2-nitrobenzyl Y +1035 Thiadiazole C +1037 Biotin:Thermo-88317 Y +1041 Deoxyhypusine Q +1043 Acetylhypusine K +107 FormylMet N-term +108 Nethylmaleimide C +10 Met->Hse M +114 OxProBiotinRed P +115 OxProBiotin P +116 OxArgBiotin R +118 EDT-iodoacetyl-PEO-biotin S +11 Met->Hsl M +121 GG N-term +121 GG K +121 GG S +121 GG T +121 GG C +122 Formyl K +122 Formyl N-term +122 Formyl S +122 Formyl T +1249 AHA-SS M +1253 2-monomethylsuccinyl C +1254 Saligenin H +1255 Cresylphosphate H +1255 Cresylphosphate S +1255 Cresylphosphate Y +1256 CresylSaligeninPhosphate S +1256 CresylSaligeninPhosphate T +1257 Ub-Br2 C +1258 Ub-VME C +1260 Ub-amide C +1262 2-dimethylsuccinyl C +1270 HCysThiolactone K +1278 Furan Y +127 Fluoro A +1281 BMP-piperidinol C +1282 UgiJoullieProGly D +1282 UgiJoullieProGly E +1283 UgiJoullieProGlyProGly D +1283 UgiJoullieProGlyProGly E +1290 Dicarbamidomethyl H +1290 Dicarbamidomethyl K +1290 Dicarbamidomethyl N-term +1290 Dicarbamidomethyl R +129 Iodo Y +1303 NeuAc T +1304 NeuGc S +1304 NeuGc T +1305 Propyl D +1305 Propyl E +130 Diiodo Y +1310 Propiophenone H +1310 Propiophenone K +1310 Propiophenone W +1312 Delta:H(6)C(3)O(1) C +1312 Delta:H(6)C(3)O(1) H +1312 Delta:H(6)C(3)O(1) K +1312 Delta:H(6)C(3)O(1) N-term +1313 Delta:H(8)C(6)O(1) N-term +1314 biotinAcrolein298 N-term +1315 MM-diphenylpentanone C +1317 EHD-diphenylpentanone C +1326 NEMsulfur C +1327 SulfurDioxide C +1330 bisANS-sulfonates S +1331 DNCB_hapten Y +1344 Phosphogluconoylation N-term +1345 PS_Hapten C +1345 PS_Hapten H +134 Myristoleyl G +1350 CarboxymethylDMAP N-term +1355 azole S +1356 phosphoRibosyl D +1356 phosphoRibosyl E +135 Myristoyl+Delta:H(-4) G +1364 O-Et-N-diMePhospho S +1365 N-dimethylphosphate S +1367 dHex(1)Hex(1) S +1375 dHex(1)Hex(2) T +1380 methylsulfonylethyl C +1380 methylsulfonylethyl H +1380 methylsulfonylethyl K +1381 ethylsulfonylethyl C +1381 ethylsulfonylethyl H +1382 phenylsulfonylethyl C +1384 Homocysteic_acid M +1385 Hydroxamic_acid D +1385 Hydroxamic_acid E +1388 HN2_mustard C +1388 HN2_mustard H +1389 HN3_mustard C +1389 HN3_mustard H +1390 Oxidation+NEM C +1397 Iodoacetanilide K +1397 Iodoacetanilide N-term +1398 Iodoacetanilide:13C(6) C +1398 Iodoacetanilide:13C(6) N-term +139 Dansyl K +139 Dansyl N-term +1400 MurNAc A +1412 s-GlcNAc S +1413 PhosphoHex(2) S +1413 PhosphoHex(2) T +1419 15N-oxobutanoic C +1419 15N-oxobutanoic S +1419 15N-oxobutanoic T +141 Amidine K +141 Amidine N-term +1420 spermine Q +1421 spermidine Q +1423 Biotin:Thermo-21330 N-term +1425 Pentose S +1425 Pentose T +1426 Hex(1)Pent(1) S +1426 Hex(1)Pent(1) T +142 HexNAc(1)dHex(1) N +142 HexNAc(1)dHex(1) T +1430 Hex(1)HexNAc(1)Sulf(1) S +1431 Hex(1)NeuAc(1) S +1434 HexNAc(1)NeuAc(1) T +143 HexNAc(2) N +145 HexNAc(1)dHex(2) N +178 DAET S +178 DAET T +17 NIPCAM C +1824 GEE Q +1826 Glu->pyro-Glu+Methyl E +1827 Glu->pyro-Glu+Methyl:2H(2)13C(1) E +1830 Biotin-tyramide Y +1831 Tris N +1839 betaFNA C +1841 Biotin:Thermo-21328 N-term +1843 PhosphoCytidine Y +1845 AzidoF F +1846 Dimethylaminoethyl C +1848 Gluratylation K +1849 hydroxyisobutyryl K +1870 Cation:Fe[III] D +1870 Cation:Fe[III] E +1873 MesitylOxide H +1873 MesitylOxide K +1873 MesitylOxide N-term +1875 methylol K +1875 methylol Y +1910 Cation:Al[III] D +1910 Cation:Al[III] E +1914 Met->AspSA M +1915 Decarboxylation D +1915 Decarboxylation E +1916 Aspartylurea H +1917 Formylasparagine H +1918 Carbonyl A +1918 Carbonyl E +1918 Carbonyl I +1918 Carbonyl Q +1918 Carbonyl S +1918 Carbonyl V +1922 Pro->HAVA P +1924 Delta:H(-4)O(3) W +1925 Delta:O(4) W +1926 Delta:H(3)C(3)O(2) K +1927 Delta:H(4)C(5)O(1) R +1930 Pent(2) S +1931 Pent(1)HexNAc(1) S +1932 Hex(2)Sulf(1) T +194 AccQTag N-term +197 EQAT C +1986 Diethylphosphothione C +1986 Diethylphosphothione S +1986 Diethylphosphothione T +1987 Dimethylphosphothione S +1987 Dimethylphosphothione Y +1989 monomethylphosphothione C +1989 monomethylphosphothione H +1989 monomethylphosphothione K +1989 monomethylphosphothione S +1989 monomethylphosphothione T +1989 monomethylphosphothione Y +1992 serotonylation Q +1 Acetyl H +1 Acetyl K +1 Acetyl N-term +1 Acetyl S +1 Acetyl T +1 Acetyl Y +2006 Haloxon Y +2007 Methamidophos-S S +2007 Methamidophos-S Y +2008 Methamidophos-O H +2008 Methamidophos-O S +2008 Methamidophos-O T +2008 Methamidophos-O Y +200 Ethanedithiol S +200 Ethanedithiol T +2014 Nitrene Y +21 Phospho C +21 Phospho D +21 Phospho E +21 Phospho S +21 Phospho T +21 Phospho Y +23 Dehydrated D +23 Dehydrated N +23 Dehydrated Q +23 Dehydrated S +23 Dehydrated T +23 Dehydrated Y +24 Propionamide C +24 Propionamide K +24 Propionamide N-term +25 Pyridylacetyl N-term +261 SPITC N-term +264 PET S +264 PET T +270 Cytopiloyne C +270 Cytopiloyne N-term +271 Cytopiloyne+water N-term +272 CAF N-term +275 Nitrosyl C +275 Nitrosyl Y +276 AEBS K +276 AEBS N-term +276 AEBS S +276 AEBS Y +278 Ethanolyl C +278 Ethanolyl K +278 Ethanolyl R +27 Glu->pyro-Glu E +280 Ethyl D +280 Ethyl E +280 Ethyl K +280 Ethyl N-term +286 SulfanilicAcid:13C(6) D +288 Trp->Oxolactone W +289 Biotin-PEO-Amine D +289 Biotin-PEO-Amine E +28 Gln->pyro-Glu Q +290 Biotin-HPDP C +293 CAMthiopropanoyl N-term +294 IED-Biotin C +295 dHex S +295 dHex T +299 Carboxy D +299 Carboxy E +299 Carboxy M +299 Carboxy W +29 SMA N-term +301 Bromobimane C +302 Menadione C +303 DeStreak C +30 Cation:Na D +30 Cation:Na E +312 Cysteinyl C +314 Nmethylmaleimide C +314 Nmethylmaleimide K +31 Pyridylethyl C +320 Nethylmaleimide+water C +327 Delta:H(4)C(2)O(-1)S(1) S +333 Can-FP-biotin S +335 HNE+Delta:H(2) C +337 Methylamine S +337 Methylamine T +340 Bromo F +340 Bromo Y +342 Amino Y +344 Arg->GluSA R +345 Trioxidation C +345 Trioxidation F +345 Trioxidation W +345 Trioxidation Y +34 Methyl D +34 Methyl E +34 Methyl H +34 Methyl I +34 Methyl K +34 Methyl N-term +34 Methyl N +34 Methyl Q +34 Methyl R +34 Methyl S +34 Methyl T +350 Trp->Hydroxykynurenin W +351 Trp->Kynurenin W +352 Lys->Allysine K +354 Nitro F +354 Nitro W +354 Nitro Y +357 probiotinhydrazide P +359 Pro->pyro-Glu P +35 Oxidation D +35 Oxidation E +35 Oxidation F +35 Oxidation G +35 Oxidation H +35 Oxidation I +35 Oxidation K +35 Oxidation N +35 Oxidation P +35 Oxidation Q +35 Oxidation R +35 Oxidation S +35 Oxidation T +35 Oxidation V +35 Oxidation W +35 Oxidation Y +360 Pro->Pyrrolidinone P +361 Thrbiotinhydrazide T +362 Diisopropylphosphate K +362 Diisopropylphosphate N-term +362 Diisopropylphosphate S +362 Diisopropylphosphate T +362 Diisopropylphosphate Y +363 Isopropylphospho Y +368 Cys->Dha C +369 Pro->Pyrrolidone P +36 Dimethyl K +36 Dimethyl N +36 Dimethyl P +36 Dimethyl R +371 HMVK C +374 Dehydro C +375 Diphthamide H +376 Hydroxyfarnesyl C +378 Carboxyethyl H +37 Trimethyl A +37 Trimethyl K +37 Trimethyl R +381 Lys->AminoadipicAcid K +382 Cys->PyruvicAcid C +385 Ammonia-loss C +385 Ammonia-loss N +385 Ammonia-loss S +385 Ammonia-loss T +392 Quinone W +392 Quinone Y +396 GlycerylPE E +39 Methylthio C +39 Methylthio D +39 Methylthio K +39 Methylthio N-term +39 Methylthio N +3 Biotin K +3 Biotin N-term +400 Tyr->Dha Y +401 Didehydro K +401 Didehydro S +401 Didehydro T +401 Didehydro Y +402 Cys->Oxoalanine C +403 Ser->LacticAcid S +407 Hydroxycinnamyl C +408 Glycosyl P +40 Sulfo C +40 Sulfo S +40 Sulfo T +40 Sulfo Y +411 Phenylisocyanate N-term +412 Phenylisocyanate:2H(5) N-term +413 Phosphoguanosine H +414 Hydroxymethyl N +416 Dipyrrolylmethanemethyl C +417 PhosphoUridine Y +41 Hex N-term +41 Hex N +420 Carboxy->Thiocarboxy G +421 Sulfide C +421 Sulfide D +421 Sulfide W +422 PyruvicAcidIminyl C +422 PyruvicAcidIminyl V +425 Dioxidation C +425 Dioxidation E +425 Dioxidation F +425 Dioxidation I +425 Dioxidation P +425 Dioxidation V +425 Dioxidation W +425 Dioxidation Y +426 Octanoyl C +429 PhosphoHex S +42 Lipoyl K +431 Palmitoleyl C +435 Methylpyrroline K +438 Cyano C +43 HexNAc S +43 HexNAc T +440 Amidino C +445 Hydroxytrimethyl K +447 Deoxy D +447 Deoxy S +447 Deoxy T +449 Decanoyl S +449 Decanoyl T +450 Glu E +451 GluGlu E +452 GluGluGlu E +454 HexN N +454 HexN S +454 HexN T +454 HexN W +457 NDA N-term +45 Myristoyl C +45 Myristoyl G +464 SPITC:13C(6) N-term +472 AEC-MAEC S +472 AEC-MAEC T +47 Palmitoyl N-term +488 DHP C +490 Hep R +490 Hep S +490 Hep T +49 Phosphopantetheine S +4 Carbamidomethyl D +4 Carbamidomethyl E +4 Carbamidomethyl H +4 Carbamidomethyl K +4 Carbamidomethyl M +4 Carbamidomethyl N-term +4 Carbamidomethyl S +4 Carbamidomethyl T +4 Carbamidomethyl Y +500 Nmethylmaleimide+water C +501 PyMIC N-term +503 LG-lactam-K N-term +504 LG-Hlactam-K N-term +518 Diethyl N-term +520 Piperidine N-term +523 Sulfo-NHS-LC-LC-Biotin N-term +526 Dethiomethyl M +528 Methyl+Deamidated N +528 Methyl+Deamidated Q +529 Delta:H(5)C(2) P +52 Guanidinyl K +52 Guanidinyl N-term +530 Cation:K D +530 Cation:K E +531 Cation:Cu[I] D +531 Cation:Cu[I] E +53 HNE A +53 HNE C +53 HNE H +53 HNE K +54 Glucuronyl N-term +54 Glucuronyl T +55 Glutathione C +58 Propionyl N-term +58 Propionyl S +58 Propionyl K +5 Carbamyl C +5 Carbamyl K +5 Carbamyl M +5 Carbamyl N-term +5 Carbamyl S +5 Carbamyl T +5 Carbamyl Y +64 Succinyl N-term +64 Succinyl K +684 BDMAPP H +6 Carboxymethyl C +6 Carboxymethyl K +6 Carboxymethyl N-term +6 Carboxymethyl W +720 HNE-Delta:H(2)O C +720 HNE-Delta:H(2)O H +721 4-ONE C +721 4-ONE H +723 O-Dimethylphosphate S +724 O-Methylphosphate S +724 O-Methylphosphate T +724 O-Methylphosphate Y +725 Diethylphosphate H +725 Diethylphosphate S +725 Diethylphosphate T +725 Diethylphosphate Y +726 Ethylphosphate S +726 Ethylphosphate T +726 Ethylphosphate Y +727 O-pinacolylmethylphosphonate H +727 O-pinacolylmethylphosphonate S +727 O-pinacolylmethylphosphonate Y +728 Methylphosphonate S +728 Methylphosphonate T +728 Methylphosphonate Y +729 O-Isopropylmethylphosphonate S +729 O-Isopropylmethylphosphonate Y +734 Ethanolamine C +734 Ethanolamine D +734 Ethanolamine E +743 4-ONE+Delta:H(-2)O(-1) H +744 NO_SMX_SEMD C +745 NO_SMX_SMCT C +747 Malonyl C +747 Malonyl S +747 Malonyl K +748 3sulfo N-term +767 Menadione-HQ C +771 lapachenole C +773 maleimide C +775 Carboxymethyl:13C(2) C +776 NEM:2H(5) C +793 Hex(1)HexNAc(1) S +7 Deamidated F +7 Deamidated N +7 Deamidated Q +7 Deamidated R +800 Biotin:Thermo-21345 Q +801 Pentylamine Q +822 Gly-loss+Amide G +830 Dihydroxyimidazolidine R +837 Arg->Npo R +851 cGMP+RMP-loss C +851 cGMP+RMP-loss S +887 MDCC C +893 CarbamidomethylDTT C +894 CarboxymethylDTT C +898 pyrophospho S +89 Iminobiotin N-term +902 DimethylArsino C +903 Lys->CamCys K +904 Phe->CamCys F +906 Lys->MetOx K +907 Galactosyl N-term +910 Bacillosamine N +911 MTSL C +914 Methylmalonylation S +915 Ethoxyformyl H +926 ethylamino S +926 ethylamino T +928 MercaptoEthanol S +928 MercaptoEthanol T +92 NHS-LC-Biotin N-term +931 Ethyl+Deamidated N +931 Ethyl+Deamidated Q +936 Chlorination W +936 Chlorination Y +937 dichlorination Y +939 Cys->methylaminoAla C +940 Cys->ethylaminoAla C +941 DNPS C +947 LG-pyrrole N-term +948 LG-anhyropyrrole N-term +950 Cation:Li E +951 Cation:Ca[II] D +951 Cation:Ca[II] E +952 Cation:Fe[II] D +952 Cation:Fe[II] E +953 Cation:Ni[II] D +953 Cation:Ni[II] E +954 Cation:Zn[II] D +954 Cation:Zn[II] H +955 Cation:Ag E +956 Cation:Mg[II] D +956 Cation:Mg[II] E +957 2-succinyl C +958 Propargylamine E +959 Phosphopropargyl T +959 Phosphopropargyl Y +977 Carbofuran S +978 BITC N-term +979 PEITC K +979 PEITC N-term +989 Ammonium D +989 Ammonium E +991 ISD_z+2_ion N-term +997 sulfo+amino Y +9999 Hex-N-acetyl-D-glucosamine N +1289 Butyryl K +1363 Crotonyl K +2133 Pro->hydroxyproline P \ No newline at end of file diff --git a/mumble/package_data/unimod.db b/mumble/package_data/unimod.db new file mode 100644 index 0000000..b0ca30a Binary files /dev/null and b/mumble/package_data/unimod.db differ diff --git a/pyproject.toml b/pyproject.toml index 3478a43..8b2fc44 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools>=61.0.0"] +requires = ["setuptools>=61.0.0", "wheel"] build-backend = "setuptools.build_meta" [project] @@ -61,4 +61,13 @@ push = false "mumble/__init__.py" = ['__version__ = "{version}"'] [tool.setuptools.packages.find] -include = ["mumble*"] \ No newline at end of file +include = ["mumble*"] + +[tool.setuptools.package-data] +"mumble.package_data" = [ + "*.tsv", + "*.db", +] + +[tool.setuptools] +include-package-data = true \ No newline at end of file diff --git a/tests/test_data/unmapped_mass_shift_psms.tsv b/tests/test_data/unmapped_mass_shift_psms.tsv new file mode 100644 index 0000000..0f7654d --- /dev/null +++ b/tests/test_data/unmapped_mass_shift_psms.tsv @@ -0,0 +1,16 @@ +psm_id peptide proteins num_proteins filename scannr rank label expmass calcmass charge peptide_len missed_cleavages semi_enzymatic isotope_error precursor_ppm fragment_ppm hyperscore delta_next delta_best rt aligned_rt predicted_rt delta_rt_model ion_mobility predicted_mobility delta_mobility matched_peaks longest_b longest_y longest_y_pct matched_intensity_pct scored_candidates poisson sage_discriminant_score posterior_error spectrum_q peptide_q protein_q ms2_intensity +199919 HSALDMTRYW sp|Q96PM5|ZN363_HUMAN 1 TP14632RJB_Slot2-4_1_15045.mzml index=58358 1 1 1279.5836 1278.5815 2 10 0 0 0.0 783.4328 5.0078335 30.532207635648223 5.878146880422211 0.0 71.6985 0.5524135 0.46870884 0.08370468 0.9579565 0.94857436 0.009382129 12 3 6 0.6 9.362421 10851549 -9.373043530379118 0.60522705 -2.886328 0.00043252597 0.0010298972 0.0007910994 4931.0 +187214 AAADSAVRLW sp|Q6IA86|ELP2_HUMAN 1 TP14632RJB_Slot2-4_1_15045.mzml index=54307 1 1 1059.5529 1058.5508 2 10 0 0 0.0 946.2004 2.185873 28.32508699713557 5.123075332671423 0.0 68.82776 0.53029877 0.5403004 0.010001659 0.84979576 0.87333834 0.023542583 10 2 6 0.6 58.70408 10676408 -7.051204957495997 0.5703348 -2.815339 0.0006829824 0.00064476236 0.0007910994 18935.0 +219534 SVTEIQEKW sp|Q9NR30|DDX21_HUMAN 1 TP14632RJB_Slot2-4_1_15045.mzml index=38403 1 1 1119.5616 1118.5608 2 9 0 0 0.0 894.3697 5.311113 24.395969385978187 4.710274430737893 0.0 55.26193 0.42579475 0.40589654 0.019898206 0.8886946 0.88813794 0.0005566478 10 3 5 0.5555556 22.787048 14424512 -6.995568016505072 0.546295 -2.815339 0.00079221866 0.00064476236 0.0007910994 2062.0 +164506 VTVAMVERW sp|Q9Y3T9|NOC2L_HUMAN 1 TP14632RJB_Slot2-4_1_15045.mzml index=60045 1 1 1091.5695 1089.5642 2 9 0 0 0.0 1838.7217 2.5305421 27.05069782560415 3.848331962079176 0.0 72.99217 0.56237924 0.51100254 0.0513767 0.8591933 0.8812905 0.02209717 10 3 5 0.5555556 16.531023 12148347 -7.036492940220911 0.50694627 -2.5875413 0.0011906677 0.00076485716 0.0007910994 9120.0 +7768 QYSNNIRQL sp|P22314|UBA1_HUMAN 1 TP14632RJB_Slot2-4_1_15045.mzml index=22779 1 1 1135.5818 1134.5782 2 9 0 0 0.0 884.1139 1.8831239 23.440637579926737 0.7914321997743663 0.0 39.204037 0.3020932 0.32888395 0.026790738 0.9082265 0.89833635 0.009890139 8 1 5 0.5555556 19.517168 10595356 -5.242721157452186 0.5045328 -2.5777586 0.0012614715 0.0011441789 0.0008331231 6508.0 +14041 ATYMKPEMW sp|B5ME19|EIFCL_HUMAN;sp|Q99613|EIF3C_HUMAN 2 TP14632RJB_Slot2-4_1_15045.mzml index=52143 1 1 1156.5135 1155.5093 2 9 0 0 0.0 868.73914 4.9459743 26.477541289435923 3.1331527902343552 0.0 67.534355 0.5203351 0.47933102 0.04100406 0.8865586 0.9000385 0.013479888 9 3 4 0.44444445 40.85939 11661667 -6.139319724208844 0.48178595 -2.4375985 0.0014339826 0.00064476236 0.0007910994 13303.0 +41503 KSLPAEINRM sp|Q9H9A6|LRC40_HUMAN 1 TP14632RJB_Slot2-4_1_15045.mzml index=22417 1 1 1158.6241 1157.6227 3 10 0 0 0.0 864.7307 2.4828513 23.867568404785203 2.980331959799255 0.0 38.857033 0.29942006 0.34816507 0.048745006 0.68858534 0.7415607 0.052975357 8 2 5 0.5 41.6222 12180240 -5.170785777230154 0.46959156 -2.3500934 0.001701373 0.00064476236 0.0007910994 7020.0 +282639 DAAVDTSSEITTK sp|P06454|PTMA_HUMAN 1 TP14632RJB_Slot2-4_1_15045.mzml index=38693 1 1 1465.6859 1336.6359 2 13 0 0 0.0 92102.234 3.0631099 38.90515320283323 3.9681349625450153 0.0 55.5774 0.42822498 0.4064465 0.021778494 1.0624033 0.9901312 0.07227212 14 3 10 0.7692308 47.170033 20810275 -11.026229486120789 0.4663473 -2.3132765 0.001701373 0.0025553054 0.0019074185 60530.0 +124634 AVATGDIGRVW sp|P09001|RM03_HUMAN 1 TP14632RJB_Slot2-4_1_15045.mzml index=53304 1 1 1144.5485 1143.6036 2 11 0 0 0.0 825.84045 3.4161668 22.31014669743914 1.868865552675203 0.0 68.22837 0.52568144 0.52021843 0.005463004 0.9002248 0.90414137 0.0039165616 8 1 6 0.54545456 11.186528 15442399 -5.082884427858923 0.45964876 -2.2053041 0.0018072996 0.00064476236 0.0007910994 5208.0 +265252 KSSEVDNWRII sp|Q14703|MBTP1_HUMAN 1 TP14632RJB_Slot2-4_1_15045.mzml index=47200 1 1 1346.6993 1345.699 3 11 0 0 0.0 743.1041 4.769527 21.704638238514416 3.457056589918473 0.0 63.275066 0.4875238 0.4520869 0.0354369 0.7358609 0.7838678 0.048006892 8 4 2 0.18181819 13.228376 16972383 -5.238138623322404 0.40334752 -1.6035125 0.0048363954 0.004268829 0.0072094654 2109.0 +333186 DKPDMAEIEKFDKSKLK sp|P62328|TYB4_HUMAN 1 TP14632RJB_Slot2-4_1_15045.mzml index=40275 1 1 2150.094 2021.0503 4 17 0 0 0.0 61874.48 1.6855274 32.537247754888 4.45793067677355 0.0 56.965416 0.43891752 0.43971053 0.00079301 0.81057006 0.861354 0.050783932 12 2 7 0.4117647 20.889032 13767769 -9.319593391441426 0.37860322 -1.4082162 0.007289102 0.0171281 0.0007910994 11673.0 +261471 SKIDLHKY sp|Q6PL18|ATAD2_HUMAN 1 TP14632RJB_Slot2-4_1_15045.mzml index=16871 1 1 1116.5916 1002.5498 3 8 0 0 0.0 107630.15 5.277048 25.82725878613052 2.0076779582668465 0.0 33.746063 0.26004785 0.2389326 0.021115258 0.69248414 0.7291602 0.03667605 10 2 5 0.625 15.223612 11706895 -7.072172488123847 0.37513587 -1.3865924 0.007692431 0.028399123 0.0007910994 3724.0 +329976 VFNHPAIKKF sp|Q9ULM3|YETS2_HUMAN 1 TP14632RJB_Slot2-4_1_15045.mzml index=29594 1 1 1331.7269 1199.6814 3 10 0 0 0.0 104325.76 2.1558359 32.82403334388452 5.519597165476629 0.0 46.144814 0.35556132 0.36646292 0.0109016 0.7752403 0.77022356 0.005016744 13 3 7 0.7 30.47619 11552423 -10.408735543512638 0.3726905 -1.3708363 0.007948747 0.029461462 0.0009400809 7808.0 +158199 IAFPGKHFQEISW sp|Q6DKI1|RL7L_HUMAN 1 TP14632RJB_Slot2-4_1_15045.mzml index=68663 1 1 1559.8033 1558.7932 2 13 0 0 0.0 647.81177 2.9568856 23.116929396838152 0.580005686554518 0.0 80.090416 0.61706036 0.6526362 0.035575807 1.0654885 1.038366 0.027122498 7 3 1 0.07692308 18.279957 17761602 -4.246950783513149 0.36107647 -1.2702332 0.009099153 0.0011546799 0.0007910994 8111.0 +244366 VMEIHSKYW sp|P56524|HDAC4_HUMAN 1 TP14632RJB_Slot2-4_1_15045.mzml index=28000 1 1 1319.6724 1191.5747 3 9 0 0 0.0 102019.16 3.94922 27.266959268607945 1.488696262360481 0.0 44.535973 0.34316763 0.40736136 0.064193726 0.7949624 0.75805515 0.036907256 11 4 4 0.44444445 20.097553 16448319 -8.204367361226717 0.3587304 -1.2438289 0.009471288 0.036396112 0.0007910994 2431.0 diff --git a/tests/test_mumble.py b/tests/test_mumble.py index 0544de7..a5aad72 100644 --- a/tests/test_mumble.py +++ b/tests/test_mumble.py @@ -1,13 +1,11 @@ import os - import pytest from unittest.mock import MagicMock import pandas as pd from collections import namedtuple from psm_utils import PSMList, PSM, Peptidoform -from psm_utils.utils import mz_to_mass +from psm_utils.io import read_file from pyteomics import proforma -from pyteomics.mass import calculate_mass from pyteomics.fasta import IndexedFASTA from mumble.mumble import _ModificationHandler, PSMHandler @@ -114,7 +112,7 @@ def test_get_modified_peptidoforms(self, setup_psmhandler): Localised_mass_shifts=[Localised_mass_shift("N-term", "Acetyl")] ) ] - new_psms = psm_handler._get_modified_peptidoforms(psm, keep_original=True) + new_psms = psm_handler._get_modified_peptidoforms(psm, include_original_psm=True) assert isinstance(new_psms, list) assert len(new_psms) == 2 @@ -126,7 +124,7 @@ def test_get_modified_peptidoforms(self, setup_psmhandler): Modification_candidate(Localised_mass_shifts=[Localised_mass_shift(1, "Carbamyl")]), Modification_candidate(Localised_mass_shifts=[Localised_mass_shift(4, "Carbamyl")]), ] - new_psms = psm_handler._get_modified_peptidoforms(psm, keep_original=False) + new_psms = psm_handler._get_modified_peptidoforms(psm, include_original_psm=False) assert isinstance(new_psms, list) assert len(new_psms) == 2 @@ -142,7 +140,7 @@ def test_get_modified_peptidoforms(self, setup_psmhandler): ] ) ] - new_psms = psm_handler._get_modified_peptidoforms(psm, keep_original=False) + new_psms = psm_handler._get_modified_peptidoforms(psm, include_original_psm=False) assert isinstance(new_psms, list) assert len(new_psms) == 1 # 1 combined psm expected @@ -157,20 +155,19 @@ def test_add_modified_psms(self, setup_psmhandler): mod_handler.localize_mass_shift.return_value = [ Modification_candidate(Localised_mass_shifts=[Localised_mass_shift("N-term", "mod1")]) ] - new_psm_list = psm_handler.add_modified_psms(psm_list, keep_original=True) + new_psm_list = psm_handler.add_modified_psms(psm_list, include_original_psm=True) assert isinstance(new_psm_list, PSMList) assert len(new_psm_list) > 1 def test_tool_combination_length_1(self, setup_psm): - psm_handler = PSMHandler(combination_length=1, exclude_mutations=False) + psm_handler = PSMHandler( + combination_length=1, exclude_mutations=False, all_unimod_modifications=True + ) # retrigger get_unimod_database - cache_file = psm_handler.modification_handler.cache._get_cache_file_path() - psm_handler.modification_handler.cache._load_or_generate_data( - cache_file, force_reload=True - ) + psm_handler.modification_handler.cache.load_cache(force_reload=True) psm = setup_psm result_psm_list = psm_handler.get_modified_peptidoforms_list(psm) @@ -188,13 +185,10 @@ def test_tool_combination_length_1(self, setup_psm): def test_tool_combination_length_2(self, setup_psm): - psm_handler = PSMHandler(combination_length=2) + psm_handler = PSMHandler(combination_length=2, all_unimod_modifications=True) # retrigger get_unimod_database - cache_file = psm_handler.modification_handler.cache._get_cache_file_path() - psm_handler.modification_handler.cache._load_or_generate_data( - cache_file, force_reload=True - ) + psm_handler.modification_handler.cache._load_or_generate_data(force_reload=True) psm = setup_psm result_psm_list = psm_handler.get_modified_peptidoforms_list(psm) @@ -228,13 +222,15 @@ def test_tool_combination_length_2(self, setup_psm): peptidoform in result_peptidoforms for peptidoform in expected_double_mod_Peptidoforms ) - def test_tool_keep_original(self, setup_psm): + def test_tool_include_original_psm(self, setup_psm): # psm_handler = setup_psmhandler[0] - psm_handler = PSMHandler(combination_length=1) + psm_handler = PSMHandler(combination_length=1, all_unimod_modifications=True) psm = setup_psm - result_psm_list = psm_handler.get_modified_peptidoforms_list(psm, keep_original=True) + result_psm_list = psm_handler.get_modified_peptidoforms_list( + psm, include_original_psm=True + ) assert psm in result_psm_list assert len(result_psm_list) == 3 @@ -309,7 +305,7 @@ def test_get_localisation(self, setup_modhandler): restrictions = ["anywhere", "N-term", "C-term", "N-term", "anywhere"] # Mock the check_protein_level method - mod_handler.check_protein_level = MagicMock(return_value=[("pepeptide", "mod1")]) + mod_handler.check_protein_level = MagicMock(return_value=[("prepeptide", "mod1")]) # Expected output expected_output = { @@ -318,7 +314,7 @@ def test_get_localisation(self, setup_modhandler): Localised_mass_shift("N-term", "mod1"), # N-term modification Localised_mass_shift("C-term", "mod1"), # C-term modification Localised_mass_shift("N-term", "mod1"), # Q in the sequence - Localised_mass_shift("pepeptide", "mod1"), # protein level modification + Localised_mass_shift("prepeptide", "mod1"), # protein level modification } # Call the method @@ -769,42 +765,71 @@ def test_generate_modifications_combinations_lists_empty(self, setup_modhandler_ assert masses == [] assert combinations == [] - def test_double_combined_modifcations(self): + def test_single_combined_modifcations(self): - mod_handler = _ModificationHandler(combination_length=2) + current_dir = os.path.dirname(os.path.abspath(__file__)) + data_file_path = os.path.join(current_dir, "test_data", "unmapped_mass_shift_psms.tsv") - psm = PSM( - peptidoform="VTFTETPENGSKW/2", - spectrum_id="some_spectrum", - is_decoy=False, - protein_list=["some_protein"], - precursor_mz="748.8581250320699", + psm_list_unmapped_psms = read_file(data_file_path, filetype="sage") + + psm_handler = PSMHandler( + aa_combinations=0, + fasta_file=None, + mass_error=0.02, + exclude_mutations=True, + combination_length=1, + all_unimod_modifications=True, ) - localized_modifications = mod_handler.localize_mass_shift(psm) - name_to_mass_dict = mod_handler.name_to_mass_residue_dict + mapped_psms = psm_handler.add_modified_psms( + psm_list_unmapped_psms, include_original_psm=False, include_decoy_psm=False + ) - expmass = mz_to_mass(psm.precursor_mz, psm.get_precursor_charge()) - calcmass = calculate_mass(psm.peptidoform.composition) - mass_shift = expmass - calcmass + expected_peptidoforms = [ + "HSALDMTR[Deamidated]YW", + "AAADSAVR[Deamidated]LW", + "SVTEIQ[Deamidated]EKW", + "Q[Deamidated]YSNNIRQL", + "QYSNNIR[Deamidated]QL", + "QYSNNIRQ[Deamidated]L", + "QYSN[Deamidated]NIRQL", + "QYSNN[Deamidated]IRQL", + "KSLPAEIN[Deamidated]RM", + "KSLPAEINR[Deamidated]M", + "KSSEVDN[Deamidated]WRII", + "KSSEVDNWR[Deamidated]II", + "SK[Dicarbamidomethyl]IDLHKY", + "SKIDLHK[Dicarbamidomethyl]Y", + "SKIDLH[Dicarbamidomethyl]KY", + "[Dicarbamidomethyl]-SKIDLHKY", + "[Lys]-VMEIHSKYW", + ] + assert len(mapped_psms) == 17 + assert set(expected_peptidoforms) == set( + [psm.peptidoform.proforma.split("/")[0] for psm in mapped_psms] + ) - for candidate in localized_modifications: + def test_double_combined_modifcations(self): - mass_shift1 = candidate.Localised_mass_shifts[0] + current_dir = os.path.dirname(os.path.abspath(__file__)) + data_file_path = os.path.join(current_dir, "test_data", "unmapped_mass_shift_psms.tsv") - # no need to check single mod 'combinations' - try: - mass_shift2 = candidate.Localised_mass_shifts[1] - except: # noqa: E722 - continue + psm_list_unmapped_psms = read_file(data_file_path, filetype="sage") - sum = ( - name_to_mass_dict[mass_shift1.modification].mass - + name_to_mass_dict[mass_shift2.modification].mass - ) + psm_handler = PSMHandler( + aa_combinations=0, + fasta_file=None, + mass_error=0.02, + exclude_mutations=True, + combination_length=2, + all_unimod_modifications=True, + ) + + mapped_psms = psm_handler.add_modified_psms( + psm_list_unmapped_psms, include_original_psm=False, include_decoy_psm=False + ) - assert mass_shift1.loc != mass_shift2.loc - assert sum >= (mass_shift - 0.02) and sum <= (mass_shift + 0.02) + assert len(mapped_psms) == 426 if __name__ == "__main__":