diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index e10fe13c..66c9399a 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -273,8 +273,9 @@ jobs: run: pip install . - name: Test FragPipe file run: | - wget -nv -P ./fragpipe https://ftp.pride.ebi.ac.uk/pride/data/archive/2025/08/PXD066146/psm.tsv - wget -nv -P ./fragpipe https://ftp.pride.ebi.ac.uk/pride/data/archive/2025/08/PXD066146/ion.tsv + wget -nv https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/pmultiqc/example-projects/PXD062399.zip + mkdir -p ./fragpipe + unzip -d ./fragpipe PXD062399.zip multiqc --fragpipe-plugin ./fragpipe -o ./results_fragpipe - uses: actions/upload-artifact@v4 if: always() diff --git a/docs/README.md b/docs/README.md index 29d9f801..dded321a 100644 --- a/docs/README.md +++ b/docs/README.md @@ -243,7 +243,7 @@ You can find example reports on the [docs page](https://bigbio.github.io/pmultiq | ProteoBench | ProteoBench results | [ProteoBench Example](https://pmultiqc.quantms.org/ProteoBench/multiqc_report.html) ([disable_hoverinfo](https://pmultiqc.quantms.org/ProteoBench_disable_hoverinfo/multiqc_report.html)) | [ProteoBench data](https://proteobench.cubimed.rub.de/datasets/d01e87b997b84c985868204b1ed26749902fd7f9/d01e87b997b84c985868204b1ed26749902fd7f9_data.zip) | | mzIdentML with mzML | mzIdentML with mzML files | [mzIdentML with mzML Example](https://pmultiqc.quantms.org/PXD053068/multiqc_report.html) ([disable_hoverinfo](https://pmultiqc.quantms.org/PXD053068_disable_hoverinfo/multiqc_report.html)) | [PXD053068 folder](https://ftp.pride.ebi.ac.uk/pride/data/archive/2025/05/PXD053068/) | | mzIdentML with MGF | mzIdentML with MGF files | [mzIdentML with MGF Example](https://pmultiqc.quantms.org/PXD054720/multiqc_report.html) ([disable_hoverinfo](https://pmultiqc.quantms.org/PXD054720_disable_hoverinfo/multiqc_report.html)) | [PXD054720 folder](https://ftp.pride.ebi.ac.uk/pride/data/archive/2024/08/PXD054720/) | -| FragPipe | FragPipe results | [FragPipe Example](https://pmultiqc.quantms.org/PXD066146/multiqc_report.html) ([disable_hoverinfo](https://pmultiqc.quantms.org/PXD066146_disable_hoverinfo/multiqc_report.html)) | [psm.tsv](https://ftp.pride.ebi.ac.uk/pride/data/archive/2025/08/PXD066146/psm.tsv) | +| FragPipe | FragPipe results | [FragPipe Example](https://pmultiqc.quantms.org/PXD062399/multiqc_report.html) ([disable_hoverinfo](https://pmultiqc.quantms.org/PXD062399_disable_hoverinfo/multiqc_report.html)) | [PXD062399.zip](https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/pmultiqc/example-projects/PXD062399.zip) | ## 👥 Contributing diff --git a/docs/config.json b/docs/config.json index e1a53b90..7ec6c82c 100644 --- a/docs/config.json +++ b/docs/config.json @@ -165,21 +165,19 @@ "file_type": ["mzid", "disable_hoverinfo"] }, { - "accession": "PXD066146", + "accession": "PXD062399", "urls": [ - "https://ftp.pride.ebi.ac.uk/pride/data/archive/2025/08/PXD066146/psm.tsv", - "https://ftp.pride.ebi.ac.uk/pride/data/archive/2025/08/PXD066146/ion.tsv" + "https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/pmultiqc/example-projects/PXD062399.zip" ], - "path": "docs/PXD066146", + "path": "docs/PXD062399", "file_type": ["fragpipe", ""] }, { - "accession": "PXD066146_disable_hoverinfo", + "accession": "PXD062399_disable_hoverinfo", "urls": [ - "https://ftp.pride.ebi.ac.uk/pride/data/archive/2025/08/PXD066146/psm.tsv", - "https://ftp.pride.ebi.ac.uk/pride/data/archive/2025/08/PXD066146/ion.tsv" + "https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/pmultiqc/example-projects/PXD062399.zip" ], - "path": "docs/PXD066146_disable_hoverinfo", + "path": "docs/PXD062399_disable_hoverinfo", "file_type": ["fragpipe", "disable_hoverinfo"] } ] diff --git a/pmultiqc/main.py b/pmultiqc/main.py index b42113a4..04fe00c1 100644 --- a/pmultiqc/main.py +++ b/pmultiqc/main.py @@ -152,4 +152,25 @@ def pmultiqc_plugin_execution_start(): {"pmultiqc/tsv": {"fn": "*.tsv", "num_lines": 0}}, ) + # FragPipe workflow file (parameters) + if "pmultiqc/workflow" not in config.sp: + config.update_dict( + config.sp, + {"pmultiqc/workflow": {"fn": "*.workflow", "num_lines": 0}}, + ) + + # FragPipe manifest file (experiment design) + if "pmultiqc/fp-manifest" not in config.sp: + config.update_dict( + config.sp, + {"pmultiqc/fp-manifest": {"fn": "*.fp-manifest", "num_lines": 0}}, + ) + + # MSFragger params file (search engine parameters) + if "pmultiqc/fragger_params" not in config.sp: + config.update_dict( + config.sp, + {"pmultiqc/fragger_params": {"fn": "*.params", "num_lines": 0}}, + ) + config.update({"log_filesize_limit": 200 * pow(1024, 3), "thousandsSep_format": ""}) diff --git a/pmultiqc/modules/common/plots/id.py b/pmultiqc/modules/common/plots/id.py index 30d2939a..bae8e1fd 100644 --- a/pmultiqc/modules/common/plots/id.py +++ b/pmultiqc/modules/common/plots/id.py @@ -403,7 +403,8 @@ def draw_identification( cal_num_table_data=None, quantms_missed_cleavages=None, quantms_modified=None, - msms_identified_rate=None + msms_identified_rate=None, + draw_peptide_id_count=True, ): if cal_num_table_data and cal_num_table_data.get("sdrf_samples"): @@ -483,33 +484,34 @@ def build_count(data, value_key, sources): """, ) - draw_config = { - "id": "peptide_id_count", - "cpswitch": False, - "title": "Peptide ID Count", - "tt_decimals": 0, - "ylab": "Count", - "data_labels": plot_label, - "save_data_file": False, - } - - bar_html = bargraph.plot( - peptide_count, - pconfig=draw_config, - ) - bar_html = plot_html_check(bar_html) + if draw_peptide_id_count: + draw_config = { + "id": "peptide_id_count", + "cpswitch": False, + "title": "Peptide ID Count", + "tt_decimals": 0, + "ylab": "Count", + "data_labels": plot_label, + "save_data_file": False, + } - add_sub_section( - sub_section=sub_sections, - plot=bar_html, - order=4, - description=""" - Number of unique (i.e. not counted twice) peptide sequences including modifications per Raw file. - """, - helptext=""" - Based on statistics calculated from mzTab, mzIdentML (mzid), DIA-NN report files, or FragPipe psm.tsv. - """, - ) + bar_html = bargraph.plot( + peptide_count, + pconfig=draw_config, + ) + bar_html = plot_html_check(bar_html) + + add_sub_section( + sub_section=sub_sections, + plot=bar_html, + order=4, + description=""" + Number of unique (i.e. not counted twice) peptide sequences including modifications per Raw file. + """, + helptext=""" + Based on statistics calculated from mzTab, mzIdentML (mzid), DIA-NN report files, or FragPipe psm.tsv. + """, + ) if quantms_missed_cleavages: @@ -912,8 +914,9 @@ def draw_modifications(sub_section, modified_data): ) -def draw_oversampling(sub_section, oversampling, oversampling_plot, is_maxquant): - if is_maxquant: +def draw_oversampling(sub_section, oversampling, oversampling_plot, data_type: str): + + if data_type == "maxquant" or data_type == "fragpipe": draw_config = { "id": "oversampling_distribution", "cpswitch": False, @@ -948,9 +951,15 @@ def draw_oversampling(sub_section, oversampling, oversampling_plot, is_maxquant) Oversampling occurs in low-complexity samples or long LC gradients, as well as undersized dynamic exclusion windows for data independent acquisitions. """ - if is_maxquant: + if data_type == "maxquant": helptext += "

If DIA-Data: this metric is skipped.

" + if data_type == "fragpipe": + helptext += """ +

[FragPipe: combined_ion.tsv] This plot shows the distribution of + MS/MS spectral counts per ion/peak for each sample.

+ """ + add_sub_section( sub_section=sub_section, plot=bar_html, diff --git a/pmultiqc/modules/fragpipe/fragpipe.py b/pmultiqc/modules/fragpipe/fragpipe.py index bb3939ba..4113e5d1 100644 --- a/pmultiqc/modules/fragpipe/fragpipe.py +++ b/pmultiqc/modules/fragpipe/fragpipe.py @@ -8,6 +8,18 @@ psm_reader, ion_reader, get_ion_intensity_data, + workflow_reader, + fragger_params_reader, + get_workflow_parameters_table, + manifest_reader, + get_experiment_design_table, + combined_protein_reader, + get_protein_intensity_distribution, + combined_peptide_reader, + get_mbr_stats, + combined_ion_reader, + get_msms_counts_per_peak, + cal_peptide_id_gain ) from pmultiqc.modules.common.stats import ( cal_delta_mass_dict, @@ -27,7 +39,8 @@ rebuild_dict_structure, draw_top_n_contaminants, draw_potential_contaminants, - draw_modifications + draw_modifications, + draw_oversampling ) from pmultiqc.modules.core.section_groups import ( add_group_modules, @@ -48,12 +61,13 @@ draw_search_engine_scores, draw_heatmap ) +from pmultiqc.modules.maxquant.maxquant_plots import draw_evidence_peptide_id_count from collections import OrderedDict from pmultiqc.modules.common.histogram import Histogram from multiqc import config -from multiqc.plots import bargraph, box +from multiqc.plots import bargraph, box, table from pmultiqc.modules.common.logging import get_logger @@ -86,6 +100,28 @@ def __init__(self, find_log_files_func, sub_sections, heatmap_colors): self.ion_intensity_data = None self.ion_sample_cols = [] + # New data containers for additional FragPipe files + # Parameters from workflow file and fragger.params + self.parameters = None + self.fragger_params = None + self.parameters_table = None + + # Experiment design from manifest + self.manifest_df = None + self.experiment_design = None + + # Protein intensity data from combined_protein.tsv + self.protein_df = None + self.protein_intensity_cols = [] + self.protein_intensity_distribution = None + self.protein_contam_distribution = None + + self.peptide_id_gain = {} + self.peptide_id_count_no_gain = True + + # MS/MS counts from combined_ion.tsv + self.msms_counts = {} + def get_data(self): @@ -122,12 +158,107 @@ def get_data(self): fragpipe_files=self.fragpipe_files ) + # Parse workflow file for parameters (optional) + if self.fragpipe_files.get("workflow"): + try: + workflow_path = self.fragpipe_files["workflow"][0] + self.parameters = workflow_reader(workflow_path) + if self.parameters: + log.info("Workflow parameters loaded successfully.") + except Exception as e: + log.warning(f"Error parsing workflow file: {e}") + + # Parse fragger.params for MSFragger search parameters (optional) + if self.fragpipe_files.get("fragger_params"): + try: + fragger_path = self.fragpipe_files["fragger_params"][0] + self.fragger_params = fragger_params_reader(fragger_path) + if self.fragger_params: + log.info("Fragger.params loaded successfully.") + except Exception as e: + log.warning(f"Error parsing fragger.params file: {e}") + + # Generate parameters table from workflow and/or fragger.params + if self.parameters or self.fragger_params: + self.parameters_table = get_workflow_parameters_table( + self.parameters, self.fragger_params + ) + + # Parse manifest file for experiment design (optional) + if self.fragpipe_files.get("manifest"): + try: + manifest_path = self.fragpipe_files["manifest"][0] + self.manifest_df = manifest_reader(manifest_path) + if self.manifest_df is not None: + self.experiment_design = get_experiment_design_table(self.manifest_df) + log.info("Manifest/experiment design loaded successfully.") + except Exception as e: + log.warning(f"Error parsing manifest file: {e}") + + # Parse combined_protein.tsv for protein intensity distribution + if self.fragpipe_files.get("combined_protein"): + try: + protein_path = self.fragpipe_files["combined_protein"][0] + self.protein_df, self.protein_intensity_cols = combined_protein_reader(protein_path) + if self.protein_df is not None and self.protein_intensity_cols: + contam_affix = config.kwargs["contaminant_affix"] + ( + self.protein_intensity_distribution, + self.protein_contam_distribution + ) = get_protein_intensity_distribution( + self.protein_df, self.protein_intensity_cols, contam_affix + ) + log.info("Protein intensity distribution loaded successfully.") + except Exception as e: + log.warning(f"Error parsing combined_protein.tsv: {e}") + + # Parse combined_peptide.tsv + if self.fragpipe_files.get("combined_peptide"): + try: + peptide_path = self.fragpipe_files["combined_peptide"][0] + peptide_df, combined_peptide_column_valid = combined_peptide_reader(peptide_path) + log.info("Combined peptide data loaded successfully.") + + if peptide_df is not None and combined_peptide_column_valid: + self.peptide_id_gain = cal_peptide_id_gain(peptide_df) + log.info("peptide statics loaded successfully.") + + except Exception as e: + log.warning(f"Error parsing combined_peptide.tsv: {e}") + + # Parse combined_ion.tsv for MS/MS counts + if self.fragpipe_files.get("combined_ion"): + try: + combined_ion_path = self.fragpipe_files["combined_ion"][0] + combined_ion_df, combined_ion_column_valid = combined_ion_reader(combined_ion_path) + log.info("Combined ion data loaded successfully.") + + if combined_ion_df is not None and combined_ion_column_valid: + self.msms_counts = get_msms_counts_per_peak(combined_ion_df) + log.info("MS/MS counts per peak loaded successfully.") + except Exception as e: + log.warning(f"Error parsing combined_ion.tsv: {e}") + return True def draw_plots(self): log.info("Starting to process plotting data...") + # Parameters table (from workflow file) + if self.parameters_table: + self.draw_parameters( + sub_section=self.sub_sections["experiment"], + parameter_table=self.parameters_table + ) + + # Experiment design table (from manifest file) + if self.experiment_design: + self.draw_experiment_design( + sub_section=self.sub_sections["experiment"], + exp_design=self.experiment_design + ) + # Delta Mass if self.delta_masses: self.draw_delta_mass( @@ -142,6 +273,15 @@ def draw_plots(self): charge_states=self.charge_states ) + if self.peptide_id_gain: + + self.peptide_id_count_no_gain = False + draw_evidence_peptide_id_count( + self.sub_sections["identification"], + self.peptide_id_gain, + "fragpipe" + ) + if self.pipeline_stats: # Statistics @@ -176,6 +316,7 @@ def draw_plots(self): draw_identification( self.sub_sections["identification"], cal_num_table_data=statistics_result, + draw_peptide_id_count=self.peptide_id_count_no_gain ) # Peptide Intensity Distribution @@ -238,7 +379,32 @@ def draw_plots(self): intensity_data=self.ion_intensity_data ) + # Protein intensity distribution from combined_protein.tsv + if self.protein_intensity_distribution: + self.draw_protein_intensity_distribution( + sub_section=self.sub_sections["quantification"], + sample_distribution=self.protein_intensity_distribution, + contam_distribution=self.protein_contam_distribution + ) + + # MBR (Match-Between-Runs) visualization + # if self.mbr_stats: + # self.draw_mbr_contribution( + # sub_section=self.sub_sections["identification"], + # mbr_stats=self.mbr_stats + # ) + + # MS/MS counts per peak from combined_ion.tsv + if self.msms_counts: + draw_oversampling( + self.sub_sections["ms2"], + self.msms_counts, + "", + "fragpipe", + ) + section_group_dict = { + "experiment_sub_section": self.sub_sections["experiment"], "summary_sub_section": self.sub_sections["summary"], "identification_sub_section": self.sub_sections["identification"], "search_engine_sub_section": self.sub_sections["search_engine"], @@ -248,6 +414,8 @@ def draw_plots(self): "mass_error_sub_section": self.sub_sections["mass_error"], "rt_qc_sub_section": self.sub_sections["rt_qc"], } + # Filter out None values from section_group_dict + section_group_dict = {k: v for k, v in section_group_dict.items() if v is not None} add_group_modules(section_group_dict, "") @@ -854,6 +1022,302 @@ def draw_ion_intensity_distribution(sub_section, intensity_data: dict): log.info("Ion intensity distribution plot generated.") + @staticmethod + def draw_parameters(sub_section, parameter_table: dict): + """ + Draw FragPipe parameters table. + + Parameters + ---------- + sub_section : dict + Section to add the table to. + parameter_table : dict + Dictionary containing parameter name/value pairs. + """ + if not parameter_table: + log.info("No parameters table data available.") + return + + log.info(f"Drawing parameters table with {len(parameter_table)} parameters") + + draw_config = { + "namespace": "", + "id": "fragpipe_parameters", + "title": "FragPipe Parameters", + "save_file": False, + "sort_rows": False, + "only_defined_headers": True, + "col1_header": "No.", + "no_violin": True, + "save_data_file": False, + } + + headers = { + "parameter": { + "title": "Parameter", + "scale": False + }, + "value": { + "title": "Value", + "scale": False + } + } + + table_html = table.plot(data=parameter_table, headers=headers, pconfig=draw_config) + + add_sub_section( + sub_section=sub_section, + plot=table_html, + order=1, + description="This table presents the parameters used in FragPipe analysis.", + helptext=""" + FragPipe parameters, extracted from fragpipe.workflow, summarizes the settings + used for the FragPipe analysis. Key parameters include FragPipe version, search + engine settings (enzyme, mass tolerances), modifications, database used, and + IonQuant settings like Match-Between-Runs (MBR) and normalization options. + """, + ) + + log.info("FragPipe parameters table generated.") + + @staticmethod + def draw_experiment_design(sub_section, exp_design: dict): + """ + Draw experiment design table from manifest file. + + Parameters + ---------- + sub_section : dict + Section to add the table to. + exp_design : dict + Dictionary containing experiment design information. + """ + if not exp_design: + log.info("No experiment design data available.") + return + + log.info(f"Drawing experiment design table with {len(exp_design)} entries") + + draw_config = { + "namespace": "", + "id": "fragpipe_experiment_design", + "title": "Experimental Design", + "save_file": False, + "sort_rows": False, + "only_defined_headers": True, + "col1_header": "No.", + "no_violin": True, + "save_data_file": False, + } + + headers = { + "file_name": { + "title": "File Name", + "description": "Raw data file name", + "scale": False + }, + "experiment": { + "title": "Experiment", + "description": "Experiment/sample name", + "scale": False + }, + "bioreplicate": { + "title": "BioReplicate", + "description": "Biological replicate ID", + "scale": False + }, + "data_type": { + "title": "Data Type", + "description": "Data type (DDA/DIA)", + "scale": False + }, + } + + table_html = table.plot(data=exp_design, headers=headers, pconfig=draw_config) + + add_sub_section( + sub_section=sub_section, + plot=table_html, + order=2, + description="This table shows the experimental design extracted from the FragPipe manifest file.", + helptext=""" + The experiment design table shows which raw files belong to which experiment + and biological replicate. This information is extracted from the FragPipe + manifest file (fp-manifest). + """, + ) + + log.info("Experiment design table generated.") + + @staticmethod + def draw_protein_intensity_distribution(sub_section, sample_distribution: dict, contam_distribution: dict = None): + """ + Draw protein intensity distribution box plot from combined_protein.tsv. + + Parameters + ---------- + sub_section : dict + Section to add the plot to. + sample_distribution : dict + Dictionary mapping sample names to log2-transformed intensity values. + contam_distribution : dict, optional + Dictionary mapping sample names to contaminant intensity values. + """ + if not sample_distribution: + log.info("No protein intensity distribution data available.") + return + + log.info(f"Drawing protein intensity distribution for {len(sample_distribution)} samples") + + # Prepare data for box plot - can include sample and contaminant distributions + if contam_distribution and len(contam_distribution) > 0: + distribution_box = [sample_distribution, contam_distribution] + boxplot_labels = ["Sample", "Contaminants"] + else: + distribution_box = [sample_distribution] + boxplot_labels = ["Sample"] + + draw_config = { + "id": "protein_intensity_distribution_box", + "cpswitch": False, + "cpswitch_c_active": False, + "title": "Protein Intensity Distribution", + "tt_decimals": 2, + "data_labels": boxplot_labels, + "xlab": "log2(Intensity)", + "save_data_file": False, + } + + box_html = box.plot(list_of_data_by_sample=distribution_box, pconfig=draw_config) + + box_html = plot_data_check( + plot_data=distribution_box, + plot_html=box_html, + log_text="pmultiqc.modules.fragpipe.fragpipe", + function_name="draw_protein_intensity_distribution" + ) + box_html = plot_html_check(box_html) + + add_sub_section( + sub_section=sub_section, + plot=box_html, + order=2, + description="Protein intensity distribution from combined_protein.tsv.", + helptext=""" + [FragPipe: combined_protein.tsv] This plot shows the log2-transformed protein + intensity distribution for each sample. The combined_protein.tsv file contains + protein-level quantification data from IonQuant. + + For label-free experiments, each box represents the MaxLFQ intensity distribution. + For TMT experiments, intensity values from TMT channels are shown. + + A higher median intensity and narrower distribution typically indicate better + quantification quality. Large differences between samples may indicate + normalization issues or batch effects. + + Contaminant proteins (when available) are shown separately to help assess + the level of contamination in each sample. + """, + ) + + log.info("Protein intensity distribution plot generated.") + + @staticmethod + def draw_mbr_contribution(sub_section, mbr_stats: dict): + """ + Draw Match-Between-Runs (MBR) contribution visualization. + + Parameters + ---------- + sub_section : dict + Section to add the plot to. + mbr_stats : dict + Dictionary containing MBR statistics per sample. + """ + if not mbr_stats: + log.info("No MBR statistics available.") + return + + # Check if we have any meaningful MBR data + has_data = False + for sample, stats in mbr_stats.items(): + proteins = stats.get('proteins', {}) + if proteins.get('mbr_only', 0) > 0 or proteins.get('both', 0) > 0: + has_data = True + break + + if not has_data: + log.info("No meaningful MBR data found (all MBR counts are 0).") + return + + log.info(f"Drawing MBR contribution plot for {len(mbr_stats)} samples") + + # Prepare data for stacked bar chart + # Categories: MS/MS only (identified by MS/MS), MBR only (transferred), Both (MS/MS + MBR) + plot_data = {} + for sample, stats in mbr_stats.items(): + proteins = stats.get('proteins', {}) + plot_data[sample] = { + "MS/MS Only": proteins.get('msms_only', 0), + "MS/MS + MBR": proteins.get('both', 0), + "MBR Only": proteins.get('mbr_only', 0), + } + + cats = [ + { + "name": "MS/MS Only", + "color": "#1f77b4", + "description": "Proteins identified only by MS/MS" + }, + { + "name": "MS/MS + MBR", + "color": "#2ca02c", + "description": "Proteins identified by both MS/MS and MBR" + }, + { + "name": "MBR Only", + "color": "#ff7f0e", + "description": "Proteins identified only by Match-Between-Runs" + } + ] + + draw_config = { + "id": "mbr_protein_contribution", + "cpswitch": True, + "title": "MBR Protein Contribution", + "tt_decimals": 0, + "ylab": "Protein Count", + "stacking": "normal", + "save_data_file": False, + } + + bar_html = bargraph.plot(data=plot_data, cats=cats, pconfig=draw_config) + + bar_html = plot_html_check(bar_html) + + add_sub_section( + sub_section=sub_section, + plot=bar_html, + order=7, + description="Match-Between-Runs (MBR) contribution to protein identification.", + helptext=""" + [FragPipe: combined_protein.tsv] This plot shows the contribution of + Match-Between-Runs (MBR) to protein identification counts. + + **MS/MS Only**: Proteins identified only through direct MS/MS identification. + **MS/MS + MBR**: Proteins with both MS/MS identification and MBR transfer. + **MBR Only**: Proteins identified only through MBR transfer from other runs. + + A high 'MBR Only' count indicates significant gain from MBR, but also + potential false positives. Ideally, most proteins should be in the + 'MS/MS Only' or 'MS/MS + MBR' categories. + + If MBR is not enabled in IonQuant, this plot will not be shown. + """, + ) + + log.info("MBR contribution plot generated.") + def _calculate_statistics(pipeline_stats: list): diff --git a/pmultiqc/modules/fragpipe/fragpipe_io.py b/pmultiqc/modules/fragpipe/fragpipe_io.py index 09560203..63f9fd70 100644 --- a/pmultiqc/modules/fragpipe/fragpipe_io.py +++ b/pmultiqc/modules/fragpipe/fragpipe_io.py @@ -18,9 +18,33 @@ ], "ion": [ "Peptide Sequence", "Modified Sequence", "Charge", "Protein", "Intensity" + ], + "combined_protein": [ + "Protein", "Protein ID", "Entry Name", "Gene", "Protein Length", + "Combined Total Peptides", "Combined Spectral Count", "Combined Unique Spectral Count", + "Combined Total Spectral Count" + ], + "combined_peptide": [ + "Peptide", "Peptide Length", "Charges", "Protein", "Protein Start", "Protein End", + "Combined Spectral Count" + ], + "combined_ion": [ + "Peptide Sequence", "Modified Sequence", "Charge", "Protein", "Gene", "Assigned Modifications" ] } +REQUIRED_KEYWORDS = { + "combined_ion": { + "Spectral Count": False, + "Match Type": False, + "Intensity": False + }, + "combined_peptide": { + "Match Type": False, + "Intensity": False + }, +} + # FragPipe File Paths def get_fragpipe_files(find_log_files): @@ -35,31 +59,60 @@ def get_fragpipe_files(find_log_files): # combined_modified_peptide.tsv (from IonQuant) # combined_peptide.tsv (from Philosopher, overwritten by IonQuant) # combined_protein.tsv (from Philosopher, overwritten by IonQuant) + # fragpipe.workflow (FragPipe configuration parameters) + # fragpipe-files.fp-manifest (experiment design/manifest) # diann-output files (see DIA-NN documentation) - required_files = ["psm", "ion"] - req_set = set(required_files) - - fragpipe_files = {req: [] for req in required_files} + # Define all file types to look for + file_types = [ + "psm", "ion", "combined_protein", "combined_peptide", "combined_ion", + "workflow", "manifest", "fragger_params" + ] + fragpipe_files = {ft: [] for ft in file_types} # FragPipe *tsv Data for file_info in find_log_files("pmultiqc/tsv", filecontents=False): filename = file_info["fn"] full_path = os.path.join(file_info["root"], filename) - for req in req_set: - # Match exact file names to avoid conflicts - # e.g., "ion.tsv" should not match "combined_ion.tsv" - if req == "ion" and filename == "ion.tsv": - fragpipe_files[req].append(full_path) - elif req == "psm" and "psm" in filename: - fragpipe_files[req].append(full_path) + # Match exact file names to avoid conflicts + # e.g., "ion.tsv" should not match "combined_ion.tsv" + if filename == "ion.tsv": + fragpipe_files["ion"].append(full_path) + elif filename == "combined_ion.tsv": + fragpipe_files["combined_ion"].append(full_path) + elif filename == "combined_protein.tsv": + fragpipe_files["combined_protein"].append(full_path) + elif filename == "combined_peptide.tsv": + fragpipe_files["combined_peptide"].append(full_path) + elif "psm" in filename: + fragpipe_files["psm"].append(full_path) + + # FragPipe workflow file (configuration parameters) + for file_info in find_log_files("pmultiqc/workflow", filecontents=False): + filename = file_info["fn"] + full_path = os.path.join(file_info["root"], filename) + if filename == "fragpipe.workflow": + fragpipe_files["workflow"].append(full_path) - if any(fragpipe_files.values()): + # FragPipe manifest file (experiment design) + for file_info in find_log_files("pmultiqc/fp-manifest", filecontents=False): + filename = file_info["fn"] + full_path = os.path.join(file_info["root"], filename) + fragpipe_files["manifest"].append(full_path) + + # MSFragger params file (search engine parameters) + for file_info in find_log_files("pmultiqc/fragger_params", filecontents=False): + filename = file_info["fn"] + full_path = os.path.join(file_info["root"], filename) + if filename == "fragger.params": + fragpipe_files["fragger_params"].append(full_path) + if any(fragpipe_files.values()): for k, v in fragpipe_files.items(): - log.info(f"FragPipe data loaded: {k} ({len(v)} files).") - log.debug(f"FragPipe data loaded: {k}: {v}") + if v: + log.info(f"FragPipe data loaded: {k} ({len(v)} files).") + log.debug(f"FragPipe data loaded: {k}: {v}") return fragpipe_files @@ -237,3 +290,684 @@ def extract_sample_groups(sample_cols): return sample_groups + +def workflow_reader(file_path: str): + """ + Read fragpipe.workflow file containing FragPipe configuration parameters. + + The workflow file is a key=value format file containing all parameters + used in the FragPipe analysis. + + Parameters + ---------- + file_path : str + Path to the fragpipe.workflow file. + + Returns + ------- + dict + Dictionary containing parameter name -> value pairs. + """ + parameters = {} + + try: + with open(file_path, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if not line or line.startswith('#'): + continue + if '=' in line: + key, value = line.split('=', 1) + parameters[key.strip()] = value.strip() + except Exception as e: + log.warning(f"Error reading workflow file {file_path}: {e}") + return None + + log.info(f"Loaded {len(parameters)} parameters from workflow file") + return parameters + + +def fragger_params_reader(file_path: str): + """ + Read fragger.params file containing MSFragger search engine parameters. + + The fragger.params file is a key=value format file with MSFragger-specific + search parameters including mass tolerances, enzyme settings, and modifications. + + Parameters + ---------- + file_path : str + Path to the fragger.params file. + + Returns + ------- + dict + Dictionary containing parameter name -> value pairs. + """ + parameters = {} + + try: + with open(file_path, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + # Skip empty lines and comments + if not line or line.startswith('#'): + continue + # fragger.params uses "key = value" format with spaces + if '=' in line: + key, value = line.split('=', 1) + parameters[key.strip()] = value.strip() + except Exception as e: + log.warning(f"Error reading fragger.params file {file_path}: {e}") + return None + + log.info(f"Loaded {len(parameters)} parameters from fragger.params file") + return parameters + + +def get_workflow_parameters_table(parameters: dict, fragger_params: dict = None): + """ + Convert workflow and fragger parameters to table format for display. + + Extracts key parameters relevant for QC reporting similar to MaxQuant. + Parameters from fragger.params can supplement or provide fallback for + workflow parameters. + + Parameters + ---------- + parameters : dict + Dictionary of workflow parameters (from fragpipe.workflow). + fragger_params : dict, optional + Dictionary of MSFragger parameters (from fragger.params). + + Returns + ------- + dict + Dictionary formatted for table display with parameter/value structure. + """ + if not parameters and not fragger_params: + return None + + # Merge parameters - workflow takes precedence, fragger provides fallback + merged_params = {} + if fragger_params: + merged_params.update(fragger_params) + if parameters: + merged_params.update(parameters) + + # Key parameters to display (similar to MaxQuant parameters table) + # Format: (key_from_workflow, key_from_fragger, display_name) + key_params = [ + # FragPipe version + ("fragpipe.version", None, "FragPipe Version"), + # Search engine settings - workflow keys and fragger.params equivalents + ("msfragger.search_enzyme_name_1", "search_enzyme_name_1", "Enzyme"), + ("msfragger.search_enzyme_cut_1", "search_enzyme_cut_1", "Enzyme Cut Site"), + ("msfragger.allowed_missed_cleavage_1", "allowed_missed_cleavage_1", "Max Missed Cleavages"), + ("msfragger.precursor_mass_lower", "precursor_mass_lower", "Precursor Mass Tolerance (Lower)"), + ("msfragger.precursor_mass_upper", "precursor_mass_upper", "Precursor Mass Tolerance (Upper)"), + ("msfragger.precursor_mass_units", "precursor_mass_units", "Precursor Mass Units"), + ("msfragger.fragment_mass_tolerance", "fragment_mass_tolerance", "Fragment Mass Tolerance"), + ("msfragger.fragment_mass_units", "fragment_mass_units", "Fragment Mass Units"), + # Modifications - from workflow or fragger.params + ("msfragger.variable_mod_01", "variable_mod_01", "Variable Modification 1"), + ("msfragger.variable_mod_02", "variable_mod_02", "Variable Modification 2"), + ("msfragger.variable_mod_03", "variable_mod_03", "Variable Modification 3"), + # Database + ("database.db-path", "database_name", "Database Path"), + # IonQuant settings (workflow only) + ("ionquant.mbr", None, "Match Between Runs (MBR)"), + ("ionquant.normalization", None, "Normalization"), + ("ionquant.requantify", None, "Requantify"), + # TMT settings (workflow only) + ("tmtintegrator.channel_num", None, "TMT Channels"), + ("tmtintegrator.ref_tag", None, "TMT Reference Tag"), + # FDR + ("philosopher.filter--prot", None, "Protein FDR"), + ("philosopher.filter--pep", None, "Peptide FDR"), + ("philosopher.filter--psm", None, "PSM FDR"), + # Additional fragger.params specific settings + (None, "num_threads", "Number of Threads"), + (None, "decoy_prefix", "Decoy Prefix"), + (None, "isotope_error", "Isotope Error"), + (None, "mass_offsets", "Mass Offsets"), + (None, "precursor_true_tolerance", "Precursor True Tolerance"), + (None, "precursor_true_units", "Precursor True Units"), + (None, "calibrate_mass", "Calibrate Mass"), + (None, "clip_nTerm_M", "Clip N-term Met"), + (None, "digest_min_length", "Min Peptide Length"), + (None, "digest_max_length", "Max Peptide Length"), + ] + + table_data = {} + row_num = 1 + + for workflow_key, fragger_key, display_name in key_params: + value = None + + # Try workflow key first + if workflow_key and workflow_key in merged_params: + value = merged_params[workflow_key] + # Fallback to fragger key + elif fragger_key and fragger_key in merged_params: + value = merged_params[fragger_key] + + # Clean up value for display + if value and value != "null" and str(value).strip(): + value = str(value).strip() + # Extract filename from paths + if display_name == "Database Path" and ("/" in value or "\\" in value): + value = os.path.basename(value.replace("\\", "/")) + # Skip empty modification slots + if "Modification" in display_name and (not value or value == "0.0000 X 0"): + continue + table_data[row_num] = { + "parameter": display_name, + "value": value + } + row_num += 1 + + if not table_data: + return None + + return table_data + + +def manifest_reader(file_path: str): + """ + Read FragPipe manifest file containing experiment design information. + + The manifest file (fp-manifest) contains file paths and experimental + design information like sample names, groups, and data types. + + Parameters + ---------- + file_path : str + Path to the manifest file. + + Returns + ------- + pd.DataFrame + DataFrame containing experiment design information. + """ + try: + # Manifest file is tab-separated with columns: + # file_path, experiment, bioreplicate, data_type (optional) + manifest_df = pd.read_csv(file_path, sep='\t', header=None) + + # Assign column names based on number of columns + if len(manifest_df.columns) >= 4: + manifest_df.columns = ['file_path', 'experiment', 'bioreplicate', 'data_type'] + \ + [f'col_{i}' for i in range(4, len(manifest_df.columns))] + elif len(manifest_df.columns) == 3: + manifest_df.columns = ['file_path', 'experiment', 'bioreplicate'] + elif len(manifest_df.columns) == 2: + manifest_df.columns = ['file_path', 'experiment'] + else: + manifest_df.columns = ['file_path'] + + # Extract filename from path for display + if 'file_path' in manifest_df.columns: + manifest_df['file_name'] = manifest_df['file_path'].apply( + lambda x: os.path.basename(str(x).replace("\\", "/")) + ) + + log.info(f"Loaded manifest with {len(manifest_df)} entries") + return manifest_df + + except Exception as e: + log.warning(f"Error reading manifest file {file_path}: {e}") + return None + + +def get_experiment_design_table(manifest_df: pd.DataFrame): + """ + Convert manifest DataFrame to experiment design table format. + + Parameters + ---------- + manifest_df : pd.DataFrame + DataFrame from manifest_reader. + + Returns + ------- + dict + Dictionary formatted for experiment design table display. + """ + if manifest_df is None or manifest_df.empty: + return None + + table_data = {} + + for idx, row in manifest_df.iterrows(): + file_name = row.get('file_name', row.get('file_path', f'File_{idx}')) + entry = { + "file_name": file_name, + } + if 'experiment' in row: + entry["experiment"] = row['experiment'] + + if 'bioreplicate' in row and pd.notna(row['bioreplicate']) and row['bioreplicate'] != '': + entry["bioreplicate"] = row['bioreplicate'] + else: + entry["bioreplicate"] = "-" + + if 'data_type' in row: + entry["data_type"] = row['data_type'] + + table_data[idx + 1] = entry + + return table_data + + +def combined_protein_reader(file_path: str): + """ + Read combined_protein.tsv file from FragPipe output. + + The combined_protein.tsv file contains protein-level quantification + data including MBR (Match Between Runs) information and per-sample + intensity columns. + + Parameters + ---------- + file_path : str + Path to the combined_protein.tsv file. + + Returns + ------- + tuple + (protein_df, sample_intensity_cols, mbr_cols) where: + - protein_df: DataFrame with protein-level data + - sample_intensity_cols: List of sample intensity column names + - mbr_cols: Dict mapping sample to MBR-related columns + """ + try: + protein_df = pd.read_csv(file_path, sep="\t", low_memory=False) + except Exception as e: + log.warning(f"Error reading combined_protein.tsv: {e}") + return None, [], {} + + if protein_df.empty: + log.warning("combined_protein.tsv is empty") + return None, [], {} + + log.info(f"Loaded combined_protein.tsv with {len(protein_df)} proteins and {len(protein_df.columns)} columns") + + # Identify sample intensity columns + # These typically follow patterns like "Sample MaxLFQ Intensity" or just sample names + sample_intensity_cols = [] + + # Look for MaxLFQ or Intensity columns per sample + for col in protein_df.columns: + col_lower = col.lower() + if 'maxlfq' in col_lower or ( + 'intensity' in col_lower and 'combined' not in col_lower + ): + # Skip metadata columns + if not any(skip in col_lower for skip in ['total', 'spectral', 'razor']): + sample_intensity_cols.append(col) + + # Look for MBR-related columns + # FragPipe uses columns like "Sample Spectral Count" with MBR info + # spectral_cols = [col for col in protein_df.columns if 'spectral count' in col.lower()] + + # Identify unique vs total spectral counts which can indicate MBR contribution + # for col in spectral_cols: + # sample_base = col.replace(' Spectral Count', '').replace(' Total Spectral Count', '').strip() + # if sample_base not in mbr_cols: + # mbr_cols[sample_base] = {} + # if 'unique' in col.lower(): + # mbr_cols[sample_base]['unique'] = col + # elif 'total' in col.lower(): + # mbr_cols[sample_base]['total'] = col + # else: + # mbr_cols[sample_base]['spectral'] = col + + log.info(f"Found {len(sample_intensity_cols)} sample intensity columns") + + return protein_df, sample_intensity_cols + + +def get_protein_intensity_distribution(protein_df, sample_cols, contam_affix="CONT"): + """ + Extract protein intensity distribution data for QC plots. + + Parameters + ---------- + protein_df : pd.DataFrame + DataFrame from combined_protein_reader. + sample_cols : list + List of sample intensity column names. + contam_affix : str + Contaminant identifier prefix. + + Returns + ------- + tuple + (sample_distribution, contaminant_distribution) - log2 transformed intensity distributions. + """ + if protein_df is None or not sample_cols: + return None, None + + sample_distribution = {} + contaminant_distribution = {} + + # Identify protein column + protein_col = None + for col in ['Protein', 'Protein ID', 'Protein Group']: + if col in protein_df.columns: + protein_col = col + break + + # Separate contaminants + if protein_col: + is_contaminant = protein_df[protein_col].str.contains(contam_affix, na=False, case=False) + sample_df = protein_df[~is_contaminant] + contam_df = protein_df[is_contaminant] + else: + sample_df = protein_df + contam_df = pd.DataFrame() + + for col in sample_cols: + if col not in sample_df.columns: + continue + + if not pd.to_numeric(sample_df[col], errors='coerce').notna().all(): + continue + + # Sample intensities + intensities = sample_df[col].dropna() + valid_intensities = intensities[intensities > 0] + if len(valid_intensities) > 0: + sample_distribution[col] = np.log2(valid_intensities).tolist() + + # Contaminant intensities + if not contam_df.empty and col in contam_df.columns: + cont_intensities = contam_df[col].dropna() + valid_cont = cont_intensities[cont_intensities > 0] + if len(valid_cont) > 0: + contaminant_distribution[col] = np.log2(valid_cont).tolist() + + return sample_distribution, contaminant_distribution + + +def combined_peptide_reader(file_path: str): + """ + Read combined_peptide.tsv file from FragPipe output. + + The combined_peptide.tsv file contains peptide-level quantification + data including MBR information. + + Parameters + ---------- + file_path : str + Path to the combined_peptide.tsv file. + + Returns + ------- + tuple + (peptide_df, sample_cols, mbr_info) where: + - peptide_df: DataFrame with peptide-level data + - sample_cols: List of sample column names + - mbr_info: Dict with MBR-related statistics + """ + try: + peptide_df = pd.read_csv(file_path, sep="\t", low_memory=False) + except Exception as e: + log.warning(f"Error reading combined_peptide.tsv: {e}") + return None, False + + if peptide_df.empty: + log.warning("combined_peptide.tsv is empty") + return None, False + + log.info(f"Loaded combined_peptide.tsv with {len(peptide_df)} peptides") + + if validate_columns_existence( + df_columns=peptide_df.columns, + data_name="combined_peptide" + ): + validate_columns = True + else: + validate_columns = False + + return peptide_df, validate_columns + + +def get_mbr_stats(protein_df, peptide_df, sample_cols): + """ + Calculate Match-Between-Runs (MBR) statistics. + + Analyzes identification types to determine MBR contribution + to protein and peptide identification counts. + + Parameters + ---------- + protein_df : pd.DataFrame + DataFrame from combined_protein_reader. + peptide_df : pd.DataFrame + DataFrame from combined_peptide_reader. + sample_cols : list + List of sample column names. + + Returns + ------- + dict + Dictionary containing MBR statistics per sample with: + - 'proteins': dict with msms_only, mbr_only, both counts + - 'peptides': dict with msms_only, mbr_only, both counts + """ + mbr_stats = {} + + if protein_df is None and peptide_df is None: + return mbr_stats + + # For FragPipe, we look at spectral counts to infer MBR contribution + # Proteins/peptides with spectral count > 0 are MS/MS identified + # Proteins/peptides that are quantified but have 0 spectral count may be MBR + + for sample in sample_cols: + mbr_stats[sample] = { + 'proteins': {'msms_only': 0, 'mbr_only': 0, 'both': 0}, + 'peptides': {'msms_only': 0, 'mbr_only': 0, 'both': 0} + } + + # Protein-level MBR stats + if protein_df is not None: + spectral_col = None + intensity_col = sample + + # Find corresponding spectral count column + for col in protein_df.columns: + if sample.replace(' MaxLFQ Intensity', '').replace(' Intensity', '') in col: + if 'spectral count' in col.lower(): + spectral_col = col + break + + if spectral_col and intensity_col in protein_df.columns: + has_spectral = protein_df[spectral_col] > 0 + has_intensity = protein_df[intensity_col] > 0 + + msms_only = ((has_spectral) & (~has_intensity)).sum() + mbr_only = ((~has_spectral) & (has_intensity)).sum() + both = ((has_spectral) & (has_intensity)).sum() + + mbr_stats[sample]['proteins'] = { + 'msms_only': int(msms_only), + 'mbr_only': int(mbr_only), + 'both': int(both) + } + + return mbr_stats + + +def combined_ion_reader(file_path: str): + """ + Read combined_ion.tsv file from FragPipe output. + + The combined_ion.tsv file contains ion-level quantification data + across all samples, suitable for MS/MS counts per peak analysis. + + Parameters + ---------- + file_path : str + Path to the combined_ion.tsv file. + + Returns + ------- + tuple + (ion_df, sample_cols) where: + - ion_df: DataFrame with ion-level data + - sample_cols: List of sample column names + """ + try: + ion_df = pd.read_csv(file_path, sep="\t", low_memory=False) + except Exception as e: + log.warning(f"Error reading combined_ion.tsv: {e}") + return None, False + + if ion_df.empty: + log.warning("combined_ion.tsv is empty") + return None, False + + log.info(f"Loaded combined_ion.tsv with {len(ion_df)} ions and {len(ion_df.columns)} columns") + + if validate_columns_existence( + df_columns=ion_df.columns, + data_name="combined_ion" + ): + validate_columns = True + else: + validate_columns = False + + return ion_df, validate_columns + + +def validate_columns_existence(df_columns, data_name: str): + + col_list = list(df_columns) + + required_keywords = REQUIRED_KEYWORDS[data_name] + + for col in col_list: + if "Spectral Count" in col: + required_keywords["Spectral Count"] = True + if "Match Type" in col: + required_keywords["Match Type"] = True + if "Intensity" in col: + if "MaxLFQ" not in col: + required_keywords["Intensity"] = True + + all_passed = all(required_keywords.values()) + + print(f"Check whether the data {data_name} meets the extraction requirements.") + + + for key, found in required_keywords.items(): + status = "exists" if found else "is missing" + print(f"{key}: {status}") + + return all_passed + + +def get_msms_counts_per_peak(ion_df): + """ + Calculate MS/MS counts per peak statistics. + + Analyzes how many MS/MS spectra support each ion/peak identification. + + Parameters + ---------- + ion_df : pd.DataFrame + DataFrame from combined_ion_reader. + Returns + ------- + dict + Dictionary containing MS/MS count statistics per sample. + """ + if ion_df is None: + return {} + + df = ion_df.copy() + + samples = [col.replace(' Match Type', '') for col in df.columns if ' Match Type' in col] + + plot_data = [] + + for s in samples: + spec_col = f"{s} Spectral Count" + match_col = f"{s} Match Type" + int_col = f"{s} Intensity" + + sample_df = df[df[int_col] > 0].copy() + + msms_dist = sample_df[sample_df[match_col] == 'MS/MS'][spec_col].value_counts().to_dict() + + for count, freq in msms_dist.items(): + plot_data.append({'run': s, 'ms/ms_count': int(count), 'peptide_count': freq}) + + res_df = pd.DataFrame(plot_data) + res_df["ms/ms_count"] = res_df["ms/ms_count"].apply( + lambda x: ">=3" if x >= 3 else x + ) + res_df = res_df.groupby(['run', 'ms/ms_count'])['peptide_count'].sum().reset_index() + + res_df["ms/ms_count"] = res_df["ms/ms_count"].astype(str) + + plot_dict = {} + for raw_file, group in res_df.groupby("run"): + group["freq"] = group["peptide_count"] / group["peptide_count"].sum() * 100 + plot_dict[raw_file] = dict(zip(group["ms/ms_count"], group["freq"])) + + oversampling = { + "plot_data": plot_dict, + "cats": list(res_df["ms/ms_count"].unique()) + } + + return oversampling + + +def cal_peptide_id_gain(df): + df = df.copy() + + samples = [col.replace(' Match Type', '') for col in df.columns if ' Match Type' in col] + + peptide_counts = [] + + for s in samples: + match_col = f"{s} Match Type" + int_col = f"{s} Intensity" + + sample_df = df[df[int_col] > 0].copy() + + match_type_count = sample_df[match_col].value_counts().to_dict() + + ms_count = 0 + mbr_count = 0 + for match_type, count in match_type_count.items(): + if match_type == "MS/MS": + ms_count = count + elif match_type == "MBR": + mbr_count = count + peptide_counts.append({'run': s, "ms/ms_count": int(ms_count), 'mbr': int(mbr_count)}) + + count_df = pd.DataFrame(peptide_counts) + + denom = count_df["ms/ms_count"].replace(0, np.nan) + count_df["MBRgain"] = (count_df["mbr"] / denom) * 100 + count_df["MBRgain"] = count_df["MBRgain"].fillna(0) + + temp_df = count_df[['run', 'ms/ms_count', 'mbr']].rename( + columns={'ms/ms_count': 'MS/MS', 'mbr': 'MBR'} + ) + plot_data = temp_df.set_index('run').to_dict(orient='index') + + mbr_gain = round(count_df["MBRgain"].mean(), 2) + title_value = f"MBR gain: +{mbr_gain}%" if mbr_gain is not None else "" + + return { + "plot_data": plot_data, + "cats": ["MS/MS", "MBR"], + "title_value": title_value + } + diff --git a/pmultiqc/modules/maxquant/maxquant.py b/pmultiqc/modules/maxquant/maxquant.py index 9695473b..0cc18ca9 100644 --- a/pmultiqc/modules/maxquant/maxquant.py +++ b/pmultiqc/modules/maxquant/maxquant.py @@ -459,6 +459,7 @@ def _draw_identification_plots(self): maxquant_plots.draw_evidence_peptide_id_count, self.sub_sections["identification"], self.mq_results["get_evidence_dicts"].get("peptide_id_count"), + "maxquant", error_name="draw_evidence_peptide_id_count" ) @@ -475,7 +476,7 @@ def _draw_identification_plots(self): self.sub_sections["ms2"], self.mq_results["get_evidence_dicts"].get("oversampling"), "", - True, + "maxquant", error_name="draw_oversampling" ) diff --git a/pmultiqc/modules/maxquant/maxquant_plots.py b/pmultiqc/modules/maxquant/maxquant_plots.py index 523d84de..a60e247a 100644 --- a/pmultiqc/modules/maxquant/maxquant_plots.py +++ b/pmultiqc/modules/maxquant/maxquant_plots.py @@ -419,7 +419,7 @@ def draw_pg_pca(sub_section, pca_data, fig_type): # Peptide ID Count -def draw_evidence_peptide_id_count(sub_section, peptide_id_count_data): +def draw_evidence_peptide_id_count(sub_section, peptide_id_count_data, data_type: str): if peptide_id_count_data["title_value"]: fig_title = "Peptide ID Count" + " [" + peptide_id_count_data["title_value"] + "]" @@ -443,14 +443,11 @@ def draw_evidence_peptide_id_count(sub_section, peptide_id_count_data): bar_html = plot_html_check(bar_html) - add_sub_section( - sub_section=sub_section, - plot=bar_html, - order=4, - description=""" + if data_type == "maxquant": + description_text = """ [Excludes Contaminants] Number of unique (i.e. not counted twice) peptide sequences including modifications (after FDR) per Raw file. - """, - helptext=""" + """ + help_text=""" If MBR was enabled, three categories ('Genuine (Exclusive)', 'Genuine + Transferred', 'Transferred (Exclusive)' are shown, so the user can judge the gain that MBR provides. @@ -463,7 +460,22 @@ def draw_evidence_peptide_id_count(sub_section, peptide_id_count_data): If MBR would be switched off, you can expect to see the number of peptides corresponding to 'Genuine (Exclusive)' + 'Genuine + Transferred'. In general, if the MBR gain is low and the MBR scores are bad (see the two MBR-related metrics), MBR should be switched off for the Raw files which are affected (could be a few or all). - """, + """ + + elif data_type == "fragpipe": + description_text = """ + combined_peptide.tsv + """ + help_text = """ + combined_peptide.tsv + """ + + add_sub_section( + sub_section=sub_section, + plot=bar_html, + order=4, + description=description_text, + helptext=help_text, ) diff --git a/pmultiqc/modules/mzidentml/mzidentml.py b/pmultiqc/modules/mzidentml/mzidentml.py index c803d56f..c36fa0ca 100644 --- a/pmultiqc/modules/mzidentml/mzidentml.py +++ b/pmultiqc/modules/mzidentml/mzidentml.py @@ -240,7 +240,10 @@ def draw_plots(self) -> None: ) draw_oversampling( - self.sub_sections["ms2"], self.oversampling, self.oversampling_plot.dict["cats"], False + self.sub_sections["ms2"], + self.oversampling, + self.oversampling_plot.dict["cats"], + "" ) if self.long_trends: diff --git a/pmultiqc/modules/quantms/quantms.py b/pmultiqc/modules/quantms/quantms.py index 27783230..3762dda0 100755 --- a/pmultiqc/modules/quantms/quantms.py +++ b/pmultiqc/modules/quantms/quantms.py @@ -448,7 +448,7 @@ def draw_plots(self): self.sub_sections["ms2"], self.oversampling, self.oversampling_plot.dict["cats"], - False, + "", ) self.draw_delta_mass() diff --git a/pmultiqc_service/app.py b/pmultiqc_service/app.py index f234e19e..1251f73b 100644 --- a/pmultiqc_service/app.py +++ b/pmultiqc_service/app.py @@ -765,6 +765,93 @@ def filter_search_files(files: List[Dict]) -> tuple[List[Dict], bool]: return all_files, is_complete +# FragPipe file suffixes that belong to the same experiment +# IMPORTANT: Longer suffixes must come first to ensure correct matching +# (e.g., "combined_ion.tsv" must be checked before "ion.tsv") +FRAGPIPE_FILE_SUFFIXES = [ + "combined_modified_peptide.tsv", + "combined_peptide.tsv", + "combined_protein.tsv", + "combined_ion.tsv", + "peptide.tsv", + "protein.tsv", + "psm.tsv", + "ion.tsv", +] + + +def group_fragpipe_files(downloaded_files: List[str]) -> Dict[str, List[str]]: + """ + Group FragPipe files by experiment prefix. + + In FragPipe, files like psm.tsv, ion.tsv, peptide.tsv, protein.tsv, etc. + belong to the same experiment and should be processed together. + Files can have a prefix like {experiment}_psm.tsv, {experiment}_ion.tsv, etc. + Files without a prefix (e.g., just psm.tsv, ion.tsv) belong to the same experiment. + + Args: + downloaded_files: List of downloaded file paths + + Returns: + Dict mapping experiment name to list of file paths belonging to that experiment. + Non-FragPipe files are returned with their original filename as the key. + Returns empty dict if no files to process. + """ + groups: Dict[str, List[str]] = {} + non_fragpipe_files: List[str] = [] + + for file_path in downloaded_files: + if os.path.isdir(file_path): + # Directories (from zip extraction) are kept as-is + dir_name = os.path.basename(file_path) + groups[dir_name] = [file_path] + continue + + filename = os.path.basename(file_path).lower() + matched_suffix = None + + # Check if this is a FragPipe file + for suffix in FRAGPIPE_FILE_SUFFIXES: + if filename == suffix or filename.endswith(f"_{suffix}"): + matched_suffix = suffix + break + + if matched_suffix: + # Extract experiment prefix + if filename == matched_suffix: + # No prefix, use "fragpipe_experiment" as the group name + experiment = "fragpipe_experiment" + else: + # Has prefix: {experiment}_{suffix} + # Remove the suffix to get the experiment name + experiment = filename[: -(len(matched_suffix) + 1)] # +1 for underscore + + if experiment not in groups: + groups[experiment] = [] + groups[experiment].append(file_path) + logger.info(f"Grouped FragPipe file '{os.path.basename(file_path)}' into experiment '{experiment}'") + else: + # Non-FragPipe file + non_fragpipe_files.append(file_path) + + # Add non-FragPipe files as individual groups + for file_path in non_fragpipe_files: + file_name = os.path.splitext(os.path.basename(file_path))[0] + groups[file_name] = [file_path] + + # Log grouping summary + fragpipe_groups = {k: v for k, v in groups.items() if len(v) > 1 or any( + os.path.basename(f).lower().endswith(suffix) or os.path.basename(f).lower() == suffix + for f in v for suffix in FRAGPIPE_FILE_SUFFIXES + )} + if fragpipe_groups: + logger.info(f"FragPipe file grouping: {len(fragpipe_groups)} experiment(s) detected") + for exp, files in fragpipe_groups.items(): + logger.info(f" Experiment '{exp}': {[os.path.basename(f) for f in files]}") + + return groups + + def download_pride_file(file_info: Dict, download_dir: str, job_id: str = None) -> str: """ Download a file from PRIDE with detailed progress tracking and handle compression. @@ -1261,90 +1348,103 @@ def process_pride_job_async(job_id: str, accession: str, output_dir: str): else: logger.warning(f"pmultiqc failed for COMPLETE submission: {result.get('message')}") else: - # For regular submissions, process each file separately - total_files_to_process = len(downloaded_files) + # For regular submissions, group FragPipe files by experiment before processing + # This ensures that related files (e.g., psm.tsv, ion.tsv) are processed together + file_groups = group_fragpipe_files(downloaded_files) + total_groups_to_process = len(file_groups) + + logger.info(f"Grouped {len(downloaded_files)} files into {total_groups_to_process} experiment group(s)") - for i, downloaded_file in enumerate(downloaded_files): + for i, (group_name, group_files) in enumerate(file_groups.items()): try: logger.info( - f"Processing downloaded file {i+1}/{len(downloaded_files)}: {downloaded_file}" - ) - logger.info(f"File exists: {os.path.exists(downloaded_file)}") - logger.info( - f"Is directory: {os.path.isdir(downloaded_file) if os.path.exists(downloaded_file) else 'N/A'}" + f"Processing experiment group {i+1}/{total_groups_to_process}: {group_name}" ) + logger.info(f"Files in group: {[os.path.basename(f) for f in group_files]}") - # Determine file name and type - if os.path.isdir(downloaded_file): - # If it's a directory (from zip extraction), use directory name - file_name = os.path.basename(downloaded_file) - file_extract_dir = downloaded_file - logger.info(f"Using directory as-is: {file_name} -> {file_extract_dir}") + # Determine extraction directory for the group + if len(group_files) == 1 and os.path.isdir(group_files[0]): + # Single directory (from zip extraction), use it directly + file_extract_dir = group_files[0] + logger.info(f"Using directory as-is: {group_name} -> {file_extract_dir}") else: - # If it's a file, extract the name without extension - file_name = os.path.splitext(os.path.basename(downloaded_file))[0] - # Create extraction directory and copy file - file_extract_dir = os.path.join(download_dir, f"extracted_{file_name}") + # Create a shared extraction directory for all files in the group + file_extract_dir = os.path.join(download_dir, f"extracted_{group_name}") os.makedirs(file_extract_dir, exist_ok=True) - shutil.copy2( - downloaded_file, - os.path.join(file_extract_dir, os.path.basename(downloaded_file)), - ) - logger.info(f"Extracted file: {file_name} -> {file_extract_dir}") - logger.info(f"Processing file {i+1}/{total_files_to_process}: {file_name}") + # Copy all files in the group to the shared directory + for file_path in group_files: + if os.path.isfile(file_path): + shutil.copy2( + file_path, + os.path.join(file_extract_dir, os.path.basename(file_path)), + ) + logger.info(f"Copied {os.path.basename(file_path)} to {file_extract_dir}") + elif os.path.isdir(file_path): + # If it's a directory, copy its contents + for item in os.listdir(file_path): + src = os.path.join(file_path, item) + dst = os.path.join(file_extract_dir, item) + if os.path.isfile(src): + shutil.copy2(src, dst) + elif os.path.isdir(src): + shutil.copytree(src, dst, dirs_exist_ok=True) + + logger.info(f"Prepared extraction directory for group '{group_name}': {file_extract_dir}") + + logger.info(f"Processing group {i+1}/{total_groups_to_process}: {group_name}") # Update progress for processing (70-90%) - progress = 70 + int((i + 1) / total_files_to_process * 20) + progress = 70 + int((i + 1) / total_groups_to_process * 20) update_job_progress( job_id, "processing", progress, files_processed=total_processed, - total_files=total_files_to_process, - processing_stage=f"Processing {file_name} ({i+1}/{total_files_to_process})...", + total_files=total_groups_to_process, + processing_stage=f"Processing {group_name} ({i+1}/{total_groups_to_process})...", ) - # Create output directory for this file - file_output_dir = os.path.join(output_dir, f"report_{file_name}") + # Create output directory for this group + file_output_dir = os.path.join(output_dir, f"report_{group_name}") os.makedirs(file_output_dir, exist_ok=True) logger.info(f"Created output directory: {file_output_dir}") - # Detect input type for this file + # Detect input type for this group input_type, quantms_config = detect_input_type(file_extract_dir) - logger.info(f"Detected input type for {file_name}: {input_type}") + logger.info(f"Detected input type for {group_name}: {input_type}") # Log files found in the directory for debugging try: files_in_dir = os.listdir(file_extract_dir) - logger.info(f"Files in {file_name} directory: {files_in_dir}") + logger.info(f"Files in {group_name} directory: {files_in_dir}") except Exception as e: logger.warning(f"Could not list files in {file_extract_dir}: {e}") if input_type == "unknown": - logger.warning(f"Could not detect input type for {file_name}") + logger.warning(f"Could not detect input type for {group_name}") continue - # Run pmultiqc on this file + # Run pmultiqc on this group logger.info( - f"Starting run_pmultiqc_with_progress for job {job_id}, file {file_name}" + f"Starting run_pmultiqc_with_progress for job {job_id}, group {group_name}" ) result = run_pmultiqc_with_progress( file_extract_dir, file_output_dir, input_type, quantms_config, job_id ) logger.info( - f"run_pmultiqc_with_progress completed for job {job_id}, file {file_name}: success={result.get('success')}" + f"run_pmultiqc_with_progress completed for job {job_id}, group {group_name}: success={result.get('success')}" ) if result["success"]: - # Create zip report for this file + # Create zip report for this group zip_report_path = os.path.join( file_output_dir, f"pmultiqc_report_{job_id}.zip" ) if create_zip_report(file_output_dir, zip_report_path): all_results.append( { - "file_name": file_name, + "file_name": group_name, "input_type": input_type, "report_path": zip_report_path, "output": result.get("output", []), @@ -1353,13 +1453,13 @@ def process_pride_job_async(job_id: str, accession: str, output_dir: str): ) total_processed += 1 logger.info( - f"Successfully processed file {file_name}, total_processed now: {total_processed}" + f"Successfully processed group {group_name}, total_processed now: {total_processed}" ) else: - logger.warning(f"pmultiqc failed for {file_name}: {result.get('message')}") + logger.warning(f"pmultiqc failed for {group_name}: {result.get('message')}") except Exception as e: - logger.error(f"Error processing {downloaded_file}: {e}") + logger.error(f"Error processing group {group_name}: {e}") logger.error(f"Traceback: {traceback.format_exc()}") continue