Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ dependencies:
- pandas>=1.5
- pyteomics
- pyopenms<=3.4.0
- sdrf-pipelines>=0.0.32
- sdrf-pipelines==0.0.33
- lxml
- numpy>=1.23
- pyarrow
Expand All @@ -19,4 +19,4 @@ dependencies:
- requests
- redis
- statsmodels
- urllib3>=2.6.1
- urllib3>=2.6.1
144 changes: 121 additions & 23 deletions pmultiqc/modules/common/dia_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,14 @@ def parse_diann_report(
# Process statistics and modifications
total_protein_quantified, total_peptide_count, pep_plot = _process_diann_statistics(report_data)
peptide_search_score = _process_peptide_search_scores(report_data)
_process_modifications(report_data)
modifications_ok = _process_modifications(report_data)

# Process run-specific data
cal_num_table_data = _process_run_data(report_data, ms_with_psm, quantms_modified, file_df)
# Process run-specific data (requires Modifications column from _process_modifications)
if modifications_ok:
cal_num_table_data = _process_run_data(report_data, ms_with_psm, quantms_modified, file_df)
else:
log.warning("Skipping run data processing due to missing modifications data")
cal_num_table_data = {"sdrf_samples": {}, "ms_runs": {}}

# Handle files without PSM
ms_without_psm = _handle_files_without_psm(ms_paths, ms_with_psm, cal_num_table_data)
Expand Down Expand Up @@ -155,6 +159,10 @@ def _draw_heatmap(sub_section, report_data, heatmap_color_list):

def _process_diann_statistics(report_data):
"""Process DIA-NN statistics and create peptide plot."""
required_cols = ["Protein.Group", "Modified.Sequence"]
if not all(col in report_data.columns for col in required_cols):
log.warning(f"Missing required columns for statistics: {[c for c in required_cols if c not in report_data.columns]}")
return 0, 0, None
Comment on lines 160 to +165
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Search for parse_diann_report usage and context
rg -n --type=py -C5 'parse_diann_report' -g '!*test*'

Repository: bigbio/pmultiqc

Length of output: 3955


🏁 Script executed:

#!/bin/bash
# Get the full parse_diann_report function implementation
sed -n '32,150p' pmultiqc/modules/common/dia_utils.py

Repository: bigbio/pmultiqc

Length of output: 4386


🏁 Script executed:

#!/bin/bash
# Get the _process_diann_statistics function
sed -n '160,200p' pmultiqc/modules/common/dia_utils.py

Repository: bigbio/pmultiqc

Length of output: 1759


🏁 Script executed:

#!/bin/bash
# Check how pep_plot is used in quantms.py after assignment
rg -n --type=py 'self.pep_plot' pmultiqc/modules/quantms/quantms.py -A3 -B1

Repository: bigbio/pmultiqc

Length of output: 1837


🏁 Script executed:

#!/bin/bash
# Check how pep_plot is used in diann.py after assignment
rg -n --type=py 'self.pep_plot' pmultiqc/modules/diann/diann.py -A3 -B1

Repository: bigbio/pmultiqc

Length of output: 420


🏁 Script executed:

#!/bin/bash
# Find the function that receives pep_plot
rg -n --type=py 'def.*draw_num_pep_per_protein' -A5

Repository: bigbio/pmultiqc

Length of output: 476


🏁 Script executed:

#!/bin/bash
# Check if there's any None-checking for pep_plot before calling draw_num_pep_per_protein
rg -n --type=py 'draw_num_pep_per_protein' -B5 -A1 pmultiqc/modules/diann/diann.py

Repository: bigbio/pmultiqc

Length of output: 623


🏁 Script executed:

#!/bin/bash
# Check the full draw_num_pep_per_protein function to see how pep_plot is used
sed -n '1001,1030p' pmultiqc/modules/common/plots/id.py

Repository: bigbio/pmultiqc

Length of output: 901


🏁 Script executed:

#!/bin/bash
# Check the same in quantms.py for draw_num_pep_per_protein usage
rg -n --type=py 'draw_num_pep_per_protein' -B10 -A5 pmultiqc/modules/quantms/quantms.py

Repository: bigbio/pmultiqc

Length of output: 1802


🏁 Script executed:

#!/bin/bash
# Check if there's any None-check around the draw_num_pep_per_protein calls in quantms.py
sed -n '350,365p' pmultiqc/modules/quantms/quantms.py

Repository: bigbio/pmultiqc

Length of output: 648


🏁 Script executed:

#!/bin/bash
# Check the broader context around draw_num_pep_per_protein in quantms.py
sed -n '340,365p' pmultiqc/modules/quantms/quantms.py

Repository: bigbio/pmultiqc

Length of output: 1039


🏁 Script executed:

#!/bin/bash
# Check broader context in quantms.py around draw_num_pep_per_protein calls
sed -n '320,365p' pmultiqc/modules/quantms/quantms.py

Repository: bigbio/pmultiqc

Length of output: 1888


🏁 Script executed:

#!/bin/bash
# Double-check if there's any conditional logic around draw_num_pep_per_protein calls
sed -n '355,365p' pmultiqc/modules/quantms/quantms.py && echo "---" && sed -n '185,195p' pmultiqc/modules/diann/diann.py

Repository: bigbio/pmultiqc

Length of output: 688


Callers do not handle pep_plot=None gracefully—this will crash at runtime.

When required columns are missing, _process_diann_statistics returns None for pep_plot. Both quantms.py (line 358) and diann.py (line 187) pass this directly to draw_num_pep_per_protein, which immediately accesses pep_plot.dict["data"] (id.py:1006) without a None check. This causes an AttributeError if column validation fails.

Add a guard in draw_num_pep_per_protein or skip the call when pep_plot is None.

🤖 Prompt for AI Agents
In `@pmultiqc/modules/common/dia_utils.py` around lines 160 - 165, The callers
pass pep_plot from _process_diann_statistics into draw_num_pep_per_protein and
will crash if pep_plot is None; add a guard at the start of
draw_num_pep_per_protein to check for pep_plot is None, log or warn about
missing peptide plot data, and return a safe value (e.g., None or empty figure)
instead of accessing pep_plot.dict["data"]. Update draw_num_pep_per_protein (and
any closely related plotting helpers) to early-return when pep_plot is None so
quantms.py and diann.py no longer need to change their call sites.


total_protein_quantified = len(set(report_data["Protein.Group"]))
total_peptide_count = len(set(report_data["Modified.Sequence"]))
Expand All @@ -180,6 +188,11 @@ def _process_diann_statistics(report_data):

def _process_peptide_search_scores(report_data):
"""Process peptide search scores."""
required_cols = ["Modified.Sequence", "Q.Value"]
if not all(col in report_data.columns for col in required_cols):
log.warning(f"Missing required columns for peptide search scores: {[c for c in required_cols if c not in report_data.columns]}")
return {}

log.info("Processing DIA peptide_search_score.")
peptide_search_score = dict()
pattern = re.compile(r"\((.*?)\)")
Expand All @@ -200,6 +213,10 @@ def _process_peptide_search_scores(report_data):

def _process_modifications(report_data):
"""Process modifications in the report data."""
if "Modified.Sequence" not in report_data.columns:
log.warning("Missing Modified.Sequence column for modifications processing")
return False

log.info("Processing DIA Modifications.")
mod_pattern = re.compile(r"\((.*?)\)")
unimod_data = UnimodDatabase()
Expand All @@ -217,16 +234,21 @@ def find_diann_modified(peptide):
return None

report_data["Modifications"] = report_data["Modified.Sequence"].apply(find_diann_modified)
return True


def _process_run_data(df, ms_with_psm, quantms_modified, sdrf_file_df):
"""
Process run-specific data including modifications and statistics.
"""
required_cols = ["Run", "Modified.Sequence", "Modifications", "Protein.Group"]
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
log.warning(f"Missing required columns for run data processing: {missing_cols}")
return {"sdrf_samples": {}, "ms_runs": {}}

log.info("Processing DIA mod_plot_dict.")

required_cols = ["Run", "Modified.Sequence", "Modifications", "Protein.Group"]
report_data = df[required_cols].copy()
if "Proteotypic" in df.columns:
report_data["Proteotypic"] = df["Proteotypic"]
Expand Down Expand Up @@ -425,6 +447,11 @@ def draw_dia_rt_qc(sub_section, report_df):

# DIA-NN: IDs over RT
def draw_dia_ids_rt(sub_section, report_df):
required_cols = ["Run", "RT"]
if not all(col in report_df.columns for col in required_cols):
log.warning(f"Missing required columns for IDs over RT plot: {[c for c in required_cols if c not in report_df.columns]}")
return

rt_df = report_df[["Run", "RT"]].copy()
rt_df.rename(columns={"Run": "raw file", "RT": "retention time"}, inplace=True)
ids_over_rt = evidence_rt_count(rt_df)
Expand All @@ -437,13 +464,19 @@ def draw_diann_quant_table(sub_section, diann_report, sample_df, file_df):
peptides_table, peptides_headers = create_peptides_table(
diann_report, sample_df, file_df
)
draw_peptides_table(sub_section, peptides_table, peptides_headers, "DIA-NN")
if peptides_table is not None and peptides_headers is not None:
draw_peptides_table(sub_section, peptides_table, peptides_headers, "DIA-NN")
else:
log.warning("Skipping peptides quantification table due to missing data")

# Protein Quantification Table
protein_table, protein_headers = create_protein_table(
diann_report, sample_df, file_df
)
draw_protein_table(sub_section, protein_table, protein_headers, "DIA-NN")
if protein_table is not None and protein_headers is not None:
draw_protein_table(sub_section, protein_table, protein_headers, "DIA-NN")
else:
log.warning("Skipping protein quantification table due to missing data")


# Draw: Peptides Quantification Table
Expand Down Expand Up @@ -727,9 +760,29 @@ def _prepare_quant_table_data(report_df):
Common preprocessing for quantification table creation.

Returns:
pd.DataFrame: Preprocessed report data with positive Precursor.Normalised values.
pd.DataFrame: Preprocessed report data with positive intensity values,
or None if required columns are missing.
"""
report_data = report_df[report_df["Precursor.Normalised"] > 0].copy()
# Check for required columns
required_cols = ["Protein.Names", "Stripped.Sequence"]
missing_cols = [col for col in required_cols if col not in report_df.columns]
if missing_cols:
log.warning(f"Missing required columns for quantification table: {missing_cols}")
return None

# Use Precursor.Normalised if available, otherwise fall back to Precursor.Quantity
if "Precursor.Normalised" in report_df.columns:
intensity_col = "Precursor.Normalised"
elif "Precursor.Quantity" in report_df.columns:
intensity_col = "Precursor.Quantity"
log.info("Using Precursor.Quantity as fallback (Precursor.Normalised not available)")
else:
log.warning("Neither Precursor.Normalised nor Precursor.Quantity found. Skipping quantification table.")
return None

report_data = report_df[report_df[intensity_col] > 0].copy()
# Store which intensity column is being used for downstream functions
report_data.attrs["intensity_col"] = intensity_col
return drop_empty_row(report_data, ["Protein.Names", "Stripped.Sequence"])


Expand All @@ -743,6 +796,18 @@ def _merge_condition_data(report_data, sample_df, file_df):
if sample_df.empty or file_df.empty:
return None, []

# Get the intensity column used (stored by _prepare_quant_table_data)
intensity_col = report_data.attrs.get("intensity_col", "Precursor.Normalised")
if intensity_col not in report_data.columns:
# Fallback check
if "Precursor.Normalised" in report_data.columns:
intensity_col = "Precursor.Normalised"
elif "Precursor.Quantity" in report_data.columns:
intensity_col = "Precursor.Quantity"
else:
log.warning("No intensity column found for condition data merge")
return None, []

sample_cond_df = pd.merge(
sample_df[["Sample", "MSstats_Condition"]],
file_df[["Sample", "Spectra_Filepath"]],
Expand All @@ -752,10 +817,12 @@ def _merge_condition_data(report_data, sample_df, file_df):
sample_cond_df["Run"] = sample_cond_df["Spectra_Filepath"].str.rsplit(".", n=1).str[0]

cond_report_data = pd.merge(
report_data[["Stripped.Sequence", "Protein.Names", "Precursor.Normalised", "Run"]],
report_data[["Stripped.Sequence", "Protein.Names", intensity_col, "Run"]],
sample_cond_df[["Run", "MSstats_Condition"]].drop_duplicates(),
on="Run",
)
# Store intensity column for downstream use
cond_report_data.attrs["intensity_col"] = intensity_col

unique_conditions = sample_df["MSstats_Condition"].drop_duplicates().tolist()
return cond_report_data, unique_conditions
Expand All @@ -773,18 +840,34 @@ def _add_condition_headers(headers, conditions):

# DIA-NN: Peptides Quantification Table
def create_peptides_table(report_df, sample_df, file_df):
"""Create peptides quantification table from DIA-NN report."""
"""Create peptides quantification table from DIA-NN report.

Returns:
tuple: (table_dict, headers) or (None, None) if required columns are missing.
"""
report_data = _prepare_quant_table_data(report_df)
report_data["BestSearchScore"] = 1 - report_data["Q.Value"]
if report_data is None or report_data.empty:
log.warning("Cannot create peptides table: missing required data")
return None, None

# Get the intensity column being used
intensity_col = report_data.attrs.get("intensity_col", "Precursor.Normalised")

# Check for Q.Value column for search score
has_qvalue = "Q.Value" in report_data.columns
if has_qvalue:
report_data["BestSearchScore"] = 1 - report_data["Q.Value"]

table_dict = {}
for sequence_protein, group in report_data.groupby(["Stripped.Sequence", "Protein.Names"]):
table_dict[sequence_protein] = {
entry = {
"ProteinName": sequence_protein[1],
"PeptideSequence": sequence_protein[0],
"BestSearchScore": group["BestSearchScore"].min(),
"Average Intensity": np.log10(group["Precursor.Normalised"].mean()),
"Average Intensity": np.log10(group[intensity_col].mean()),
}
if has_qvalue:
entry["BestSearchScore"] = group["BestSearchScore"].min()
table_dict[sequence_protein] = entry

headers = {
"ProteinName": {
Expand All @@ -793,24 +876,27 @@ def create_peptides_table(report_df, sample_df, file_df):
"minrange": "200",
},
"PeptideSequence": {"title": "Peptide Sequence"},
"BestSearchScore": {"title": "Best Search Score", "format": "{:,.4f}"},
"Average Intensity": {
"title": "Average Intensity",
"description": "Average intensity across all conditions",
"format": "{:,.4f}",
},
}
if has_qvalue:
headers["BestSearchScore"] = {"title": "Best Search Score", "format": "{:,.4f}"}

cond_report_data, unique_conditions = _merge_condition_data(report_data, sample_df, file_df)
if cond_report_data is not None:
if cond_report_data is not None and not cond_report_data.empty:
cond_intensity_col = cond_report_data.attrs.get("intensity_col", intensity_col)
for sequence_protein, group in cond_report_data.groupby(
["Stripped.Sequence", "Protein.Names"]
):
condition_data = {
str(cond): np.log10(sub_group["Precursor.Normalised"].mean())
str(cond): np.log10(sub_group[cond_intensity_col].mean())
for cond, sub_group in group.groupby("MSstats_Condition")
}
table_dict[sequence_protein].update(condition_data)
if sequence_protein in table_dict:
table_dict[sequence_protein].update(condition_data)

_add_condition_headers(headers, unique_conditions)

Expand All @@ -820,15 +906,25 @@ def create_peptides_table(report_df, sample_df, file_df):

# DIA-NN: Protein Quantification Table
def create_protein_table(report_df, sample_df, file_df):
"""Create protein quantification table from DIA-NN report."""
"""Create protein quantification table from DIA-NN report.

Returns:
tuple: (table_dict, headers) or (None, None) if required columns are missing.
"""
report_data = _prepare_quant_table_data(report_df)
if report_data is None or report_data.empty:
log.warning("Cannot create protein table: missing required data")
return None, None

# Get the intensity column being used
intensity_col = report_data.attrs.get("intensity_col", "Precursor.Normalised")

table_dict = {}
for protein_name, group in report_data.groupby("Protein.Names"):
table_dict[protein_name] = {
"ProteinName": protein_name,
"Peptides_Number": group["Stripped.Sequence"].nunique(),
"Average Intensity": np.log10(group["Precursor.Normalised"].mean()),
"Average Intensity": np.log10(group[intensity_col].mean()),
}

headers = {
Expand All @@ -849,13 +945,15 @@ def create_protein_table(report_df, sample_df, file_df):
}

cond_report_data, unique_conditions = _merge_condition_data(report_data, sample_df, file_df)
if cond_report_data is not None:
if cond_report_data is not None and not cond_report_data.empty:
cond_intensity_col = cond_report_data.attrs.get("intensity_col", intensity_col)
for protein_name, group in cond_report_data.groupby("Protein.Names"):
condition_data = {
str(cond): np.log10(sub_group["Precursor.Normalised"].mean())
str(cond): np.log10(sub_group[cond_intensity_col].mean())
for cond, sub_group in group.groupby("MSstats_Condition")
}
table_dict[protein_name].update(condition_data)
if protein_name in table_dict:
table_dict[protein_name].update(condition_data)

_add_condition_headers(headers, unique_conditions)

Expand Down
1 change: 1 addition & 0 deletions pmultiqc/modules/common/plots/dia.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ def draw_dia_whole_exp_charge(sub_section, df):
draw_config = {
"id": "distribution_of_precursor_charges",
"cpswitch": True,
"cpswitch_c_active": False,
"title": "Distribution of Precursor Charges",
"tt_decimals": 0,
"ylab": "Count",
Expand Down
7 changes: 5 additions & 2 deletions pmultiqc/modules/common/plots/ms.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,8 @@ def draw_peak_intensity_distribution(
pconfig = {
"id": "peak_intensity_distribution",
"title": "Peak Intensity Distribution",
"cpswitch": False,
"cpswitch": True,
"cpswitch_c_active": False,
"stacking": "group",
"logswitch": True,
"logswitch_active": True,
Expand Down Expand Up @@ -164,6 +165,7 @@ def draw_precursor_charge_distribution(sub_sections, charge_plot=None, ms_info=N
"id": "distribution_of_precursor_charges",
"title": "Distribution of Precursor Charges",
"cpswitch": True,
"cpswitch_c_active": False,
"tt_decimals": 0,
"ylab": "Count",
"save_data_file": False,
Expand All @@ -183,7 +185,8 @@ def draw_precursor_charge_distribution(sub_sections, charge_plot=None, ms_info=N
def draw_peaks_per_ms2(sub_sections, peaks_ms2_plot, ms_info):
pconfig = {
"id": "peaks_per_ms2",
"cpswitch": False,
"cpswitch": True,
"cpswitch_c_active": False,
"title": "Number of Peaks per MS/MS spectrum",
"stacking": "group",
"logswitch": True,
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ multiqc = ">=1.29, <=1.33"
pandas = ">=1.5"
pyteomics = "*"
pyopenms = "<=3.4.0"
sdrf-pipelines = ">=0.0.32"
sdrf-pipelines = "0.0.33"
lxml = "*"
numpy = ">=1.23"
pyarrow = "*"
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@ multiqc>=1.29, <=1.33
pandas>=1.5
pyteomics
pyopenms<=3.4.0
sdrf-pipelines>=0.0.32
sdrf-pipelines==0.0.33
lxml
numpy>=1.23
pyarrow
scikit-learn>=1.2
statsmodels
urllib3>=2.6.1
urllib3>=2.6.1