Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion pmultiqc/modules/common/dia_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ def parse_diann_report(
# Handle files without PSM
ms_without_psm = _handle_files_without_psm(ms_paths, ms_with_psm, cal_num_table_data)

# Peptide Length Distribution
peptide_length = _get_peptide_length(report_data)

return (
total_protein_quantified,
total_peptide_count,
Expand All @@ -77,7 +80,8 @@ def parse_diann_report(
ms_with_psm,
cal_num_table_data,
quantms_modified,
ms_without_psm
ms_without_psm,
peptide_length
)


Expand Down Expand Up @@ -316,6 +320,22 @@ def _handle_files_without_psm(ms_paths, ms_with_psm, cal_num_table_data):
return ms_without_psm


def _get_peptide_length(df):

if not "Stripped.Sequence" in df.columns:
return None

df_sub = df[["Run", "Stripped.Sequence"]].copy()
df_sub["length"] = df_sub["Stripped.Sequence"].apply(lambda x: len(x))

plot_data = {}
for run, group in df_sub.groupby("Run"):
stats_dict = group["length"].value_counts().sort_index().to_dict()
plot_data[run] = stats_dict

return plot_data


## Removed draw_dia_heatmap wrapper; call cal_dia_heatmap and dia_plots.draw_heatmap directly.


Expand Down
2 changes: 2 additions & 0 deletions pmultiqc/modules/common/ms/mztab.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,8 @@ def parse(self, **_kwargs) -> None:

self.total_protein_quantified = len(prot.index)

psm["pep_length"] = psm["sequence"].apply(lambda x: len(x))

self.mztab_data = mztab_data
self.pep_table = pep_table
self.psm = psm
Expand Down
33 changes: 33 additions & 0 deletions pmultiqc/modules/common/plots/id.py
Original file line number Diff line number Diff line change
Expand Up @@ -1133,6 +1133,39 @@ def draw_peptide_intensity(sub_section, plot_data):
""",
)


# Peptide Length Distribution
def draw_peptide_length_distribution(sub_section, plot_data):

draw_config = {
"id": "peptide_length_distribution",
"cpswitch": False,
"cpswitch_c_active": False,
"title": "Peptide Length Distribution",
"tt_decimals": 2,
"xlab": "Peptide Length",
"save_data_file": False,
"showlegend": True,
}
box_html = linegraph.plot(plot_data, pconfig=draw_config)

box_html = plot_html_check(box_html)

add_sub_section(
sub_section=sub_section,
plot=box_html,
order=8,
description="Peptide length distribution per Run.",
helptext="""
Peptide length distribution.<br>
FragPipe: psm.tsv ('Peptide Length': number of residues in the peptide sequence).<br>
MaxQuant: evidence.txt ('Length': the length of the sequence stored in the column 'Sequence').<br>
DIA-NN: report.tsv (the length of the 'Stripped.Sequence').<br>
quantms: *.mzTab (the length of sequence).
""",
)


def draw_long_trends(sub_sections, long_trends_data):

plot_ac_datetime = long_trends_data["time"]
Expand Down
12 changes: 10 additions & 2 deletions pmultiqc/modules/diann/diann.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
draw_identi_num,
draw_num_pep_per_protein,
draw_identification,
draw_long_trends
draw_long_trends,
draw_peptide_length_distribution
)
from pmultiqc.modules.common.plots.ms import (
draw_peak_intensity_distribution,
Expand Down Expand Up @@ -154,7 +155,8 @@ def draw_plots(self):
self.ms_with_psm,
self.cal_num_table_data,
self.quantms_modified,
self.ms_without_psm
self.ms_without_psm,
self.peptide_length
) = parse_diann_report(
sub_sections=self.sub_sections,
diann_report_path=self.diann_report_path,
Expand Down Expand Up @@ -213,6 +215,12 @@ def draw_plots(self):
long_trends_data=self.long_trends
)

if self.peptide_length:
draw_peptide_length_distribution(
sub_section=self.sub_sections["identification"],
plot_data=self.peptide_length
)

if self.enable_sdrf:
ms_io.del_openms_convert_tsv()

Expand Down
50 changes: 46 additions & 4 deletions pmultiqc/modules/fragpipe/fragpipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
combined_protein_reader,
get_protein_intensity_distribution,
combined_peptide_reader,
get_mbr_stats,
combined_ion_reader,
get_msms_counts_per_peak,
cal_peptide_id_gain
Expand All @@ -40,7 +39,8 @@
draw_top_n_contaminants,
draw_potential_contaminants,
draw_modifications,
draw_oversampling
draw_oversampling,
draw_peptide_length_distribution
)
from pmultiqc.modules.core.section_groups import (
add_group_modules,
Expand Down Expand Up @@ -95,6 +95,7 @@ def __init__(self, find_log_files_func, sub_sections, heatmap_colors):
self.contam_df = []
self.mods = []
self.hm_data = []
self.peptide_length = []

# Ion-level intensity data from ion.tsv
self.ion_intensity_data = None
Expand Down Expand Up @@ -144,7 +145,8 @@ def get_data(self):
self.hyperscores,
self.contam_df,
self.mods,
self.hm_data
self.hm_data,
self.peptide_length
) = self.parse_psm(
fragpipe_files=self.fragpipe_files
)
Expand Down Expand Up @@ -365,6 +367,13 @@ def draw_plots(self):
missed_cleavages=mc_plot_data
)

# Peptide Length Distribution
if self.peptide_length:
self.draw_peptide_length(
sub_section=self.sub_sections["identification"],
peptide_length=self.peptide_length
)

# IDs over RT
if self.retentions:
self.draw_ids_over_rt(
Expand Down Expand Up @@ -435,6 +444,7 @@ def parse_psm(fragpipe_files):
contam_df = []
mods = []
hm_data = []
peptide_length = []

for psm in fragpipe_files.get("psm", []):

Expand Down Expand Up @@ -504,6 +514,10 @@ def parse_psm(fragpipe_files):
):
hm_data.append(psm_cont_df[hm_requires].copy())

# Peptide Length
if "Peptide Length" in psm_df.columns:
peptide_length.append(psm_df[["Run", "Peptide Length"]].copy())

# Contaminants
if _has_valid_contaminant(psm_cont_df):
log.info(f"{psm} contains contaminants.")
Expand Down Expand Up @@ -531,7 +545,8 @@ def parse_psm(fragpipe_files):
hyperscores,
contam_df,
mods,
hm_data
hm_data,
peptide_length
)

# Delta Mass
Expand Down Expand Up @@ -875,6 +890,33 @@ def draw_fragpipe_heatmap(sub_section, hm: list, hm_color: list, missed_cleavage
)


# Peptide Length Distribution
@staticmethod
def draw_peptide_length(sub_section, peptide_length: list):

if not peptide_length:
log.warning("No peptide_length data; skipping peptide_length.")
return

df = pd.concat(peptide_length, ignore_index=True)

if df.empty:
log.warning("Peptide Length DataFrame is empty; skipping Peptide Length.")
return

log.info(f"Number of Peptide Length rows in DataFrame: {len(df)}")

plot_data = {}
for sample, group in df.groupby("Run"):
stats_dict = group["Peptide Length"].value_counts().sort_index().to_dict()
plot_data[sample] = stats_dict

draw_peptide_length_distribution(
sub_section=sub_section,
plot_data=plot_data
)
Comment on lines +893 to +917
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

# First, let's look at the file and context around the flagged lines
cd /tmp && find . -type f -name "fragpipe.py" -path "*/modules/fragpipe/*" 2>/dev/null | head -5

Repository: bigbio/pmultiqc

Length of output: 41


🏁 Script executed:

# Search for the file in the repository
find . -type f -name "fragpipe.py" 2>/dev/null | grep -i fragpipe

Repository: bigbio/pmultiqc

Length of output: 98


🏁 Script executed:

# List files to understand structure
git ls-files | grep -i fragpipe | head -20

Repository: bigbio/pmultiqc

Length of output: 264


🏁 Script executed:

# Read the fragpipe.py file and check the relevant section and surrounding context
head -950 pmultiqc/modules/fragpipe/fragpipe.py | tail -150

Repository: bigbio/pmultiqc

Length of output: 5013


🏁 Script executed:

# Search for where peptide_length is populated
rg "peptide_length" pmultiqc/modules/fragpipe/ -n

Repository: bigbio/pmultiqc

Length of output: 1277


🏁 Script executed:

# Look at fragpipe_io.py to understand data parsing
cat -n pmultiqc/modules/fragpipe/fragpipe_io.py | head -200

Repository: bigbio/pmultiqc

Length of output: 8723


🏁 Script executed:

# Check the test PSM file to see what "Peptide Length" values look like
head -20 tests/resources/fragpipe/psm.tsv | cut -f1-20

Repository: bigbio/pmultiqc

Length of output: 5803


🏁 Script executed:

# Get column headers and check for Peptide Length column
head -1 tests/resources/fragpipe/psm.tsv | tr '\t' '\n' | nl

Repository: bigbio/pmultiqc

Length of output: 101


🏁 Script executed:

# Extract just the Peptide Length column from the test file
cut -f $(head -1 tests/resources/fragpipe/psm.tsv | tr '\t' '\n' | grep -n "Peptide Length" | cut -d: -f1) tests/resources/fragpipe/psm.tsv | head -20

Repository: bigbio/pmultiqc

Length of output: 115


🏁 Script executed:

# Check the exact data types when read by psm_reader
python3 << 'EOF'
import pandas as pd

# Simulate what psm_reader does
psm_df = pd.read_csv("tests/resources/fragpipe/psm.tsv", sep="\t")

# Check the Peptide Length column type
print(f"Peptide Length dtype: {psm_df['Peptide Length'].dtype}")
print(f"Sample values: {psm_df['Peptide Length'].head(10).tolist()}")

# Test the issue - how value_counts().sort_index() behaves
test_dict = psm_df[["Peptide Length"]].groupby(psm_df["Peptide Length"]).size().to_dict()
print(f"\nvalue_counts().sort_index() result (as dict): {test_dict}")

# If it were string-typed, show the difference
psm_df["Peptide Length"] = psm_df["Peptide Length"].astype(str)
test_dict_str = psm_df[["Peptide Length"]].groupby(psm_df["Peptide Length"]).size().to_dict()
print(f"String-typed sort_index() result: {test_dict_str}")
EOF

Repository: bigbio/pmultiqc

Length of output: 145


🏁 Script executed:

# Check what happens in the actual draw_peptide_length function with the current data
python3 << 'EOF'
import pandas as pd

# Simulate what happens in draw_peptide_length
psm_df = pd.read_csv("tests/resources/fragpipe/psm.tsv", sep="\t")

# Extract Run and Peptide Length as done in line 519
peptide_length_data = [psm_df[["Run", "Peptide Length"]].copy()]

# Now simulate the draw_peptide_length function
df = pd.concat(peptide_length_data, ignore_index=True)

print(f"After concat, dtype: {df['Peptide Length'].dtype}")

# Simulate the groupby and value_counts operations
for sample, group in df.groupby("Run"):
    stats_dict = group["Peptide Length"].value_counts().sort_index().to_dict()
    print(f"Sample: {sample}")
    print(f"Stats dict: {stats_dict}")
    print(f"Keys type: {type(list(stats_dict.keys())[0]) if stats_dict else 'N/A'}")
    break  # Just check the first one
EOF

Repository: bigbio/pmultiqc

Length of output: 145


🏁 Script executed:

# Check if there's any explicit dtype conversion in the psm_reader that I might have missed
sed -n '125,138p' pmultiqc/modules/fragpipe/fragpipe_io.py

Repository: bigbio/pmultiqc

Length of output: 480


🏁 Script executed:

# Let me check the actual behavior with a simpler test
python3 << 'EOF'
import pandas as pd
import io

# Create test data exactly like what pandas would read from the TSV
data = """Run	Peptide Length
sample1	10
sample1	8
sample1	7
sample1	10
sample1	11
sample1	2
sample1	1"""

df = pd.read_csv(io.StringIO(data), sep="\t")
print(f"Inferred dtype: {df['Peptide Length'].dtype}")

# Test current behavior
group_dict = df["Peptide Length"].value_counts().sort_index().to_dict()
print(f"Current sort_index() result: {group_dict}")
print(f"Order of keys: {list(group_dict.keys())}")

# Now test if it were string-typed (the concern in the review)
df_str = df.copy()
df_str["Peptide Length"] = df_str["Peptide Length"].astype(str)
print(f"\nString dtype: {df_str['Peptide Length'].dtype}")
group_dict_str = df_str["Peptide Length"].value_counts().sort_index().to_dict()
print(f"String sort_index() result: {group_dict_str}")
print(f"Order of keys (WRONG): {list(group_dict_str.keys())}")
EOF

Repository: bigbio/pmultiqc

Length of output: 177


🏁 Script executed:

# Check if there's any test for the peptide_length function to see how it's actually used
rg "draw_peptide_length" tests/ -A 5 -B 2

Repository: bigbio/pmultiqc

Length of output: 41


🏁 Script executed:

# Check the fragpipe module test file
head -100 tests/test_fragpipe.py

Repository: bigbio/pmultiqc

Length of output: 4112


🏁 Script executed:

# Check how pandas infers the dtype when reading the test PSM file
python3 << 'EOF'
import pandas as pd

# Read the test PSM file as pandas would
df = pd.read_csv("tests/resources/fragpipe/psm.tsv", sep="\t", nrows=10)

# Check the dtype of Peptide Length
print(f"Peptide Length dtype: {df['Peptide Length'].dtype}")
print(f"Sample values: {df['Peptide Length'].tolist()}")

# Check what value_counts().sort_index() produces
vc = df["Peptide Length"].value_counts().sort_index()
print(f"\nvalue_counts().sort_index() keys: {list(vc.index)}")
print(f"Keys are numeric: {all(isinstance(k, (int, float)) for k in vc.index)}")
EOF

Repository: bigbio/pmultiqc

Length of output: 177


🏁 Script executed:

# Check how the data flows through the code - look at where peptide_length is populated
sed -n '447,555p' pmultiqc/modules/fragpipe/fragpipe.py | head -120

Repository: bigbio/pmultiqc

Length of output: 3864


🏁 Script executed:

# Check how value_counts and sort_index interact in different scenarios
# Let me look at the exact code pattern more carefully
sed -n '893,920p' pmultiqc/modules/fragpipe/fragpipe.py

Repository: bigbio/pmultiqc

Length of output: 908


🏁 Script executed:

# Let's also compare the two similar functions side by side to see the difference
# draw_ids_over_rt (which has numeric conversion) vs draw_peptide_length (which doesn't)
sed -n '925,950p' pmultiqc/modules/fragpipe/fragpipe.py

Repository: bigbio/pmultiqc

Length of output: 780


🏁 Script executed:

# Check if there are any other similar patterns in the file where numeric conversion is used
rg "pd.to_numeric" pmultiqc/modules/fragpipe/fragpipe.py -B 2 -A 2

Repository: bigbio/pmultiqc

Length of output: 348


Ensure peptide lengths are numeric to avoid lexicographic bin ordering.

The "Peptide Length" column can be read as string-typed by pandas, causing sort_index() to order as 1,10,11,2... instead of 1,2,10,11.... This pattern is already used in other similar functions in the same file (e.g., draw_delta_mass(), draw_ids_over_rt()), so apply the same defensive approach here.

Proposed fix
         df = pd.concat(peptide_length, ignore_index=True)
+        df["Peptide Length"] = pd.to_numeric(df["Peptide Length"], errors="coerce")
+        df = df.dropna(subset=["Peptide Length"])
 
         if df.empty:
             log.warning("Peptide Length DataFrame is empty; skipping Peptide Length.")
             return
 
         log.info(f"Number of Peptide Length rows in DataFrame: {len(df)}")
 
         plot_data = {}
         for sample, group in df.groupby("Run"):
-            stats_dict = group["Peptide Length"].value_counts().sort_index().to_dict()
+            stats_dict = (
+                group["Peptide Length"]
+                .astype(int)
+                .value_counts()
+                .sort_index()
+                .to_dict()
+            )
             plot_data[sample] = stats_dict
🤖 Prompt for AI Agents
In `@pmultiqc/modules/fragpipe/fragpipe.py` around lines 893 - 917, The "Peptide
Length" column may be string-typed causing lexicographic ordering; inside
draw_peptide_length convert df["Peptide Length"] to numeric (use
pandas.to_numeric with errors='coerce'), drop or dropna invalid entries,
optionally cast to int, then proceed to compute value_counts() and sort_index()
so bins are ordered numerically; update references to df and the grouping logic
in draw_peptide_length to use the cleaned numeric column (mirroring the approach
used in draw_delta_mass() and draw_ids_over_rt()) before building plot_data for
draw_peptide_length_distribution.



# IDs over RT
@staticmethod
def draw_ids_over_rt(sub_section, retentions: list):
Expand Down
2 changes: 1 addition & 1 deletion pmultiqc/modules/fragpipe/fragpipe_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"psm": [
"Spectrum", "Peptide", "Modified Peptide", "Charge", "Retention", "Intensity",
"Delta Mass", "Number of Missed Cleavages", "Is Unique", "Protein", "Hyperscore",
"Assigned Modifications"
"Assigned Modifications", "Peptide Length"
],
"ion": [
"Peptide Sequence", "Modified Sequence", "Charge", "Protein", "Intensity"
Expand Down
9 changes: 9 additions & 0 deletions pmultiqc/modules/maxquant/maxquant.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ def _process_evidence_file(self):
"maxquant_delta_mass_da": None,
"peptides_quant_table": None,
"protein_quant_table": None,
"peptide_length": None,
}

if "evidence" not in self.maxquant_paths.keys():
Expand Down Expand Up @@ -371,6 +372,14 @@ def _draw_quantification_plots(self):
error_name="draw_peptide_table"
)

# Peptide Length Distribution
self._safe_draw_if_exists(
id_plots.draw_peptide_length_distribution,
self.sub_sections["identification"],
self.mq_results["get_evidence_dicts"].get("peptide_length"),
error_name="draw_peptide_length_distribution"
)

self._safe_draw_if_exists(
maxquant_plots.draw_protein_table,
self.sub_sections["quantification"],
Expand Down
20 changes: 20 additions & 0 deletions pmultiqc/modules/maxquant/maxquant_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -550,6 +550,9 @@ def get_evidence(file_path):
# Peptides Quantification Table / Protein Quantification Table
peptides_quant_table, protein_quant_table = evidence_peptides_table(evidence_df)

# Peptide Length Distribution
peptide_length = evidence_peptide_length(evidence_df)

result = {
"top_contaminants": top_cont_dict,
"peptide_intensity": peptide_intensity_dict,
Expand All @@ -569,6 +572,7 @@ def get_evidence(file_path):
"maxquant_delta_mass_da": maxquant_delta_mass_da,
"peptides_quant_table": peptides_quant_table,
"protein_quant_table": protein_quant_table,
"peptide_length": peptide_length,
}

logger.info("Completed processing evidence data")
Expand Down Expand Up @@ -1228,6 +1232,22 @@ def evidence_peptides_table(evidence_data):
return peptides_result_dict, protein_result_dict


# evidence_peptide_length
def evidence_peptide_length(df):
if any(
column not in df.columns
for column in ["length", "sequence"]
):
return None

plot_data = {}
for run, group in df.groupby("raw file"):
stats_dict = group["length"].value_counts().sort_index().to_dict()
plot_data[run] = stats_dict

return plot_data


# 4.msms.txt
def get_msms(file_path: Union[Path, str], evidence_df: pd.DataFrame = None):
"""
Expand Down
24 changes: 20 additions & 4 deletions pmultiqc/modules/quantms/quantms.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@
draw_summary_protein_ident_table,
draw_identi_num,
draw_peptide_intensity,
draw_long_trends
draw_long_trends,
draw_peptide_length_distribution
)
from pmultiqc.modules.common.plots.general import (
draw_heatmap,
Expand Down Expand Up @@ -176,6 +177,7 @@ def __init__(self, find_log_files_func, sub_sections, heatmap_colors):
self.sample_df = pd.DataFrame()
self.file_df = pd.DataFrame()
self.long_trends = {}
self.peptide_length = {}

def get_data(self):

Expand Down Expand Up @@ -322,7 +324,8 @@ def draw_plots(self):
self.ms_with_psm,
self.cal_num_table_data,
self.quantms_modified,
self.ms_without_psm
self.ms_without_psm,
self.peptide_length
) = parse_diann_report(
sub_sections=self.sub_sections,
diann_report_path=self.diann_report_path,
Expand Down Expand Up @@ -476,6 +479,13 @@ def draw_plots(self):
long_trends_data=self.long_trends
)

# Peptide Length Distribution
if self.peptide_length:
draw_peptide_length_distribution(
sub_section=self.sub_sections["identification"],
plot_data=self.peptide_length
)

if self.quantms_pep_intensity:
draw_peptide_intensity(
sub_section=self.sub_sections["quantification"],
Expand Down Expand Up @@ -1384,8 +1394,9 @@ def get_unimod_modification(modifis):
mod_plot_by_run = dict()
modified_cats = list()

data_per_run = dict()
num_table_at_run = dict()
data_per_run = {}
num_table_at_run = {}
peptide_length = {}

if config.kwargs["remove_decoy"]:
psm = psm[psm["opt_global_cv_MS:1002217_decoy_peptide"] == 0].copy()
Expand Down Expand Up @@ -1455,8 +1466,13 @@ def get_unimod_modification(modifis):

ml_spec_ident_final[m] = len(set(self.identified_spectrum[m]))

stats_dict = group["pep_length"].value_counts().sort_index().to_dict()
peptide_length[m] = stats_dict

num_table_at_sample = cal_num_table_at_sample(self.file_df, data_per_run)

self.peptide_length = peptide_length

self.cal_num_table_data = {
"sdrf_samples": num_table_at_sample,
"ms_runs": num_table_at_run
Expand Down