nf-core · jfy133 · Jun 22, 2025 · Aug 19, 2025 · Aug 22, 2025 · Sep 5, 2025
@@ -19,12 +19,27 @@ def parse_args(args=None):
         metavar="FILE",
         help="Bin depths summary file.",
     )
-    parser.add_argument("-b", "--binqc_summary", metavar="FILE", help="BUSCO summary file.")
-    parser.add_argument("-q", "--quast_summary", metavar="FILE", help="QUAST BINS summary file.")
-    parser.add_argument("-g", "--gtdbtk_summary", metavar="FILE", help="GTDB-Tk summary file.")
+    parser.add_argument(
+        "-b", "--binqc_summary", metavar="FILE", help="BUSCO summary file."
+    )
+    parser.add_argument(
+        "-q", "--quast_summary", metavar="FILE", help="QUAST BINS summary file."
+    )
+    parser.add_argument(
+        "-g", "--gtdbtk_summary", metavar="FILE", help="GTDB-Tk summary file."
+    )
     parser.add_argument("-a", "--cat_summary", metavar="FILE", help="CAT table file.")
     parser.add_argument(
-        "-t", "--binqc_tool", help="Bin QC tool used", choices=["busco", "checkm", "checkm2"]
+        "-p",
+        "--summarisepydamage_summary",
+        metavar="FILE",
+        help="summarisepydamage table file.",
+    )
+    parser.add_argument(
+        "-t",
+        "--binqc_tool",
+        help="Bin QC tool used",
+        choices=["busco", "checkm", "checkm2"],
     )
 
     parser.add_argument(
@@ -76,7 +91,9 @@ def parse_cat_table(cat_table):
     )
     # merge all rank columns into a single column
     df["CAT_rank"] = (
-        df.filter(regex="rank_\d+").apply(lambda x: ";".join(x.dropna()), axis=1).str.lstrip()
+        df.filter(regex="rank_\d+")
+        .apply(lambda x: ";".join(x.dropna()), axis=1)
+        .str.lstrip()
     )
     # remove rank_* columns
     df.drop(df.filter(regex="rank_\d+").columns, axis=1, inplace=True)
@@ -87,11 +104,7 @@ def parse_cat_table(cat_table):
 def main(args=None):
     args = parse_args(args)
 
-    if (
-        not args.binqc_summary
-        and not args.quast_summary
-        and not args.gtdbtk_summary
-    ):
+    if not args.binqc_summary and not args.quast_summary and not args.gtdbtk_summary:
         sys.exit(
             "No summary specified! "
             "Please specify at least BUSCO, CheckM, CheckM2 or QUAST summary."
@@ -106,15 +119,19 @@ def main(args=None):
 
     # handle bin depths
     results = pd.read_csv(args.depths_summary, sep="\t")
-    results.columns = ["Depth " + str(col) if col != "bin" else col for col in results.columns]
+    results.columns = [
+        "Depth " + str(col) if col != "bin" else col for col in results.columns
+    ]
     bins = results["bin"].sort_values().reset_index(drop=True)
 
     if args.binqc_summary and args.binqc_tool == "busco":
         busco_results = pd.read_csv(args.binqc_summary, sep="\t")
         busco_bins = set(busco_results["Input_file"])
 
         if set(bins) != busco_bins and len(busco_bins.intersection(set(bins))) > 0:
-            warnings.warn("Bins in BUSCO summary do not match bins in bin depths summary")
+            warnings.warn(
+                "Bins in BUSCO summary do not match bins in bin depths summary"
+            )
         elif len(busco_bins.intersection(set(bins))) == 0:
             sys.exit("Bins in BUSCO summary do not match bins in bin depths summary!")
         results = pd.merge(
@@ -171,7 +188,9 @@ def main(args=None):
 
     if args.quast_summary:
         quast_results = pd.read_csv(args.quast_summary, sep="\t")
-        if not bins.equals(quast_results["Assembly"].sort_values().reset_index(drop=True)):
+        if not bins.equals(
+            quast_results["Assembly"].sort_values().reset_index(drop=True)
+        ):
             sys.exit("Bins in QUAST summary do not match bins in bin depths summary!")
         results = pd.merge(
             results, quast_results, left_on="bin", right_on="Assembly", how="outer"
@@ -197,6 +216,27 @@ def main(args=None):
             how="outer",
         )
 
+    if args.summarisepydamage_summary:
+        summarisepydamage_results = pd.read_csv(
+            args.summarisepydamage_summary, sep="\t"
+        )
+        summarisepydamage_results["id"] = (summarisepydamage_results["id"]).replace(
+            "_pydamagebins", ""
+        )
+        if not bins.equals(
+            summarisepydamage_results["id"].sort_values().reset_index(drop=True)
+        ):
+            sys.exit(
+                "Bins in summarisepydamage summary do not match bins in bin depths summary!"
+            )
+        results = pd.merge(
+            results,
+            summarisepydamage_results,
+            left_on="bin",
+            right_on="id",
+            how="outer",
+        )  # assuming depths for all bins are given
+
     results.to_csv(args.out, sep="\t")
 
 

@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+
+## summarise_pydamage.py
+## Originally written by James Fellows Yates and released under the MIT license.
+## See git repository (https://github.com/nf-core/mag) for full license text.
+
+import os
+import sys
+import argparse
+import pandas as pd
+
+
+def parse_args(args=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-i",
+        "--input",
+        required=True,
+        metavar="FILE",
+        help="Input CSV file output of a pyDamage analyze command",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        required=False,
+        metavar="FILE",
+        help="Path to output TSV file with all statistic values summarised with as a median value. If not supplied, default output is input with _summarised appended to the file name.",
+    )
+    parser.add_argument(
+        "-n",
+        "--name",
+        required=True,
+        metavar="STRING",
+        help="Sample name for appending to summarised result as ID",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        required=False,
+        action=argparse.BooleanOptionalAction,
+        help="Print to console more logging information",
+    )
+    parser.add_argument("--version", action="version", version="%(prog)s 0.0.1")
+
+    return parser.parse_args(args)
+
+
+def main(args=None):
+    parser = argparse.ArgumentParser(description="summarise_pydamage.py")
+    args = parse_args(args)
+
+    if args.output is not None:
+        if os.path.abspath(args.output) == os.path.abspath(args.input):
+            sys.exit(
+                "[summarise_pydamage.py] ERROR: Input and output files names must be different."
+            )
+
+    if args.verbose:
+        print("[summarise_pydamage.py] PROCESSING: Loading " + args.input)
+
+    pydamage_raw = pd.read_csv(args.input, sep=",")
+
+    if args.verbose:
+        print("[summarise_pydamage.py] PROCESSING: appending sample name")
+
+    pydamage_raw["id"] = os.path.splitext(args.input)[0]
+
+    if args.verbose:
+        print(
+            "[summarise_pydamage.py] PROCESSING: cleaning up table, and calculating median"
+        )
+
+    pydamage_summarised = pydamage_raw.drop("reference", axis=1).groupby("id").median()
+    pydamage_summarised = pydamage_summarised.add_prefix("median_")
+
+    if args.verbose:
+        print("[summarise_pydamage.py] FINALISING: saving file")
+
+    if args.output is not None:
+        if os.path.splitext(args.output)[1] != ".tsv":
+            outfile = os.path.abspath(args.output + ".tsv")
+        else:
+            outfile = os.path.abspath(args.output)
+    else:
+        outfile = os.path.splitext(args.input)[0] + "_summarised.tsv"
+
+    pydamage_summarised.to_csv(outfile, sep="\t")
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -723,6 +723,15 @@ process {
         ]
     }
 
+    withName: SUMMARISEPYDAMAGE {
+        ext.tag    = { "${meta.bin_id}" }
+        ext.prefix = { "${meta.bin_id}" }
+        publishDir = [
+            path: { "${params.outdir}/GenomeBinning/QC/pydamage/analyze_bins/" },
+            mode: params.publish_dir_mode,
+        ]
+    }
+
     withName: SAMTOOLS_FAIDX {
         ext.prefix = { "${meta.assembler}-${meta.id}" }
         publishDir = [

@@ -11,24 +11,27 @@ process BIN_SUMMARY {
     path quast_sum
     path gtdbtk_sum
     path cat_sum
-    val  binqc_tool
+    path summarisepydamage_sum
+    val binqc_tool
 
     output:
     path "bin_summary.tsv", emit: summary
-    path "versions.yml"   , emit: versions
+    path "versions.yml", emit: versions
 
     script:
-    def binqc_summary  = binqc_sum.sort().size() > 0 ? "--binqc_summary ${binqc_sum}" : ""
-    def quast_summary  = quast_sum.sort().size() > 0 ? "--quast_summary ${quast_sum}" : ""
+    def binqc_summary = binqc_sum.sort().size() > 0 ? "--binqc_summary ${binqc_sum}" : ""
+    def quast_summary = quast_sum.sort().size() > 0 ? "--quast_summary ${quast_sum}" : ""
     def gtdbtk_summary = gtdbtk_sum.sort().size() > 0 ? "--gtdbtk_summary ${gtdbtk_sum}" : ""
-    def cat_summary    = cat_sum.sort().size() > 0 ? "--cat_summary ${cat_sum}" : ""
+    def cat_summary = cat_sum.sort().size() > 0 ? "--cat_summary ${cat_sum}" : ""
+    def summarisepydamage_summary = summarisepydamage_sum.sort().size() > 0 ? "--summarisepydamage_summary ${summarisepydamage_sum}" : ""
     """
     combine_tables.py \
         --depths_summary ${bin_depths} \
         ${binqc_summary} \
         ${quast_summary} \
         ${gtdbtk_summary} \
         ${cat_summary} \
+        ${summarisepydamage_summary} \
         --binqc_tool ${binqc_tool} \
         --out bin_summary.tsv
 

@@ -0,0 +1,7 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - "bioconda::pydamage=1.0"
@@ -0,0 +1,49 @@
+process SUMMARISEPYDAMAGE {
+    tag "${meta.id}"
+    label 'process_single'
+
+    conda "${moduleDir}/environment.yml"
+    container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container
+        ? 'https://depot.galaxyproject.org/singularity/pydamage:1.0--pyhdfd78af_0'
+        : 'biocontainers/pydamage:1.0--pyhdfd78af_0'}"
+
+    input:
+    tuple val(meta), path(csv)
+
+    output:
+    tuple val(meta), path("*.tsv"), emit: summary_tsv
+    path "versions.yml", emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    summarise_pydamage.py \\
+        ${args} \\
+        -i ${csv} \\
+        -o ${prefix}_pydamagebins_summarised.tsv \\
+        -n ${meta.id}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        summarisepydamage: \$(summarise_pydamage.py --version | cut -d ' ' -f 2)
+    END_VERSIONS
+    """
+
+    stub:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    echo ${args}
+
+    touch ${prefix}_pydamage_summarised.csv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        summarisepydamage: \$(summarise_pydamage.py --version | cut -d ' ' -f 2 )
+    END_VERSIONS
+    """
+}
@@ -0,0 +1,54 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
+name: "summarisepydamage"
+description: write your description here
+keywords:
+  - sort
+  - example
+  - genomics
+tools:
+  - "summarisepydamage":
+      description: "Processing script for summarising damage parameter estimation for ancient DNA by pyDamage."
+      homepage: "https://nf-co.re/mag"
+      documentation: "https://nf-co.re/mag"
+      tool_dev_url: "https://nf-co.re/mag"
+      licence: ["GPL v3-or-later"]
+
+input:
+  - - meta:
+        type: map
+        description: |
+          Groovy Map containing sample information
+          e.g. `[ id:'sample1' ]`
+    - csv:
+        type: file
+        description: Output CSV table from pyDamage analyze or pydamage filter
+        pattern: "*.csv"
+        ontologies:
+          - edam: "http://edamontology.org/format_3752" # CSV
+
+output:
+  bam:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. `[ id:'sample1' ]`
+      - "*.csv":
+          type: file
+          description: Summarised CSV calculating median values across all contigs
+          pattern: "*.csv"
+          ontologies:
+            - edam: "http://edamontology.org/format_3752" # CSV
+
+  versions:
+    - "versions.yml":
+        type: file
+        description: File containing software versions
+        pattern: "versions.yml"
+        ontologies:
+          - edam: "http://edamontology.org/format_3750" # YAML
+
+authors:
+  - "@jfy133"
+maintainers:
+  - "@jfy133"