diff --git a/bin/combine_tables.py b/bin/combine_tables.py index 2cff1f854..b60de2f16 100755 --- a/bin/combine_tables.py +++ b/bin/combine_tables.py @@ -19,12 +19,27 @@ def parse_args(args=None): metavar="FILE", help="Bin depths summary file.", ) - parser.add_argument("-b", "--binqc_summary", metavar="FILE", help="BUSCO summary file.") - parser.add_argument("-q", "--quast_summary", metavar="FILE", help="QUAST BINS summary file.") - parser.add_argument("-g", "--gtdbtk_summary", metavar="FILE", help="GTDB-Tk summary file.") + parser.add_argument( + "-b", "--binqc_summary", metavar="FILE", help="BUSCO summary file." + ) + parser.add_argument( + "-q", "--quast_summary", metavar="FILE", help="QUAST BINS summary file." + ) + parser.add_argument( + "-g", "--gtdbtk_summary", metavar="FILE", help="GTDB-Tk summary file." + ) parser.add_argument("-a", "--cat_summary", metavar="FILE", help="CAT table file.") parser.add_argument( - "-t", "--binqc_tool", help="Bin QC tool used", choices=["busco", "checkm", "checkm2"] + "-p", + "--summarisepydamage_summary", + metavar="FILE", + help="summarisepydamage table file.", + ) + parser.add_argument( + "-t", + "--binqc_tool", + help="Bin QC tool used", + choices=["busco", "checkm", "checkm2"], ) parser.add_argument( @@ -76,7 +91,9 @@ def parse_cat_table(cat_table): ) # merge all rank columns into a single column df["CAT_rank"] = ( - df.filter(regex="rank_\d+").apply(lambda x: ";".join(x.dropna()), axis=1).str.lstrip() + df.filter(regex="rank_\d+") + .apply(lambda x: ";".join(x.dropna()), axis=1) + .str.lstrip() ) # remove rank_* columns df.drop(df.filter(regex="rank_\d+").columns, axis=1, inplace=True) @@ -87,11 +104,7 @@ def parse_cat_table(cat_table): def main(args=None): args = parse_args(args) - if ( - not args.binqc_summary - and not args.quast_summary - and not args.gtdbtk_summary - ): + if not args.binqc_summary and not args.quast_summary and not args.gtdbtk_summary: sys.exit( "No summary specified! " "Please specify at least BUSCO, CheckM, CheckM2 or QUAST summary." @@ -106,7 +119,9 @@ def main(args=None): # handle bin depths results = pd.read_csv(args.depths_summary, sep="\t") - results.columns = ["Depth " + str(col) if col != "bin" else col for col in results.columns] + results.columns = [ + "Depth " + str(col) if col != "bin" else col for col in results.columns + ] bins = results["bin"].sort_values().reset_index(drop=True) if args.binqc_summary and args.binqc_tool == "busco": @@ -114,7 +129,9 @@ def main(args=None): busco_bins = set(busco_results["Input_file"]) if set(bins) != busco_bins and len(busco_bins.intersection(set(bins))) > 0: - warnings.warn("Bins in BUSCO summary do not match bins in bin depths summary") + warnings.warn( + "Bins in BUSCO summary do not match bins in bin depths summary" + ) elif len(busco_bins.intersection(set(bins))) == 0: sys.exit("Bins in BUSCO summary do not match bins in bin depths summary!") results = pd.merge( @@ -171,7 +188,9 @@ def main(args=None): if args.quast_summary: quast_results = pd.read_csv(args.quast_summary, sep="\t") - if not bins.equals(quast_results["Assembly"].sort_values().reset_index(drop=True)): + if not bins.equals( + quast_results["Assembly"].sort_values().reset_index(drop=True) + ): sys.exit("Bins in QUAST summary do not match bins in bin depths summary!") results = pd.merge( results, quast_results, left_on="bin", right_on="Assembly", how="outer" @@ -197,6 +216,27 @@ def main(args=None): how="outer", ) + if args.summarisepydamage_summary: + summarisepydamage_results = pd.read_csv( + args.summarisepydamage_summary, sep="\t" + ) + summarisepydamage_results["id"] = (summarisepydamage_results["id"]).replace( + "_pydamagebins", "" + ) + if not bins.equals( + summarisepydamage_results["id"].sort_values().reset_index(drop=True) + ): + sys.exit( + "Bins in summarisepydamage summary do not match bins in bin depths summary!" + ) + results = pd.merge( + results, + summarisepydamage_results, + left_on="bin", + right_on="id", + how="outer", + ) # assuming depths for all bins are given + results.to_csv(args.out, sep="\t") diff --git a/bin/summarise_pydamage.py b/bin/summarise_pydamage.py new file mode 100755 index 000000000..eeba7ef69 --- /dev/null +++ b/bin/summarise_pydamage.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 + +## summarise_pydamage.py +## Originally written by James Fellows Yates and released under the MIT license. +## See git repository (https://github.com/nf-core/mag) for full license text. + +import os +import sys +import argparse +import pandas as pd + + +def parse_args(args=None): + parser = argparse.ArgumentParser() + parser.add_argument( + "-i", + "--input", + required=True, + metavar="FILE", + help="Input CSV file output of a pyDamage analyze command", + ) + parser.add_argument( + "-o", + "--output", + required=False, + metavar="FILE", + help="Path to output TSV file with all statistic values summarised with as a median value. If not supplied, default output is input with _summarised appended to the file name.", + ) + parser.add_argument( + "-n", + "--name", + required=True, + metavar="STRING", + help="Sample name for appending to summarised result as ID", + ) + parser.add_argument( + "-v", + "--verbose", + required=False, + action=argparse.BooleanOptionalAction, + help="Print to console more logging information", + ) + parser.add_argument("--version", action="version", version="%(prog)s 0.0.1") + + return parser.parse_args(args) + + +def main(args=None): + parser = argparse.ArgumentParser(description="summarise_pydamage.py") + args = parse_args(args) + + if args.output is not None: + if os.path.abspath(args.output) == os.path.abspath(args.input): + sys.exit( + "[summarise_pydamage.py] ERROR: Input and output files names must be different." + ) + + if args.verbose: + print("[summarise_pydamage.py] PROCESSING: Loading " + args.input) + + pydamage_raw = pd.read_csv(args.input, sep=",") + + if args.verbose: + print("[summarise_pydamage.py] PROCESSING: appending sample name") + + pydamage_raw["id"] = os.path.splitext(args.input)[0] + + if args.verbose: + print( + "[summarise_pydamage.py] PROCESSING: cleaning up table, and calculating median" + ) + + pydamage_summarised = pydamage_raw.drop("reference", axis=1).groupby("id").median() + pydamage_summarised = pydamage_summarised.add_prefix("median_") + + if args.verbose: + print("[summarise_pydamage.py] FINALISING: saving file") + + if args.output is not None: + if os.path.splitext(args.output)[1] != ".tsv": + outfile = os.path.abspath(args.output + ".tsv") + else: + outfile = os.path.abspath(args.output) + else: + outfile = os.path.splitext(args.input)[0] + "_summarised.tsv" + + pydamage_summarised.to_csv(outfile, sep="\t") + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/conf/modules.config b/conf/modules.config index 034b56eea..909ec74c8 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -723,6 +723,15 @@ process { ] } + withName: SUMMARISEPYDAMAGE { + ext.tag = { "${meta.bin_id}" } + ext.prefix = { "${meta.bin_id}" } + publishDir = [ + path: { "${params.outdir}/GenomeBinning/QC/pydamage/analyze_bins/" }, + mode: params.publish_dir_mode, + ] + } + withName: SAMTOOLS_FAIDX { ext.prefix = { "${meta.assembler}-${meta.id}" } publishDir = [ diff --git a/modules/local/bin_summary/main.nf b/modules/local/bin_summary/main.nf index 07784d830..82d4e11f2 100644 --- a/modules/local/bin_summary/main.nf +++ b/modules/local/bin_summary/main.nf @@ -11,17 +11,19 @@ process BIN_SUMMARY { path quast_sum path gtdbtk_sum path cat_sum - val binqc_tool + path summarisepydamage_sum + val binqc_tool output: path "bin_summary.tsv", emit: summary - path "versions.yml" , emit: versions + path "versions.yml", emit: versions script: - def binqc_summary = binqc_sum.sort().size() > 0 ? "--binqc_summary ${binqc_sum}" : "" - def quast_summary = quast_sum.sort().size() > 0 ? "--quast_summary ${quast_sum}" : "" + def binqc_summary = binqc_sum.sort().size() > 0 ? "--binqc_summary ${binqc_sum}" : "" + def quast_summary = quast_sum.sort().size() > 0 ? "--quast_summary ${quast_sum}" : "" def gtdbtk_summary = gtdbtk_sum.sort().size() > 0 ? "--gtdbtk_summary ${gtdbtk_sum}" : "" - def cat_summary = cat_sum.sort().size() > 0 ? "--cat_summary ${cat_sum}" : "" + def cat_summary = cat_sum.sort().size() > 0 ? "--cat_summary ${cat_sum}" : "" + def summarisepydamage_summary = summarisepydamage_sum.sort().size() > 0 ? "--summarisepydamage_summary ${summarisepydamage_sum}" : "" """ combine_tables.py \ --depths_summary ${bin_depths} \ @@ -29,6 +31,7 @@ process BIN_SUMMARY { ${quast_summary} \ ${gtdbtk_summary} \ ${cat_summary} \ + ${summarisepydamage_summary} \ --binqc_tool ${binqc_tool} \ --out bin_summary.tsv diff --git a/modules/local/summarisepydamage/environment.yml b/modules/local/summarisepydamage/environment.yml new file mode 100644 index 000000000..56dbdefa8 --- /dev/null +++ b/modules/local/summarisepydamage/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - "bioconda::pydamage=1.0" diff --git a/modules/local/summarisepydamage/main.nf b/modules/local/summarisepydamage/main.nf new file mode 100644 index 000000000..8847802fd --- /dev/null +++ b/modules/local/summarisepydamage/main.nf @@ -0,0 +1,49 @@ +process SUMMARISEPYDAMAGE { + tag "${meta.id}" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://depot.galaxyproject.org/singularity/pydamage:1.0--pyhdfd78af_0' + : 'biocontainers/pydamage:1.0--pyhdfd78af_0'}" + + input: + tuple val(meta), path(csv) + + output: + tuple val(meta), path("*.tsv"), emit: summary_tsv + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + summarise_pydamage.py \\ + ${args} \\ + -i ${csv} \\ + -o ${prefix}_pydamagebins_summarised.tsv \\ + -n ${meta.id} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + summarisepydamage: \$(summarise_pydamage.py --version | cut -d ' ' -f 2) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + echo ${args} + + touch ${prefix}_pydamage_summarised.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + summarisepydamage: \$(summarise_pydamage.py --version | cut -d ' ' -f 2 ) + END_VERSIONS + """ +} diff --git a/modules/local/summarisepydamage/meta.yml b/modules/local/summarisepydamage/meta.yml new file mode 100644 index 000000000..7af9b9bc0 --- /dev/null +++ b/modules/local/summarisepydamage/meta.yml @@ -0,0 +1,54 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "summarisepydamage" +description: write your description here +keywords: + - sort + - example + - genomics +tools: + - "summarisepydamage": + description: "Processing script for summarising damage parameter estimation for ancient DNA by pyDamage." + homepage: "https://nf-co.re/mag" + documentation: "https://nf-co.re/mag" + tool_dev_url: "https://nf-co.re/mag" + licence: ["GPL v3-or-later"] + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - csv: + type: file + description: Output CSV table from pyDamage analyze or pydamage filter + pattern: "*.csv" + ontologies: + - edam: "http://edamontology.org/format_3752" # CSV + +output: + bam: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - "*.csv": + type: file + description: Summarised CSV calculating median values across all contigs + pattern: "*.csv" + ontologies: + - edam: "http://edamontology.org/format_3752" # CSV + + versions: + - "versions.yml": + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: "http://edamontology.org/format_3750" # YAML + +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/local/summarisepydamage/tests/main.nf.test b/modules/local/summarisepydamage/tests/main.nf.test new file mode 100644 index 000000000..0a3d6e402 --- /dev/null +++ b/modules/local/summarisepydamage/tests/main.nf.test @@ -0,0 +1,68 @@ +// nf-core modules test summarisepydamage +nextflow_process { + + name "Test Process SUMMARISEPYDAMAGE" + script "../main.nf" + process "SUMMARISEPYDAMAGE" + + tag "modules" + tag "modules_" + tag "summarisepydamage" + tag "pydamage_analyze" + + setup { + run('PYDAMAGE_ANALYZE') { + script '../../../nf-core/pydamage/analyze/main.nf' + process { + """ + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam.bai', checkIfExists: true) + ] + """ + } + } + } + + test("homo_sapiens - csv") { + + when { + process { + """ + input[0] = PYDAMAGE_ANALYZE.out.csv + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("homo_sapiens - csv - stub") { + + options "-stub" + + when { + process { + """ + input[0] = PYDAMAGE_ANALYZE.out.csv + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/local/summarisepydamage/tests/main.nf.test.snap b/modules/local/summarisepydamage/tests/main.nf.test.snap new file mode 100644 index 000000000..fd5210fbb --- /dev/null +++ b/modules/local/summarisepydamage/tests/main.nf.test.snap @@ -0,0 +1,68 @@ +{ + "homo_sapiens - csv": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_pydamage_summarised.csv:md5,d497b394b32609778351527fc7fe16d2" + ] + ], + "1": [ + "versions.yml:md5,8bf54a3cd6f2078212d496790c573191" + ], + "summary_csv": [ + [ + { + "id": "test" + }, + "test_pydamage_summarised.csv:md5,d497b394b32609778351527fc7fe16d2" + ] + ], + "versions": [ + "versions.yml:md5,8bf54a3cd6f2078212d496790c573191" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.6" + }, + "timestamp": "2025-08-19T13:53:19.753209317" + }, + "homo_sapiens - csv - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_pydamage_summarised.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,8bf54a3cd6f2078212d496790c573191" + ], + "summary_csv": [ + [ + { + "id": "test" + }, + "test_pydamage_summarised.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,8bf54a3cd6f2078212d496790c573191" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.6" + }, + "timestamp": "2025-08-19T13:53:32.312662338" + } +} \ No newline at end of file diff --git a/modules/nf-core/gunc/run/main.nf b/modules/nf-core/gunc/run/main.nf index 9ee614e4c..d4562c8e1 100644 --- a/modules/nf-core/gunc/run/main.nf +++ b/modules/nf-core/gunc/run/main.nf @@ -30,6 +30,7 @@ process GUNC_RUN { --db_file $db \\ --threads $task.cpus \\ $args + cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/subworkflows/local/PYDAMAGE_BINS/main.nf b/subworkflows/local/PYDAMAGE_BINS/main.nf new file mode 100644 index 000000000..49c10d097 --- /dev/null +++ b/subworkflows/local/PYDAMAGE_BINS/main.nf @@ -0,0 +1,93 @@ +/* + * SHORTREAD_PREPROCESSING: Preprocessing and QC for short reads + */ + +include { SUMMARISEPYDAMAGE } from '../../../modules/local/summarisepydamage/main' + +workflow PYDAMAGE_BINS { + take: + ch_pydamage_results + ch_input_for_postbinning + + main: + ch_versions = Channel.empty() + + // Get sample ID for each contig (record header) + ch_contig_bin_assignments = ch_input_for_postbinning.transpose() + + // Get meta only version of input bins for later re-binding + ch_input_for_bins_metaonly = ch_contig_bin_assignments.map { meta, binfile -> [[bin_id: binfile.name], meta] } + + // Extract contig ID per bin + ch_contig_bin_assignments_meta = ch_contig_bin_assignments + .map { meta, binfile -> [meta + [bin_id: binfile.name], binfile] } + .splitFasta(record: [header: true]) + .map { meta, record -> [[id: meta.id, contig_id: record.header], [bin_id: meta.bin_id]] } + + // Make useable sample id/contig name meta map for merging + ch_pydamage_filtered_results = ch_pydamage_results + .splitCsv(header: true) + .map { meta, pydamage_stats -> [[id: meta.id, contig_id: pydamage_stats.reference], meta, pydamage_stats] } + + //Merge sample ID to pydamage results so we can group by contig AND node name + ch_reordered_pydamage_stats = ch_contig_bin_assignments_meta + .join(ch_pydamage_filtered_results) + .map { _coremeta, _bin_meta, _full_meta, pydamage_stats -> [_bin_meta, pydamage_stats] } + .groupTuple(by: 0) + + // Convert contents of the reordered contigs to a CSV file, and re-attach meta based on bin_id (i.e., from bin file name) + ch_pydamage_to_bins = ch_reordered_pydamage_stats + .map { bin_id, data -> + // Process your data to create CSV content + def header = data[0].keySet().join(',') + def rows = data.collect { it.values().join(',') }.join('\n') + def content = header + '\n' + rows + [bin_id, content] + } + .collectFile( + [storeDir: "${params.outdir}/GenomeBinning/QC/pydamage/analyze_bins/"] + ) { meta, content -> + ["${meta.bin_id}_pydamagebins.csv", content] + } + .map { file -> + def meta = [:] + meta.bin_id = file.getName() - '_pydamagebins.csv' + [meta, file] + } + .join(ch_input_for_bins_metaonly) + .map { bin_id, file, meta -> + def meta_new = meta + bin_id + [meta_new, file] + } + + // Generate the per bin-summary + SUMMARISEPYDAMAGE(ch_pydamage_to_bins) + ch_versions = ch_versions.mix(SUMMARISEPYDAMAGE.out.versions) + + // Stick per bin-summary into a single summary CSV + ch_aggregate_summaries = SUMMARISEPYDAMAGE.out.summary_tsv + .map { _meta, file -> [file] } + .splitCsv(header: true, sep: '\t') + .map { content -> + content[0].id = content[0].id - '_pydamagebins' + [[id: 'all'], content.flatten()] + } + .groupTuple() + .map { _meta, contents -> + // Get keys of first row to act as header + def header = contents[0][0].keySet().join('\t') + // Get values of remaining rows for cells + def rows = contents.collect { it[0].values().join('\t') }.join('\n') + header + '\n' + rows + } + .collectFile( + name: "pydamage_bin_summary.tsv", + newLine: true, + sort: false, + storeDir: "${params.outdir}/GenomeBinning/QC/", + ) + + emit: + tsv = ch_aggregate_summaries + versions = ch_versions +} diff --git a/subworkflows/local/ancient_dna/main.nf b/subworkflows/local/ancient_dna/main.nf index c9996f5e7..314bdaf3e 100644 --- a/subworkflows/local/ancient_dna/main.nf +++ b/subworkflows/local/ancient_dna/main.nf @@ -1,10 +1,10 @@ -include { BCFTOOLS_CONSENSUS } from '../../../modules/nf-core/bcftools/consensus/main' -include { BCFTOOLS_INDEX } from '../../../modules/nf-core/bcftools/index/main' -include { BCFTOOLS_VIEW } from '../../../modules/nf-core/bcftools/view/main' -include { FREEBAYES } from '../../../modules/nf-core/freebayes/main' -include { PYDAMAGE_ANALYZE } from '../../../modules/nf-core/pydamage/analyze/main' -include { PYDAMAGE_FILTER } from '../../../modules/nf-core/pydamage/filter/main' -include { SAMTOOLS_FAIDX as FAIDX } from '../../../modules/nf-core/samtools/faidx/main' +include { BCFTOOLS_CONSENSUS } from '../../../modules/nf-core/bcftools/consensus/main' +include { BCFTOOLS_INDEX } from '../../../modules/nf-core/bcftools/index/main' +include { BCFTOOLS_VIEW } from '../../../modules/nf-core/bcftools/view/main' +include { FREEBAYES } from '../../../modules/nf-core/freebayes/main' +include { PYDAMAGE_ANALYZE } from '../../../modules/nf-core/pydamage/analyze/main' +include { PYDAMAGE_FILTER } from '../../../modules/nf-core/pydamage/filter/main' +include { SAMTOOLS_FAIDX as FAIDX } from '../../../modules/nf-core/samtools/faidx/main' workflow ANCIENT_DNA_ASSEMBLY_VALIDATION { take: diff --git a/workflows/mag.nf b/workflows/mag.nf index f6e3146a5..89e40b542 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -42,6 +42,7 @@ include { QUAST } from '../modules/local/quast_run/mai include { QUAST_BINS } from '../modules/local/quast_bins/main' include { QUAST_BINS_SUMMARY } from '../modules/local/quast_bins_summary/main' include { BIN_SUMMARY } from '../modules/local/bin_summary/main' +include { PYDAMAGE_BINS } from '../subworkflows/local/PYDAMAGE_BINS/main' workflow MAG { take: @@ -374,6 +375,10 @@ workflow MAG { ) .groupTuple(by: 0) + /* + * Generate additional post-binning statistics for Bin QC + */ + DEPTHS(ch_input_for_postbinning, BINNING.out.metabat2depths, ch_reads_for_depths) ch_versions = ch_versions.mix(DEPTHS.out.versions) @@ -450,6 +455,14 @@ workflow MAG { ch_gtdbtk_summary = Channel.empty() } + if (params.ancient_dna) { + PYDAMAGE_BINS(ANCIENT_DNA_ASSEMBLY_VALIDATION.out.pydamage_results, ch_input_for_postbinning) + ch_summarisepydamage = PYDAMAGE_BINS.out.tsv + } + else { + ch_summarisepydamage = Channel.empty() + } + if ((!params.skip_binqc) || !params.skip_quast || !params.skip_gtdbtk) { BIN_SUMMARY( ch_input_for_binsummary, @@ -457,6 +470,7 @@ workflow MAG { ch_quast_bins_summary.ifEmpty([]), ch_gtdbtk_summary.ifEmpty([]), ch_catpack_summary.ifEmpty([]), + ch_summarisepydamage.ifEmpty([]), params.binqc_tool, ) ch_versions = ch_versions.mix(BIN_SUMMARY.out.versions)