Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 53 additions & 13 deletions bin/combine_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,27 @@ def parse_args(args=None):
metavar="FILE",
help="Bin depths summary file.",
)
parser.add_argument("-b", "--binqc_summary", metavar="FILE", help="BUSCO summary file.")
parser.add_argument("-q", "--quast_summary", metavar="FILE", help="QUAST BINS summary file.")
parser.add_argument("-g", "--gtdbtk_summary", metavar="FILE", help="GTDB-Tk summary file.")
parser.add_argument(
"-b", "--binqc_summary", metavar="FILE", help="BUSCO summary file."
)
parser.add_argument(
"-q", "--quast_summary", metavar="FILE", help="QUAST BINS summary file."
)
parser.add_argument(
"-g", "--gtdbtk_summary", metavar="FILE", help="GTDB-Tk summary file."
)
parser.add_argument("-a", "--cat_summary", metavar="FILE", help="CAT table file.")
parser.add_argument(
"-t", "--binqc_tool", help="Bin QC tool used", choices=["busco", "checkm", "checkm2"]
"-p",
"--summarisepydamage_summary",
metavar="FILE",
help="summarisepydamage table file.",
)
parser.add_argument(
"-t",
"--binqc_tool",
help="Bin QC tool used",
choices=["busco", "checkm", "checkm2"],
)

parser.add_argument(
Expand Down Expand Up @@ -76,7 +91,9 @@ def parse_cat_table(cat_table):
)
# merge all rank columns into a single column
df["CAT_rank"] = (
df.filter(regex="rank_\d+").apply(lambda x: ";".join(x.dropna()), axis=1).str.lstrip()
df.filter(regex="rank_\d+")
.apply(lambda x: ";".join(x.dropna()), axis=1)
.str.lstrip()
)
# remove rank_* columns
df.drop(df.filter(regex="rank_\d+").columns, axis=1, inplace=True)
Expand All @@ -87,11 +104,7 @@ def parse_cat_table(cat_table):
def main(args=None):
args = parse_args(args)

if (
not args.binqc_summary
and not args.quast_summary
and not args.gtdbtk_summary
):
if not args.binqc_summary and not args.quast_summary and not args.gtdbtk_summary:
sys.exit(
"No summary specified! "
"Please specify at least BUSCO, CheckM, CheckM2 or QUAST summary."
Expand All @@ -106,15 +119,19 @@ def main(args=None):

# handle bin depths
results = pd.read_csv(args.depths_summary, sep="\t")
results.columns = ["Depth " + str(col) if col != "bin" else col for col in results.columns]
results.columns = [
"Depth " + str(col) if col != "bin" else col for col in results.columns
]
bins = results["bin"].sort_values().reset_index(drop=True)

if args.binqc_summary and args.binqc_tool == "busco":
busco_results = pd.read_csv(args.binqc_summary, sep="\t")
busco_bins = set(busco_results["Input_file"])

if set(bins) != busco_bins and len(busco_bins.intersection(set(bins))) > 0:
warnings.warn("Bins in BUSCO summary do not match bins in bin depths summary")
warnings.warn(
"Bins in BUSCO summary do not match bins in bin depths summary"
)
elif len(busco_bins.intersection(set(bins))) == 0:
sys.exit("Bins in BUSCO summary do not match bins in bin depths summary!")
results = pd.merge(
Expand Down Expand Up @@ -171,7 +188,9 @@ def main(args=None):

if args.quast_summary:
quast_results = pd.read_csv(args.quast_summary, sep="\t")
if not bins.equals(quast_results["Assembly"].sort_values().reset_index(drop=True)):
if not bins.equals(
quast_results["Assembly"].sort_values().reset_index(drop=True)
):
sys.exit("Bins in QUAST summary do not match bins in bin depths summary!")
results = pd.merge(
results, quast_results, left_on="bin", right_on="Assembly", how="outer"
Expand All @@ -197,6 +216,27 @@ def main(args=None):
how="outer",
)

if args.summarisepydamage_summary:
summarisepydamage_results = pd.read_csv(
args.summarisepydamage_summary, sep="\t"
)
summarisepydamage_results["id"] = (summarisepydamage_results["id"]).replace(
"_pydamagebins", ""
)
if not bins.equals(
summarisepydamage_results["id"].sort_values().reset_index(drop=True)
):
sys.exit(
"Bins in summarisepydamage summary do not match bins in bin depths summary!"
)
results = pd.merge(
results,
summarisepydamage_results,
left_on="bin",
right_on="id",
how="outer",
) # assuming depths for all bins are given

results.to_csv(args.out, sep="\t")


Expand Down
91 changes: 91 additions & 0 deletions bin/summarise_pydamage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#!/usr/bin/env python3

## summarise_pydamage.py
## Originally written by James Fellows Yates and released under the MIT license.
## See git repository (https://github.com/nf-core/mag) for full license text.

import os
import sys
import argparse
import pandas as pd


def parse_args(args=None):
parser = argparse.ArgumentParser()
parser.add_argument(
"-i",
"--input",
required=True,
metavar="FILE",
help="Input CSV file output of a pyDamage analyze command",
)
parser.add_argument(
"-o",
"--output",
required=False,
metavar="FILE",
help="Path to output TSV file with all statistic values summarised with as a median value. If not supplied, default output is input with _summarised appended to the file name.",
)
parser.add_argument(
"-n",
"--name",
required=True,
metavar="STRING",
help="Sample name for appending to summarised result as ID",
)
parser.add_argument(
"-v",
"--verbose",
required=False,
action=argparse.BooleanOptionalAction,
help="Print to console more logging information",
)
parser.add_argument("--version", action="version", version="%(prog)s 0.0.1")

return parser.parse_args(args)


def main(args=None):
parser = argparse.ArgumentParser(description="summarise_pydamage.py")
args = parse_args(args)

if args.output is not None:
if os.path.abspath(args.output) == os.path.abspath(args.input):
sys.exit(
"[summarise_pydamage.py] ERROR: Input and output files names must be different."
)

if args.verbose:
print("[summarise_pydamage.py] PROCESSING: Loading " + args.input)

pydamage_raw = pd.read_csv(args.input, sep=",")

if args.verbose:
print("[summarise_pydamage.py] PROCESSING: appending sample name")

pydamage_raw["id"] = os.path.splitext(args.input)[0]

if args.verbose:
print(
"[summarise_pydamage.py] PROCESSING: cleaning up table, and calculating median"
)

pydamage_summarised = pydamage_raw.drop("reference", axis=1).groupby("id").median()
pydamage_summarised = pydamage_summarised.add_prefix("median_")

if args.verbose:
print("[summarise_pydamage.py] FINALISING: saving file")

if args.output is not None:
if os.path.splitext(args.output)[1] != ".tsv":
outfile = os.path.abspath(args.output + ".tsv")
else:
outfile = os.path.abspath(args.output)
else:
outfile = os.path.splitext(args.input)[0] + "_summarised.tsv"

pydamage_summarised.to_csv(outfile, sep="\t")


if __name__ == "__main__":
sys.exit(main())
9 changes: 9 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -723,6 +723,15 @@ process {
]
}

withName: SUMMARISEPYDAMAGE {
ext.tag = { "${meta.bin_id}" }
ext.prefix = { "${meta.bin_id}" }
publishDir = [
path: { "${params.outdir}/GenomeBinning/QC/pydamage/analyze_bins/" },
mode: params.publish_dir_mode,
]
}

withName: SAMTOOLS_FAIDX {
ext.prefix = { "${meta.assembler}-${meta.id}" }
publishDir = [
Expand Down
13 changes: 8 additions & 5 deletions modules/local/bin_summary/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -11,24 +11,27 @@ process BIN_SUMMARY {
path quast_sum
path gtdbtk_sum
path cat_sum
val binqc_tool
path summarisepydamage_sum
val binqc_tool

output:
path "bin_summary.tsv", emit: summary
path "versions.yml" , emit: versions
path "versions.yml", emit: versions

script:
def binqc_summary = binqc_sum.sort().size() > 0 ? "--binqc_summary ${binqc_sum}" : ""
def quast_summary = quast_sum.sort().size() > 0 ? "--quast_summary ${quast_sum}" : ""
def binqc_summary = binqc_sum.sort().size() > 0 ? "--binqc_summary ${binqc_sum}" : ""
def quast_summary = quast_sum.sort().size() > 0 ? "--quast_summary ${quast_sum}" : ""
def gtdbtk_summary = gtdbtk_sum.sort().size() > 0 ? "--gtdbtk_summary ${gtdbtk_sum}" : ""
def cat_summary = cat_sum.sort().size() > 0 ? "--cat_summary ${cat_sum}" : ""
def cat_summary = cat_sum.sort().size() > 0 ? "--cat_summary ${cat_sum}" : ""
def summarisepydamage_summary = summarisepydamage_sum.sort().size() > 0 ? "--summarisepydamage_summary ${summarisepydamage_sum}" : ""
"""
combine_tables.py \
--depths_summary ${bin_depths} \
${binqc_summary} \
${quast_summary} \
${gtdbtk_summary} \
${cat_summary} \
${summarisepydamage_summary} \
--binqc_tool ${binqc_tool} \
--out bin_summary.tsv

Expand Down
7 changes: 7 additions & 0 deletions modules/local/summarisepydamage/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
channels:
- conda-forge
- bioconda
dependencies:
- "bioconda::pydamage=1.0"
49 changes: 49 additions & 0 deletions modules/local/summarisepydamage/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
process SUMMARISEPYDAMAGE {
tag "${meta.id}"
label 'process_single'

conda "${moduleDir}/environment.yml"
container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container
? 'https://depot.galaxyproject.org/singularity/pydamage:1.0--pyhdfd78af_0'
: 'biocontainers/pydamage:1.0--pyhdfd78af_0'}"

input:
tuple val(meta), path(csv)

output:
tuple val(meta), path("*.tsv"), emit: summary_tsv
path "versions.yml", emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
"""
summarise_pydamage.py \\
${args} \\
-i ${csv} \\
-o ${prefix}_pydamagebins_summarised.tsv \\
-n ${meta.id}

cat <<-END_VERSIONS > versions.yml
"${task.process}":
summarisepydamage: \$(summarise_pydamage.py --version | cut -d ' ' -f 2)
END_VERSIONS
"""

stub:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
"""
echo ${args}

touch ${prefix}_pydamage_summarised.csv

cat <<-END_VERSIONS > versions.yml
"${task.process}":
summarisepydamage: \$(summarise_pydamage.py --version | cut -d ' ' -f 2 )
END_VERSIONS
"""
}
54 changes: 54 additions & 0 deletions modules/local/summarisepydamage/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
name: "summarisepydamage"
description: write your description here
keywords:
- sort
- example
- genomics
tools:
- "summarisepydamage":
description: "Processing script for summarising damage parameter estimation for ancient DNA by pyDamage."
homepage: "https://nf-co.re/mag"
documentation: "https://nf-co.re/mag"
tool_dev_url: "https://nf-co.re/mag"
licence: ["GPL v3-or-later"]

input:
- - meta:
type: map
description: |
Groovy Map containing sample information
e.g. `[ id:'sample1' ]`
- csv:
type: file
description: Output CSV table from pyDamage analyze or pydamage filter
pattern: "*.csv"
ontologies:
- edam: "http://edamontology.org/format_3752" # CSV

output:
bam:
- - meta:
type: map
description: |
Groovy Map containing sample information
e.g. `[ id:'sample1' ]`
- "*.csv":
type: file
description: Summarised CSV calculating median values across all contigs
pattern: "*.csv"
ontologies:
- edam: "http://edamontology.org/format_3752" # CSV

versions:
- "versions.yml":
type: file
description: File containing software versions
pattern: "versions.yml"
ontologies:
- edam: "http://edamontology.org/format_3750" # YAML

authors:
- "@jfy133"
maintainers:
- "@jfy133"
Loading
Loading