Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### `Added`

- [#861](https://github.com/nf-core/mag/pull/861) - Added `--generate_bigmag_file` to execute the bigmag workflow that generates the file to be used as input for [BIgMAG](https://github.com/jeffe107/BIgMAG) (added by @jeffe107)
- [#718](https://github.com/nf-core/mag/pull/718) - Add support for independent long-read metagenomic assembly (requested by @ljmesi and many others, added by @muabnezor)
- [#718](https://github.com/nf-core/mag/pull/718) - Added metaMDBG and (meta)Flye as long read assemblers (added by @muabnezor)
- [#718](https://github.com/nf-core/mag/pull/718) - Added host removal for long reads using minimap2 as aligner (added by @muabnezor)
Expand Down
4 changes: 4 additions & 0 deletions CITATIONS.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,10 @@

> Orakov, A., Fullam, A., Coelho, A. P., Khedkar, S., Szklarczyk, D., Mende, D. R., Schmidt, T. S. B., and Bork, P.. 2021. “GUNC: Detection of Chimerism and Contamination in Prokaryotic Genomes.” Genome Biology 22 (1): 178. doi: 10.1186/s13059-021-02393-0.

- [MAGFlow/BIgMAG](https://doi.org/10.12688/f1000research.152290.2)

> Yepes-García, J., Falquet, L. (2024). Metagenome quality metrics and taxonomical annotation visualization through the integration of MAGFlow and BIgMAG. F1000Research 13:640. doi.org/10.12688/f1000research.152290.2

- [MaxBin2](https://doi.org/10.1093/bioinformatics/btv638)

> Yu-Wei, W., Simmons, B. A. & Singer, S. W. (2015) MaxBin 2.0: An Automated Binning Algorithm to Recover Genomes from Multiple Metagenomic Datasets. Bioinformatics 32 (4): 605–7. doi: 10.1093/bioinformatics/btv638.
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ Other code contributors include:
- [Nikolaos Vergoulidis](https://github.com/IceGreb)
- [Greg Fedewa](https://github.com/harper357)
- [Vini Salazar](https://github.com/vinisalazar)
- [Jeferyd Yepes](https://github.com/jeffe107)

Long read processing was inspired by [caspargross/HybridAssembly](https://github.com/caspargross/HybridAssembly) written by Caspar Gross [@caspargross](https://github.com/caspargross)

Expand Down
98 changes: 98 additions & 0 deletions bin/bigmag_summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#!/usr/bin/env python

## Originally written by Jeferyd Yepes and released under the MIT license.
## See git repository (https://github.com/nf-core/mag) for full license text.

import pandas as pd
import re
import argparse
import sys
import warnings

def parse_args(args=None):
parser = argparse.ArgumentParser()
parser.add_argument("-s", "--summary", metavar="FILE", help="Pipeline summary file.")
parser.add_argument("-g", "--gunc_summary", metavar="FILE", help="GUNC summary file.")
parser.add_argument("-a", "--alt_summary", metavar="FILE", help="BUSCO or CheckM2 summary file.")
parser.add_argument(
"-t", "--binqc_tool", help="Bin QC tool used", choices=["busco", "checkm", "checkm2"]
)

parser.add_argument(
"-o",
"--out",
required=True,
metavar="FILE",
type=argparse.FileType("w"),
help="Output file containing final bigmag summary.",
)
return parser.parse_args(args)


def main(args=None):
args = parse_args(args)

if (
not args.summary
and not args.gunc_summary
and not args.alt_summary
):
sys.exit(
"No summary specified! "
"Please specify the pipeline summary, the GUNC summary and BUSCO or CheckM2 summary."
)

df_summary = pd.read_csv(args.summary, sep='\t', index_col=0)
for i in range(len(df_summary["bin"])):
name = df_summary["bin"][i]
name = re.sub(r'\.(fa|fasta)(\..*)?$', '', name)
df_summary.at[i,"bin"] = name
df_summary = df_summary.sort_values(by='bin')
df_summary["bin"] = df_summary["bin"].astype(str)

df_gunc = pd.read_csv(args.gunc_summary, sep='\t')
df_gunc["genome"] = df_gunc["genome"].astype(str)
df_gunc = df_gunc.sort_values(by='genome')

df_alt = pd.read_csv(args.alt_summary, sep='\t')

column_names = ['genome']

if args.binqc_tool == "busco":
df_alt["Name"] = df_alt["Name"].astype(str)
df_alt = df_alt.sort_values(by='Name')
column_names.append("Name")

elif args.binqc_tool == "checkm" or args.binqc_tool == "checkm2":
for i in range(len(df_alt["Input_file"])):
name = df_alt["Input_file"][i]
name = re.sub(r'\.(fa|fasta)(\..*)?$', '', name)
df_alt.at[i,"Input_file"] = name
df_alt = df_alt.sort_values(by='Input_file')
df_alt["Input_file"] = df_alt["Input_file"].astype(str)
column_names.append("Input_file")

df_list = [df_gunc, df_alt]
for i in range(len(df_list)):
df_summary = pd.merge(df_summary, df_list[i], left_on='bin', right_on=column_names[i], how='left')

df_summary.rename(columns={'bin': 'Bin'}, inplace=True)

columns_to_remove = ['Name', "genome", 'Input_file', 'Assembly', 'Bin Id']
for column in df_summary.columns:
if column in columns_to_remove:
df_summary = df_summary.drop(columns=column)

df_summary['sample'] = None
for f in range(len(df_summary["Bin"])):
match = re.search(r'^.*?-.*?-(.*)$', df_summary["Bin"][f])
if match:
name = match.group(1)
name = re.sub(r'\.(unbinned|noclass)(\..*)?$', '', name)
name = re.sub(r'\.\d+(\.[^.]+)?$', '', name)
df_summary.at[f,"sample"] = name

df_summary.to_csv(args.out, sep="\t", index=True)

if __name__ == "__main__":
sys.exit(main())
16 changes: 16 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -895,4 +895,20 @@ process {
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
]
}
withName: BIGMAG_SUMMARY {
publishDir = [
[
path: { "${params.outdir}/GenomeBinning/BIgMAG/" },
mode: params.publish_dir_mode,
pattern: '*.tsv',
]
]
}
withName: CONCAT_BIGMAG {
publishDir = [
path: { "${params.outdir}/GenomeBinning/QC" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
]
}
}
13 changes: 13 additions & 0 deletions docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -760,6 +760,19 @@ In cases where eukaryotic genomes are recovered in binning, [MetaEuk](https://gi

</details>

## Summary file to be used as input for BIgMAG

<details markdown="1">
<summary>Output files</summary>

- `GenomeBinning/BIgMAG/bigmag_summary.tsv`: Summary of bin sequencing depths together with GUNC, QUAST, GTDB-Tk, BUSCO and CheckM or CheckM2 results.

</details>

The output file in this directory is suitable to be used as input for the dashboard [BIgMAG](https://github.com/jeffe107/BIgMAG.

It is generated through a dedicated subworkflow that takes the file `bin_summary.tsv` as input, and it will additionally execute CheckM2 if BUSCO is the selected quality control tool or BUSCO if CheckM or CheckM2 was specified by the user as the main tool. By default the subworkflow will execute GUNC.

## Ancient DNA

Optional, only running when parameter `-profile ancient_dna` is specified.
Expand Down
4 changes: 4 additions & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -531,3 +531,7 @@ Up until version 4.0.0, this pipeline offered raw read taxonomic profiling using
This feature was removed in version 5.0.0 to strengthen the pipeline's focus on metagenome assembly and binning.

If you require taxonomic profiling of raw reads, we recommend using [nf-core/taxprofiler](https://nf-co.re/taxprofiler/), which is specifically designed for taxonomic profiling of raw reads and supports a wide range of tools for this purpose.

## BIgMAG compatibility

With the parameter `--generate_bigmag_file` a subworkflow will be triggered to generate one and unique file that contains the ouput from all of the bin-quality tools. The file `bigmag_summary.tsv`located at `GenomeBinning/BIgMAG` in the output directory can be used directly for the application [BIgMAG](https://github.com/jeffe107/BIgMAG), after the proper installation in their local environment. This is the only file they need to run the BIgMAG dashboard.
42 changes: 42 additions & 0 deletions modules/local/bigmag_summary/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
process BIGMAG_SUMMARY {

conda "conda-forge::pandas=1.4.3"
container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container
? 'https://depot.galaxyproject.org/singularity/pandas:1.4.3'
: 'biocontainers/pandas:1.4.3'}"

input:
path summary
path gunc_sum
path alt_sum
val binqc_tool

output:
path "bigmag_summary.tsv", emit: bigmag_summary
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def args = task.ext.args ?: ''
def summary = summary.sort().size() > 0 ? "--summary ${summary}" : ""
def gunc_summary = gunc_sum.sort().size() > 0 ? "--gunc_summary ${gunc_sum}" : ""
def alt_summary = alt_sum.sort().size() > 0 ? "--alt_summary ${alt_sum}" : ""
"""
bigmag_summary.py \
${args} \
${summary} \
${gunc_summary} \
${alt_summary} \
--binqc_tool ${binqc_tool} \
--out bigmag_summary.tsv

cat <<-END_VERSIONS > versions.yml
"${task.process}":
python: \$(python --version 2>&1 | sed 's/Python //g')
pandas: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('pandas').version)")
END_VERSIONS
"""
}
55 changes: 55 additions & 0 deletions modules/local/concat_bigmag/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
process CONCAT_BIGMAG {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question(if-minor)
Is this a duplicate of modules/nf-core/csvtk/concat? If so, you don't need to write it again, you can just import the module in your subworkflow and rename it with the as alias (see the subworkflows/local/assembly_shortread` for how we do that to import SPADES as METASPADES

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes and no. It does exactly the same, the difference relies on that modules/nf-core/csvtk/concat takes binqc_tool as parameter to generate the output file. If I call the same module on bigmag subworkflow, it would overwrite the file generated during BIN_QC subworkflow. As a result, I created CONCAT_BIGMAG to rename the binqc_tool and create the file for the tool that bigmag subworkflow is executing.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where would it re-write the file? You mean in --outdir? If that's the case we can just change the resulting file's name using publishDir for an aliased CSVTK module in modules.config :)

Unless I misunderstand?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, you are right, it would overwrite the file in --outdir. So, great, I agree with the solution you propose.

tag "$meta.id"
label 'process_low'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/csvtk:0.31.0--h9ee0642_0' :
'biocontainers/csvtk:0.31.0--h9ee0642_0' }"

input:
tuple val(meta), path(csv, name: 'inputs/csv*/*')
val in_format
val out_format

output:
tuple val(meta), path("${prefix}.${out_extension}"), emit: csv
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
prefix = params.binqc_tool == 'busco' ? 'checkm2_summary' : 'busco_summary'
def delimiter = in_format == "tsv" ? "\t" : (in_format == "csv" ? "," : in_format)
def out_delimiter = out_format == "tsv" ? "\t" : (out_format == "csv" ? "," : out_format)
out_extension = out_format == "tsv" ? 'tsv' : 'csv'
"""
csvtk \\
concat \\
$args \\
--num-cpus $task.cpus \\
--delimiter "${delimiter}" \\
--out-delimiter "${out_delimiter}" \\
--out-file ${prefix}.${out_extension} \\
$csv
cat <<-END_VERSIONS > versions.yml
"${task.process}":
csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" ))
END_VERSIONS
"""

stub:
prefix = params.binqc_tool == 'busco' ? 'checkm2_summary' : 'busco_summary'
out_extension = out_format == "tsv" ? 'tsv' : 'csv'
"""
touch ${prefix}.${out_extension}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" ))
END_VERSIONS
"""
}
1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ params {
gunc_database_type = 'progenomes'
gunc_db = null
gunc_save_db = false
generate_bigmag_file = false

// Reproducibility options
megahit_fix_cpu_1 = false
Expand Down
5 changes: 5 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -966,6 +966,11 @@
"type": "boolean",
"description": "Save the used GUNC reference files downloaded when not using --gunc_db parameter.",
"help_text": "If specified, the corresponding DIAMOND file downloaded from the GUNC server will be stored in your output directory alongside your GUNC results."
},
"generate_bigmag_file": {
"type": "boolean",
"description": "Make the file bin_summary.tsv suitable for the application BIgMAG.",
"help_text": "If specified, BUSCO, ChecKM2 and GUNC will be executed simultaneously."
}
}
},
Expand Down
Loading
Loading