-
Notifications
You must be signed in to change notification settings - Fork 141
BIgMAG compatibility #861
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: dev
Are you sure you want to change the base?
BIgMAG compatibility #861
Changes from all commits
105a51b
5e430e3
56a200d
0168b46
5abd21b
926d01c
f5b4502
83ae6a0
7ef95af
99e5bf8
38e48b2
22517cd
9ca66ae
568abc5
d34d2ad
ed196db
0f3476d
bb864fa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
#!/usr/bin/env python | ||
|
||
## Originally written by Jeferyd Yepes and released under the MIT license. | ||
## See git repository (https://github.com/nf-core/mag) for full license text. | ||
|
||
import pandas as pd | ||
import re | ||
import argparse | ||
import sys | ||
import warnings | ||
|
||
def parse_args(args=None): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("-s", "--summary", metavar="FILE", help="Pipeline summary file.") | ||
parser.add_argument("-g", "--gunc_summary", metavar="FILE", help="GUNC summary file.") | ||
parser.add_argument("-a", "--alt_summary", metavar="FILE", help="BUSCO or CheckM2 summary file.") | ||
parser.add_argument( | ||
"-t", "--binqc_tool", help="Bin QC tool used", choices=["busco", "checkm", "checkm2"] | ||
) | ||
|
||
parser.add_argument( | ||
"-o", | ||
"--out", | ||
required=True, | ||
metavar="FILE", | ||
type=argparse.FileType("w"), | ||
help="Output file containing final bigmag summary.", | ||
) | ||
return parser.parse_args(args) | ||
|
||
|
||
def main(args=None): | ||
args = parse_args(args) | ||
|
||
if ( | ||
not args.summary | ||
and not args.gunc_summary | ||
and not args.alt_summary | ||
): | ||
sys.exit( | ||
"No summary specified! " | ||
"Please specify the pipeline summary, the GUNC summary and BUSCO or CheckM2 summary." | ||
) | ||
|
||
df_summary = pd.read_csv(args.summary, sep='\t', index_col=0) | ||
for i in range(len(df_summary["bin"])): | ||
name = df_summary["bin"][i] | ||
name = re.sub(r'\.(fa|fasta)(\..*)?$', '', name) | ||
df_summary.at[i,"bin"] = name | ||
df_summary = df_summary.sort_values(by='bin') | ||
df_summary["bin"] = df_summary["bin"].astype(str) | ||
|
||
df_gunc = pd.read_csv(args.gunc_summary, sep='\t') | ||
df_gunc["genome"] = df_gunc["genome"].astype(str) | ||
df_gunc = df_gunc.sort_values(by='genome') | ||
|
||
df_alt = pd.read_csv(args.alt_summary, sep='\t') | ||
|
||
column_names = ['genome'] | ||
|
||
if args.binqc_tool == "busco": | ||
df_alt["Name"] = df_alt["Name"].astype(str) | ||
df_alt = df_alt.sort_values(by='Name') | ||
column_names.append("Name") | ||
|
||
elif args.binqc_tool == "checkm" or args.binqc_tool == "checkm2": | ||
for i in range(len(df_alt["Input_file"])): | ||
name = df_alt["Input_file"][i] | ||
name = re.sub(r'\.(fa|fasta)(\..*)?$', '', name) | ||
df_alt.at[i,"Input_file"] = name | ||
df_alt = df_alt.sort_values(by='Input_file') | ||
df_alt["Input_file"] = df_alt["Input_file"].astype(str) | ||
column_names.append("Input_file") | ||
|
||
df_list = [df_gunc, df_alt] | ||
for i in range(len(df_list)): | ||
df_summary = pd.merge(df_summary, df_list[i], left_on='bin', right_on=column_names[i], how='left') | ||
|
||
df_summary.rename(columns={'bin': 'Bin'}, inplace=True) | ||
|
||
columns_to_remove = ['Name', "genome", 'Input_file', 'Assembly', 'Bin Id'] | ||
for column in df_summary.columns: | ||
if column in columns_to_remove: | ||
df_summary = df_summary.drop(columns=column) | ||
|
||
df_summary['sample'] = None | ||
for f in range(len(df_summary["Bin"])): | ||
match = re.search(r'^.*?-.*?-(.*)$', df_summary["Bin"][f]) | ||
if match: | ||
name = match.group(1) | ||
name = re.sub(r'\.(unbinned|noclass)(\..*)?$', '', name) | ||
name = re.sub(r'\.\d+(\.[^.]+)?$', '', name) | ||
df_summary.at[f,"sample"] = name | ||
|
||
df_summary.to_csv(args.out, sep="\t", index=True) | ||
|
||
if __name__ == "__main__": | ||
sys.exit(main()) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
process BIGMAG_SUMMARY { | ||
|
||
conda "conda-forge::pandas=1.4.3" | ||
container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container | ||
? 'https://depot.galaxyproject.org/singularity/pandas:1.4.3' | ||
: 'biocontainers/pandas:1.4.3'}" | ||
|
||
input: | ||
path summary | ||
path gunc_sum | ||
path alt_sum | ||
val binqc_tool | ||
|
||
output: | ||
path "bigmag_summary.tsv", emit: bigmag_summary | ||
path "versions.yml" , emit: versions | ||
|
||
when: | ||
task.ext.when == null || task.ext.when | ||
|
||
script: | ||
def args = task.ext.args ?: '' | ||
def args = task.ext.args ?: '' | ||
def summary = summary.sort().size() > 0 ? "--summary ${summary}" : "" | ||
jeffe107 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
def gunc_summary = gunc_sum.sort().size() > 0 ? "--gunc_summary ${gunc_sum}" : "" | ||
def alt_summary = alt_sum.sort().size() > 0 ? "--alt_summary ${alt_sum}" : "" | ||
""" | ||
bigmag_summary.py \ | ||
${args} \ | ||
${summary} \ | ||
jeffe107 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
${gunc_summary} \ | ||
${alt_summary} \ | ||
--binqc_tool ${binqc_tool} \ | ||
--out bigmag_summary.tsv | ||
|
||
cat <<-END_VERSIONS > versions.yml | ||
"${task.process}": | ||
python: \$(python --version 2>&1 | sed 's/Python //g') | ||
pandas: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('pandas').version)") | ||
END_VERSIONS | ||
""" | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
process CONCAT_BIGMAG { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes and no. It does exactly the same, the difference relies on that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where would it re-write the file? You mean in Unless I misunderstand? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, you are right, it would overwrite the file in |
||
tag "$meta.id" | ||
label 'process_low' | ||
|
||
conda "${moduleDir}/environment.yml" | ||
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? | ||
'https://depot.galaxyproject.org/singularity/csvtk:0.31.0--h9ee0642_0' : | ||
'biocontainers/csvtk:0.31.0--h9ee0642_0' }" | ||
|
||
input: | ||
tuple val(meta), path(csv, name: 'inputs/csv*/*') | ||
val in_format | ||
val out_format | ||
|
||
output: | ||
tuple val(meta), path("${prefix}.${out_extension}"), emit: csv | ||
path "versions.yml" , emit: versions | ||
|
||
when: | ||
task.ext.when == null || task.ext.when | ||
|
||
script: | ||
def args = task.ext.args ?: '' | ||
prefix = params.binqc_tool == 'busco' ? 'checkm2_summary' : 'busco_summary' | ||
def delimiter = in_format == "tsv" ? "\t" : (in_format == "csv" ? "," : in_format) | ||
def out_delimiter = out_format == "tsv" ? "\t" : (out_format == "csv" ? "," : out_format) | ||
out_extension = out_format == "tsv" ? 'tsv' : 'csv' | ||
""" | ||
csvtk \\ | ||
concat \\ | ||
$args \\ | ||
--num-cpus $task.cpus \\ | ||
--delimiter "${delimiter}" \\ | ||
--out-delimiter "${out_delimiter}" \\ | ||
--out-file ${prefix}.${out_extension} \\ | ||
$csv | ||
cat <<-END_VERSIONS > versions.yml | ||
"${task.process}": | ||
csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) | ||
END_VERSIONS | ||
""" | ||
|
||
stub: | ||
prefix = params.binqc_tool == 'busco' ? 'checkm2_summary' : 'busco_summary' | ||
out_extension = out_format == "tsv" ? 'tsv' : 'csv' | ||
""" | ||
touch ${prefix}.${out_extension} | ||
cat <<-END_VERSIONS > versions.yml | ||
"${task.process}": | ||
csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) | ||
END_VERSIONS | ||
""" | ||
} |
Uh oh!
There was an error while loading. Please reload this page.