From 611c6c3bd888c0f3787a88b2ca6117ab32109e7f Mon Sep 17 00:00:00 2001 From: bshifaw Date: Thu, 17 Aug 2023 14:05:54 -0400 Subject: [PATCH 01/62] Added CoverageStats.wdl --- .dockstore.yml | 3 + .../TechAgnostic/Utility/CoverageStats.wdl | 196 ++++++++++++++++++ 2 files changed, 199 insertions(+) create mode 100644 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl diff --git a/.dockstore.yml b/.dockstore.yml index ff958fe50..8193435fa 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -96,3 +96,6 @@ workflows: - name: CleanupIntermediate subclass: wdl primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/CleanupIntermediate.wdl +- name: CoverageStats + subclass: wdl + primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl new file mode 100644 index 000000000..39b65309b --- /dev/null +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -0,0 +1,196 @@ +version 1.0 + +workflow MosdepthCoverageStats { + + meta { + description: "Calculate coverage statistics using mosdepth." + } + parameter_meta { + aligned_bam: "Aligned BAM file." + aligned_bai: "Aligned BAM index file." + bed_file: "BED file containing regions of interest." + bin_length: "Length of bins to use for coverage calculation." + preemptible_tries: "Number of times to retry a preempted task." + } + + input { + File aligned_bam + File aligned_bai + File? bed_file + + Int bin_length = 1000 + + # Runtime parameters + Int? preemptible_tries = 3 + } + + call MosDepthOverBed { + input: + bam = aligned_bam, + bai = aligned_bai, + bed = bed_file, + bin_length = bin_length, + preemptible_tries = preemptible_tries, + } + + String basename = basename(aligned_bam, ".bam") + + call CoverageStats { + input: + mosdepth_regions = MosDepthOverBed.regions, + basename_input = basename, + preemptible_tries = preemptible_tries, + } + + output { + File cov_stat_summary_file = CoverageStats.cov_stat_summary_file + Map[String, String] cov_stat_summary = CoverageStats.cov_stat_summary + } + +} + + +task MosDepthOverBed { + + meta { + description: "Calculate coverage using mosdepth." + } + parameter_meta { + bam: "Aligned BAM file." + bai: "Aligned BAM index file." + bed: "BED file containing regions of interest." + bin_length: "Length of bins to use for coverage calculation." + preemptible_tries: "Number of times to retry a preempted task." + } + + input { + File bam + File bai + File? bed + Int bin_length + + # Runtime parameters + Int? preemptible_tries = 3 + } + + String mosdepth_by_region = select_first([bed, bin_length]) + + Int disk_size = 2 * ceil(size(bam, "GB") + size(bai, "GB")) + String basename = basename(bam, ".bam") + String prefix = "~{basename}.coverage_over_bed" + + command { + set -euxo pipefail + + # Create symbolic links for bam and bai in the current working directory + ln -s ~{bam} ./~{basename}.bam + ln -s ~{bai} ./~{basename}.bai + + mosdepth -t 4 -b ~{mosdepth_by_region} -n -x -Q 1 ~{prefix} ./~{basename}.bam + } + + output { + File global_dist = "~{prefix}.mosdepth.global.dist.txt" + File region_dist = "~{prefix}.mosdepth.region.dist.txt" + File regions = "~{prefix}.regions.bed.gz" + File regions_csi = "~{prefix}.regions.bed.gz.csi" + } + + runtime { + cpu: 4 + memory: 8 + " GiB" + disks: "local-disk " + 100 + " HDD" + preemptible: preemptible_tries + maxRetries: 1 + docker: "us.gcr.io/broad-dsp-lrma/lr-mosdepth:latest" + } +} + +task CoverageStats { + + meta { + description: "Calculate coverage statistics from mosdepth output." + } + parameter_meta { + mosdepth_regions: "Mosdepth output file containing coverage values." + cov_col: "Column holding the coverage values." + basename_input: "Basename to use for output files." + preemptible_tries: "Number of times to retry a preempted task." + } + + input { + File mosdepth_regions + Int cov_col = 4 # column holding the coverage values + String? basename_input + + # Runtime parameters + Int? preemptible_tries = 3 + } + + Int disk_size = 2*ceil(size(mosdepth_regions, "GB")) + String basename = select_first([basename_input, basename(mosdepth_regions)]) + String prefix = "~{basename}.coverage_over_bed" + + command <<< + set -euxo pipefail + + # Use Datamash to calculate summary statistics + zcat ~{mosdepth_regions} | datamash -H \ + mean ~{cov_col} \ + q1 ~{cov_col} \ + median ~{cov_col} \ + q3 ~{cov_col} \ + iqr ~{cov_col} \ + sstdev ~{cov_col} \ + mad ~{cov_col} > ~{prefix}.cov_stat_summary.txt + + cat ~{prefix}.cov_stat_summary.txt + + # Calculate covrage percentage with greater than 4x coverage + total_bases=$(zcat ~{mosdepth_regions} | wc -l) + bases_above_4x=$(zcat ~{mosdepth_regions} | awk -v cov_col=~{cov_col} '$cov_col > 4' | wc -l) + percent_above_4x=$(python3 -c "print($bases_above_4x/$total_bases)") + + # Append the percentage to the summary tsv file + sed -i "1s/$/\tpercent_above_4x/" ~{prefix}.cov_stat_summary.txt + sed -i "2s/$/\t$percent_above_4x/" ~{prefix}.cov_stat_summary.txt + + # Remove floating-point numbers suffix from field names + sed -i '1s/([0-9]*\.[0-9]*)//g' ~{prefix}.cov_stat_summary.txt + + # Extract field names from the header + header=$(head -n 1 ~{prefix}.cov_stat_summary.txt) + IFS=$'\t' read -ra field_names <<< "$header" + + # Extract data values + data=$(tail -n 1 ~{prefix}.cov_stat_summary.txt) + data_values=($(echo "$data" | awk '{ for(i=1; i<=NF; i++) print $i }')) + + # Create the JSON object + json="{" + for (( i=0; i<${#field_names[@]}; i++ )); do + json+="\"${field_names[i]}\":${data_values[i]}" + if [[ $i -lt $(( ${#field_names[@]} - 1 )) ]]; then + json+=", " + fi + done + json+="}" + + # Write the JSON object to the output file + echo "$json" > ~{prefix}.cov_stat_summary.json + >>> + + output { + File cov_stat_summary_file = "~{prefix}.cov_stat_summary.txt" + Map[String, Float] cov_stat_summary = read_json("~{prefix}.cov_stat_summary.json") + } + + runtime { + cpu: 2 + memory: 4 + " GiB" + disks: "local-disk " + 100 + " HDD" + preemptible: preemptible_tries + maxRetries: 1 + docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + } +} From b825dc92b211858f7eecc577f1b775076b5c4025 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Thu, 17 Aug 2023 14:25:00 -0400 Subject: [PATCH 02/62] added "_coverage" to headers --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 39b65309b..275a816ee 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -146,18 +146,18 @@ task CoverageStats { cat ~{prefix}.cov_stat_summary.txt + # Replace floating-point numbers suffix ing field names with '_coverage + sed -i '1s/([0-9]*\.[0-9]*)/_coverage/g' ~{prefix}.cov_stat_summary.txt + # Calculate covrage percentage with greater than 4x coverage total_bases=$(zcat ~{mosdepth_regions} | wc -l) bases_above_4x=$(zcat ~{mosdepth_regions} | awk -v cov_col=~{cov_col} '$cov_col > 4' | wc -l) percent_above_4x=$(python3 -c "print($bases_above_4x/$total_bases)") # Append the percentage to the summary tsv file - sed -i "1s/$/\tpercent_above_4x/" ~{prefix}.cov_stat_summary.txt + sed -i "1s/$/\tpercent_above_4x_coverage/" ~{prefix}.cov_stat_summary.txt sed -i "2s/$/\t$percent_above_4x/" ~{prefix}.cov_stat_summary.txt - # Remove floating-point numbers suffix from field names - sed -i '1s/([0-9]*\.[0-9]*)//g' ~{prefix}.cov_stat_summary.txt - # Extract field names from the header header=$(head -n 1 ~{prefix}.cov_stat_summary.txt) IFS=$'\t' read -ra field_names <<< "$header" From fa49a9da5a5609b6cd43717ea3d909efe6fecb4b Mon Sep 17 00:00:00 2001 From: bshifaw Date: Fri, 18 Aug 2023 14:58:36 -0400 Subject: [PATCH 03/62] Rounding to the nearest hundreth and centralized header suffix --- .../TechAgnostic/Utility/CoverageStats.wdl | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 275a816ee..f87c5362d 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -126,6 +126,8 @@ task CoverageStats { # Runtime parameters Int? preemptible_tries = 3 } + Int round = 2 + String header_suffix = "_coverage" Int disk_size = 2*ceil(size(mosdepth_regions, "GB")) String basename = select_first([basename_input, basename(mosdepth_regions)]) @@ -135,7 +137,7 @@ task CoverageStats { set -euxo pipefail # Use Datamash to calculate summary statistics - zcat ~{mosdepth_regions} | datamash -H \ + zcat ~{mosdepth_regions} | datamash -H -R ~{round} \ mean ~{cov_col} \ q1 ~{cov_col} \ median ~{cov_col} \ @@ -147,17 +149,20 @@ task CoverageStats { cat ~{prefix}.cov_stat_summary.txt # Replace floating-point numbers suffix ing field names with '_coverage - sed -i '1s/([0-9]*\.[0-9]*)/_coverage/g' ~{prefix}.cov_stat_summary.txt + sed -i '1s/([0-9]*\.[0-9]*)/~{header_suffix}/g' ~{prefix}.cov_stat_summary.txt # Calculate covrage percentage with greater than 4x coverage total_bases=$(zcat ~{mosdepth_regions} | wc -l) bases_above_4x=$(zcat ~{mosdepth_regions} | awk -v cov_col=~{cov_col} '$cov_col > 4' | wc -l) - percent_above_4x=$(python3 -c "print($bases_above_4x/$total_bases)") + percent_above_4x=$(python3 -c "print(round($bases_above_4x/$total_bases, ~{round}))") # Append the percentage to the summary tsv file - sed -i "1s/$/\tpercent_above_4x_coverage/" ~{prefix}.cov_stat_summary.txt + sed -i "1s/$/\tpercent_above_4x~{header_suffix}/" ~{prefix}.cov_stat_summary.txt sed -i "2s/$/\t$percent_above_4x/" ~{prefix}.cov_stat_summary.txt + # Round Floats seperated by tab in the second line to the nearest hundredth + sed -i '2s/[0-9]*\.[0-9]*/&\n/g' ~{prefix}.cov_stat_summary.txt + # Extract field names from the header header=$(head -n 1 ~{prefix}.cov_stat_summary.txt) IFS=$'\t' read -ra field_names <<< "$header" From dffa95ef30680d8c6234278121318349503ca1f8 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Fri, 18 Aug 2023 15:35:58 -0400 Subject: [PATCH 04/62] removed old rounding line --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 3 --- 1 file changed, 3 deletions(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index f87c5362d..6643adf64 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -160,9 +160,6 @@ task CoverageStats { sed -i "1s/$/\tpercent_above_4x~{header_suffix}/" ~{prefix}.cov_stat_summary.txt sed -i "2s/$/\t$percent_above_4x/" ~{prefix}.cov_stat_summary.txt - # Round Floats seperated by tab in the second line to the nearest hundredth - sed -i '2s/[0-9]*\.[0-9]*/&\n/g' ~{prefix}.cov_stat_summary.txt - # Extract field names from the header header=$(head -n 1 ~{prefix}.cov_stat_summary.txt) IFS=$'\t' read -ra field_names <<< "$header" From be08fcc96dda9ce03c8d45caac5dec4677f8deaf Mon Sep 17 00:00:00 2001 From: bshifaw Date: Thu, 2 May 2024 11:03:34 -0400 Subject: [PATCH 05/62] More parameters to mosdepthoverbed task --- .../TechAgnostic/Utility/CoverageStats.wdl | 33 ++++++++++++++----- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 6643adf64..766c7ba59 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -21,7 +21,7 @@ workflow MosdepthCoverageStats { Int bin_length = 1000 # Runtime parameters - Int? preemptible_tries = 3 + Int preemptible_tries = 3 } call MosDepthOverBed { @@ -44,7 +44,7 @@ workflow MosdepthCoverageStats { output { File cov_stat_summary_file = CoverageStats.cov_stat_summary_file - Map[String, String] cov_stat_summary = CoverageStats.cov_stat_summary + Map[String, Float] cov_stat_summary = CoverageStats.cov_stat_summary } } @@ -59,7 +59,11 @@ task MosDepthOverBed { bam: "Aligned BAM file." bai: "Aligned BAM index file." bed: "BED file containing regions of interest." - bin_length: "Length of bins to use for coverage calculation." + bin_length: "Length of bins to use for coverage calculation. default is 1000. If bed file is provided, this parameter is ignored." + chrom: "Chromosome to calculate coverage for." + no_per_base: "Do not calculate per-base coverage." + fast_mode: "Use fast mode." + mapq: "Minimum mapping quality to consider." preemptible_tries: "Number of times to retry a preempted task." } @@ -67,10 +71,14 @@ task MosDepthOverBed { File bam File bai File? bed - Int bin_length + String? chrom + Int? bin_length + Boolean no_per_base = true + Boolean fast_mode = true + Int mapq = 1 # Runtime parameters - Int? preemptible_tries = 3 + Int preemptible_tries = 3 } String mosdepth_by_region = select_first([bed, bin_length]) @@ -86,7 +94,14 @@ task MosDepthOverBed { ln -s ~{bam} ./~{basename}.bam ln -s ~{bai} ./~{basename}.bai - mosdepth -t 4 -b ~{mosdepth_by_region} -n -x -Q 1 ~{prefix} ./~{basename}.bam + mosdepth \ + ~{true="-n" false="" no_per_base} \ + ~{true="-x" false="" fast_mode} \ + -t 4 \ + ~{"-c " + chrom} \ + ~{"-b " + mosdepth_by_region} \ + ~{"-Q " + mapq} \ + ~{prefix} ./~{basename}.bam } output { @@ -99,7 +114,7 @@ task MosDepthOverBed { runtime { cpu: 4 memory: 8 + " GiB" - disks: "local-disk " + 100 + " HDD" + disks: "local-disk " + disk_size + " HDD" preemptible: preemptible_tries maxRetries: 1 docker: "us.gcr.io/broad-dsp-lrma/lr-mosdepth:latest" @@ -124,7 +139,7 @@ task CoverageStats { String? basename_input # Runtime parameters - Int? preemptible_tries = 3 + Int preemptible_tries = 3 } Int round = 2 String header_suffix = "_coverage" @@ -190,7 +205,7 @@ task CoverageStats { runtime { cpu: 2 memory: 4 + " GiB" - disks: "local-disk " + 100 + " HDD" + disks: "local-disk " + disk_size + " HDD" preemptible: preemptible_tries maxRetries: 1 docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" From 9598e346b83e2ba30937507936d2434f86ba34e9 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Thu, 9 May 2024 17:02:56 -0400 Subject: [PATCH 06/62] More parameters to mosdepthoverbed task, Added evenness score to CoverageStats --- .../TechAgnostic/Utility/CoverageStats.wdl | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 766c7ba59..9f937ee11 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -64,6 +64,7 @@ task MosDepthOverBed { no_per_base: "Do not calculate per-base coverage." fast_mode: "Use fast mode." mapq: "Minimum mapping quality to consider." + thresholds: "Comma-separated list of thresholds to use for coverage calculation. (e.g. 1,10,20,30)." preemptible_tries: "Number of times to retry a preempted task." } @@ -73,6 +74,7 @@ task MosDepthOverBed { File? bed String? chrom Int? bin_length + String? thresholds Boolean no_per_base = true Boolean fast_mode = true Int mapq = 1 @@ -101,6 +103,7 @@ task MosDepthOverBed { ~{"-c " + chrom} \ ~{"-b " + mosdepth_by_region} \ ~{"-Q " + mapq} \ + ~{"-T " + thresholds} \ ~{prefix} ./~{basename}.bam } @@ -166,7 +169,9 @@ task CoverageStats { # Replace floating-point numbers suffix ing field names with '_coverage sed -i '1s/([0-9]*\.[0-9]*)/~{header_suffix}/g' ~{prefix}.cov_stat_summary.txt + ### # Calculate covrage percentage with greater than 4x coverage + ### total_bases=$(zcat ~{mosdepth_regions} | wc -l) bases_above_4x=$(zcat ~{mosdepth_regions} | awk -v cov_col=~{cov_col} '$cov_col > 4' | wc -l) percent_above_4x=$(python3 -c "print(round($bases_above_4x/$total_bases, ~{round}))") @@ -175,6 +180,38 @@ task CoverageStats { sed -i "1s/$/\tpercent_above_4x~{header_suffix}/" ~{prefix}.cov_stat_summary.txt sed -i "2s/$/\t$percent_above_4x/" ~{prefix}.cov_stat_summary.txt + ### + # Calculate Evenness Score + # Konrad Oexle, Journal of Human Genetics 2016, Evaulation of the evenness score in NGS. + # https://www.nature.com/articles/jhg201621 + ### + + # get mean from the summary file, under the column 'mean_coverage' + mean_coverage=$(awk -F"\t" '{for(i=1; i<=NF; i++) if ($i == "mean_coverage") {getline; getline; printf "%.0f", $(i); exit}}' ~{prefix}.cov_stat_summary.txt) + coverage_count=$(zcat ~{mosdepth_regions} | wc -l) + + # get list of coverages that are less than the mean coverage from the mosdepth output file + D2=($(zcat ~{mosdepth_regions} | awk -v mean=$mean_coverage '{if ($~{cov_col} < mean) print $~{cov_col}}')) + D2_count=${#D2[@]} + D2_sum=$(IFS=+; echo "$((${D2[*]}))") + + # print the values for debugging + echo "mean_coverage: $mean_coverage" + echo "coverage_count: $coverage_count" + echo "D2_count: $D2_count" + echo "D2_sum: $D2_sum" + + # calculate the evenness score + evenness_score=$(1 - ((D2_count - D2_sum)/mean_coverage)/coverage_count) + + # Append the evenness score to the summary tsv file + sed -i "1s/$/\tevenness_score~{header_suffix}/" ~{prefix}.cov_stat_summary.txt + sed -i "2s/$/\t$evenness_score/" ~{prefix}.cov_stat_summary.txt + + ### + # Convert the summary statistics to a JSON object + ### + # Extract field names from the header header=$(head -n 1 ~{prefix}.cov_stat_summary.txt) IFS=$'\t' read -ra field_names <<< "$header" From 24e0a3f64ae1d3cf3de5332fc23d61ca7b7e1fc2 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Fri, 10 May 2024 09:35:51 -0400 Subject: [PATCH 07/62] Added subheaders in CoverageStats task command, use bc to get D2_sum, if statement to avoid evenness score if mean is 0 --- .../TechAgnostic/Utility/CoverageStats.wdl | 35 ++++++++++++------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 9f937ee11..9b7e3942d 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -169,9 +169,12 @@ task CoverageStats { # Replace floating-point numbers suffix ing field names with '_coverage sed -i '1s/([0-9]*\.[0-9]*)/~{header_suffix}/g' ~{prefix}.cov_stat_summary.txt - ### + ############################################################ # Calculate covrage percentage with greater than 4x coverage - ### + ############################################################ + + echo "Calculating coverage percentage with greater than 4x coverage" + total_bases=$(zcat ~{mosdepth_regions} | wc -l) bases_above_4x=$(zcat ~{mosdepth_regions} | awk -v cov_col=~{cov_col} '$cov_col > 4' | wc -l) percent_above_4x=$(python3 -c "print(round($bases_above_4x/$total_bases, ~{round}))") @@ -180,11 +183,13 @@ task CoverageStats { sed -i "1s/$/\tpercent_above_4x~{header_suffix}/" ~{prefix}.cov_stat_summary.txt sed -i "2s/$/\t$percent_above_4x/" ~{prefix}.cov_stat_summary.txt - ### + ############################################################ # Calculate Evenness Score # Konrad Oexle, Journal of Human Genetics 2016, Evaulation of the evenness score in NGS. # https://www.nature.com/articles/jhg201621 - ### + ############################################################ + + echo "Calculating Evenness Score" # get mean from the summary file, under the column 'mean_coverage' mean_coverage=$(awk -F"\t" '{for(i=1; i<=NF; i++) if ($i == "mean_coverage") {getline; getline; printf "%.0f", $(i); exit}}' ~{prefix}.cov_stat_summary.txt) @@ -193,7 +198,7 @@ task CoverageStats { # get list of coverages that are less than the mean coverage from the mosdepth output file D2=($(zcat ~{mosdepth_regions} | awk -v mean=$mean_coverage '{if ($~{cov_col} < mean) print $~{cov_col}}')) D2_count=${#D2[@]} - D2_sum=$(IFS=+; echo "$((${D2[*]}))") + D2_sum=$(echo "${D2[@]}" | tr ' ' '+' | bc) # print the values for debugging echo "mean_coverage: $mean_coverage" @@ -201,16 +206,22 @@ task CoverageStats { echo "D2_count: $D2_count" echo "D2_sum: $D2_sum" - # calculate the evenness score - evenness_score=$(1 - ((D2_count - D2_sum)/mean_coverage)/coverage_count) + if [ $mean_coverage -ne 0 ]; then + # calculate the evenness score + evenness_score=$(1 - ((D2_count - D2_sum)/mean_coverage)/coverage_count) - # Append the evenness score to the summary tsv file - sed -i "1s/$/\tevenness_score~{header_suffix}/" ~{prefix}.cov_stat_summary.txt - sed -i "2s/$/\t$evenness_score/" ~{prefix}.cov_stat_summary.txt + # Append the evenness score to the summary tsv file + sed -i "1s/$/\tevenness_score~{header_suffix}/" ~{prefix}.cov_stat_summary.txt + sed -i "2s/$/\t$evenness_score/" ~{prefix}.cov_stat_summary.txt + else + echo "mean_coverage is 0, cannot calculate evenness score." + fi - ### + ############################################################ # Convert the summary statistics to a JSON object - ### + ############################################################ + + echo "Converting summary statistics to JSON object" # Extract field names from the header header=$(head -n 1 ~{prefix}.cov_stat_summary.txt) From d33b8cce20dbb23a171fd2ee13d0f6c9c8ae0a39 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Fri, 10 May 2024 10:14:48 -0400 Subject: [PATCH 08/62] use awk to get D2_sum --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 9b7e3942d..c62bcf64b 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -198,7 +198,7 @@ task CoverageStats { # get list of coverages that are less than the mean coverage from the mosdepth output file D2=($(zcat ~{mosdepth_regions} | awk -v mean=$mean_coverage '{if ($~{cov_col} < mean) print $~{cov_col}}')) D2_count=${#D2[@]} - D2_sum=$(echo "${D2[@]}" | tr ' ' '+' | bc) + D2_sum=$(echo "$D2" | tr ' ' '\n' | awk '{sum+=$1};END{print sum}') # print the values for debugging echo "mean_coverage: $mean_coverage" From f5663824f249209a400fbf428b9829ce1094efa9 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Fri, 10 May 2024 11:00:55 -0400 Subject: [PATCH 09/62] updated awk command to get D2_sum --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index c62bcf64b..514d79cf5 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -198,7 +198,7 @@ task CoverageStats { # get list of coverages that are less than the mean coverage from the mosdepth output file D2=($(zcat ~{mosdepth_regions} | awk -v mean=$mean_coverage '{if ($~{cov_col} < mean) print $~{cov_col}}')) D2_count=${#D2[@]} - D2_sum=$(echo "$D2" | tr ' ' '\n' | awk '{sum+=$1};END{print sum}') + D2_sum=$(printf "%s\n" "${D2[@]}" | awk 'BEGIN {sum = 0} {sum += $1} END {print sum}') # print the values for debugging echo "mean_coverage: $mean_coverage" @@ -208,7 +208,7 @@ task CoverageStats { if [ $mean_coverage -ne 0 ]; then # calculate the evenness score - evenness_score=$(1 - ((D2_count - D2_sum)/mean_coverage)/coverage_count) + evenness_score=$((1 - ((D2_count - D2_sum)/mean_coverage)/coverage_count)) # Append the evenness score to the summary tsv file sed -i "1s/$/\tevenness_score~{header_suffix}/" ~{prefix}.cov_stat_summary.txt From 9e42b15c0e370446f2977b66243e958255382a3a Mon Sep 17 00:00:00 2001 From: bshifaw Date: Fri, 10 May 2024 11:33:51 -0400 Subject: [PATCH 10/62] updated awk command to get D2_sum with $ --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 514d79cf5..70e2867ae 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -208,7 +208,7 @@ task CoverageStats { if [ $mean_coverage -ne 0 ]; then # calculate the evenness score - evenness_score=$((1 - ((D2_count - D2_sum)/mean_coverage)/coverage_count)) + evenness_score=$(( 1 - (($D2_count - $D2_sum)/$mean_coverage)/$coverage_count )) # Append the evenness score to the summary tsv file sed -i "1s/$/\tevenness_score~{header_suffix}/" ~{prefix}.cov_stat_summary.txt From 8d47439645037a7d2fd234f52e1c5bcdc3929662 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Fri, 10 May 2024 12:21:41 -0400 Subject: [PATCH 11/62] using python for evenness calculation because awk bash doesn't support float arithmetic --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 70e2867ae..2c46f1a4b 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -208,7 +208,7 @@ task CoverageStats { if [ $mean_coverage -ne 0 ]; then # calculate the evenness score - evenness_score=$(( 1 - (($D2_count - $D2_sum)/$mean_coverage)/$coverage_count )) + evenness_score=$(python3 -c "print(1 - (($D2_count - $D2_sum)/$mean_coverage)/$coverage_count)") # Append the evenness score to the summary tsv file sed -i "1s/$/\tevenness_score~{header_suffix}/" ~{prefix}.cov_stat_summary.txt From a6df7de712e326859e60b3fb779dc06079dfb77e Mon Sep 17 00:00:00 2001 From: bshifaw Date: Thu, 16 May 2024 09:03:26 -0400 Subject: [PATCH 12/62] fixed e score formula --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 2c46f1a4b..a72550659 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -196,7 +196,7 @@ task CoverageStats { coverage_count=$(zcat ~{mosdepth_regions} | wc -l) # get list of coverages that are less than the mean coverage from the mosdepth output file - D2=($(zcat ~{mosdepth_regions} | awk -v mean=$mean_coverage '{if ($~{cov_col} < mean) print $~{cov_col}}')) + D2=($(zcat ~{mosdepth_regions} | awk -v mean=$mean_coverage '{if ($~{cov_col} <= mean) print $~{cov_col}}')) D2_count=${#D2[@]} D2_sum=$(printf "%s\n" "${D2[@]}" | awk 'BEGIN {sum = 0} {sum += $1} END {print sum}') @@ -208,7 +208,7 @@ task CoverageStats { if [ $mean_coverage -ne 0 ]; then # calculate the evenness score - evenness_score=$(python3 -c "print(1 - (($D2_count - $D2_sum)/$mean_coverage)/$coverage_count)") + evenness_score=$(python3 -c "print(1 - ($D2_count - ($D2_sum/$mean_coverage))/$coverage_count)") # Append the evenness score to the summary tsv file sed -i "1s/$/\tevenness_score~{header_suffix}/" ~{prefix}.cov_stat_summary.txt From 93ba31a13825076b6ed109c63211a9837e2e9502 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Thu, 16 May 2024 10:27:55 -0400 Subject: [PATCH 13/62] round e-score to hundredth --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index a72550659..3952fadd4 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -208,7 +208,7 @@ task CoverageStats { if [ $mean_coverage -ne 0 ]; then # calculate the evenness score - evenness_score=$(python3 -c "print(1 - ($D2_count - ($D2_sum/$mean_coverage))/$coverage_count)") + evenness_score=$(python3 -c "print(round(1 - ($D2_count - ($D2_sum/$mean_coverage))/$coverage_count, 2))") # Append the evenness score to the summary tsv file sed -i "1s/$/\tevenness_score~{header_suffix}/" ~{prefix}.cov_stat_summary.txt From 310f66fdb3ec98106b61c729eaec386e93673c50 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Thu, 23 May 2024 10:03:29 -0400 Subject: [PATCH 14/62] removed `mosdepth_by_region` intermediate variable --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 3952fadd4..06718392f 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -83,8 +83,6 @@ task MosDepthOverBed { Int preemptible_tries = 3 } - String mosdepth_by_region = select_first([bed, bin_length]) - Int disk_size = 2 * ceil(size(bam, "GB") + size(bai, "GB")) String basename = basename(bam, ".bam") String prefix = "~{basename}.coverage_over_bed" @@ -101,7 +99,7 @@ task MosDepthOverBed { ~{true="-x" false="" fast_mode} \ -t 4 \ ~{"-c " + chrom} \ - ~{"-b " + mosdepth_by_region} \ + ~{"-b " + select_first([bed, bin_length])} \ ~{"-Q " + mapq} \ ~{"-T " + thresholds} \ ~{prefix} ./~{basename}.bam From 8d6fba6c47363993e58bc66163726b9bf32eb0fb Mon Sep 17 00:00:00 2001 From: bshifaw Date: Thu, 23 May 2024 11:07:00 -0400 Subject: [PATCH 15/62] threads option added --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 06718392f..9dc91c6cb 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -72,6 +72,7 @@ task MosDepthOverBed { File bam File bai File? bed + Int? threads = 4 String? chrom Int? bin_length String? thresholds @@ -97,7 +98,7 @@ task MosDepthOverBed { mosdepth \ ~{true="-n" false="" no_per_base} \ ~{true="-x" false="" fast_mode} \ - -t 4 \ + -t ~{threads} \ ~{"-c " + chrom} \ ~{"-b " + select_first([bed, bin_length])} \ ~{"-Q " + mapq} \ From 5cf9f47c192dd164b660b9ad75d708eea1d0de30 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Thu, 23 May 2024 11:08:40 -0400 Subject: [PATCH 16/62] minor fix --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 9dc91c6cb..a0b9e311b 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -72,7 +72,7 @@ task MosDepthOverBed { File bam File bai File? bed - Int? threads = 4 + Int threads = 4 String? chrom Int? bin_length String? thresholds From f3f6dd6e9d87031bbc576b53a5958045b3d737d0 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Tue, 23 Jul 2024 15:34:35 -0400 Subject: [PATCH 17/62] if mean zero, evenness score is none --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index a0b9e311b..9b4c4b110 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -182,6 +182,8 @@ task CoverageStats { sed -i "1s/$/\tpercent_above_4x~{header_suffix}/" ~{prefix}.cov_stat_summary.txt sed -i "2s/$/\t$percent_above_4x/" ~{prefix}.cov_stat_summary.txt + cat ~{prefix}.cov_stat_summary.txt + ############################################################ # Calculate Evenness Score # Konrad Oexle, Journal of Human Genetics 2016, Evaulation of the evenness score in NGS. @@ -214,8 +216,13 @@ task CoverageStats { sed -i "2s/$/\t$evenness_score/" ~{prefix}.cov_stat_summary.txt else echo "mean_coverage is 0, cannot calculate evenness score." + # Set evenness score to None + sed -i "1s/$/\tevenness_score~{header_suffix}/" ~{prefix}.cov_stat_summary.txt + sed -i "2s/$/\tNone/" ~{prefix}.cov_stat_summary.txt fi + cat ~{prefix}.cov_stat_summary.txt + ############################################################ # Convert the summary statistics to a JSON object ############################################################ From fe5254374123ba7110104a6beae3ae363b8ed65f Mon Sep 17 00:00:00 2001 From: bshifaw Date: Tue, 23 Jul 2024 16:51:14 -0400 Subject: [PATCH 18/62] graceful exit if datamash calculation failed --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 9b4c4b110..e4e2770aa 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -165,6 +165,12 @@ task CoverageStats { cat ~{prefix}.cov_stat_summary.txt + wc_sum=$(wc -l ~{prefix}.cov_stat_summary.txt | awk '{print $1}') + if [ "$wc_sum" -lt 2 ]; then + echo "Error calculating summary statistics. datamash may not have enough data." + exit 1 + fi + # Replace floating-point numbers suffix ing field names with '_coverage sed -i '1s/([0-9]*\.[0-9]*)/~{header_suffix}/g' ~{prefix}.cov_stat_summary.txt From d15d9f7773288f0ac8fc64150790b764e6745493 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Wed, 24 Jul 2024 11:27:05 -0400 Subject: [PATCH 19/62] coverage evenness gets 'null' if unable to calculate --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index e4e2770aa..44695c1d1 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -224,7 +224,7 @@ task CoverageStats { echo "mean_coverage is 0, cannot calculate evenness score." # Set evenness score to None sed -i "1s/$/\tevenness_score~{header_suffix}/" ~{prefix}.cov_stat_summary.txt - sed -i "2s/$/\tNone/" ~{prefix}.cov_stat_summary.txt + sed -i "2s/$/\tnull/" ~{prefix}.cov_stat_summary.txt fi cat ~{prefix}.cov_stat_summary.txt From 3118aa7428f86307cd8fec15862312c6815982fb Mon Sep 17 00:00:00 2001 From: bshifaw Date: Wed, 24 Jul 2024 17:55:03 -0400 Subject: [PATCH 20/62] transformed shell script to python and added to mosdepth Dockerfile --- docker/lr-mosdepth/Dockerfile | 3 + docker/lr-mosdepth/coverage_stats.py | 157 ++++++++++++++++++ .../TechAgnostic/Utility/CoverageStats.wdl | 114 +------------ 3 files changed, 168 insertions(+), 106 deletions(-) create mode 100644 docker/lr-mosdepth/coverage_stats.py diff --git a/docker/lr-mosdepth/Dockerfile b/docker/lr-mosdepth/Dockerfile index 9a5acff9d..b72e5468c 100644 --- a/docker/lr-mosdepth/Dockerfile +++ b/docker/lr-mosdepth/Dockerfile @@ -18,3 +18,6 @@ RUN curl https://sdk.cloud.google.com | bash # activate conda environment RUN echo "source activate lr-mosdepth" > ~/.bashrc + +# copy the script +COPY coverage_stats.py / diff --git a/docker/lr-mosdepth/coverage_stats.py b/docker/lr-mosdepth/coverage_stats.py new file mode 100644 index 000000000..97ff05a80 --- /dev/null +++ b/docker/lr-mosdepth/coverage_stats.py @@ -0,0 +1,157 @@ +import gzip +import pandas as pd +import numpy as np +import json +import argparse +import logging + + +def parse_arguments(): + parser = argparse.ArgumentParser(description='Process coverage values.') + parser.add_argument('mosdepth_regions', type=str, + help='Mosdepth output file containing coverage values.') + parser.add_argument('--cov_col', type=int, default=4, + help='Column holding the coverage values.') + parser.add_argument('--output_prefix', type=str, + help='Prefix to use for output files.') + parser.add_argument('--round', type=int, default=2, help='Rounding precision.') + parser.add_argument('--debug', action='store_true', help='Debug mode.') + return parser.parse_args() + + +def calculate_summary_statistics(df, cov_col, round_precision): + """ + Calculate summary statistics for the coverage values. + @param df: Dataframe containing coverage values + @param cov_col: Column containing coverage values + @param round_precision: Rounding precision + @return: + """ + + cov_data = df.iloc[:, cov_col - 1] + mean_val = cov_data.mean() + mad_val = np.mean(np.abs(cov_data - mean_val)) + + statistics = { + 'mean': round(mean_val, round_precision), + 'q1': round(cov_data.quantile(0.25), round_precision), + 'median': round(cov_data.median(), round_precision), + 'q3': round(cov_data.quantile(0.75), round_precision), + 'iqr': round(cov_data.quantile(0.75) - cov_data.quantile(0.25), + round_precision), + 'sstdev': round(cov_data.std(), round_precision), + 'mad': round(mad_val, round_precision) + } + return statistics + + +def percentage_greater_than_4x(df, cov_col, round_precision): + """ + Calculate the percentage of coverage values greater than 4x. + @param df: Dataframe containing coverage values + @param cov_col: Column containing coverage values + @param round_precision: Rounding precision + @return: + """ + logging.debug("Calculating percentage of coverage values greater than 4x") + total_bases = len(df) + bases_above_4x = df[df.iloc[:, cov_col - 1] > 4].shape[0] + percent_above_4x = round(bases_above_4x / total_bases, round_precision) + + return percent_above_4x + + +def calculate_evenness_score(df, cov_col, round_precision): + """ + Calculate the evenness score. + @param df: Dataframe containing coverage values + @param cov_col: Column containing coverage values + @param round_precision: Rounding precision + @return: + """ + logging.debug("Calculating evenness score") + mean_coverage = df.iloc[:, cov_col - 1].mean() + # Get the coverage values that are less than or equal to the mean coverage + D2 = df[df.iloc[:, cov_col - 1] <= mean_coverage].iloc[:, cov_col - 1].tolist() + # count of coverage values that are less than or equal to the mean coverage + D2_count = len(D2) + # sum of coverage values that are less than or equal to the mean coverage + D2_sum = sum(D2) + # total number of coverage + coverage_count = len(df) + + logging.debug("Mean coverage: %s", mean_coverage) + logging.debug("D2 count: %s", D2_count) + logging.debug("D2 sum: %s", D2_sum) + logging.debug("Coverage count: %s", coverage_count) + + if mean_coverage != 0: + evenness_score = round(1 - (D2_count - (D2_sum / mean_coverage)) / coverage_count, round_precision) + else: + logging.warning("Mean coverage is zero. Evenness score will be set to null.") + evenness_score = 'null' + + return evenness_score + + +def open_file(file_path): + """ + Open a file and return a dataframe + @param file_path: Path to the file + @return: + """ + + if file_path.endswith('.gz'): + with gzip.open(file_path, 'rt') as f: + df = pd.read_csv(f, sep='\t', header=None) + else: + with open(file_path, 'rt') as f: + df = pd.read_csv(f, sep='\t', header=None) + + return df + + +def main(): + args = parse_arguments() + if args.debug: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + logging.info("Arguments: %s", args) + logging.info("Calculating coverage statistics") + + prefix = args.output_prefix if args.output_prefix else args.mosdepth_regions.split('.')[0] + + df: pd.DataFrame = open_file(args.mosdepth_regions) + logging.info("Opened file: %s", args.mosdepth_regions) + + logging.debug("Dataframe shape: %s", df.shape) + logging.debug("Dataframe columns: %s", df.columns) + logging.debug(f"Dataframe Head\n{df.head()}") + + statistics = calculate_summary_statistics(df, args.cov_col, args.round) + + logging.debug("Summary statistics: %s", statistics) + + percent_above_4x = percentage_greater_than_4x(df, args.cov_col, args.round) + statistics['percent_above_4x'] = percent_above_4x + + logging.info("Percentage of coverage values greater than 4x: %s", percent_above_4x) + + evenness_score = calculate_evenness_score(df, args.cov_col, args.round) + statistics['evenness_score'] = evenness_score + + logging.info("Evenness score: %s", evenness_score) + + logging.info("Summary statistics: %s", statistics) + + summary_file = f"{prefix}.cov_stat_summary.json" + + logging.info("Writing summary statistics to file: %s", summary_file) + with open(summary_file, 'w') as f: + json.dump(statistics, f) + + +if __name__ == "__main__": + main() diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 44695c1d1..75b260f7a 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -150,115 +150,17 @@ task CoverageStats { String basename = select_first([basename_input, basename(mosdepth_regions)]) String prefix = "~{basename}.coverage_over_bed" - command <<< + command { set -euxo pipefail - # Use Datamash to calculate summary statistics - zcat ~{mosdepth_regions} | datamash -H -R ~{round} \ - mean ~{cov_col} \ - q1 ~{cov_col} \ - median ~{cov_col} \ - q3 ~{cov_col} \ - iqr ~{cov_col} \ - sstdev ~{cov_col} \ - mad ~{cov_col} > ~{prefix}.cov_stat_summary.txt - - cat ~{prefix}.cov_stat_summary.txt - - wc_sum=$(wc -l ~{prefix}.cov_stat_summary.txt | awk '{print $1}') - if [ "$wc_sum" -lt 2 ]; then - echo "Error calculating summary statistics. datamash may not have enough data." - exit 1 - fi - - # Replace floating-point numbers suffix ing field names with '_coverage - sed -i '1s/([0-9]*\.[0-9]*)/~{header_suffix}/g' ~{prefix}.cov_stat_summary.txt - - ############################################################ - # Calculate covrage percentage with greater than 4x coverage - ############################################################ - - echo "Calculating coverage percentage with greater than 4x coverage" - - total_bases=$(zcat ~{mosdepth_regions} | wc -l) - bases_above_4x=$(zcat ~{mosdepth_regions} | awk -v cov_col=~{cov_col} '$cov_col > 4' | wc -l) - percent_above_4x=$(python3 -c "print(round($bases_above_4x/$total_bases, ~{round}))") - - # Append the percentage to the summary tsv file - sed -i "1s/$/\tpercent_above_4x~{header_suffix}/" ~{prefix}.cov_stat_summary.txt - sed -i "2s/$/\t$percent_above_4x/" ~{prefix}.cov_stat_summary.txt - - cat ~{prefix}.cov_stat_summary.txt - - ############################################################ - # Calculate Evenness Score - # Konrad Oexle, Journal of Human Genetics 2016, Evaulation of the evenness score in NGS. - # https://www.nature.com/articles/jhg201621 - ############################################################ - - echo "Calculating Evenness Score" - - # get mean from the summary file, under the column 'mean_coverage' - mean_coverage=$(awk -F"\t" '{for(i=1; i<=NF; i++) if ($i == "mean_coverage") {getline; getline; printf "%.0f", $(i); exit}}' ~{prefix}.cov_stat_summary.txt) - coverage_count=$(zcat ~{mosdepth_regions} | wc -l) - - # get list of coverages that are less than the mean coverage from the mosdepth output file - D2=($(zcat ~{mosdepth_regions} | awk -v mean=$mean_coverage '{if ($~{cov_col} <= mean) print $~{cov_col}}')) - D2_count=${#D2[@]} - D2_sum=$(printf "%s\n" "${D2[@]}" | awk 'BEGIN {sum = 0} {sum += $1} END {print sum}') - - # print the values for debugging - echo "mean_coverage: $mean_coverage" - echo "coverage_count: $coverage_count" - echo "D2_count: $D2_count" - echo "D2_sum: $D2_sum" - - if [ $mean_coverage -ne 0 ]; then - # calculate the evenness score - evenness_score=$(python3 -c "print(round(1 - ($D2_count - ($D2_sum/$mean_coverage))/$coverage_count, 2))") - - # Append the evenness score to the summary tsv file - sed -i "1s/$/\tevenness_score~{header_suffix}/" ~{prefix}.cov_stat_summary.txt - sed -i "2s/$/\t$evenness_score/" ~{prefix}.cov_stat_summary.txt - else - echo "mean_coverage is 0, cannot calculate evenness score." - # Set evenness score to None - sed -i "1s/$/\tevenness_score~{header_suffix}/" ~{prefix}.cov_stat_summary.txt - sed -i "2s/$/\tnull/" ~{prefix}.cov_stat_summary.txt - fi - - cat ~{prefix}.cov_stat_summary.txt - - ############################################################ - # Convert the summary statistics to a JSON object - ############################################################ - - echo "Converting summary statistics to JSON object" - - # Extract field names from the header - header=$(head -n 1 ~{prefix}.cov_stat_summary.txt) - IFS=$'\t' read -ra field_names <<< "$header" - - # Extract data values - data=$(tail -n 1 ~{prefix}.cov_stat_summary.txt) - data_values=($(echo "$data" | awk '{ for(i=1; i<=NF; i++) print $i }')) - - # Create the JSON object - json="{" - for (( i=0; i<${#field_names[@]}; i++ )); do - json+="\"${field_names[i]}\":${data_values[i]}" - if [[ $i -lt $(( ${#field_names[@]} - 1 )) ]]; then - json+=", " - fi - done - json+="}" - - # Write the JSON object to the output file - echo "$json" > ~{prefix}.cov_stat_summary.json - >>> + python3 coverage_stats.py \ + --mosdepth_regions ~{mosdepth_regions} \ + --cov_col ~{cov_col} \ + --round ~{round} \ + --prefix ~{prefix} + } output { - File cov_stat_summary_file = "~{prefix}.cov_stat_summary.txt" Map[String, Float] cov_stat_summary = read_json("~{prefix}.cov_stat_summary.json") } @@ -268,6 +170,6 @@ task CoverageStats { disks: "local-disk " + disk_size + " HDD" preemptible: preemptible_tries maxRetries: 1 - docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + docker: "us.gcr.io/broad-dsp-lrma/lr-mosdepth:latest" } } From 2d1e64ec7de65a7bf1faa97b230748cd17169d85 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Fri, 26 Jul 2024 11:17:38 -0400 Subject: [PATCH 21/62] fixed coverage_stats.py option names --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 75b260f7a..e0927845b 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -154,13 +154,14 @@ task CoverageStats { set -euxo pipefail python3 coverage_stats.py \ - --mosdepth_regions ~{mosdepth_regions} \ --cov_col ~{cov_col} \ --round ~{round} \ - --prefix ~{prefix} + --output_prefix ~{prefix} \ + ~{mosdepth_regions} } output { + File cov_stat_summary_file = "~{prefix}.cov_stat_summary.json" Map[String, Float] cov_stat_summary = read_json("~{prefix}.cov_stat_summary.json") } From 9ce9549ba3bccf94cc87efcc307e7a98cb2e482f Mon Sep 17 00:00:00 2001 From: bshifaw Date: Fri, 26 Jul 2024 11:17:59 -0400 Subject: [PATCH 22/62] changed column names --- docker/lr-mosdepth/coverage_stats.py | 40 +++++++++++++++------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/docker/lr-mosdepth/coverage_stats.py b/docker/lr-mosdepth/coverage_stats.py index 97ff05a80..d0282c11d 100644 --- a/docker/lr-mosdepth/coverage_stats.py +++ b/docker/lr-mosdepth/coverage_stats.py @@ -1,4 +1,6 @@ import gzip + +import pandas import pandas as pd import numpy as np import json @@ -28,19 +30,19 @@ def calculate_summary_statistics(df, cov_col, round_precision): @return: """ - cov_data = df.iloc[:, cov_col - 1] - mean_val = cov_data.mean() - mad_val = np.mean(np.abs(cov_data - mean_val)) + cov_data: pandas.Series = df.iloc[:, cov_col - 1] + mean_val: float = cov_data.mean() + mad_val: float = np.mean(np.abs(cov_data - mean_val)) statistics = { - 'mean': round(mean_val, round_precision), - 'q1': round(cov_data.quantile(0.25), round_precision), - 'median': round(cov_data.median(), round_precision), - 'q3': round(cov_data.quantile(0.75), round_precision), - 'iqr': round(cov_data.quantile(0.75) - cov_data.quantile(0.25), + 'cov_mean': round(mean_val, round_precision), + 'cov_q1': round(cov_data.quantile(0.25), round_precision), + 'cov_median': round(cov_data.median(), round_precision), + 'cov_q3': round(cov_data.quantile(0.75), round_precision), + 'cov_iqr': round(cov_data.quantile(0.75) - cov_data.quantile(0.25), round_precision), - 'sstdev': round(cov_data.std(), round_precision), - 'mad': round(mad_val, round_precision) + 'cov_sstdev': round(cov_data.std(), round_precision), + 'cov_mad': round(mad_val, round_precision) } return statistics @@ -72,21 +74,21 @@ def calculate_evenness_score(df, cov_col, round_precision): logging.debug("Calculating evenness score") mean_coverage = df.iloc[:, cov_col - 1].mean() # Get the coverage values that are less than or equal to the mean coverage - D2 = df[df.iloc[:, cov_col - 1] <= mean_coverage].iloc[:, cov_col - 1].tolist() + d2 = df[df.iloc[:, cov_col - 1] <= mean_coverage].iloc[:, cov_col - 1].tolist() # count of coverage values that are less than or equal to the mean coverage - D2_count = len(D2) + d2_count = len(d2) # sum of coverage values that are less than or equal to the mean coverage - D2_sum = sum(D2) - # total number of coverage + d2_sum = sum(d2) + # total number of coverages coverage_count = len(df) logging.debug("Mean coverage: %s", mean_coverage) - logging.debug("D2 count: %s", D2_count) - logging.debug("D2 sum: %s", D2_sum) + logging.debug("D2 count: %s", d2_count) + logging.debug("D2 sum: %s", d2_sum) logging.debug("Coverage count: %s", coverage_count) if mean_coverage != 0: - evenness_score = round(1 - (D2_count - (D2_sum / mean_coverage)) / coverage_count, round_precision) + evenness_score = round(1 - (d2_count - (d2_sum / mean_coverage)) / coverage_count, round_precision) else: logging.warning("Mean coverage is zero. Evenness score will be set to null.") evenness_score = 'null' @@ -135,12 +137,12 @@ def main(): logging.debug("Summary statistics: %s", statistics) percent_above_4x = percentage_greater_than_4x(df, args.cov_col, args.round) - statistics['percent_above_4x'] = percent_above_4x + statistics['cov_percent_above_4x'] = percent_above_4x logging.info("Percentage of coverage values greater than 4x: %s", percent_above_4x) evenness_score = calculate_evenness_score(df, args.cov_col, args.round) - statistics['evenness_score'] = evenness_score + statistics['cov_evenness_score'] = evenness_score logging.info("Evenness score: %s", evenness_score) From 49445d8173755856bd792344498a5ff79206dc65 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Fri, 26 Jul 2024 14:10:58 -0400 Subject: [PATCH 23/62] new line eof for output file --- docker/lr-mosdepth/coverage_stats.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/lr-mosdepth/coverage_stats.py b/docker/lr-mosdepth/coverage_stats.py index d0282c11d..733979e38 100644 --- a/docker/lr-mosdepth/coverage_stats.py +++ b/docker/lr-mosdepth/coverage_stats.py @@ -153,6 +153,7 @@ def main(): logging.info("Writing summary statistics to file: %s", summary_file) with open(summary_file, 'w') as f: json.dump(statistics, f) + f.write('\n') if __name__ == "__main__": From 6431ff2106bf51c5edc36db94a75d556a56a3446 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Fri, 26 Jul 2024 14:22:37 -0400 Subject: [PATCH 24/62] moved changes to bs_coverage_stats_script_mosdepth branch --- docker/lr-mosdepth/Dockerfile | 3 - docker/lr-mosdepth/coverage_stats.py | 160 --------------------------- 2 files changed, 163 deletions(-) delete mode 100644 docker/lr-mosdepth/coverage_stats.py diff --git a/docker/lr-mosdepth/Dockerfile b/docker/lr-mosdepth/Dockerfile index b72e5468c..9a5acff9d 100644 --- a/docker/lr-mosdepth/Dockerfile +++ b/docker/lr-mosdepth/Dockerfile @@ -18,6 +18,3 @@ RUN curl https://sdk.cloud.google.com | bash # activate conda environment RUN echo "source activate lr-mosdepth" > ~/.bashrc - -# copy the script -COPY coverage_stats.py / diff --git a/docker/lr-mosdepth/coverage_stats.py b/docker/lr-mosdepth/coverage_stats.py deleted file mode 100644 index 733979e38..000000000 --- a/docker/lr-mosdepth/coverage_stats.py +++ /dev/null @@ -1,160 +0,0 @@ -import gzip - -import pandas -import pandas as pd -import numpy as np -import json -import argparse -import logging - - -def parse_arguments(): - parser = argparse.ArgumentParser(description='Process coverage values.') - parser.add_argument('mosdepth_regions', type=str, - help='Mosdepth output file containing coverage values.') - parser.add_argument('--cov_col', type=int, default=4, - help='Column holding the coverage values.') - parser.add_argument('--output_prefix', type=str, - help='Prefix to use for output files.') - parser.add_argument('--round', type=int, default=2, help='Rounding precision.') - parser.add_argument('--debug', action='store_true', help='Debug mode.') - return parser.parse_args() - - -def calculate_summary_statistics(df, cov_col, round_precision): - """ - Calculate summary statistics for the coverage values. - @param df: Dataframe containing coverage values - @param cov_col: Column containing coverage values - @param round_precision: Rounding precision - @return: - """ - - cov_data: pandas.Series = df.iloc[:, cov_col - 1] - mean_val: float = cov_data.mean() - mad_val: float = np.mean(np.abs(cov_data - mean_val)) - - statistics = { - 'cov_mean': round(mean_val, round_precision), - 'cov_q1': round(cov_data.quantile(0.25), round_precision), - 'cov_median': round(cov_data.median(), round_precision), - 'cov_q3': round(cov_data.quantile(0.75), round_precision), - 'cov_iqr': round(cov_data.quantile(0.75) - cov_data.quantile(0.25), - round_precision), - 'cov_sstdev': round(cov_data.std(), round_precision), - 'cov_mad': round(mad_val, round_precision) - } - return statistics - - -def percentage_greater_than_4x(df, cov_col, round_precision): - """ - Calculate the percentage of coverage values greater than 4x. - @param df: Dataframe containing coverage values - @param cov_col: Column containing coverage values - @param round_precision: Rounding precision - @return: - """ - logging.debug("Calculating percentage of coverage values greater than 4x") - total_bases = len(df) - bases_above_4x = df[df.iloc[:, cov_col - 1] > 4].shape[0] - percent_above_4x = round(bases_above_4x / total_bases, round_precision) - - return percent_above_4x - - -def calculate_evenness_score(df, cov_col, round_precision): - """ - Calculate the evenness score. - @param df: Dataframe containing coverage values - @param cov_col: Column containing coverage values - @param round_precision: Rounding precision - @return: - """ - logging.debug("Calculating evenness score") - mean_coverage = df.iloc[:, cov_col - 1].mean() - # Get the coverage values that are less than or equal to the mean coverage - d2 = df[df.iloc[:, cov_col - 1] <= mean_coverage].iloc[:, cov_col - 1].tolist() - # count of coverage values that are less than or equal to the mean coverage - d2_count = len(d2) - # sum of coverage values that are less than or equal to the mean coverage - d2_sum = sum(d2) - # total number of coverages - coverage_count = len(df) - - logging.debug("Mean coverage: %s", mean_coverage) - logging.debug("D2 count: %s", d2_count) - logging.debug("D2 sum: %s", d2_sum) - logging.debug("Coverage count: %s", coverage_count) - - if mean_coverage != 0: - evenness_score = round(1 - (d2_count - (d2_sum / mean_coverage)) / coverage_count, round_precision) - else: - logging.warning("Mean coverage is zero. Evenness score will be set to null.") - evenness_score = 'null' - - return evenness_score - - -def open_file(file_path): - """ - Open a file and return a dataframe - @param file_path: Path to the file - @return: - """ - - if file_path.endswith('.gz'): - with gzip.open(file_path, 'rt') as f: - df = pd.read_csv(f, sep='\t', header=None) - else: - with open(file_path, 'rt') as f: - df = pd.read_csv(f, sep='\t', header=None) - - return df - - -def main(): - args = parse_arguments() - if args.debug: - logging.basicConfig(level=logging.DEBUG) - else: - logging.basicConfig(level=logging.INFO) - - logging.info("Arguments: %s", args) - logging.info("Calculating coverage statistics") - - prefix = args.output_prefix if args.output_prefix else args.mosdepth_regions.split('.')[0] - - df: pd.DataFrame = open_file(args.mosdepth_regions) - logging.info("Opened file: %s", args.mosdepth_regions) - - logging.debug("Dataframe shape: %s", df.shape) - logging.debug("Dataframe columns: %s", df.columns) - logging.debug(f"Dataframe Head\n{df.head()}") - - statistics = calculate_summary_statistics(df, args.cov_col, args.round) - - logging.debug("Summary statistics: %s", statistics) - - percent_above_4x = percentage_greater_than_4x(df, args.cov_col, args.round) - statistics['cov_percent_above_4x'] = percent_above_4x - - logging.info("Percentage of coverage values greater than 4x: %s", percent_above_4x) - - evenness_score = calculate_evenness_score(df, args.cov_col, args.round) - statistics['cov_evenness_score'] = evenness_score - - logging.info("Evenness score: %s", evenness_score) - - logging.info("Summary statistics: %s", statistics) - - summary_file = f"{prefix}.cov_stat_summary.json" - - logging.info("Writing summary statistics to file: %s", summary_file) - with open(summary_file, 'w') as f: - json.dump(statistics, f) - f.write('\n') - - -if __name__ == "__main__": - main() From ac916c0d4a0b65a55993e0aeb4b8ba1cb2b164ab Mon Sep 17 00:00:00 2001 From: bshifaw Date: Thu, 1 Aug 2024 11:29:55 -0400 Subject: [PATCH 25/62] "Addition: logic for bed_file with one inteval in MosdepthCoverageStats workflow" --- .../TechAgnostic/Utility/CoverageStats.wdl | 55 ++++++++++++++++++- 1 file changed, 52 insertions(+), 3 deletions(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index e0927845b..64df0bc26 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -18,17 +18,37 @@ workflow MosdepthCoverageStats { File aligned_bai File? bed_file - Int bin_length = 1000 + Int? bin_length # Runtime parameters Int preemptible_tries = 3 } + if (defined(bed_file) && defined(bin_length)) { + call BinBed { + input: + bed_file = bed_file, + bin_length = bin_length + } + } + + # if the line number in the bed file is less than 2 the run BinBed + if (defined(bed_file)){ + Int file_lines = length(read_lines(select_first([bed_file]))) + if (file_lines < 2) { + call BinBed as BinSingleBed { + input: + bed_file = bed_file, + bin_length = 1 + } + } + } + call MosDepthOverBed { input: bam = aligned_bam, bai = aligned_bai, - bed = bed_file, + bed = select_first([BinBed.binned_bed, BinSingleBed.binned_bed, bed_file]), bin_length = bin_length, preemptible_tries = preemptible_tries, } @@ -74,7 +94,7 @@ task MosDepthOverBed { File? bed Int threads = 4 String? chrom - Int? bin_length + Int bin_length = 1000 String? thresholds Boolean no_per_base = true Boolean fast_mode = true @@ -108,9 +128,12 @@ task MosDepthOverBed { output { File global_dist = "~{prefix}.mosdepth.global.dist.txt" + File summary = "~{prefix}.mosdepth.summary.txt" + File? per_base = "~{prefix}.per-base.bed.gz" File region_dist = "~{prefix}.mosdepth.region.dist.txt" File regions = "~{prefix}.regions.bed.gz" File regions_csi = "~{prefix}.regions.bed.gz.csi" + } runtime { @@ -174,3 +197,29 @@ task CoverageStats { docker: "us.gcr.io/broad-dsp-lrma/lr-mosdepth:latest" } } + + +task BinBed { + input { + File? bed_file + Int? bin_length + } + + command { + set -euxo pipefail + + bedtools makewindows -b ~{bed_file} -w ~{bin_length} > binned.bed + + } + + output { + File binned_bed = "binned.bed" + } + + runtime { + cpu: 1 + memory: "1 GiB" + disks: "local-disk 10 HDD" + docker: "us.gcr.io/broad-dsp-lrma/lr-basic:latest" + } +} \ No newline at end of file From f46a66bbaf8fa6211178ff82c2e0d657193fafcc Mon Sep 17 00:00:00 2001 From: bshifaw Date: Mon, 5 Aug 2024 18:23:00 -0400 Subject: [PATCH 26/62] "Addition: reformated logic for bed_file with one inteval in MosdepthCoverageStats workflow, shifted to using feature branch for docker of mosdepth" --- .../TechAgnostic/Utility/CoverageStats.wdl | 64 ++++++++++++------- 1 file changed, 41 insertions(+), 23 deletions(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 64df0bc26..933993500 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -25,39 +25,43 @@ workflow MosdepthCoverageStats { } if (defined(bed_file) && defined(bin_length)) { - call BinBed { + call BinBed { input: bed_file = bed_file, bin_length = bin_length } } - # if the line number in the bed file is less than 2 the run BinBed - if (defined(bed_file)){ - Int file_lines = length(read_lines(select_first([bed_file]))) - if (file_lines < 2) { - call BinBed as BinSingleBed { - input: - bed_file = bed_file, - bin_length = 1 - } + # Runs in cases where bed file is provided or bed and bin_length are provided + if (defined(bed_file) || (defined(bin_length) && defined(BinBed.binned_bed))) { + call MosDepthOverBed { + input: + bam = aligned_bam, + bai = aligned_bai, + bed = select_first([BinBed.binned_bed, bed_file]), + bin_length = bin_length, + preemptible_tries = preemptible_tries, } + File cov_stat_input_bed = select_first([MosDepthOverBed.per_base, MosDepthOverBed.regions]) } - call MosDepthOverBed { - input: - bam = aligned_bam, - bai = aligned_bai, - bed = select_first([BinBed.binned_bed, BinSingleBed.binned_bed, bed_file]), - bin_length = bin_length, - preemptible_tries = preemptible_tries, + # Runs in cases where no bed file is provided, will run regardless of bin_length + if (!defined(bed_file) ) { + call MosDepthOverBed as MosDepthNoBed { + input: + bam = aligned_bam, + bai = aligned_bai, + bin_length = bin_length, + preemptible_tries = preemptible_tries, + } + File cov_stat_input_no_bed = select_first([MosDepthNoBed.per_base, MosDepthNoBed.regions]) } String basename = basename(aligned_bam, ".bam") call CoverageStats { input: - mosdepth_regions = MosDepthOverBed.regions, + mosdepth_regions = select_first([cov_stat_input_bed, cov_stat_input_no_bed]), basename_input = basename, preemptible_tries = preemptible_tries, } @@ -96,8 +100,8 @@ task MosDepthOverBed { String? chrom Int bin_length = 1000 String? thresholds - Boolean no_per_base = true - Boolean fast_mode = true + Boolean no_per_base = false + Boolean fast_mode = false Int mapq = 1 # Runtime parameters @@ -142,7 +146,7 @@ task MosDepthOverBed { disks: "local-disk " + disk_size + " HDD" preemptible: preemptible_tries maxRetries: 1 - docker: "us.gcr.io/broad-dsp-lrma/lr-mosdepth:latest" + docker: "us.gcr.io/broad-dsp-lrma/lr-mosdepth:bs-cov-sum-0.3.1" } } @@ -194,7 +198,7 @@ task CoverageStats { disks: "local-disk " + disk_size + " HDD" preemptible: preemptible_tries maxRetries: 1 - docker: "us.gcr.io/broad-dsp-lrma/lr-mosdepth:latest" + docker: "us.gcr.io/broad-dsp-lrma/lr-mosdepth:bs-cov-sum-0.3.1" } } @@ -222,4 +226,18 @@ task BinBed { disks: "local-disk 10 HDD" docker: "us.gcr.io/broad-dsp-lrma/lr-basic:latest" } -} \ No newline at end of file +} + + + +# # if the line number in the bed file is less than 2 the run BinBed +# if (defined(bed_file)){ +# Int file_lines = length(read_lines(select_first([bed_file]))) +# if (file_lines < 2) { +# call BinBed as BinSingleBed { +# input: +# bed_file = bed_file, +# bin_length = 1 +# } +# } +# } From 2608f4835a84269b5dafd6a7b19f70d33c03ac50 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Mon, 5 Aug 2024 18:31:28 -0400 Subject: [PATCH 27/62] "Addition: set parameters to mosdepth, so readability is improved in Terra" --- .../TechAgnostic/Utility/CoverageStats.wdl | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 933993500..24848267e 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -85,10 +85,6 @@ task MosDepthOverBed { bed: "BED file containing regions of interest." bin_length: "Length of bins to use for coverage calculation. default is 1000. If bed file is provided, this parameter is ignored." chrom: "Chromosome to calculate coverage for." - no_per_base: "Do not calculate per-base coverage." - fast_mode: "Use fast mode." - mapq: "Minimum mapping quality to consider." - thresholds: "Comma-separated list of thresholds to use for coverage calculation. (e.g. 1,10,20,30)." preemptible_tries: "Number of times to retry a preempted task." } @@ -96,18 +92,19 @@ task MosDepthOverBed { File bam File bai File? bed - Int threads = 4 + String? chrom Int bin_length = 1000 - String? thresholds - Boolean no_per_base = false - Boolean fast_mode = false - Int mapq = 1 # Runtime parameters Int preemptible_tries = 3 } + Int threads = 4 + Boolean no_per_base = false + Boolean fast_mode = false + Int mapq = 1 + Int disk_size = 2 * ceil(size(bam, "GB") + size(bai, "GB")) String basename = basename(bam, ".bam") String prefix = "~{basename}.coverage_over_bed" @@ -126,7 +123,6 @@ task MosDepthOverBed { ~{"-c " + chrom} \ ~{"-b " + select_first([bed, bin_length])} \ ~{"-Q " + mapq} \ - ~{"-T " + thresholds} \ ~{prefix} ./~{basename}.bam } From a641a060355401f89b7f58593aaa55b99c3e0937 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Tue, 6 Aug 2024 08:24:09 -0400 Subject: [PATCH 28/62] "Addition: add absolute path to coverage_stats.py" --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 24848267e..3ea845c70 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -176,7 +176,7 @@ task CoverageStats { command { set -euxo pipefail - python3 coverage_stats.py \ + python3 /coverage_stats.py \ --cov_col ~{cov_col} \ --round ~{round} \ --output_prefix ~{prefix} \ From b6eb2e7598efdeb29fce9dab02190866c7c0b7c7 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Tue, 6 Aug 2024 09:22:12 -0400 Subject: [PATCH 29/62] "Addition: combined mosdepth task and coverage" --- .../TechAgnostic/Utility/CoverageStats.wdl | 121 +++++++----------- 1 file changed, 45 insertions(+), 76 deletions(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 3ea845c70..0a5f4518d 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -11,6 +11,7 @@ workflow MosdepthCoverageStats { bed_file: "BED file containing regions of interest." bin_length: "Length of bins to use for coverage calculation." preemptible_tries: "Number of times to retry a preempted task." +# stats_per_interval: "If true, calculate coverage statistics per interval." } input { @@ -19,11 +20,35 @@ workflow MosdepthCoverageStats { File? bed_file Int? bin_length +# +# Boolean stats_per_interval = false # Runtime parameters Int preemptible_tries = 3 } +# if (stats_per_interval) { +# scatter ( bed_line in read_lines(bed_file) ) { +# call MosDepthOverBed as MosDepthPerInterval { +# input: +# bam = aligned_bam, +# bai = aligned_bai, +# bed = write_lines([bed_line]), +# preemptible_tries = preemptible_tries, +# } +# File cov_stat_input_bed = select_first([MosDepthPerInterval.per_base, MosDepthPerInterval.regions]) +# String basename = basename(aligned_bam, ".bam") +# call CoverageStats { +# input: +# mosdepth_regions = cov_stat_input_bed, +# basename_input = basename, +# preemptible_tries = preemptible_tries, +# } +# } +# } + + + if (defined(bed_file) && defined(bin_length)) { call BinBed { input: @@ -42,7 +67,6 @@ workflow MosdepthCoverageStats { bin_length = bin_length, preemptible_tries = preemptible_tries, } - File cov_stat_input_bed = select_first([MosDepthOverBed.per_base, MosDepthOverBed.regions]) } # Runs in cases where no bed file is provided, will run regardless of bin_length @@ -54,21 +78,12 @@ workflow MosdepthCoverageStats { bin_length = bin_length, preemptible_tries = preemptible_tries, } - File cov_stat_input_no_bed = select_first([MosDepthNoBed.per_base, MosDepthNoBed.regions]) } - String basename = basename(aligned_bam, ".bam") - - call CoverageStats { - input: - mosdepth_regions = select_first([cov_stat_input_bed, cov_stat_input_no_bed]), - basename_input = basename, - preemptible_tries = preemptible_tries, - } output { - File cov_stat_summary_file = CoverageStats.cov_stat_summary_file - Map[String, Float] cov_stat_summary = CoverageStats.cov_stat_summary + File cov_stat_summary_file = MosDepthOverBed.cov_stat_summary_file + Map[String, Float] cov_stat_summary = MosDepthOverBed.cov_stat_summary } } @@ -99,12 +114,18 @@ task MosDepthOverBed { # Runtime parameters Int preemptible_tries = 3 } - + # mosdepth parameters Int threads = 4 Boolean no_per_base = false Boolean fast_mode = false Int mapq = 1 + # coverage_stats.py parameters + Int cov_col = 4 # column holding the coverage values + Int round = 2 + String header_suffix = "_coverage" + + Int disk_size = 2 * ceil(size(bam, "GB") + size(bai, "GB")) String basename = basename(bam, ".bam") String prefix = "~{basename}.coverage_over_bed" @@ -124,15 +145,26 @@ task MosDepthOverBed { ~{"-b " + select_first([bed, bin_length])} \ ~{"-Q " + mapq} \ ~{prefix} ./~{basename}.bam + + # Run coverage_stats.py + python3 /coverage_stats.py \ + --cov_col ~{cov_col} \ + --round ~{round} \ + --output_prefix ~{prefix} \ + ~{prefix}.per-base.bed.gz } output { + # mosdepth output File global_dist = "~{prefix}.mosdepth.global.dist.txt" File summary = "~{prefix}.mosdepth.summary.txt" File? per_base = "~{prefix}.per-base.bed.gz" File region_dist = "~{prefix}.mosdepth.region.dist.txt" File regions = "~{prefix}.regions.bed.gz" File regions_csi = "~{prefix}.regions.bed.gz.csi" + # coverage_stats.py output + File cov_stat_summary_file = "~{prefix}.cov_stat_summary.json" + Map[String, Float] cov_stat_summary = read_json("~{prefix}.cov_stat_summary.json") } @@ -146,57 +178,7 @@ task MosDepthOverBed { } } -task CoverageStats { - - meta { - description: "Calculate coverage statistics from mosdepth output." - } - parameter_meta { - mosdepth_regions: "Mosdepth output file containing coverage values." - cov_col: "Column holding the coverage values." - basename_input: "Basename to use for output files." - preemptible_tries: "Number of times to retry a preempted task." - } - - input { - File mosdepth_regions - Int cov_col = 4 # column holding the coverage values - String? basename_input - - # Runtime parameters - Int preemptible_tries = 3 - } - Int round = 2 - String header_suffix = "_coverage" - - Int disk_size = 2*ceil(size(mosdepth_regions, "GB")) - String basename = select_first([basename_input, basename(mosdepth_regions)]) - String prefix = "~{basename}.coverage_over_bed" - command { - set -euxo pipefail - - python3 /coverage_stats.py \ - --cov_col ~{cov_col} \ - --round ~{round} \ - --output_prefix ~{prefix} \ - ~{mosdepth_regions} - } - - output { - File cov_stat_summary_file = "~{prefix}.cov_stat_summary.json" - Map[String, Float] cov_stat_summary = read_json("~{prefix}.cov_stat_summary.json") - } - - runtime { - cpu: 2 - memory: 4 + " GiB" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - maxRetries: 1 - docker: "us.gcr.io/broad-dsp-lrma/lr-mosdepth:bs-cov-sum-0.3.1" - } -} task BinBed { @@ -224,16 +206,3 @@ task BinBed { } } - - -# # if the line number in the bed file is less than 2 the run BinBed -# if (defined(bed_file)){ -# Int file_lines = length(read_lines(select_first([bed_file]))) -# if (file_lines < 2) { -# call BinBed as BinSingleBed { -# input: -# bed_file = bed_file, -# bin_length = 1 -# } -# } -# } From 8e3e63028af836695c1d75f56f8edcff0a1d7e52 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Tue, 6 Aug 2024 09:39:29 -0400 Subject: [PATCH 30/62] "Addition: fix optional output" --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 0a5f4518d..d31429ad8 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -82,8 +82,8 @@ workflow MosdepthCoverageStats { output { - File cov_stat_summary_file = MosDepthOverBed.cov_stat_summary_file - Map[String, Float] cov_stat_summary = MosDepthOverBed.cov_stat_summary + File cov_stat_summary_file = select_first([MosDepthOverBed.cov_stat_summary_file, MosDepthNoBed.cov_stat_summary_file]) + Map[String, Float] cov_stat_summary = select_first([MosDepthOverBed.cov_stat_summary, MosDepthNoBed.cov_stat_summary]) } } From 4a5b11f0b78295b3b72772af153fef69602c638f Mon Sep 17 00:00:00 2001 From: bshifaw Date: Tue, 6 Aug 2024 09:42:02 -0400 Subject: [PATCH 31/62] "Addition: fix optional output" --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index d31429ad8..655b70c17 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -82,8 +82,8 @@ workflow MosdepthCoverageStats { output { - File cov_stat_summary_file = select_first([MosDepthOverBed.cov_stat_summary_file, MosDepthNoBed.cov_stat_summary_file]) - Map[String, Float] cov_stat_summary = select_first([MosDepthOverBed.cov_stat_summary, MosDepthNoBed.cov_stat_summary]) + File? cov_stat_summary_file = select_first([MosDepthOverBed.cov_stat_summary_file, MosDepthNoBed.cov_stat_summary_file]) + Map[String, Float]? cov_stat_summary = select_first([MosDepthOverBed.cov_stat_summary, MosDepthNoBed.cov_stat_summary]) } } From 1131d859980a7a3ee3c2727410e1ce751ee5ceb3 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Tue, 6 Aug 2024 10:45:00 -0400 Subject: [PATCH 32/62] "Addition: added a scatter option" --- .../TechAgnostic/Utility/CoverageStats.wdl | 67 ++++++++++++------- 1 file changed, 43 insertions(+), 24 deletions(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 655b70c17..6623a39ad 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -11,7 +11,7 @@ workflow MosdepthCoverageStats { bed_file: "BED file containing regions of interest." bin_length: "Length of bins to use for coverage calculation." preemptible_tries: "Number of times to retry a preempted task." -# stats_per_interval: "If true, calculate coverage statistics per interval." + stats_per_interval: "If true, calculate coverage statistics per interval. Must provide a bed file." } input { @@ -20,32 +20,32 @@ workflow MosdepthCoverageStats { File? bed_file Int? bin_length -# -# Boolean stats_per_interval = false + + Boolean stats_per_interval = false # Runtime parameters Int preemptible_tries = 3 } -# if (stats_per_interval) { -# scatter ( bed_line in read_lines(bed_file) ) { -# call MosDepthOverBed as MosDepthPerInterval { -# input: -# bam = aligned_bam, -# bai = aligned_bai, -# bed = write_lines([bed_line]), -# preemptible_tries = preemptible_tries, -# } -# File cov_stat_input_bed = select_first([MosDepthPerInterval.per_base, MosDepthPerInterval.regions]) -# String basename = basename(aligned_bam, ".bam") -# call CoverageStats { -# input: -# mosdepth_regions = cov_stat_input_bed, -# basename_input = basename, -# preemptible_tries = preemptible_tries, -# } -# } -# } + if (stats_per_interval ) { + if (!defined(bed_file)) { + call FailWorkflow { + input: + message = "stats_per_interval is set to true, but no bed file is provided." + } + } + scatter ( bed_line in read_lines(select_first([bed_file])) ) { + call MosDepthOverBed as MosDepthPerInterval { + input: + bam = aligned_bam, + bai = aligned_bai, + bed = write_lines([bed_line]), + preemptible_tries = preemptible_tries, + } + } + Array[File] cov_stat_summary_files = MosDepthPerInterval.cov_stat_summary_file + Array[Map[String, Float]] cov_stat_summaries = MosDepthPerInterval.cov_stat_summary + } @@ -82,8 +82,10 @@ workflow MosdepthCoverageStats { output { - File? cov_stat_summary_file = select_first([MosDepthOverBed.cov_stat_summary_file, MosDepthNoBed.cov_stat_summary_file]) - Map[String, Float]? cov_stat_summary = select_first([MosDepthOverBed.cov_stat_summary, MosDepthNoBed.cov_stat_summary]) + File cov_stat_summary_file = select_first([MosDepthOverBed.cov_stat_summary_file, MosDepthNoBed.cov_stat_summary_file]) + Map[String, Float] cov_stat_summary = select_first([MosDepthOverBed.cov_stat_summary, MosDepthNoBed.cov_stat_summary]) + Array[File]? stats_per_interval_cov_stat_summary_files = cov_stat_summary_files + Array[Map[String, Float]]? stats_per_interval_cov_stat_summaries = cov_stat_summaries } } @@ -206,3 +208,20 @@ task BinBed { } } +task FailWorkflow { + input{ + String message + } + command { + echo "Failing workflow" + echo ~{message} + exit 1 + } + output { + String out_message = message + } + runtime { + docker: "marketplace.gcr.io/google/ubuntu2004" + } +} + From 08a2bb4dc644e708997098553193634e1d3d2e4b Mon Sep 17 00:00:00 2001 From: bshifaw Date: Tue, 6 Aug 2024 13:35:11 -0400 Subject: [PATCH 33/62] "Addition: memory parameter added " --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 6623a39ad..f05f349fa 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -114,6 +114,7 @@ task MosDepthOverBed { Int bin_length = 1000 # Runtime parameters + Int mem = 8 Int preemptible_tries = 3 } # mosdepth parameters @@ -127,7 +128,7 @@ task MosDepthOverBed { Int round = 2 String header_suffix = "_coverage" - + # Calculate disk size Int disk_size = 2 * ceil(size(bam, "GB") + size(bai, "GB")) String basename = basename(bam, ".bam") String prefix = "~{basename}.coverage_over_bed" @@ -172,7 +173,7 @@ task MosDepthOverBed { runtime { cpu: 4 - memory: 8 + " GiB" + memory: mem + " GiB" disks: "local-disk " + disk_size + " HDD" preemptible: preemptible_tries maxRetries: 1 From ec15f626c5b3bf659fe1fbbd901349ed7db71751 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Tue, 6 Aug 2024 13:42:04 -0400 Subject: [PATCH 34/62] "Addition: memory parameter added to workflow parameter" --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index f05f349fa..ad68b1f98 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -25,6 +25,7 @@ workflow MosdepthCoverageStats { # Runtime parameters Int preemptible_tries = 3 + Int mosdepth_mem = 8 } if (stats_per_interval ) { @@ -41,6 +42,7 @@ workflow MosdepthCoverageStats { bai = aligned_bai, bed = write_lines([bed_line]), preemptible_tries = preemptible_tries, + mem = mosdepth_mem, } } Array[File] cov_stat_summary_files = MosDepthPerInterval.cov_stat_summary_file @@ -66,6 +68,7 @@ workflow MosdepthCoverageStats { bed = select_first([BinBed.binned_bed, bed_file]), bin_length = bin_length, preemptible_tries = preemptible_tries, + mem = mosdepth_mem, } } @@ -77,6 +80,7 @@ workflow MosdepthCoverageStats { bai = aligned_bai, bin_length = bin_length, preemptible_tries = preemptible_tries, + mem = mosdepth_mem, } } From 39b33314d4894801396ddfe2e68fd964a911e1a1 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Tue, 6 Aug 2024 14:44:04 -0400 Subject: [PATCH 35/62] "Addition: fail if scatter interval is over 200" --- .../TechAgnostic/Utility/CoverageStats.wdl | 23 ++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index ad68b1f98..a8ceb848d 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -8,10 +8,10 @@ workflow MosdepthCoverageStats { parameter_meta { aligned_bam: "Aligned BAM file." aligned_bai: "Aligned BAM index file." - bed_file: "BED file containing regions of interest." + bed_file: "BED file containing regions of interest. Workflow will fail if bed file is empty" bin_length: "Length of bins to use for coverage calculation." preemptible_tries: "Number of times to retry a preempted task." - stats_per_interval: "If true, calculate coverage statistics per interval. Must provide a bed file." + stats_per_interval: "If true, calculate coverage statistics per interval. Must provide a bed file. Workflow will fail if bed file contains more than 200 intervals to avoid accidently over spending." } input { @@ -28,13 +28,30 @@ workflow MosdepthCoverageStats { Int mosdepth_mem = 8 } + if (defined(bed_file)) { + if (length(read_lines(select_first([bed_file]))) == 0) { + call FailWorkflow as EmptyBedFile { + input: + message = "stats_per_interval is set to true, but the provided bed file is empty." + } + } + } + if (stats_per_interval ) { if (!defined(bed_file)) { - call FailWorkflow { + call FailWorkflow as NoBedFile { input: message = "stats_per_interval is set to true, but no bed file is provided." } } + if (defined(bed_file)) { + if (length(read_lines(select_first([bed_file]))) > 200) { + call FailWorkflow as TooManyIntervals { + input: + message = "stats_per_interval is set to true, but the provided bed file contains more than 200 intervals." + } + } + } scatter ( bed_line in read_lines(select_first([bed_file])) ) { call MosDepthOverBed as MosDepthPerInterval { input: From 087788d4233477803a890f0dced74384e513348f Mon Sep 17 00:00:00 2001 From: bshifaw Date: Tue, 6 Aug 2024 14:56:27 -0400 Subject: [PATCH 36/62] "Addition: check bed length in scatter regardless" --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index a8ceb848d..475a2d3b8 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -44,12 +44,10 @@ workflow MosdepthCoverageStats { message = "stats_per_interval is set to true, but no bed file is provided." } } - if (defined(bed_file)) { - if (length(read_lines(select_first([bed_file]))) > 200) { - call FailWorkflow as TooManyIntervals { - input: - message = "stats_per_interval is set to true, but the provided bed file contains more than 200 intervals." - } + if (length(read_lines(select_first([bed_file]))) > 200) { + call FailWorkflow as TooManyIntervals { + input: + message = "stats_per_interval is set to true, but the provided bed file contains more than 200 intervals." } } scatter ( bed_line in read_lines(select_first([bed_file])) ) { From 5e49318667fa5873bf59fc8616d7f399207ac91b Mon Sep 17 00:00:00 2001 From: bshifaw Date: Tue, 6 Aug 2024 16:31:21 -0400 Subject: [PATCH 37/62] "Addition: mosdepth summarizes regions" --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 475a2d3b8..56660bf80 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -58,6 +58,7 @@ workflow MosdepthCoverageStats { bed = write_lines([bed_line]), preemptible_tries = preemptible_tries, mem = mosdepth_mem, + summarize_regions = true } } Array[File] cov_stat_summary_files = MosDepthPerInterval.cov_stat_summary_file @@ -84,6 +85,7 @@ workflow MosdepthCoverageStats { bin_length = bin_length, preemptible_tries = preemptible_tries, mem = mosdepth_mem, + summarize_regions = true, } } @@ -131,6 +133,7 @@ task MosDepthOverBed { String? chrom Int bin_length = 1000 + boolean summarize_regions = false # Runtime parameters Int mem = 8 @@ -152,6 +155,8 @@ task MosDepthOverBed { String basename = basename(bam, ".bam") String prefix = "~{basename}.coverage_over_bed" + String cov_file_to_summarize = if summarize_regions then "~{prefix}.regions.bed.gz" else "~{prefix}.per-base.bed.gz" + command { set -euxo pipefail @@ -173,7 +178,7 @@ task MosDepthOverBed { --cov_col ~{cov_col} \ --round ~{round} \ --output_prefix ~{prefix} \ - ~{prefix}.per-base.bed.gz + ~{cov_file_to_summarize} } output { From f798f33a96d1620381801157f03d1f704444e0c2 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Tue, 6 Aug 2024 16:47:43 -0400 Subject: [PATCH 38/62] "Addition: mini fix" --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 56660bf80..21be19591 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -133,7 +133,7 @@ task MosDepthOverBed { String? chrom Int bin_length = 1000 - boolean summarize_regions = false + Boolean summarize_regions = false # Runtime parameters Int mem = 8 From 8bfcfa76593a4a4b1f943d64f823ddf5c39f987b Mon Sep 17 00:00:00 2001 From: bshifaw Date: Tue, 6 Aug 2024 20:40:15 -0400 Subject: [PATCH 39/62] "Addition: created separate task for cov per interval" --- .../TechAgnostic/Utility/CoverageStats.wdl | 115 ++++++++++++++++-- 1 file changed, 104 insertions(+), 11 deletions(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 21be19591..98772becc 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -37,7 +37,7 @@ workflow MosdepthCoverageStats { } } - if (stats_per_interval ) { + if (stats_per_interval) { if (!defined(bed_file)) { call FailWorkflow as NoBedFile { input: @@ -50,19 +50,14 @@ workflow MosdepthCoverageStats { message = "stats_per_interval is set to true, but the provided bed file contains more than 200 intervals." } } - scatter ( bed_line in read_lines(select_first([bed_file])) ) { - call MosDepthOverBed as MosDepthPerInterval { + call MosDepthPerInterval { input: bam = aligned_bam, bai = aligned_bai, - bed = write_lines([bed_line]), + bed = bed_file, preemptible_tries = preemptible_tries, mem = mosdepth_mem, - summarize_regions = true } - } - Array[File] cov_stat_summary_files = MosDepthPerInterval.cov_stat_summary_file - Array[Map[String, Float]] cov_stat_summaries = MosDepthPerInterval.cov_stat_summary } @@ -105,8 +100,8 @@ workflow MosdepthCoverageStats { output { File cov_stat_summary_file = select_first([MosDepthOverBed.cov_stat_summary_file, MosDepthNoBed.cov_stat_summary_file]) Map[String, Float] cov_stat_summary = select_first([MosDepthOverBed.cov_stat_summary, MosDepthNoBed.cov_stat_summary]) - Array[File]? stats_per_interval_cov_stat_summary_files = cov_stat_summary_files - Array[Map[String, Float]]? stats_per_interval_cov_stat_summaries = cov_stat_summaries + File? stats_per_interval_cov_stat_summary_files = MosDepthPerInterval.cov_stat_summary_all_file + Map[String, Float]? stats_per_interval_cov_stat_summaries = MosDepthPerInterval.cov_stat_summary_all } } @@ -148,7 +143,6 @@ task MosDepthOverBed { # coverage_stats.py parameters Int cov_col = 4 # column holding the coverage values Int round = 2 - String header_suffix = "_coverage" # Calculate disk size Int disk_size = 2 * ceil(size(bam, "GB") + size(bai, "GB")) @@ -205,8 +199,105 @@ task MosDepthOverBed { } } +task MosDepthPerInterval { + + meta { + description: "Calculate coverage using mosdepth for each interval in bed." + } + parameter_meta { + bam: "Aligned BAM file." + bai: "Aligned BAM index file." + bed: "BED file containing regions of interest." + bin_length: "Length of bins to use for coverage calculation. default is 1000. If bed file is provided, this parameter is ignored." + preemptible_tries: "Number of times to retry a preempted task." + } + + input { + File bam + File bai + File? bed + + Int bin_length = 1000 + + # Runtime parameters + Int mem = 8 + Int preemptible_tries = 3 + } + # mosdepth parameters + Int threads = 4 + Boolean no_per_base = false + Boolean fast_mode = false + Int mapq = 1 + + # coverage_stats.py parameters + Int cov_col = 4 # column holding the coverage values + Int round = 2 + + # Calculate disk size + Int disk_size = 2 * ceil(size(bam, "GB") + size(bai, "GB")) + String basename = basename(bam, ".bam") + String prefix = "~{basename}.coverage_over_bed" + + String cov_file_to_summarize = "~{prefix}.regions.bed.gz" + + command <<< + set -euxo pipefail + + # Create symbolic links for bam and bai in the current working directory + ln -s ~{bam} ./~{basename}.bam + ln -s ~{bai} ./~{basename}.bai + + # create file for coverage stats summary of all intervals + touch ~{prefix}.cov_stat_summary_all.json + + # Create a temporary directory for intermediate files + tmp_dir=$(mktemp -d) + trap "rm -rf $tmp_dir" EXIT + + for bed_line in $(cat ~{bed}); do + bed_file="$tmp_dir/bed_line.bed" + echo $bed_line > $bed_file + + mosdepth \ + ~{true="-n" false="" no_per_base} \ + ~{true="-x" false="" fast_mode} \ + -t ~{threads} \ + -b $bed_file \ + ~{"-Q " + mapq} \ + ~{prefix} ./~{basename}.bam + + # Run coverage_stats.py + python3 /coverage_stats.py \ + --cov_col ~{cov_col} \ + --round ~{round} \ + --output_prefix ~{prefix} \ + ~{cov_file_to_summarize} + + # Append the coverage stats summary of the current interval to the file containing the summary of all intervals + cat ~{prefix}.cov_stat_summary.json >> ~{prefix}.cov_stat_summary_all.json + + done + >>> + + output { + # coverage_stats.py output + File cov_stat_summary_all_file = "~{prefix}.cov_stat_summary_all.json" + Map[String, Float] cov_stat_summary_all = read_json("~{prefix}.cov_stat_summary_all.json") + + } + + runtime { + cpu: 4 + memory: mem + " GiB" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + maxRetries: 1 + docker: "us.gcr.io/broad-dsp-lrma/lr-mosdepth:bs-cov-sum-0.3.1" + } +} + task BinBed { input { @@ -238,6 +329,8 @@ task FailWorkflow { String message } command { + set -e + echo "Failing workflow" echo ~{message} exit 1 From d6a5ea13c5a9b79cb5134c9c85ecd1847ec465e5 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Tue, 6 Aug 2024 21:07:47 -0400 Subject: [PATCH 40/62] "Addition: changed for loop for per interval" --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 98772becc..872ce1e23 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -254,7 +254,9 @@ task MosDepthPerInterval { tmp_dir=$(mktemp -d) trap "rm -rf $tmp_dir" EXIT - for bed_line in $(cat ~{bed}); do + mapfile -t lines < "~{bed}" + + for line in "${lines[@]}"; do bed_file="$tmp_dir/bed_line.bed" echo $bed_line > $bed_file From c988ddfae02445002ad2bf8ac179084dde3373d6 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Tue, 6 Aug 2024 21:14:51 -0400 Subject: [PATCH 41/62] "Addition: fix bed_line to line in for loop for per interval" --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 872ce1e23..9328e2d7a 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -247,7 +247,7 @@ task MosDepthPerInterval { ln -s ~{bam} ./~{basename}.bam ln -s ~{bai} ./~{basename}.bai - # create file for coverage stats summary of all intervals + # Create file for coverage stats summary of all intervals touch ~{prefix}.cov_stat_summary_all.json # Create a temporary directory for intermediate files @@ -258,7 +258,7 @@ task MosDepthPerInterval { for line in "${lines[@]}"; do bed_file="$tmp_dir/bed_line.bed" - echo $bed_line > $bed_file + echo $line > $bed_file mosdepth \ ~{true="-n" false="" no_per_base} \ From 9f7fe0ec0d2b89351b15ce8339c641f3fc2b83f4 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Tue, 6 Aug 2024 21:36:52 -0400 Subject: [PATCH 42/62] "Addition: fix white space to be tab in for loop for per interval" --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 9328e2d7a..1a089a8c7 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -258,7 +258,7 @@ task MosDepthPerInterval { for line in "${lines[@]}"; do bed_file="$tmp_dir/bed_line.bed" - echo $line > $bed_file + echo $line | tr ' ' '\t' > $bed_file mosdepth \ ~{true="-n" false="" no_per_base} \ From 23f29c820853c5d3abf902384352dbdecb13ba25 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Wed, 7 Aug 2024 10:29:34 -0400 Subject: [PATCH 43/62] "Addition: creating json list for cover per interval" --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 1a089a8c7..780e4a2a5 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -249,6 +249,7 @@ task MosDepthPerInterval { # Create file for coverage stats summary of all intervals touch ~{prefix}.cov_stat_summary_all.json + cat "[\n" > ~{prefix}.cov_stat_summary_all.json # Create a temporary directory for intermediate files tmp_dir=$(mktemp -d) @@ -276,9 +277,15 @@ task MosDepthPerInterval { ~{cov_file_to_summarize} # Append the coverage stats summary of the current interval to the file containing the summary of all intervals - cat ~{prefix}.cov_stat_summary.json >> ~{prefix}.cov_stat_summary_all.json + if [[ -s ~{prefix}.cov_stat_summary.json ]]; then + cat ~{prefix}.cov_stat_summary.json >> ~{prefix}.cov_stat_summary_all.json + echo "," >> ~{prefix}.cov_stat_summary_all.json + fi done + # remove the last comma and close the json array + sed -i '$ s/,$//' ~{prefix}.cov_stat_summary_all.json + echo "]" >> ~{prefix}.cov_stat_summary_all.json >>> From 39e4c80499e147a0c9508938a06b14faacb13ccf Mon Sep 17 00:00:00 2001 From: bshifaw Date: Wed, 7 Aug 2024 10:50:48 -0400 Subject: [PATCH 44/62] "Addition: fix for first line json list for cover per interval" --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 780e4a2a5..3bd5e32b9 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -249,7 +249,7 @@ task MosDepthPerInterval { # Create file for coverage stats summary of all intervals touch ~{prefix}.cov_stat_summary_all.json - cat "[\n" > ~{prefix}.cov_stat_summary_all.json + echo "[" > ~{prefix}.cov_stat_summary_all.json # Create a temporary directory for intermediate files tmp_dir=$(mktemp -d) From a763344f9f2a854283e3c87dda6d036d72d74548 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Wed, 7 Aug 2024 11:05:38 -0400 Subject: [PATCH 45/62] "Addition: fix output to be list of dict for cover per interval" --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 3bd5e32b9..29dd68392 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -101,7 +101,7 @@ workflow MosdepthCoverageStats { File cov_stat_summary_file = select_first([MosDepthOverBed.cov_stat_summary_file, MosDepthNoBed.cov_stat_summary_file]) Map[String, Float] cov_stat_summary = select_first([MosDepthOverBed.cov_stat_summary, MosDepthNoBed.cov_stat_summary]) File? stats_per_interval_cov_stat_summary_files = MosDepthPerInterval.cov_stat_summary_all_file - Map[String, Float]? stats_per_interval_cov_stat_summaries = MosDepthPerInterval.cov_stat_summary_all + Array[Map[String, Float]]? stats_per_interval_cov_stat_summaries = MosDepthPerInterval.cov_stat_summary_all } } @@ -293,7 +293,7 @@ task MosDepthPerInterval { output { # coverage_stats.py output File cov_stat_summary_all_file = "~{prefix}.cov_stat_summary_all.json" - Map[String, Float] cov_stat_summary_all = read_json("~{prefix}.cov_stat_summary_all.json") + Array[Map[String, Float]] cov_stat_summary_all = read_json("~{prefix}.cov_stat_summary_all.json") } From a3d06e2bf63f39168e90751375dbda25895f6f05 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Wed, 7 Aug 2024 11:29:58 -0400 Subject: [PATCH 46/62] "Addition: fix output to be object value for cover per interval" --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 29dd68392..29c7303ae 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -185,7 +185,7 @@ task MosDepthOverBed { File regions_csi = "~{prefix}.regions.bed.gz.csi" # coverage_stats.py output File cov_stat_summary_file = "~{prefix}.cov_stat_summary.json" - Map[String, Float] cov_stat_summary = read_json("~{prefix}.cov_stat_summary.json") + Map[String, Object] cov_stat_summary = read_json("~{prefix}.cov_stat_summary.json") } @@ -293,7 +293,7 @@ task MosDepthPerInterval { output { # coverage_stats.py output File cov_stat_summary_all_file = "~{prefix}.cov_stat_summary_all.json" - Array[Map[String, Float]] cov_stat_summary_all = read_json("~{prefix}.cov_stat_summary_all.json") + Array[Map[String, Object]] cov_stat_summary_all = read_json("~{prefix}.cov_stat_summary_all.json") } From e5daa8af91b708c40bab4df79ec1d53b1e60942d Mon Sep 17 00:00:00 2001 From: bshifaw Date: Wed, 7 Aug 2024 11:35:22 -0400 Subject: [PATCH 47/62] "Addition: fix output to be object value for cover per interval" --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 29c7303ae..4c27f8d72 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -101,7 +101,7 @@ workflow MosdepthCoverageStats { File cov_stat_summary_file = select_first([MosDepthOverBed.cov_stat_summary_file, MosDepthNoBed.cov_stat_summary_file]) Map[String, Float] cov_stat_summary = select_first([MosDepthOverBed.cov_stat_summary, MosDepthNoBed.cov_stat_summary]) File? stats_per_interval_cov_stat_summary_files = MosDepthPerInterval.cov_stat_summary_all_file - Array[Map[String, Float]]? stats_per_interval_cov_stat_summaries = MosDepthPerInterval.cov_stat_summary_all + Array[Map[String, Object]]? stats_per_interval_cov_stat_summaries = MosDepthPerInterval.cov_stat_summary_all } } @@ -185,7 +185,7 @@ task MosDepthOverBed { File regions_csi = "~{prefix}.regions.bed.gz.csi" # coverage_stats.py output File cov_stat_summary_file = "~{prefix}.cov_stat_summary.json" - Map[String, Object] cov_stat_summary = read_json("~{prefix}.cov_stat_summary.json") + Map[String, Float] cov_stat_summary = read_json("~{prefix}.cov_stat_summary.json") } From 13b035033c2bd0ee972d08c23de69ffad5674b28 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Wed, 7 Aug 2024 12:32:03 -0400 Subject: [PATCH 48/62] "Addition: fix output to be object value for cover per interval" --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 4c27f8d72..d75a45892 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -101,7 +101,7 @@ workflow MosdepthCoverageStats { File cov_stat_summary_file = select_first([MosDepthOverBed.cov_stat_summary_file, MosDepthNoBed.cov_stat_summary_file]) Map[String, Float] cov_stat_summary = select_first([MosDepthOverBed.cov_stat_summary, MosDepthNoBed.cov_stat_summary]) File? stats_per_interval_cov_stat_summary_files = MosDepthPerInterval.cov_stat_summary_all_file - Array[Map[String, Object]]? stats_per_interval_cov_stat_summaries = MosDepthPerInterval.cov_stat_summary_all +# Array[Map[String, Object]]? stats_per_interval_cov_stat_summaries = MosDepthPerInterval.cov_stat_summary_all } } @@ -293,7 +293,7 @@ task MosDepthPerInterval { output { # coverage_stats.py output File cov_stat_summary_all_file = "~{prefix}.cov_stat_summary_all.json" - Array[Map[String, Object]] cov_stat_summary_all = read_json("~{prefix}.cov_stat_summary_all.json") +# Array[Map[String, Object]] cov_stat_summary_all = read_json("~{prefix}.cov_stat_summary_all.json") } From 1e8a13f005e9cb283e5bf33fb01429d169165826 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Wed, 7 Aug 2024 13:23:23 -0400 Subject: [PATCH 49/62] "Addition: made max bed interval value a parameter for cover per interval" --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index d75a45892..33b794d81 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -12,6 +12,7 @@ workflow MosdepthCoverageStats { bin_length: "Length of bins to use for coverage calculation." preemptible_tries: "Number of times to retry a preempted task." stats_per_interval: "If true, calculate coverage statistics per interval. Must provide a bed file. Workflow will fail if bed file contains more than 200 intervals to avoid accidently over spending." + max_bed_intervals: "Maximum number of intervals allowed in the bed file for MosdpethPerInterval. Default is 200." } input { @@ -22,6 +23,7 @@ workflow MosdepthCoverageStats { Int? bin_length Boolean stats_per_interval = false + Int max_bed_intervals = 200 # Runtime parameters Int preemptible_tries = 3 @@ -44,7 +46,7 @@ workflow MosdepthCoverageStats { message = "stats_per_interval is set to true, but no bed file is provided." } } - if (length(read_lines(select_first([bed_file]))) > 200) { + if (length(read_lines(select_first([bed_file]))) > max_bed_intervals) { call FailWorkflow as TooManyIntervals { input: message = "stats_per_interval is set to true, but the provided bed file contains more than 200 intervals." From b58a0ed28c26fcf2374bbe06600d83c0b3517809 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Wed, 7 Aug 2024 14:31:20 -0400 Subject: [PATCH 50/62] "Addition: added extra disk parameter" --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 33b794d81..c2fa9442d 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -28,6 +28,7 @@ workflow MosdepthCoverageStats { # Runtime parameters Int preemptible_tries = 3 Int mosdepth_mem = 8 + Int mosdepth_extra_disk = 0 } if (defined(bed_file)) { @@ -59,6 +60,7 @@ workflow MosdepthCoverageStats { bed = bed_file, preemptible_tries = preemptible_tries, mem = mosdepth_mem, + extra_disk = mosdepth_extra_disk } } @@ -82,6 +84,7 @@ workflow MosdepthCoverageStats { bin_length = bin_length, preemptible_tries = preemptible_tries, mem = mosdepth_mem, + extra_disk = mosdepth_extra_disk, summarize_regions = true, } } @@ -95,6 +98,7 @@ workflow MosdepthCoverageStats { bin_length = bin_length, preemptible_tries = preemptible_tries, mem = mosdepth_mem, + extra_disk = mosdepth_extra_disk, } } @@ -121,6 +125,7 @@ task MosDepthOverBed { bin_length: "Length of bins to use for coverage calculation. default is 1000. If bed file is provided, this parameter is ignored." chrom: "Chromosome to calculate coverage for." preemptible_tries: "Number of times to retry a preempted task." + extra_disk: "Extra disk space to allocate for intermediate files." } input { @@ -135,6 +140,7 @@ task MosDepthOverBed { # Runtime parameters Int mem = 8 Int preemptible_tries = 3 + Int extra_disk = 0 } # mosdepth parameters Int threads = 4 @@ -147,7 +153,7 @@ task MosDepthOverBed { Int round = 2 # Calculate disk size - Int disk_size = 2 * ceil(size(bam, "GB") + size(bai, "GB")) + Int disk_size = (2 * ceil(size(bam, "GB") + size(bai, "GB"))) + 100 + extra_disk String basename = basename(bam, ".bam") String prefix = "~{basename}.coverage_over_bed" @@ -212,6 +218,7 @@ task MosDepthPerInterval { bed: "BED file containing regions of interest." bin_length: "Length of bins to use for coverage calculation. default is 1000. If bed file is provided, this parameter is ignored." preemptible_tries: "Number of times to retry a preempted task." + extra_disk: "Extra disk space to allocate for intermediate files." } input { @@ -224,6 +231,7 @@ task MosDepthPerInterval { # Runtime parameters Int mem = 8 Int preemptible_tries = 3 + Int extra_disk = 0 } # mosdepth parameters Int threads = 4 @@ -236,7 +244,7 @@ task MosDepthPerInterval { Int round = 2 # Calculate disk size - Int disk_size = 2 * ceil(size(bam, "GB") + size(bai, "GB")) + Int disk_size = (2 * ceil(size(bam, "GB") + size(bai, "GB"))) + 100 + extra_disk String basename = basename(bam, ".bam") String prefix = "~{basename}.coverage_over_bed" From 3ba18d485ad47e8fd61fdd07e9f8d4ddaa15e68b Mon Sep 17 00:00:00 2001 From: bshifaw Date: Wed, 7 Aug 2024 14:59:45 -0400 Subject: [PATCH 51/62] "Addition: removed cpu details from runtime to avoid expensive custom vms, this should automatically be scaled by mem" --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 -- 1 file changed, 2 deletions(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index c2fa9442d..17d6e3968 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -198,7 +198,6 @@ task MosDepthOverBed { } runtime { - cpu: 4 memory: mem + " GiB" disks: "local-disk " + disk_size + " HDD" preemptible: preemptible_tries @@ -308,7 +307,6 @@ task MosDepthPerInterval { } runtime { - cpu: 4 memory: mem + " GiB" disks: "local-disk " + disk_size + " HDD" preemptible: preemptible_tries From 8a8db154d27e96295f6e819e59ea7133c3c67149 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Thu, 15 Aug 2024 13:38:11 -0400 Subject: [PATCH 52/62] "Addition: added interval to per interval json --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 17d6e3968..7e3da8c8d 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -285,6 +285,10 @@ task MosDepthPerInterval { --output_prefix ~{prefix} \ ~{cov_file_to_summarize} + # In the coverage stats summary replace the open bracket with 'interval' name and the line as the value + sed -i '' 's/{/\{"interval": "$line", /' ~{prefix}.cov_stat_summary.json + + # Append the coverage stats summary of the current interval to the file containing the summary of all intervals if [[ -s ~{prefix}.cov_stat_summary.json ]]; then cat ~{prefix}.cov_stat_summary.json >> ~{prefix}.cov_stat_summary_all.json From 2cb6d449ccf80575b3d5122d8209415a89f5c2cd Mon Sep 17 00:00:00 2001 From: bshifaw Date: Thu, 15 Aug 2024 14:37:29 -0400 Subject: [PATCH 53/62] "Addition: fix sed command --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 7e3da8c8d..117793977 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -286,7 +286,7 @@ task MosDepthPerInterval { ~{cov_file_to_summarize} # In the coverage stats summary replace the open bracket with 'interval' name and the line as the value - sed -i '' 's/{/\{"interval": "$line", /' ~{prefix}.cov_stat_summary.json + sed -i 's/{/\{"interval": "$line", /' ~{prefix}.cov_stat_summary.json # Append the coverage stats summary of the current interval to the file containing the summary of all intervals From a2622476c63e740aeacb711221be448f8fa6811a Mon Sep 17 00:00:00 2001 From: bshifaw Date: Thu, 15 Aug 2024 14:50:56 -0400 Subject: [PATCH 54/62] "Addition: fix sed command with backslash for double quotes --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 117793977..19fe08f42 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -286,7 +286,7 @@ task MosDepthPerInterval { ~{cov_file_to_summarize} # In the coverage stats summary replace the open bracket with 'interval' name and the line as the value - sed -i 's/{/\{"interval": "$line", /' ~{prefix}.cov_stat_summary.json + sed -i 's/{/\{"interval": \"$line\", /' ~{prefix}.cov_stat_summary.json # Append the coverage stats summary of the current interval to the file containing the summary of all intervals From 2cf2d12a485ccaea76e1d09aaf31acae3108c689 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Mon, 19 Aug 2024 11:06:19 -0400 Subject: [PATCH 55/62] "Fix: updated quotation for sed in mosdepthperinterval" --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 19fe08f42..92567553e 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -286,7 +286,7 @@ task MosDepthPerInterval { ~{cov_file_to_summarize} # In the coverage stats summary replace the open bracket with 'interval' name and the line as the value - sed -i 's/{/\{"interval": \"$line\", /' ~{prefix}.cov_stat_summary.json + sed -i "s/{/\{\"interval\": \"$line\", /" ~{prefix}.cov_stat_summary.json # Append the coverage stats summary of the current interval to the file containing the summary of all intervals From 8d7f19713d2dc6e14546ac76bb3cdc4fd4c51849 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Mon, 19 Aug 2024 14:18:55 -0400 Subject: [PATCH 56/62] "Fix: updated mosdepthperinterval to skip interval if mosdepth failed" --- .../TechAgnostic/Utility/CoverageStats.wdl | 39 +++++++++++-------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 92567553e..92cc24b39 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -266,7 +266,7 @@ task MosDepthPerInterval { mapfile -t lines < "~{bed}" - for line in "${lines[@]}"; do + for line in "${lines[@]}"; then bed_file="$tmp_dir/bed_line.bed" echo $line | tr ' ' '\t' > $bed_file @@ -278,24 +278,29 @@ task MosDepthPerInterval { ~{"-Q " + mapq} \ ~{prefix} ./~{basename}.bam - # Run coverage_stats.py - python3 /coverage_stats.py \ - --cov_col ~{cov_col} \ - --round ~{round} \ - --output_prefix ~{prefix} \ - ~{cov_file_to_summarize} - - # In the coverage stats summary replace the open bracket with 'interval' name and the line as the value - sed -i "s/{/\{\"interval\": \"$line\", /" ~{prefix}.cov_stat_summary.json - - - # Append the coverage stats summary of the current interval to the file containing the summary of all intervals - if [[ -s ~{prefix}.cov_stat_summary.json ]]; then - cat ~{prefix}.cov_stat_summary.json >> ~{prefix}.cov_stat_summary_all.json - echo "," >> ~{prefix}.cov_stat_summary_all.json + # if mosdepth fails, skip the current interval + if [[ $? -ne 0 ]]; then + # Run coverage_stats.py + python3 /coverage_stats.py \ + --cov_col ~{cov_col} \ + --round ~{round} \ + --output_prefix ~{prefix} \ + ~{cov_file_to_summarize} + + # In the coverage stats summary replace the open bracket with 'interval' name and the line as the value + sed -i "s/{/\{\"interval\": \"$line\", /" ~{prefix}.cov_stat_summary.json + + + # Append the coverage stats summary of the current interval to the file containing the summary of all intervals + if [[ -s ~{prefix}.cov_stat_summary.json ]]; then + cat ~{prefix}.cov_stat_summary.json >> ~{prefix}.cov_stat_summary_all.json + echo "," >> ~{prefix}.cov_stat_summary_all.json + fi + else + echo "Mosdepth failed for interval: $line. Coverage stats script was not run." fi - done + # remove the last comma and close the json array sed -i '$ s/,$//' ~{prefix}.cov_stat_summary_all.json echo "]" >> ~{prefix}.cov_stat_summary_all.json From 8a13b1098bccef05b5100ec6eae5bea5e0d6fcca Mon Sep 17 00:00:00 2001 From: bshifaw Date: Mon, 19 Aug 2024 16:02:05 -0400 Subject: [PATCH 57/62] "Fix: updated mosdepthperinterval fail loud if mosdepth fails" --- .../TechAgnostic/Utility/CoverageStats.wdl | 43 ++++++++++--------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 92cc24b39..7212a47cf 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -266,7 +266,7 @@ task MosDepthPerInterval { mapfile -t lines < "~{bed}" - for line in "${lines[@]}"; then + for line in "${lines[@]}"; do bed_file="$tmp_dir/bed_line.bed" echo $line | tr ' ' '\t' > $bed_file @@ -278,29 +278,30 @@ task MosDepthPerInterval { ~{"-Q " + mapq} \ ~{prefix} ./~{basename}.bam - # if mosdepth fails, skip the current interval + # if mosdepth fails, exit with failure if [[ $? -ne 0 ]]; then - # Run coverage_stats.py - python3 /coverage_stats.py \ - --cov_col ~{cov_col} \ - --round ~{round} \ - --output_prefix ~{prefix} \ - ~{cov_file_to_summarize} - - # In the coverage stats summary replace the open bracket with 'interval' name and the line as the value - sed -i "s/{/\{\"interval\": \"$line\", /" ~{prefix}.cov_stat_summary.json - - - # Append the coverage stats summary of the current interval to the file containing the summary of all intervals - if [[ -s ~{prefix}.cov_stat_summary.json ]]; then - cat ~{prefix}.cov_stat_summary.json >> ~{prefix}.cov_stat_summary_all.json - echo "," >> ~{prefix}.cov_stat_summary_all.json - fi - else - echo "Mosdepth failed for interval: $line. Coverage stats script was not run." + echo "Mosdepth failed for interval $line" + exit 1 fi - done + # Run coverage_stats.py + python3 /coverage_stats.py \ + --cov_col ~{cov_col} \ + --round ~{round} \ + --output_prefix ~{prefix} \ + ~{cov_file_to_summarize} + + # In the coverage stats summary replace the open bracket with 'interval' name and the line as the value + sed -i "s/{/\{\"interval\": \"$line\", /" ~{prefix}.cov_stat_summary.json + + + # Append the coverage stats summary of the current interval to the file containing the summary of all intervals + if [[ -s ~{prefix}.cov_stat_summary.json ]]; then + cat ~{prefix}.cov_stat_summary.json >> ~{prefix}.cov_stat_summary_all.json + echo "," >> ~{prefix}.cov_stat_summary_all.json + fi + + done # remove the last comma and close the json array sed -i '$ s/,$//' ~{prefix}.cov_stat_summary_all.json echo "]" >> ~{prefix}.cov_stat_summary_all.json From e0cf7d3a45dd7b47a4ed7c54e2feb2077fef828b Mon Sep 17 00:00:00 2001 From: bshifaw Date: Thu, 22 Aug 2024 11:44:01 -0400 Subject: [PATCH 58/62] "Fix: updated mosdepthperinterval to disable per base reads " --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 7212a47cf..4e8d3de64 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -234,7 +234,7 @@ task MosDepthPerInterval { } # mosdepth parameters Int threads = 4 - Boolean no_per_base = false + Boolean no_per_base = true Boolean fast_mode = false Int mapq = 1 From 35ea1d393e23105e62d35191e2b2f469b27a5fca Mon Sep 17 00:00:00 2001 From: bshifaw Date: Mon, 26 Aug 2024 10:45:32 -0400 Subject: [PATCH 59/62] "Fix: updated mosdepthperinterval to use samtools to create new bam interval to be processed my mosdepth per-base" --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 4e8d3de64..4153e4166 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -234,7 +234,7 @@ task MosDepthPerInterval { } # mosdepth parameters Int threads = 4 - Boolean no_per_base = true + Boolean no_per_base = false Boolean fast_mode = false Int mapq = 1 @@ -247,7 +247,7 @@ task MosDepthPerInterval { String basename = basename(bam, ".bam") String prefix = "~{basename}.coverage_over_bed" - String cov_file_to_summarize = "~{prefix}.regions.bed.gz" + String cov_file_to_summarize = "~{prefix}.per-base.bed.gz" command <<< set -euxo pipefail @@ -270,13 +270,15 @@ task MosDepthPerInterval { bed_file="$tmp_dir/bed_line.bed" echo $line | tr ' ' '\t' > $bed_file + # Use samtools to extract reads overlapping the interval + samtools view -b -L $bed_file ~{basename}.bam > $tmp_dir/interval.bam + mosdepth \ ~{true="-n" false="" no_per_base} \ ~{true="-x" false="" fast_mode} \ -t ~{threads} \ - -b $bed_file \ ~{"-Q " + mapq} \ - ~{prefix} ./~{basename}.bam + ~{prefix} $tmp_dir/interval.bam # if mosdepth fails, exit with failure if [[ $? -ne 0 ]]; then @@ -286,6 +288,7 @@ task MosDepthPerInterval { # Run coverage_stats.py python3 /coverage_stats.py \ + --debug \ --cov_col ~{cov_col} \ --round ~{round} \ --output_prefix ~{prefix} \ From e6c2f8be2914dc23f75b13908a9fd025372e8a40 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Mon, 26 Aug 2024 14:34:51 -0400 Subject: [PATCH 60/62] "Fix: updated mosdepthperinterval to use samtools to create new bam index" --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 4153e4166..046653ab5 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -272,6 +272,7 @@ task MosDepthPerInterval { # Use samtools to extract reads overlapping the interval samtools view -b -L $bed_file ~{basename}.bam > $tmp_dir/interval.bam + samtools index $tmp_dir/interval.bam mosdepth \ ~{true="-n" false="" no_per_base} \ From 9cff0552cf95abcbf34decbefdf90fb301c42a95 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Mon, 26 Aug 2024 18:39:05 -0400 Subject: [PATCH 61/62] "Fix: updated mosdepthperinterval to use samtools --region-file" --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 046653ab5..1c6c167c0 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -271,7 +271,7 @@ task MosDepthPerInterval { echo $line | tr ' ' '\t' > $bed_file # Use samtools to extract reads overlapping the interval - samtools view -b -L $bed_file ~{basename}.bam > $tmp_dir/interval.bam + samtools view -b --region-file $bed_file ~{basename}.bam > $tmp_dir/interval.bam samtools index $tmp_dir/interval.bam mosdepth \ From f177effddf297bf6780ba67437ed62bac6905124 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Mon, 26 Aug 2024 19:56:11 -0400 Subject: [PATCH 62/62] "Fix: updated mosdepthperinterval to use samtools -M and -L" --- wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl index 1c6c167c0..cd496f532 100644 --- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl +++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl @@ -271,7 +271,7 @@ task MosDepthPerInterval { echo $line | tr ' ' '\t' > $bed_file # Use samtools to extract reads overlapping the interval - samtools view -b --region-file $bed_file ~{basename}.bam > $tmp_dir/interval.bam + samtools view -bM -L $bed_file ~{basename}.bam > $tmp_dir/interval.bam samtools index $tmp_dir/interval.bam mosdepth \