From 611c6c3bd888c0f3787a88b2ca6117ab32109e7f Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Thu, 17 Aug 2023 14:05:54 -0400
Subject: [PATCH 01/62] Added CoverageStats.wdl

---
 .dockstore.yml                                |   3 +
 .../TechAgnostic/Utility/CoverageStats.wdl    | 196 ++++++++++++++++++
 2 files changed, 199 insertions(+)
 create mode 100644 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl

diff --git a/.dockstore.yml b/.dockstore.yml
index ff958fe50..8193435fa 100644
--- a/.dockstore.yml
+++ b/.dockstore.yml
@@ -96,3 +96,6 @@ workflows:
 - name: CleanupIntermediate
   subclass: wdl
   primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/CleanupIntermediate.wdl
+- name: CoverageStats
+  subclass: wdl
+  primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
new file mode 100644
index 000000000..39b65309b
--- /dev/null
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -0,0 +1,196 @@
+version 1.0
+
+workflow MosdepthCoverageStats {
+
+    meta {
+        description: "Calculate coverage statistics using mosdepth."
+    }
+    parameter_meta {
+        aligned_bam: "Aligned BAM file."
+        aligned_bai: "Aligned BAM index file."
+        bed_file: "BED file containing regions of interest."
+        bin_length: "Length of bins to use for coverage calculation."
+        preemptible_tries: "Number of times to retry a preempted task."
+    }
+
+    input {
+        File aligned_bam
+        File aligned_bai
+        File? bed_file
+
+        Int bin_length = 1000
+
+        # Runtime parameters
+        Int? preemptible_tries = 3
+    }
+
+    call MosDepthOverBed {
+        input:
+            bam = aligned_bam,
+            bai = aligned_bai,
+            bed = bed_file,
+            bin_length = bin_length,
+            preemptible_tries = preemptible_tries,
+    }
+
+    String basename = basename(aligned_bam, ".bam")
+
+    call CoverageStats {
+        input:
+            mosdepth_regions = MosDepthOverBed.regions,
+            basename_input = basename,
+            preemptible_tries = preemptible_tries,
+    }
+
+    output {
+        File cov_stat_summary_file      = CoverageStats.cov_stat_summary_file
+        Map[String, String] cov_stat_summary = CoverageStats.cov_stat_summary
+    }
+
+}
+
+
+task MosDepthOverBed {
+
+    meta {
+        description: "Calculate coverage using mosdepth."
+    }
+    parameter_meta {
+        bam: "Aligned BAM file."
+        bai: "Aligned BAM index file."
+        bed: "BED file containing regions of interest."
+        bin_length: "Length of bins to use for coverage calculation."
+        preemptible_tries: "Number of times to retry a preempted task."
+    }
+
+    input {
+        File bam
+        File bai
+        File? bed
+        Int bin_length
+
+        # Runtime parameters
+        Int? preemptible_tries = 3
+    }
+
+    String mosdepth_by_region = select_first([bed, bin_length])
+
+    Int disk_size = 2 * ceil(size(bam, "GB") + size(bai, "GB"))
+    String basename = basename(bam, ".bam")
+    String prefix = "~{basename}.coverage_over_bed"
+
+    command {
+        set -euxo pipefail
+
+        # Create symbolic links for bam and bai in the current working directory
+        ln -s ~{bam} ./~{basename}.bam
+        ln -s ~{bai} ./~{basename}.bai
+
+        mosdepth -t 4 -b ~{mosdepth_by_region} -n -x -Q 1 ~{prefix} ./~{basename}.bam
+    }
+
+    output {
+        File global_dist      = "~{prefix}.mosdepth.global.dist.txt"
+        File region_dist      = "~{prefix}.mosdepth.region.dist.txt"
+        File regions          = "~{prefix}.regions.bed.gz"
+        File regions_csi      = "~{prefix}.regions.bed.gz.csi"
+    }
+
+    runtime {
+        cpu:                    4
+        memory:                 8 + " GiB"
+        disks: "local-disk " +  100 + " HDD"
+        preemptible:            preemptible_tries
+        maxRetries:             1
+        docker:                 "us.gcr.io/broad-dsp-lrma/lr-mosdepth:latest"
+    }
+}
+
+task CoverageStats {
+
+    meta {
+        description: "Calculate coverage statistics from mosdepth output."
+    }
+    parameter_meta {
+        mosdepth_regions: "Mosdepth output file containing coverage values."
+        cov_col: "Column holding the coverage values."
+        basename_input: "Basename to use for output files."
+        preemptible_tries: "Number of times to retry a preempted task."
+    }
+
+    input {
+        File mosdepth_regions
+        Int cov_col = 4 # column holding the coverage values
+        String? basename_input
+
+        # Runtime parameters
+        Int? preemptible_tries = 3
+    }
+
+    Int disk_size = 2*ceil(size(mosdepth_regions, "GB"))
+    String basename = select_first([basename_input, basename(mosdepth_regions)])
+    String prefix = "~{basename}.coverage_over_bed"
+
+    command <<<
+        set -euxo pipefail
+
+        # Use Datamash to calculate summary statistics
+        zcat ~{mosdepth_regions} | datamash -H \
+        mean ~{cov_col} \
+        q1 ~{cov_col} \
+        median ~{cov_col} \
+        q3 ~{cov_col} \
+        iqr ~{cov_col} \
+        sstdev ~{cov_col} \
+        mad ~{cov_col} > ~{prefix}.cov_stat_summary.txt
+
+        cat ~{prefix}.cov_stat_summary.txt
+
+        # Calculate covrage percentage with greater than 4x coverage
+        total_bases=$(zcat ~{mosdepth_regions} | wc -l)
+        bases_above_4x=$(zcat ~{mosdepth_regions} | awk -v cov_col=~{cov_col} '$cov_col > 4' | wc -l)
+        percent_above_4x=$(python3 -c "print($bases_above_4x/$total_bases)")
+
+        # Append the percentage to the summary tsv file
+        sed -i "1s/$/\tpercent_above_4x/" ~{prefix}.cov_stat_summary.txt
+        sed -i "2s/$/\t$percent_above_4x/" ~{prefix}.cov_stat_summary.txt
+
+        # Remove floating-point numbers suffix from field names
+        sed -i '1s/([0-9]*\.[0-9]*)//g' ~{prefix}.cov_stat_summary.txt
+
+        # Extract field names from the header
+        header=$(head -n 1 ~{prefix}.cov_stat_summary.txt)
+        IFS=$'\t' read -ra field_names <<< "$header"
+
+        # Extract data values
+        data=$(tail -n 1 ~{prefix}.cov_stat_summary.txt)
+        data_values=($(echo "$data" | awk '{ for(i=1; i<=NF; i++) print $i }'))
+
+        # Create the JSON object
+        json="{"
+        for (( i=0; i<${#field_names[@]}; i++ )); do
+            json+="\"${field_names[i]}\":${data_values[i]}"
+            if [[ $i -lt $(( ${#field_names[@]} - 1 )) ]]; then
+                json+=", "
+            fi
+        done
+        json+="}"
+
+        # Write the JSON object to the output file
+        echo "$json" > ~{prefix}.cov_stat_summary.json
+    >>>
+
+    output {
+        File cov_stat_summary_file      = "~{prefix}.cov_stat_summary.txt"
+        Map[String, Float] cov_stat_summary = read_json("~{prefix}.cov_stat_summary.json")
+    }
+
+    runtime {
+        cpu:                    2
+        memory:                 4 + " GiB"
+        disks: "local-disk " +  100 + " HDD"
+        preemptible:            preemptible_tries
+        maxRetries:             1
+        docker:                 "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11"
+    }
+}

From b825dc92b211858f7eecc577f1b775076b5c4025 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Thu, 17 Aug 2023 14:25:00 -0400
Subject: [PATCH 02/62] added "_coverage" to headers

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 39b65309b..275a816ee 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -146,18 +146,18 @@ task CoverageStats {
 
         cat ~{prefix}.cov_stat_summary.txt
 
+        # Replace floating-point numbers suffix ing field names with '_coverage
+        sed -i '1s/([0-9]*\.[0-9]*)/_coverage/g' ~{prefix}.cov_stat_summary.txt
+
         # Calculate covrage percentage with greater than 4x coverage
         total_bases=$(zcat ~{mosdepth_regions} | wc -l)
         bases_above_4x=$(zcat ~{mosdepth_regions} | awk -v cov_col=~{cov_col} '$cov_col > 4' | wc -l)
         percent_above_4x=$(python3 -c "print($bases_above_4x/$total_bases)")
 
         # Append the percentage to the summary tsv file
-        sed -i "1s/$/\tpercent_above_4x/" ~{prefix}.cov_stat_summary.txt
+        sed -i "1s/$/\tpercent_above_4x_coverage/" ~{prefix}.cov_stat_summary.txt
         sed -i "2s/$/\t$percent_above_4x/" ~{prefix}.cov_stat_summary.txt
 
-        # Remove floating-point numbers suffix from field names
-        sed -i '1s/([0-9]*\.[0-9]*)//g' ~{prefix}.cov_stat_summary.txt
-
         # Extract field names from the header
         header=$(head -n 1 ~{prefix}.cov_stat_summary.txt)
         IFS=$'\t' read -ra field_names <<< "$header"

From fa49a9da5a5609b6cd43717ea3d909efe6fecb4b Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Fri, 18 Aug 2023 14:58:36 -0400
Subject: [PATCH 03/62] Rounding to the nearest hundreth and centralized header
 suffix

---
 .../TechAgnostic/Utility/CoverageStats.wdl          | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 275a816ee..f87c5362d 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -126,6 +126,8 @@ task CoverageStats {
         # Runtime parameters
         Int? preemptible_tries = 3
     }
+    Int round = 2
+    String header_suffix = "_coverage"
 
     Int disk_size = 2*ceil(size(mosdepth_regions, "GB"))
     String basename = select_first([basename_input, basename(mosdepth_regions)])
@@ -135,7 +137,7 @@ task CoverageStats {
         set -euxo pipefail
 
         # Use Datamash to calculate summary statistics
-        zcat ~{mosdepth_regions} | datamash -H \
+        zcat ~{mosdepth_regions} | datamash -H -R ~{round} \
         mean ~{cov_col} \
         q1 ~{cov_col} \
         median ~{cov_col} \
@@ -147,17 +149,20 @@ task CoverageStats {
         cat ~{prefix}.cov_stat_summary.txt
 
         # Replace floating-point numbers suffix ing field names with '_coverage
-        sed -i '1s/([0-9]*\.[0-9]*)/_coverage/g' ~{prefix}.cov_stat_summary.txt
+        sed -i '1s/([0-9]*\.[0-9]*)/~{header_suffix}/g' ~{prefix}.cov_stat_summary.txt
 
         # Calculate covrage percentage with greater than 4x coverage
         total_bases=$(zcat ~{mosdepth_regions} | wc -l)
         bases_above_4x=$(zcat ~{mosdepth_regions} | awk -v cov_col=~{cov_col} '$cov_col > 4' | wc -l)
-        percent_above_4x=$(python3 -c "print($bases_above_4x/$total_bases)")
+        percent_above_4x=$(python3 -c "print(round($bases_above_4x/$total_bases, ~{round}))")
 
         # Append the percentage to the summary tsv file
-        sed -i "1s/$/\tpercent_above_4x_coverage/" ~{prefix}.cov_stat_summary.txt
+        sed -i "1s/$/\tpercent_above_4x~{header_suffix}/" ~{prefix}.cov_stat_summary.txt
         sed -i "2s/$/\t$percent_above_4x/" ~{prefix}.cov_stat_summary.txt
 
+        # Round Floats seperated by tab in the second line to the nearest hundredth
+        sed -i '2s/[0-9]*\.[0-9]*/&\n/g' ~{prefix}.cov_stat_summary.txt
+
         # Extract field names from the header
         header=$(head -n 1 ~{prefix}.cov_stat_summary.txt)
         IFS=$'\t' read -ra field_names <<< "$header"

From dffa95ef30680d8c6234278121318349503ca1f8 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Fri, 18 Aug 2023 15:35:58 -0400
Subject: [PATCH 04/62] removed old rounding line

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index f87c5362d..6643adf64 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -160,9 +160,6 @@ task CoverageStats {
         sed -i "1s/$/\tpercent_above_4x~{header_suffix}/" ~{prefix}.cov_stat_summary.txt
         sed -i "2s/$/\t$percent_above_4x/" ~{prefix}.cov_stat_summary.txt
 
-        # Round Floats seperated by tab in the second line to the nearest hundredth
-        sed -i '2s/[0-9]*\.[0-9]*/&\n/g' ~{prefix}.cov_stat_summary.txt
-
         # Extract field names from the header
         header=$(head -n 1 ~{prefix}.cov_stat_summary.txt)
         IFS=$'\t' read -ra field_names <<< "$header"

From be08fcc96dda9ce03c8d45caac5dec4677f8deaf Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Thu, 2 May 2024 11:03:34 -0400
Subject: [PATCH 05/62] More parameters to mosdepthoverbed task

---
 .../TechAgnostic/Utility/CoverageStats.wdl    | 33 ++++++++++++++-----
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 6643adf64..766c7ba59 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -21,7 +21,7 @@ workflow MosdepthCoverageStats {
         Int bin_length = 1000
 
         # Runtime parameters
-        Int? preemptible_tries = 3
+        Int preemptible_tries = 3
     }
 
     call MosDepthOverBed {
@@ -44,7 +44,7 @@ workflow MosdepthCoverageStats {
 
     output {
         File cov_stat_summary_file      = CoverageStats.cov_stat_summary_file
-        Map[String, String] cov_stat_summary = CoverageStats.cov_stat_summary
+        Map[String, Float] cov_stat_summary = CoverageStats.cov_stat_summary
     }
 
 }
@@ -59,7 +59,11 @@ task MosDepthOverBed {
         bam: "Aligned BAM file."
         bai: "Aligned BAM index file."
         bed: "BED file containing regions of interest."
-        bin_length: "Length of bins to use for coverage calculation."
+        bin_length: "Length of bins to use for coverage calculation. default is 1000. If bed file is provided, this parameter is ignored."
+        chrom: "Chromosome to calculate coverage for."
+        no_per_base: "Do not calculate per-base coverage."
+        fast_mode: "Use fast mode."
+        mapq: "Minimum mapping quality to consider."
         preemptible_tries: "Number of times to retry a preempted task."
     }
 
@@ -67,10 +71,14 @@ task MosDepthOverBed {
         File bam
         File bai
         File? bed
-        Int bin_length
+        String? chrom
+        Int? bin_length
+        Boolean no_per_base = true
+        Boolean fast_mode = true
+        Int mapq = 1
 
         # Runtime parameters
-        Int? preemptible_tries = 3
+        Int preemptible_tries = 3
     }
 
     String mosdepth_by_region = select_first([bed, bin_length])
@@ -86,7 +94,14 @@ task MosDepthOverBed {
         ln -s ~{bam} ./~{basename}.bam
         ln -s ~{bai} ./~{basename}.bai
 
-        mosdepth -t 4 -b ~{mosdepth_by_region} -n -x -Q 1 ~{prefix} ./~{basename}.bam
+        mosdepth \
+        ~{true="-n" false="" no_per_base} \
+        ~{true="-x" false="" fast_mode} \
+        -t 4 \
+        ~{"-c " + chrom} \
+        ~{"-b " + mosdepth_by_region} \
+        ~{"-Q " + mapq} \
+        ~{prefix} ./~{basename}.bam
     }
 
     output {
@@ -99,7 +114,7 @@ task MosDepthOverBed {
     runtime {
         cpu:                    4
         memory:                 8 + " GiB"
-        disks: "local-disk " +  100 + " HDD"
+        disks: "local-disk " +  disk_size + " HDD"
         preemptible:            preemptible_tries
         maxRetries:             1
         docker:                 "us.gcr.io/broad-dsp-lrma/lr-mosdepth:latest"
@@ -124,7 +139,7 @@ task CoverageStats {
         String? basename_input
 
         # Runtime parameters
-        Int? preemptible_tries = 3
+        Int preemptible_tries = 3
     }
     Int round = 2
     String header_suffix = "_coverage"
@@ -190,7 +205,7 @@ task CoverageStats {
     runtime {
         cpu:                    2
         memory:                 4 + " GiB"
-        disks: "local-disk " +  100 + " HDD"
+        disks: "local-disk " +  disk_size + " HDD"
         preemptible:            preemptible_tries
         maxRetries:             1
         docker:                 "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11"

From 9598e346b83e2ba30937507936d2434f86ba34e9 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Thu, 9 May 2024 17:02:56 -0400
Subject: [PATCH 06/62] More parameters to mosdepthoverbed task, Added evenness
 score to CoverageStats

---
 .../TechAgnostic/Utility/CoverageStats.wdl    | 37 +++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 766c7ba59..9f937ee11 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -64,6 +64,7 @@ task MosDepthOverBed {
         no_per_base: "Do not calculate per-base coverage."
         fast_mode: "Use fast mode."
         mapq: "Minimum mapping quality to consider."
+        thresholds: "Comma-separated list of thresholds to use for coverage calculation. (e.g. 1,10,20,30)."
         preemptible_tries: "Number of times to retry a preempted task."
     }
 
@@ -73,6 +74,7 @@ task MosDepthOverBed {
         File? bed
         String? chrom
         Int? bin_length
+        String? thresholds
         Boolean no_per_base = true
         Boolean fast_mode = true
         Int mapq = 1
@@ -101,6 +103,7 @@ task MosDepthOverBed {
         ~{"-c " + chrom} \
         ~{"-b " + mosdepth_by_region} \
         ~{"-Q " + mapq} \
+        ~{"-T " + thresholds} \
         ~{prefix} ./~{basename}.bam
     }
 
@@ -166,7 +169,9 @@ task CoverageStats {
         # Replace floating-point numbers suffix ing field names with '_coverage
         sed -i '1s/([0-9]*\.[0-9]*)/~{header_suffix}/g' ~{prefix}.cov_stat_summary.txt
 
+        ###
         # Calculate covrage percentage with greater than 4x coverage
+        ###
         total_bases=$(zcat ~{mosdepth_regions} | wc -l)
         bases_above_4x=$(zcat ~{mosdepth_regions} | awk -v cov_col=~{cov_col} '$cov_col > 4' | wc -l)
         percent_above_4x=$(python3 -c "print(round($bases_above_4x/$total_bases, ~{round}))")
@@ -175,6 +180,38 @@ task CoverageStats {
         sed -i "1s/$/\tpercent_above_4x~{header_suffix}/" ~{prefix}.cov_stat_summary.txt
         sed -i "2s/$/\t$percent_above_4x/" ~{prefix}.cov_stat_summary.txt
 
+        ###
+        # Calculate Evenness Score
+        # Konrad Oexle, Journal of Human Genetics 2016, Evaulation of the evenness score in NGS.
+        # https://www.nature.com/articles/jhg201621
+        ###
+
+        # get mean from the summary file, under the column 'mean_coverage'
+        mean_coverage=$(awk -F"\t" '{for(i=1; i<=NF; i++) if ($i == "mean_coverage") {getline; getline; printf "%.0f", $(i); exit}}' ~{prefix}.cov_stat_summary.txt)
+        coverage_count=$(zcat ~{mosdepth_regions} | wc -l)
+
+        # get list of coverages that are less than the mean coverage from the mosdepth output file
+        D2=($(zcat ~{mosdepth_regions} | awk -v mean=$mean_coverage '{if ($~{cov_col} < mean) print $~{cov_col}}'))
+        D2_count=${#D2[@]}
+        D2_sum=$(IFS=+; echo "$((${D2[*]}))")
+
+        # print the values for debugging
+        echo "mean_coverage: $mean_coverage"
+        echo "coverage_count: $coverage_count"
+        echo "D2_count: $D2_count"
+        echo "D2_sum: $D2_sum"
+
+        # calculate the evenness score
+        evenness_score=$(1 - ((D2_count - D2_sum)/mean_coverage)/coverage_count)
+
+        # Append the evenness score to the summary tsv file
+        sed -i "1s/$/\tevenness_score~{header_suffix}/" ~{prefix}.cov_stat_summary.txt
+        sed -i "2s/$/\t$evenness_score/" ~{prefix}.cov_stat_summary.txt
+
+        ###
+        # Convert the summary statistics to a JSON object
+        ###
+
         # Extract field names from the header
         header=$(head -n 1 ~{prefix}.cov_stat_summary.txt)
         IFS=$'\t' read -ra field_names <<< "$header"

From 24e0a3f64ae1d3cf3de5332fc23d61ca7b7e1fc2 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Fri, 10 May 2024 09:35:51 -0400
Subject: [PATCH 07/62] Added subheaders in CoverageStats task command, use bc
 to get D2_sum, if statement to avoid evenness score if mean is 0

---
 .../TechAgnostic/Utility/CoverageStats.wdl    | 35 ++++++++++++-------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 9f937ee11..9b7e3942d 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -169,9 +169,12 @@ task CoverageStats {
         # Replace floating-point numbers suffix ing field names with '_coverage
         sed -i '1s/([0-9]*\.[0-9]*)/~{header_suffix}/g' ~{prefix}.cov_stat_summary.txt
 
-        ###
+        ############################################################
         # Calculate covrage percentage with greater than 4x coverage
-        ###
+        ############################################################
+
+        echo "Calculating coverage percentage with greater than 4x coverage"
+
         total_bases=$(zcat ~{mosdepth_regions} | wc -l)
         bases_above_4x=$(zcat ~{mosdepth_regions} | awk -v cov_col=~{cov_col} '$cov_col > 4' | wc -l)
         percent_above_4x=$(python3 -c "print(round($bases_above_4x/$total_bases, ~{round}))")
@@ -180,11 +183,13 @@ task CoverageStats {
         sed -i "1s/$/\tpercent_above_4x~{header_suffix}/" ~{prefix}.cov_stat_summary.txt
         sed -i "2s/$/\t$percent_above_4x/" ~{prefix}.cov_stat_summary.txt
 
-        ###
+        ############################################################
         # Calculate Evenness Score
         # Konrad Oexle, Journal of Human Genetics 2016, Evaulation of the evenness score in NGS.
         # https://www.nature.com/articles/jhg201621
-        ###
+        ############################################################
+
+        echo "Calculating Evenness Score"
 
         # get mean from the summary file, under the column 'mean_coverage'
         mean_coverage=$(awk -F"\t" '{for(i=1; i<=NF; i++) if ($i == "mean_coverage") {getline; getline; printf "%.0f", $(i); exit}}' ~{prefix}.cov_stat_summary.txt)
@@ -193,7 +198,7 @@ task CoverageStats {
         # get list of coverages that are less than the mean coverage from the mosdepth output file
         D2=($(zcat ~{mosdepth_regions} | awk -v mean=$mean_coverage '{if ($~{cov_col} < mean) print $~{cov_col}}'))
         D2_count=${#D2[@]}
-        D2_sum=$(IFS=+; echo "$((${D2[*]}))")
+        D2_sum=$(echo "${D2[@]}" | tr ' ' '+' | bc)
 
         # print the values for debugging
         echo "mean_coverage: $mean_coverage"
@@ -201,16 +206,22 @@ task CoverageStats {
         echo "D2_count: $D2_count"
         echo "D2_sum: $D2_sum"
 
-        # calculate the evenness score
-        evenness_score=$(1 - ((D2_count - D2_sum)/mean_coverage)/coverage_count)
+        if [ $mean_coverage -ne 0 ]; then
+            # calculate the evenness score
+            evenness_score=$(1 - ((D2_count - D2_sum)/mean_coverage)/coverage_count)
 
-        # Append the evenness score to the summary tsv file
-        sed -i "1s/$/\tevenness_score~{header_suffix}/" ~{prefix}.cov_stat_summary.txt
-        sed -i "2s/$/\t$evenness_score/" ~{prefix}.cov_stat_summary.txt
+            # Append the evenness score to the summary tsv file
+            sed -i "1s/$/\tevenness_score~{header_suffix}/" ~{prefix}.cov_stat_summary.txt
+            sed -i "2s/$/\t$evenness_score/" ~{prefix}.cov_stat_summary.txt
+        else
+            echo "mean_coverage is 0, cannot calculate evenness score."
+        fi
 
-        ###
+        ############################################################
         # Convert the summary statistics to a JSON object
-        ###
+        ############################################################
+
+        echo "Converting summary statistics to JSON object"
 
         # Extract field names from the header
         header=$(head -n 1 ~{prefix}.cov_stat_summary.txt)

From d33b8cce20dbb23a171fd2ee13d0f6c9c8ae0a39 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Fri, 10 May 2024 10:14:48 -0400
Subject: [PATCH 08/62] use awk to get D2_sum

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 9b7e3942d..c62bcf64b 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -198,7 +198,7 @@ task CoverageStats {
         # get list of coverages that are less than the mean coverage from the mosdepth output file
         D2=($(zcat ~{mosdepth_regions} | awk -v mean=$mean_coverage '{if ($~{cov_col} < mean) print $~{cov_col}}'))
         D2_count=${#D2[@]}
-        D2_sum=$(echo "${D2[@]}" | tr ' ' '+' | bc)
+        D2_sum=$(echo "$D2" | tr ' ' '\n' | awk '{sum+=$1};END{print sum}')
 
         # print the values for debugging
         echo "mean_coverage: $mean_coverage"

From f5663824f249209a400fbf428b9829ce1094efa9 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Fri, 10 May 2024 11:00:55 -0400
Subject: [PATCH 09/62] updated awk command to get D2_sum

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index c62bcf64b..514d79cf5 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -198,7 +198,7 @@ task CoverageStats {
         # get list of coverages that are less than the mean coverage from the mosdepth output file
         D2=($(zcat ~{mosdepth_regions} | awk -v mean=$mean_coverage '{if ($~{cov_col} < mean) print $~{cov_col}}'))
         D2_count=${#D2[@]}
-        D2_sum=$(echo "$D2" | tr ' ' '\n' | awk '{sum+=$1};END{print sum}')
+        D2_sum=$(printf "%s\n" "${D2[@]}" | awk 'BEGIN {sum = 0} {sum += $1} END {print sum}')
 
         # print the values for debugging
         echo "mean_coverage: $mean_coverage"
@@ -208,7 +208,7 @@ task CoverageStats {
 
         if [ $mean_coverage -ne 0 ]; then
             # calculate the evenness score
-            evenness_score=$(1 - ((D2_count - D2_sum)/mean_coverage)/coverage_count)
+            evenness_score=$((1 - ((D2_count - D2_sum)/mean_coverage)/coverage_count))
 
             # Append the evenness score to the summary tsv file
             sed -i "1s/$/\tevenness_score~{header_suffix}/" ~{prefix}.cov_stat_summary.txt

From 9e42b15c0e370446f2977b66243e958255382a3a Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Fri, 10 May 2024 11:33:51 -0400
Subject: [PATCH 10/62] updated awk command to get D2_sum with $

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 514d79cf5..70e2867ae 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -208,7 +208,7 @@ task CoverageStats {
 
         if [ $mean_coverage -ne 0 ]; then
             # calculate the evenness score
-            evenness_score=$((1 - ((D2_count - D2_sum)/mean_coverage)/coverage_count))
+            evenness_score=$(( 1 - (($D2_count - $D2_sum)/$mean_coverage)/$coverage_count ))
 
             # Append the evenness score to the summary tsv file
             sed -i "1s/$/\tevenness_score~{header_suffix}/" ~{prefix}.cov_stat_summary.txt

From 8d47439645037a7d2fd234f52e1c5bcdc3929662 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Fri, 10 May 2024 12:21:41 -0400
Subject: [PATCH 11/62] using python for evenness calculation because awk bash
 doesn't support float arithmetic

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 70e2867ae..2c46f1a4b 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -208,7 +208,7 @@ task CoverageStats {
 
         if [ $mean_coverage -ne 0 ]; then
             # calculate the evenness score
-            evenness_score=$(( 1 - (($D2_count - $D2_sum)/$mean_coverage)/$coverage_count ))
+            evenness_score=$(python3 -c "print(1 - (($D2_count - $D2_sum)/$mean_coverage)/$coverage_count)")
 
             # Append the evenness score to the summary tsv file
             sed -i "1s/$/\tevenness_score~{header_suffix}/" ~{prefix}.cov_stat_summary.txt

From a6df7de712e326859e60b3fb779dc06079dfb77e Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Thu, 16 May 2024 09:03:26 -0400
Subject: [PATCH 12/62] fixed e score formula

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 2c46f1a4b..a72550659 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -196,7 +196,7 @@ task CoverageStats {
         coverage_count=$(zcat ~{mosdepth_regions} | wc -l)
 
         # get list of coverages that are less than the mean coverage from the mosdepth output file
-        D2=($(zcat ~{mosdepth_regions} | awk -v mean=$mean_coverage '{if ($~{cov_col} < mean) print $~{cov_col}}'))
+        D2=($(zcat ~{mosdepth_regions} | awk -v mean=$mean_coverage '{if ($~{cov_col} <= mean) print $~{cov_col}}'))
         D2_count=${#D2[@]}
         D2_sum=$(printf "%s\n" "${D2[@]}" | awk 'BEGIN {sum = 0} {sum += $1} END {print sum}')
 
@@ -208,7 +208,7 @@ task CoverageStats {
 
         if [ $mean_coverage -ne 0 ]; then
             # calculate the evenness score
-            evenness_score=$(python3 -c "print(1 - (($D2_count - $D2_sum)/$mean_coverage)/$coverage_count)")
+            evenness_score=$(python3 -c "print(1 - ($D2_count - ($D2_sum/$mean_coverage))/$coverage_count)")
 
             # Append the evenness score to the summary tsv file
             sed -i "1s/$/\tevenness_score~{header_suffix}/" ~{prefix}.cov_stat_summary.txt

From 93ba31a13825076b6ed109c63211a9837e2e9502 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Thu, 16 May 2024 10:27:55 -0400
Subject: [PATCH 13/62] round e-score to hundredth

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index a72550659..3952fadd4 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -208,7 +208,7 @@ task CoverageStats {
 
         if [ $mean_coverage -ne 0 ]; then
             # calculate the evenness score
-            evenness_score=$(python3 -c "print(1 - ($D2_count - ($D2_sum/$mean_coverage))/$coverage_count)")
+            evenness_score=$(python3 -c "print(round(1 - ($D2_count - ($D2_sum/$mean_coverage))/$coverage_count, 2))")
 
             # Append the evenness score to the summary tsv file
             sed -i "1s/$/\tevenness_score~{header_suffix}/" ~{prefix}.cov_stat_summary.txt

From 310f66fdb3ec98106b61c729eaec386e93673c50 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Thu, 23 May 2024 10:03:29 -0400
Subject: [PATCH 14/62] removed `mosdepth_by_region` intermediate variable

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 3952fadd4..06718392f 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -83,8 +83,6 @@ task MosDepthOverBed {
         Int preemptible_tries = 3
     }
 
-    String mosdepth_by_region = select_first([bed, bin_length])
-
     Int disk_size = 2 * ceil(size(bam, "GB") + size(bai, "GB"))
     String basename = basename(bam, ".bam")
     String prefix = "~{basename}.coverage_over_bed"
@@ -101,7 +99,7 @@ task MosDepthOverBed {
         ~{true="-x" false="" fast_mode} \
         -t 4 \
         ~{"-c " + chrom} \
-        ~{"-b " + mosdepth_by_region} \
+        ~{"-b " + select_first([bed, bin_length])} \
         ~{"-Q " + mapq} \
         ~{"-T " + thresholds} \
         ~{prefix} ./~{basename}.bam

From 8d6fba6c47363993e58bc66163726b9bf32eb0fb Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Thu, 23 May 2024 11:07:00 -0400
Subject: [PATCH 15/62] threads option added

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 06718392f..9dc91c6cb 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -72,6 +72,7 @@ task MosDepthOverBed {
         File bam
         File bai
         File? bed
+        Int? threads = 4
         String? chrom
         Int? bin_length
         String? thresholds
@@ -97,7 +98,7 @@ task MosDepthOverBed {
         mosdepth \
         ~{true="-n" false="" no_per_base} \
         ~{true="-x" false="" fast_mode} \
-        -t 4 \
+        -t ~{threads} \
         ~{"-c " + chrom} \
         ~{"-b " + select_first([bed, bin_length])} \
         ~{"-Q " + mapq} \

From 5cf9f47c192dd164b660b9ad75d708eea1d0de30 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Thu, 23 May 2024 11:08:40 -0400
Subject: [PATCH 16/62] minor fix

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 9dc91c6cb..a0b9e311b 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -72,7 +72,7 @@ task MosDepthOverBed {
         File bam
         File bai
         File? bed
-        Int? threads = 4
+        Int threads = 4
         String? chrom
         Int? bin_length
         String? thresholds

From f3f6dd6e9d87031bbc576b53a5958045b3d737d0 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Tue, 23 Jul 2024 15:34:35 -0400
Subject: [PATCH 17/62] if mean zero, evenness score is none

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index a0b9e311b..9b4c4b110 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -182,6 +182,8 @@ task CoverageStats {
         sed -i "1s/$/\tpercent_above_4x~{header_suffix}/" ~{prefix}.cov_stat_summary.txt
         sed -i "2s/$/\t$percent_above_4x/" ~{prefix}.cov_stat_summary.txt
 
+        cat ~{prefix}.cov_stat_summary.txt
+
         ############################################################
         # Calculate Evenness Score
         # Konrad Oexle, Journal of Human Genetics 2016, Evaulation of the evenness score in NGS.
@@ -214,8 +216,13 @@ task CoverageStats {
             sed -i "2s/$/\t$evenness_score/" ~{prefix}.cov_stat_summary.txt
         else
             echo "mean_coverage is 0, cannot calculate evenness score."
+            # Set evenness score to None
+            sed -i "1s/$/\tevenness_score~{header_suffix}/" ~{prefix}.cov_stat_summary.txt
+            sed -i "2s/$/\tNone/" ~{prefix}.cov_stat_summary.txt
         fi
 
+        cat ~{prefix}.cov_stat_summary.txt
+
         ############################################################
         # Convert the summary statistics to a JSON object
         ############################################################

From fe5254374123ba7110104a6beae3ae363b8ed65f Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Tue, 23 Jul 2024 16:51:14 -0400
Subject: [PATCH 18/62] graceful exit if datamash calculation failed

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 9b4c4b110..e4e2770aa 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -165,6 +165,12 @@ task CoverageStats {
 
         cat ~{prefix}.cov_stat_summary.txt
 
+        wc_sum=$(wc -l ~{prefix}.cov_stat_summary.txt | awk '{print $1}')
+        if [ "$wc_sum" -lt 2 ]; then
+            echo "Error calculating summary statistics. datamash may not have enough data."
+            exit 1
+        fi
+
         # Replace floating-point numbers suffix ing field names with '_coverage
         sed -i '1s/([0-9]*\.[0-9]*)/~{header_suffix}/g' ~{prefix}.cov_stat_summary.txt
 

From d15d9f7773288f0ac8fc64150790b764e6745493 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Wed, 24 Jul 2024 11:27:05 -0400
Subject: [PATCH 19/62] coverage evenness gets 'null' if unable to calculate

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index e4e2770aa..44695c1d1 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -224,7 +224,7 @@ task CoverageStats {
             echo "mean_coverage is 0, cannot calculate evenness score."
             # Set evenness score to None
             sed -i "1s/$/\tevenness_score~{header_suffix}/" ~{prefix}.cov_stat_summary.txt
-            sed -i "2s/$/\tNone/" ~{prefix}.cov_stat_summary.txt
+            sed -i "2s/$/\tnull/" ~{prefix}.cov_stat_summary.txt
         fi
 
         cat ~{prefix}.cov_stat_summary.txt

From 3118aa7428f86307cd8fec15862312c6815982fb Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Wed, 24 Jul 2024 17:55:03 -0400
Subject: [PATCH 20/62] transformed shell script to python and added to
 mosdepth Dockerfile

---
 docker/lr-mosdepth/Dockerfile                 |   3 +
 docker/lr-mosdepth/coverage_stats.py          | 157 ++++++++++++++++++
 .../TechAgnostic/Utility/CoverageStats.wdl    | 114 +------------
 3 files changed, 168 insertions(+), 106 deletions(-)
 create mode 100644 docker/lr-mosdepth/coverage_stats.py

diff --git a/docker/lr-mosdepth/Dockerfile b/docker/lr-mosdepth/Dockerfile
index 9a5acff9d..b72e5468c 100644
--- a/docker/lr-mosdepth/Dockerfile
+++ b/docker/lr-mosdepth/Dockerfile
@@ -18,3 +18,6 @@ RUN curl https://sdk.cloud.google.com | bash
 
 # activate conda environment
 RUN echo "source activate lr-mosdepth" > ~/.bashrc
+
+# copy the script
+COPY coverage_stats.py /
diff --git a/docker/lr-mosdepth/coverage_stats.py b/docker/lr-mosdepth/coverage_stats.py
new file mode 100644
index 000000000..97ff05a80
--- /dev/null
+++ b/docker/lr-mosdepth/coverage_stats.py
@@ -0,0 +1,157 @@
+import gzip
+import pandas as pd
+import numpy as np
+import json
+import argparse
+import logging
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description='Process coverage values.')
+    parser.add_argument('mosdepth_regions', type=str,
+                        help='Mosdepth output file containing coverage values.')
+    parser.add_argument('--cov_col', type=int, default=4,
+                        help='Column holding the coverage values.')
+    parser.add_argument('--output_prefix', type=str,
+                        help='Prefix to use for output files.')
+    parser.add_argument('--round', type=int, default=2, help='Rounding precision.')
+    parser.add_argument('--debug', action='store_true', help='Debug mode.')
+    return parser.parse_args()
+
+
+def calculate_summary_statistics(df, cov_col, round_precision):
+    """
+    Calculate summary statistics for the coverage values.
+    @param df: Dataframe containing coverage values
+    @param cov_col: Column containing coverage values
+    @param round_precision: Rounding precision
+    @return:
+    """
+
+    cov_data = df.iloc[:, cov_col - 1]
+    mean_val = cov_data.mean()
+    mad_val = np.mean(np.abs(cov_data - mean_val))
+
+    statistics = {
+        'mean': round(mean_val, round_precision),
+        'q1': round(cov_data.quantile(0.25), round_precision),
+        'median': round(cov_data.median(), round_precision),
+        'q3': round(cov_data.quantile(0.75), round_precision),
+        'iqr': round(cov_data.quantile(0.75) - cov_data.quantile(0.25),
+                     round_precision),
+        'sstdev': round(cov_data.std(), round_precision),
+        'mad': round(mad_val, round_precision)
+    }
+    return statistics
+
+
+def percentage_greater_than_4x(df, cov_col, round_precision):
+    """
+    Calculate the percentage of coverage values greater than 4x.
+    @param df: Dataframe containing coverage values
+    @param cov_col: Column containing coverage values
+    @param round_precision: Rounding precision
+    @return:
+    """
+    logging.debug("Calculating percentage of coverage values greater than 4x")
+    total_bases = len(df)
+    bases_above_4x = df[df.iloc[:, cov_col - 1] > 4].shape[0]
+    percent_above_4x = round(bases_above_4x / total_bases, round_precision)
+
+    return percent_above_4x
+
+
+def calculate_evenness_score(df, cov_col, round_precision):
+    """
+    Calculate the evenness score.
+    @param df: Dataframe containing coverage values
+    @param cov_col: Column containing coverage values
+    @param round_precision: Rounding precision
+    @return:
+    """
+    logging.debug("Calculating evenness score")
+    mean_coverage = df.iloc[:, cov_col - 1].mean()
+    # Get the coverage values that are less than or equal to the mean coverage
+    D2 = df[df.iloc[:, cov_col - 1] <= mean_coverage].iloc[:, cov_col - 1].tolist()
+    # count of coverage values that are less than or equal to the mean coverage
+    D2_count = len(D2)
+    # sum of coverage values that are less than or equal to the mean coverage
+    D2_sum = sum(D2)
+    # total number of coverage
+    coverage_count = len(df)
+
+    logging.debug("Mean coverage: %s", mean_coverage)
+    logging.debug("D2 count: %s", D2_count)
+    logging.debug("D2 sum: %s", D2_sum)
+    logging.debug("Coverage count: %s", coverage_count)
+
+    if mean_coverage != 0:
+        evenness_score = round(1 - (D2_count - (D2_sum / mean_coverage)) / coverage_count, round_precision)
+    else:
+        logging.warning("Mean coverage is zero. Evenness score will be set to null.")
+        evenness_score = 'null'
+
+    return evenness_score
+
+
+def open_file(file_path):
+    """
+    Open a file and return a dataframe
+    @param file_path: Path to the file
+    @return:
+    """
+
+    if file_path.endswith('.gz'):
+        with gzip.open(file_path, 'rt') as f:
+            df = pd.read_csv(f, sep='\t', header=None)
+    else:
+        with open(file_path, 'rt') as f:
+            df = pd.read_csv(f, sep='\t', header=None)
+
+    return df
+
+
+def main():
+    args = parse_arguments()
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    logging.info("Arguments: %s", args)
+    logging.info("Calculating coverage statistics")
+
+    prefix = args.output_prefix if args.output_prefix else args.mosdepth_regions.split('.')[0]
+
+    df: pd.DataFrame = open_file(args.mosdepth_regions)
+    logging.info("Opened file: %s", args.mosdepth_regions)
+
+    logging.debug("Dataframe shape: %s", df.shape)
+    logging.debug("Dataframe columns: %s", df.columns)
+    logging.debug(f"Dataframe Head\n{df.head()}")
+
+    statistics = calculate_summary_statistics(df, args.cov_col, args.round)
+
+    logging.debug("Summary statistics: %s", statistics)
+
+    percent_above_4x = percentage_greater_than_4x(df, args.cov_col, args.round)
+    statistics['percent_above_4x'] = percent_above_4x
+
+    logging.info("Percentage of coverage values greater than 4x: %s", percent_above_4x)
+
+    evenness_score = calculate_evenness_score(df, args.cov_col, args.round)
+    statistics['evenness_score'] = evenness_score
+
+    logging.info("Evenness score: %s", evenness_score)
+
+    logging.info("Summary statistics: %s", statistics)
+
+    summary_file = f"{prefix}.cov_stat_summary.json"
+
+    logging.info("Writing summary statistics to file: %s", summary_file)
+    with open(summary_file, 'w') as f:
+        json.dump(statistics, f)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 44695c1d1..75b260f7a 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -150,115 +150,17 @@ task CoverageStats {
     String basename = select_first([basename_input, basename(mosdepth_regions)])
     String prefix = "~{basename}.coverage_over_bed"
 
-    command <<<
+    command {
         set -euxo pipefail
 
-        # Use Datamash to calculate summary statistics
-        zcat ~{mosdepth_regions} | datamash -H -R ~{round} \
-        mean ~{cov_col} \
-        q1 ~{cov_col} \
-        median ~{cov_col} \
-        q3 ~{cov_col} \
-        iqr ~{cov_col} \
-        sstdev ~{cov_col} \
-        mad ~{cov_col} > ~{prefix}.cov_stat_summary.txt
-
-        cat ~{prefix}.cov_stat_summary.txt
-
-        wc_sum=$(wc -l ~{prefix}.cov_stat_summary.txt | awk '{print $1}')
-        if [ "$wc_sum" -lt 2 ]; then
-            echo "Error calculating summary statistics. datamash may not have enough data."
-            exit 1
-        fi
-
-        # Replace floating-point numbers suffix ing field names with '_coverage
-        sed -i '1s/([0-9]*\.[0-9]*)/~{header_suffix}/g' ~{prefix}.cov_stat_summary.txt
-
-        ############################################################
-        # Calculate covrage percentage with greater than 4x coverage
-        ############################################################
-
-        echo "Calculating coverage percentage with greater than 4x coverage"
-
-        total_bases=$(zcat ~{mosdepth_regions} | wc -l)
-        bases_above_4x=$(zcat ~{mosdepth_regions} | awk -v cov_col=~{cov_col} '$cov_col > 4' | wc -l)
-        percent_above_4x=$(python3 -c "print(round($bases_above_4x/$total_bases, ~{round}))")
-
-        # Append the percentage to the summary tsv file
-        sed -i "1s/$/\tpercent_above_4x~{header_suffix}/" ~{prefix}.cov_stat_summary.txt
-        sed -i "2s/$/\t$percent_above_4x/" ~{prefix}.cov_stat_summary.txt
-
-        cat ~{prefix}.cov_stat_summary.txt
-
-        ############################################################
-        # Calculate Evenness Score
-        # Konrad Oexle, Journal of Human Genetics 2016, Evaulation of the evenness score in NGS.
-        # https://www.nature.com/articles/jhg201621
-        ############################################################
-
-        echo "Calculating Evenness Score"
-
-        # get mean from the summary file, under the column 'mean_coverage'
-        mean_coverage=$(awk -F"\t" '{for(i=1; i<=NF; i++) if ($i == "mean_coverage") {getline; getline; printf "%.0f", $(i); exit}}' ~{prefix}.cov_stat_summary.txt)
-        coverage_count=$(zcat ~{mosdepth_regions} | wc -l)
-
-        # get list of coverages that are less than the mean coverage from the mosdepth output file
-        D2=($(zcat ~{mosdepth_regions} | awk -v mean=$mean_coverage '{if ($~{cov_col} <= mean) print $~{cov_col}}'))
-        D2_count=${#D2[@]}
-        D2_sum=$(printf "%s\n" "${D2[@]}" | awk 'BEGIN {sum = 0} {sum += $1} END {print sum}')
-
-        # print the values for debugging
-        echo "mean_coverage: $mean_coverage"
-        echo "coverage_count: $coverage_count"
-        echo "D2_count: $D2_count"
-        echo "D2_sum: $D2_sum"
-
-        if [ $mean_coverage -ne 0 ]; then
-            # calculate the evenness score
-            evenness_score=$(python3 -c "print(round(1 - ($D2_count - ($D2_sum/$mean_coverage))/$coverage_count, 2))")
-
-            # Append the evenness score to the summary tsv file
-            sed -i "1s/$/\tevenness_score~{header_suffix}/" ~{prefix}.cov_stat_summary.txt
-            sed -i "2s/$/\t$evenness_score/" ~{prefix}.cov_stat_summary.txt
-        else
-            echo "mean_coverage is 0, cannot calculate evenness score."
-            # Set evenness score to None
-            sed -i "1s/$/\tevenness_score~{header_suffix}/" ~{prefix}.cov_stat_summary.txt
-            sed -i "2s/$/\tnull/" ~{prefix}.cov_stat_summary.txt
-        fi
-
-        cat ~{prefix}.cov_stat_summary.txt
-
-        ############################################################
-        # Convert the summary statistics to a JSON object
-        ############################################################
-
-        echo "Converting summary statistics to JSON object"
-
-        # Extract field names from the header
-        header=$(head -n 1 ~{prefix}.cov_stat_summary.txt)
-        IFS=$'\t' read -ra field_names <<< "$header"
-
-        # Extract data values
-        data=$(tail -n 1 ~{prefix}.cov_stat_summary.txt)
-        data_values=($(echo "$data" | awk '{ for(i=1; i<=NF; i++) print $i }'))
-
-        # Create the JSON object
-        json="{"
-        for (( i=0; i<${#field_names[@]}; i++ )); do
-            json+="\"${field_names[i]}\":${data_values[i]}"
-            if [[ $i -lt $(( ${#field_names[@]} - 1 )) ]]; then
-                json+=", "
-            fi
-        done
-        json+="}"
-
-        # Write the JSON object to the output file
-        echo "$json" > ~{prefix}.cov_stat_summary.json
-    >>>
+        python3 coverage_stats.py \
+        --mosdepth_regions ~{mosdepth_regions} \
+        --cov_col ~{cov_col} \
+        --round ~{round} \
+        --prefix ~{prefix}
+    }
 
     output {
-        File cov_stat_summary_file      = "~{prefix}.cov_stat_summary.txt"
         Map[String, Float] cov_stat_summary = read_json("~{prefix}.cov_stat_summary.json")
     }
 
@@ -268,6 +170,6 @@ task CoverageStats {
         disks: "local-disk " +  disk_size + " HDD"
         preemptible:            preemptible_tries
         maxRetries:             1
-        docker:                 "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11"
+        docker:                 "us.gcr.io/broad-dsp-lrma/lr-mosdepth:latest"
     }
 }

From 2d1e64ec7de65a7bf1faa97b230748cd17169d85 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Fri, 26 Jul 2024 11:17:38 -0400
Subject: [PATCH 21/62] fixed coverage_stats.py option names

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 75b260f7a..e0927845b 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -154,13 +154,14 @@ task CoverageStats {
         set -euxo pipefail
 
         python3 coverage_stats.py \
-        --mosdepth_regions ~{mosdepth_regions} \
         --cov_col ~{cov_col} \
         --round ~{round} \
-        --prefix ~{prefix}
+        --output_prefix ~{prefix} \
+        ~{mosdepth_regions}
     }
 
     output {
+        File cov_stat_summary_file = "~{prefix}.cov_stat_summary.json"
         Map[String, Float] cov_stat_summary = read_json("~{prefix}.cov_stat_summary.json")
     }
 

From 9ce9549ba3bccf94cc87efcc307e7a98cb2e482f Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Fri, 26 Jul 2024 11:17:59 -0400
Subject: [PATCH 22/62] changed column names

---
 docker/lr-mosdepth/coverage_stats.py | 40 +++++++++++++++-------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/docker/lr-mosdepth/coverage_stats.py b/docker/lr-mosdepth/coverage_stats.py
index 97ff05a80..d0282c11d 100644
--- a/docker/lr-mosdepth/coverage_stats.py
+++ b/docker/lr-mosdepth/coverage_stats.py
@@ -1,4 +1,6 @@
 import gzip
+
+import pandas
 import pandas as pd
 import numpy as np
 import json
@@ -28,19 +30,19 @@ def calculate_summary_statistics(df, cov_col, round_precision):
     @return:
     """
 
-    cov_data = df.iloc[:, cov_col - 1]
-    mean_val = cov_data.mean()
-    mad_val = np.mean(np.abs(cov_data - mean_val))
+    cov_data: pandas.Series = df.iloc[:, cov_col - 1]
+    mean_val: float = cov_data.mean()
+    mad_val: float = np.mean(np.abs(cov_data - mean_val))
 
     statistics = {
-        'mean': round(mean_val, round_precision),
-        'q1': round(cov_data.quantile(0.25), round_precision),
-        'median': round(cov_data.median(), round_precision),
-        'q3': round(cov_data.quantile(0.75), round_precision),
-        'iqr': round(cov_data.quantile(0.75) - cov_data.quantile(0.25),
+        'cov_mean': round(mean_val, round_precision),
+        'cov_q1': round(cov_data.quantile(0.25), round_precision),
+        'cov_median': round(cov_data.median(), round_precision),
+        'cov_q3': round(cov_data.quantile(0.75), round_precision),
+        'cov_iqr': round(cov_data.quantile(0.75) - cov_data.quantile(0.25),
                      round_precision),
-        'sstdev': round(cov_data.std(), round_precision),
-        'mad': round(mad_val, round_precision)
+        'cov_sstdev': round(cov_data.std(), round_precision),
+        'cov_mad': round(mad_val, round_precision)
     }
     return statistics
 
@@ -72,21 +74,21 @@ def calculate_evenness_score(df, cov_col, round_precision):
     logging.debug("Calculating evenness score")
     mean_coverage = df.iloc[:, cov_col - 1].mean()
     # Get the coverage values that are less than or equal to the mean coverage
-    D2 = df[df.iloc[:, cov_col - 1] <= mean_coverage].iloc[:, cov_col - 1].tolist()
+    d2 = df[df.iloc[:, cov_col - 1] <= mean_coverage].iloc[:, cov_col - 1].tolist()
     # count of coverage values that are less than or equal to the mean coverage
-    D2_count = len(D2)
+    d2_count = len(d2)
     # sum of coverage values that are less than or equal to the mean coverage
-    D2_sum = sum(D2)
-    # total number of coverage
+    d2_sum = sum(d2)
+    # total number of coverages
     coverage_count = len(df)
 
     logging.debug("Mean coverage: %s", mean_coverage)
-    logging.debug("D2 count: %s", D2_count)
-    logging.debug("D2 sum: %s", D2_sum)
+    logging.debug("D2 count: %s", d2_count)
+    logging.debug("D2 sum: %s", d2_sum)
     logging.debug("Coverage count: %s", coverage_count)
 
     if mean_coverage != 0:
-        evenness_score = round(1 - (D2_count - (D2_sum / mean_coverage)) / coverage_count, round_precision)
+        evenness_score = round(1 - (d2_count - (d2_sum / mean_coverage)) / coverage_count, round_precision)
     else:
         logging.warning("Mean coverage is zero. Evenness score will be set to null.")
         evenness_score = 'null'
@@ -135,12 +137,12 @@ def main():
     logging.debug("Summary statistics: %s", statistics)
 
     percent_above_4x = percentage_greater_than_4x(df, args.cov_col, args.round)
-    statistics['percent_above_4x'] = percent_above_4x
+    statistics['cov_percent_above_4x'] = percent_above_4x
 
     logging.info("Percentage of coverage values greater than 4x: %s", percent_above_4x)
 
     evenness_score = calculate_evenness_score(df, args.cov_col, args.round)
-    statistics['evenness_score'] = evenness_score
+    statistics['cov_evenness_score'] = evenness_score
 
     logging.info("Evenness score: %s", evenness_score)
 

From 49445d8173755856bd792344498a5ff79206dc65 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Fri, 26 Jul 2024 14:10:58 -0400
Subject: [PATCH 23/62] new line eof for output file

---
 docker/lr-mosdepth/coverage_stats.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docker/lr-mosdepth/coverage_stats.py b/docker/lr-mosdepth/coverage_stats.py
index d0282c11d..733979e38 100644
--- a/docker/lr-mosdepth/coverage_stats.py
+++ b/docker/lr-mosdepth/coverage_stats.py
@@ -153,6 +153,7 @@ def main():
     logging.info("Writing summary statistics to file: %s", summary_file)
     with open(summary_file, 'w') as f:
         json.dump(statistics, f)
+        f.write('\n')
 
 
 if __name__ == "__main__":

From 6431ff2106bf51c5edc36db94a75d556a56a3446 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Fri, 26 Jul 2024 14:22:37 -0400
Subject: [PATCH 24/62] moved changes to bs_coverage_stats_script_mosdepth
 branch

---
 docker/lr-mosdepth/Dockerfile        |   3 -
 docker/lr-mosdepth/coverage_stats.py | 160 ---------------------------
 2 files changed, 163 deletions(-)
 delete mode 100644 docker/lr-mosdepth/coverage_stats.py

diff --git a/docker/lr-mosdepth/Dockerfile b/docker/lr-mosdepth/Dockerfile
index b72e5468c..9a5acff9d 100644
--- a/docker/lr-mosdepth/Dockerfile
+++ b/docker/lr-mosdepth/Dockerfile
@@ -18,6 +18,3 @@ RUN curl https://sdk.cloud.google.com | bash
 
 # activate conda environment
 RUN echo "source activate lr-mosdepth" > ~/.bashrc
-
-# copy the script
-COPY coverage_stats.py /
diff --git a/docker/lr-mosdepth/coverage_stats.py b/docker/lr-mosdepth/coverage_stats.py
deleted file mode 100644
index 733979e38..000000000
--- a/docker/lr-mosdepth/coverage_stats.py
+++ /dev/null
@@ -1,160 +0,0 @@
-import gzip
-
-import pandas
-import pandas as pd
-import numpy as np
-import json
-import argparse
-import logging
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser(description='Process coverage values.')
-    parser.add_argument('mosdepth_regions', type=str,
-                        help='Mosdepth output file containing coverage values.')
-    parser.add_argument('--cov_col', type=int, default=4,
-                        help='Column holding the coverage values.')
-    parser.add_argument('--output_prefix', type=str,
-                        help='Prefix to use for output files.')
-    parser.add_argument('--round', type=int, default=2, help='Rounding precision.')
-    parser.add_argument('--debug', action='store_true', help='Debug mode.')
-    return parser.parse_args()
-
-
-def calculate_summary_statistics(df, cov_col, round_precision):
-    """
-    Calculate summary statistics for the coverage values.
-    @param df: Dataframe containing coverage values
-    @param cov_col: Column containing coverage values
-    @param round_precision: Rounding precision
-    @return:
-    """
-
-    cov_data: pandas.Series = df.iloc[:, cov_col - 1]
-    mean_val: float = cov_data.mean()
-    mad_val: float = np.mean(np.abs(cov_data - mean_val))
-
-    statistics = {
-        'cov_mean': round(mean_val, round_precision),
-        'cov_q1': round(cov_data.quantile(0.25), round_precision),
-        'cov_median': round(cov_data.median(), round_precision),
-        'cov_q3': round(cov_data.quantile(0.75), round_precision),
-        'cov_iqr': round(cov_data.quantile(0.75) - cov_data.quantile(0.25),
-                     round_precision),
-        'cov_sstdev': round(cov_data.std(), round_precision),
-        'cov_mad': round(mad_val, round_precision)
-    }
-    return statistics
-
-
-def percentage_greater_than_4x(df, cov_col, round_precision):
-    """
-    Calculate the percentage of coverage values greater than 4x.
-    @param df: Dataframe containing coverage values
-    @param cov_col: Column containing coverage values
-    @param round_precision: Rounding precision
-    @return:
-    """
-    logging.debug("Calculating percentage of coverage values greater than 4x")
-    total_bases = len(df)
-    bases_above_4x = df[df.iloc[:, cov_col - 1] > 4].shape[0]
-    percent_above_4x = round(bases_above_4x / total_bases, round_precision)
-
-    return percent_above_4x
-
-
-def calculate_evenness_score(df, cov_col, round_precision):
-    """
-    Calculate the evenness score.
-    @param df: Dataframe containing coverage values
-    @param cov_col: Column containing coverage values
-    @param round_precision: Rounding precision
-    @return:
-    """
-    logging.debug("Calculating evenness score")
-    mean_coverage = df.iloc[:, cov_col - 1].mean()
-    # Get the coverage values that are less than or equal to the mean coverage
-    d2 = df[df.iloc[:, cov_col - 1] <= mean_coverage].iloc[:, cov_col - 1].tolist()
-    # count of coverage values that are less than or equal to the mean coverage
-    d2_count = len(d2)
-    # sum of coverage values that are less than or equal to the mean coverage
-    d2_sum = sum(d2)
-    # total number of coverages
-    coverage_count = len(df)
-
-    logging.debug("Mean coverage: %s", mean_coverage)
-    logging.debug("D2 count: %s", d2_count)
-    logging.debug("D2 sum: %s", d2_sum)
-    logging.debug("Coverage count: %s", coverage_count)
-
-    if mean_coverage != 0:
-        evenness_score = round(1 - (d2_count - (d2_sum / mean_coverage)) / coverage_count, round_precision)
-    else:
-        logging.warning("Mean coverage is zero. Evenness score will be set to null.")
-        evenness_score = 'null'
-
-    return evenness_score
-
-
-def open_file(file_path):
-    """
-    Open a file and return a dataframe
-    @param file_path: Path to the file
-    @return:
-    """
-
-    if file_path.endswith('.gz'):
-        with gzip.open(file_path, 'rt') as f:
-            df = pd.read_csv(f, sep='\t', header=None)
-    else:
-        with open(file_path, 'rt') as f:
-            df = pd.read_csv(f, sep='\t', header=None)
-
-    return df
-
-
-def main():
-    args = parse_arguments()
-    if args.debug:
-        logging.basicConfig(level=logging.DEBUG)
-    else:
-        logging.basicConfig(level=logging.INFO)
-
-    logging.info("Arguments: %s", args)
-    logging.info("Calculating coverage statistics")
-
-    prefix = args.output_prefix if args.output_prefix else args.mosdepth_regions.split('.')[0]
-
-    df: pd.DataFrame = open_file(args.mosdepth_regions)
-    logging.info("Opened file: %s", args.mosdepth_regions)
-
-    logging.debug("Dataframe shape: %s", df.shape)
-    logging.debug("Dataframe columns: %s", df.columns)
-    logging.debug(f"Dataframe Head\n{df.head()}")
-
-    statistics = calculate_summary_statistics(df, args.cov_col, args.round)
-
-    logging.debug("Summary statistics: %s", statistics)
-
-    percent_above_4x = percentage_greater_than_4x(df, args.cov_col, args.round)
-    statistics['cov_percent_above_4x'] = percent_above_4x
-
-    logging.info("Percentage of coverage values greater than 4x: %s", percent_above_4x)
-
-    evenness_score = calculate_evenness_score(df, args.cov_col, args.round)
-    statistics['cov_evenness_score'] = evenness_score
-
-    logging.info("Evenness score: %s", evenness_score)
-
-    logging.info("Summary statistics: %s", statistics)
-
-    summary_file = f"{prefix}.cov_stat_summary.json"
-
-    logging.info("Writing summary statistics to file: %s", summary_file)
-    with open(summary_file, 'w') as f:
-        json.dump(statistics, f)
-        f.write('\n')
-
-
-if __name__ == "__main__":
-    main()

From ac916c0d4a0b65a55993e0aeb4b8ba1cb2b164ab Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Thu, 1 Aug 2024 11:29:55 -0400
Subject: [PATCH 25/62] "Addition: logic for bed_file with one inteval in
 MosdepthCoverageStats workflow"

---
 .../TechAgnostic/Utility/CoverageStats.wdl    | 55 ++++++++++++++++++-
 1 file changed, 52 insertions(+), 3 deletions(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index e0927845b..64df0bc26 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -18,17 +18,37 @@ workflow MosdepthCoverageStats {
         File aligned_bai
         File? bed_file
 
-        Int bin_length = 1000
+        Int? bin_length
 
         # Runtime parameters
         Int preemptible_tries = 3
     }
 
+    if (defined(bed_file) && defined(bin_length)) {
+        call BinBed  {
+            input:
+                bed_file = bed_file,
+                bin_length = bin_length
+        }
+    }
+
+    # if the line number in the bed file is less than 2 the run BinBed
+    if (defined(bed_file)){
+        Int file_lines = length(read_lines(select_first([bed_file])))
+        if (file_lines < 2) {
+            call BinBed as BinSingleBed {
+                input:
+                    bed_file = bed_file,
+                    bin_length = 1
+            }
+        }
+    }
+
     call MosDepthOverBed {
         input:
             bam = aligned_bam,
             bai = aligned_bai,
-            bed = bed_file,
+            bed = select_first([BinBed.binned_bed, BinSingleBed.binned_bed, bed_file]),
             bin_length = bin_length,
             preemptible_tries = preemptible_tries,
     }
@@ -74,7 +94,7 @@ task MosDepthOverBed {
         File? bed
         Int threads = 4
         String? chrom
-        Int? bin_length
+        Int bin_length = 1000
         String? thresholds
         Boolean no_per_base = true
         Boolean fast_mode = true
@@ -108,9 +128,12 @@ task MosDepthOverBed {
 
     output {
         File global_dist      = "~{prefix}.mosdepth.global.dist.txt"
+        File summary          = "~{prefix}.mosdepth.summary.txt"
+        File? per_base        = "~{prefix}.per-base.bed.gz"
         File region_dist      = "~{prefix}.mosdepth.region.dist.txt"
         File regions          = "~{prefix}.regions.bed.gz"
         File regions_csi      = "~{prefix}.regions.bed.gz.csi"
+
     }
 
     runtime {
@@ -174,3 +197,29 @@ task CoverageStats {
         docker:                 "us.gcr.io/broad-dsp-lrma/lr-mosdepth:latest"
     }
 }
+
+
+task BinBed {
+    input {
+        File? bed_file
+        Int? bin_length
+    }
+
+    command {
+        set -euxo pipefail
+
+        bedtools makewindows -b ~{bed_file} -w ~{bin_length} > binned.bed
+
+    }
+
+    output {
+        File binned_bed = "binned.bed"
+    }
+
+    runtime {
+        cpu: 1
+        memory: "1 GiB"
+        disks: "local-disk 10 HDD"
+        docker: "us.gcr.io/broad-dsp-lrma/lr-basic:latest"
+    }
+}
\ No newline at end of file

From f46a66bbaf8fa6211178ff82c2e0d657193fafcc Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Mon, 5 Aug 2024 18:23:00 -0400
Subject: [PATCH 26/62] "Addition: reformated logic for bed_file with one
 inteval in MosdepthCoverageStats workflow, shifted to using feature branch
 for docker of mosdepth"

---
 .../TechAgnostic/Utility/CoverageStats.wdl    | 64 ++++++++++++-------
 1 file changed, 41 insertions(+), 23 deletions(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 64df0bc26..933993500 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -25,39 +25,43 @@ workflow MosdepthCoverageStats {
     }
 
     if (defined(bed_file) && defined(bin_length)) {
-        call BinBed  {
+        call BinBed {
             input:
                 bed_file = bed_file,
                 bin_length = bin_length
         }
     }
 
-    # if the line number in the bed file is less than 2 the run BinBed
-    if (defined(bed_file)){
-        Int file_lines = length(read_lines(select_first([bed_file])))
-        if (file_lines < 2) {
-            call BinBed as BinSingleBed {
-                input:
-                    bed_file = bed_file,
-                    bin_length = 1
-            }
+    # Runs in cases where bed file is provided or bed and bin_length are provided
+    if (defined(bed_file) || (defined(bin_length) && defined(BinBed.binned_bed))) {
+        call MosDepthOverBed {
+            input:
+                bam = aligned_bam,
+                bai = aligned_bai,
+                bed = select_first([BinBed.binned_bed, bed_file]),
+                bin_length = bin_length,
+                preemptible_tries = preemptible_tries,
         }
+        File cov_stat_input_bed = select_first([MosDepthOverBed.per_base, MosDepthOverBed.regions])
     }
 
-    call MosDepthOverBed {
-        input:
-            bam = aligned_bam,
-            bai = aligned_bai,
-            bed = select_first([BinBed.binned_bed, BinSingleBed.binned_bed, bed_file]),
-            bin_length = bin_length,
-            preemptible_tries = preemptible_tries,
+    # Runs in cases where no bed file is provided, will run regardless of bin_length
+    if (!defined(bed_file) ) {
+        call MosDepthOverBed as MosDepthNoBed {
+            input:
+                bam = aligned_bam,
+                bai = aligned_bai,
+                bin_length = bin_length,
+                preemptible_tries = preemptible_tries,
+        }
+        File cov_stat_input_no_bed = select_first([MosDepthNoBed.per_base, MosDepthNoBed.regions])
     }
 
     String basename = basename(aligned_bam, ".bam")
 
     call CoverageStats {
         input:
-            mosdepth_regions = MosDepthOverBed.regions,
+            mosdepth_regions = select_first([cov_stat_input_bed, cov_stat_input_no_bed]),
             basename_input = basename,
             preemptible_tries = preemptible_tries,
     }
@@ -96,8 +100,8 @@ task MosDepthOverBed {
         String? chrom
         Int bin_length = 1000
         String? thresholds
-        Boolean no_per_base = true
-        Boolean fast_mode = true
+        Boolean no_per_base = false
+        Boolean fast_mode = false
         Int mapq = 1
 
         # Runtime parameters
@@ -142,7 +146,7 @@ task MosDepthOverBed {
         disks: "local-disk " +  disk_size + " HDD"
         preemptible:            preemptible_tries
         maxRetries:             1
-        docker:                 "us.gcr.io/broad-dsp-lrma/lr-mosdepth:latest"
+        docker:                 "us.gcr.io/broad-dsp-lrma/lr-mosdepth:bs-cov-sum-0.3.1"
     }
 }
 
@@ -194,7 +198,7 @@ task CoverageStats {
         disks: "local-disk " +  disk_size + " HDD"
         preemptible:            preemptible_tries
         maxRetries:             1
-        docker:                 "us.gcr.io/broad-dsp-lrma/lr-mosdepth:latest"
+        docker:                 "us.gcr.io/broad-dsp-lrma/lr-mosdepth:bs-cov-sum-0.3.1"
     }
 }
 
@@ -222,4 +226,18 @@ task BinBed {
         disks: "local-disk 10 HDD"
         docker: "us.gcr.io/broad-dsp-lrma/lr-basic:latest"
     }
-}
\ No newline at end of file
+}
+
+
+
+#        # if the line number in the bed file is less than 2 the run BinBed
+#    if (defined(bed_file)){
+#        Int file_lines = length(read_lines(select_first([bed_file])))
+#        if (file_lines < 2) {
+#            call BinBed as BinSingleBed {
+#                input:
+#                    bed_file = bed_file,
+#                    bin_length = 1
+#            }
+#        }
+#    }

From 2608f4835a84269b5dafd6a7b19f70d33c03ac50 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Mon, 5 Aug 2024 18:31:28 -0400
Subject: [PATCH 27/62] "Addition: set parameters to mosdepth, so readability
 is improved in Terra"

---
 .../TechAgnostic/Utility/CoverageStats.wdl       | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 933993500..24848267e 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -85,10 +85,6 @@ task MosDepthOverBed {
         bed: "BED file containing regions of interest."
         bin_length: "Length of bins to use for coverage calculation. default is 1000. If bed file is provided, this parameter is ignored."
         chrom: "Chromosome to calculate coverage for."
-        no_per_base: "Do not calculate per-base coverage."
-        fast_mode: "Use fast mode."
-        mapq: "Minimum mapping quality to consider."
-        thresholds: "Comma-separated list of thresholds to use for coverage calculation. (e.g. 1,10,20,30)."
         preemptible_tries: "Number of times to retry a preempted task."
     }
 
@@ -96,18 +92,19 @@ task MosDepthOverBed {
         File bam
         File bai
         File? bed
-        Int threads = 4
+
         String? chrom
         Int bin_length = 1000
-        String? thresholds
-        Boolean no_per_base = false
-        Boolean fast_mode = false
-        Int mapq = 1
 
         # Runtime parameters
         Int preemptible_tries = 3
     }
 
+    Int threads = 4
+    Boolean no_per_base = false
+    Boolean fast_mode = false
+    Int mapq = 1
+
     Int disk_size = 2 * ceil(size(bam, "GB") + size(bai, "GB"))
     String basename = basename(bam, ".bam")
     String prefix = "~{basename}.coverage_over_bed"
@@ -126,7 +123,6 @@ task MosDepthOverBed {
         ~{"-c " + chrom} \
         ~{"-b " + select_first([bed, bin_length])} \
         ~{"-Q " + mapq} \
-        ~{"-T " + thresholds} \
         ~{prefix} ./~{basename}.bam
     }
 

From a641a060355401f89b7f58593aaa55b99c3e0937 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Tue, 6 Aug 2024 08:24:09 -0400
Subject: [PATCH 28/62] "Addition: add absolute path to coverage_stats.py"

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 24848267e..3ea845c70 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -176,7 +176,7 @@ task CoverageStats {
     command {
         set -euxo pipefail
 
-        python3 coverage_stats.py \
+        python3 /coverage_stats.py \
         --cov_col ~{cov_col} \
         --round ~{round} \
         --output_prefix ~{prefix} \

From b6eb2e7598efdeb29fce9dab02190866c7c0b7c7 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Tue, 6 Aug 2024 09:22:12 -0400
Subject: [PATCH 29/62] "Addition: combined mosdepth task and coverage"

---
 .../TechAgnostic/Utility/CoverageStats.wdl    | 121 +++++++-----------
 1 file changed, 45 insertions(+), 76 deletions(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 3ea845c70..0a5f4518d 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -11,6 +11,7 @@ workflow MosdepthCoverageStats {
         bed_file: "BED file containing regions of interest."
         bin_length: "Length of bins to use for coverage calculation."
         preemptible_tries: "Number of times to retry a preempted task."
+#        stats_per_interval: "If true, calculate coverage statistics per interval."
     }
 
     input {
@@ -19,11 +20,35 @@ workflow MosdepthCoverageStats {
         File? bed_file
 
         Int? bin_length
+#
+#        Boolean stats_per_interval = false
 
         # Runtime parameters
         Int preemptible_tries = 3
     }
 
+#    if (stats_per_interval) {
+#        scatter ( bed_line in read_lines(bed_file) ) {
+#            call MosDepthOverBed as MosDepthPerInterval {
+#                input:
+#                    bam = aligned_bam,
+#                    bai = aligned_bai,
+#                    bed = write_lines([bed_line]),
+#                    preemptible_tries = preemptible_tries,
+#            }
+#            File cov_stat_input_bed = select_first([MosDepthPerInterval.per_base, MosDepthPerInterval.regions])
+#            String basename = basename(aligned_bam, ".bam")
+#            call CoverageStats {
+#                input:
+#                    mosdepth_regions = cov_stat_input_bed,
+#                    basename_input = basename,
+#                    preemptible_tries = preemptible_tries,
+#            }
+#        }
+#    }
+
+
+
     if (defined(bed_file) && defined(bin_length)) {
         call BinBed {
             input:
@@ -42,7 +67,6 @@ workflow MosdepthCoverageStats {
                 bin_length = bin_length,
                 preemptible_tries = preemptible_tries,
         }
-        File cov_stat_input_bed = select_first([MosDepthOverBed.per_base, MosDepthOverBed.regions])
     }
 
     # Runs in cases where no bed file is provided, will run regardless of bin_length
@@ -54,21 +78,12 @@ workflow MosdepthCoverageStats {
                 bin_length = bin_length,
                 preemptible_tries = preemptible_tries,
         }
-        File cov_stat_input_no_bed = select_first([MosDepthNoBed.per_base, MosDepthNoBed.regions])
     }
 
-    String basename = basename(aligned_bam, ".bam")
-
-    call CoverageStats {
-        input:
-            mosdepth_regions = select_first([cov_stat_input_bed, cov_stat_input_no_bed]),
-            basename_input = basename,
-            preemptible_tries = preemptible_tries,
-    }
 
     output {
-        File cov_stat_summary_file      = CoverageStats.cov_stat_summary_file
-        Map[String, Float] cov_stat_summary = CoverageStats.cov_stat_summary
+        File cov_stat_summary_file      = MosDepthOverBed.cov_stat_summary_file
+        Map[String, Float] cov_stat_summary = MosDepthOverBed.cov_stat_summary
     }
 
 }
@@ -99,12 +114,18 @@ task MosDepthOverBed {
         # Runtime parameters
         Int preemptible_tries = 3
     }
-
+    # mosdepth parameters
     Int threads = 4
     Boolean no_per_base = false
     Boolean fast_mode = false
     Int mapq = 1
 
+    # coverage_stats.py parameters
+    Int cov_col = 4 # column holding the coverage values
+    Int round = 2
+    String header_suffix = "_coverage"
+
+
     Int disk_size = 2 * ceil(size(bam, "GB") + size(bai, "GB"))
     String basename = basename(bam, ".bam")
     String prefix = "~{basename}.coverage_over_bed"
@@ -124,15 +145,26 @@ task MosDepthOverBed {
         ~{"-b " + select_first([bed, bin_length])} \
         ~{"-Q " + mapq} \
         ~{prefix} ./~{basename}.bam
+
+        # Run coverage_stats.py
+        python3 /coverage_stats.py \
+        --cov_col ~{cov_col} \
+        --round ~{round} \
+        --output_prefix ~{prefix} \
+        ~{prefix}.per-base.bed.gz
     }
 
     output {
+        # mosdepth output
         File global_dist      = "~{prefix}.mosdepth.global.dist.txt"
         File summary          = "~{prefix}.mosdepth.summary.txt"
         File? per_base        = "~{prefix}.per-base.bed.gz"
         File region_dist      = "~{prefix}.mosdepth.region.dist.txt"
         File regions          = "~{prefix}.regions.bed.gz"
         File regions_csi      = "~{prefix}.regions.bed.gz.csi"
+        # coverage_stats.py output
+        File cov_stat_summary_file = "~{prefix}.cov_stat_summary.json"
+        Map[String, Float] cov_stat_summary = read_json("~{prefix}.cov_stat_summary.json")
 
     }
 
@@ -146,57 +178,7 @@ task MosDepthOverBed {
     }
 }
 
-task CoverageStats {
-
-    meta {
-        description: "Calculate coverage statistics from mosdepth output."
-    }
-    parameter_meta {
-        mosdepth_regions: "Mosdepth output file containing coverage values."
-        cov_col: "Column holding the coverage values."
-        basename_input: "Basename to use for output files."
-        preemptible_tries: "Number of times to retry a preempted task."
-    }
-
-    input {
-        File mosdepth_regions
-        Int cov_col = 4 # column holding the coverage values
-        String? basename_input
-
-        # Runtime parameters
-        Int preemptible_tries = 3
-    }
-    Int round = 2
-    String header_suffix = "_coverage"
-
-    Int disk_size = 2*ceil(size(mosdepth_regions, "GB"))
-    String basename = select_first([basename_input, basename(mosdepth_regions)])
-    String prefix = "~{basename}.coverage_over_bed"
 
-    command {
-        set -euxo pipefail
-
-        python3 /coverage_stats.py \
-        --cov_col ~{cov_col} \
-        --round ~{round} \
-        --output_prefix ~{prefix} \
-        ~{mosdepth_regions}
-    }
-
-    output {
-        File cov_stat_summary_file = "~{prefix}.cov_stat_summary.json"
-        Map[String, Float] cov_stat_summary = read_json("~{prefix}.cov_stat_summary.json")
-    }
-
-    runtime {
-        cpu:                    2
-        memory:                 4 + " GiB"
-        disks: "local-disk " +  disk_size + " HDD"
-        preemptible:            preemptible_tries
-        maxRetries:             1
-        docker:                 "us.gcr.io/broad-dsp-lrma/lr-mosdepth:bs-cov-sum-0.3.1"
-    }
-}
 
 
 task BinBed {
@@ -224,16 +206,3 @@ task BinBed {
     }
 }
 
-
-
-#        # if the line number in the bed file is less than 2 the run BinBed
-#    if (defined(bed_file)){
-#        Int file_lines = length(read_lines(select_first([bed_file])))
-#        if (file_lines < 2) {
-#            call BinBed as BinSingleBed {
-#                input:
-#                    bed_file = bed_file,
-#                    bin_length = 1
-#            }
-#        }
-#    }

From 8e3e63028af836695c1d75f56f8edcff0a1d7e52 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Tue, 6 Aug 2024 09:39:29 -0400
Subject: [PATCH 30/62] "Addition: fix optional output"

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 0a5f4518d..d31429ad8 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -82,8 +82,8 @@ workflow MosdepthCoverageStats {
 
 
     output {
-        File cov_stat_summary_file      = MosDepthOverBed.cov_stat_summary_file
-        Map[String, Float] cov_stat_summary = MosDepthOverBed.cov_stat_summary
+        File cov_stat_summary_file      = select_first([MosDepthOverBed.cov_stat_summary_file, MosDepthNoBed.cov_stat_summary_file])
+        Map[String, Float] cov_stat_summary = select_first([MosDepthOverBed.cov_stat_summary, MosDepthNoBed.cov_stat_summary])
     }
 
 }

From 4a5b11f0b78295b3b72772af153fef69602c638f Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Tue, 6 Aug 2024 09:42:02 -0400
Subject: [PATCH 31/62] "Addition: fix optional output"

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index d31429ad8..655b70c17 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -82,8 +82,8 @@ workflow MosdepthCoverageStats {
 
 
     output {
-        File cov_stat_summary_file      = select_first([MosDepthOverBed.cov_stat_summary_file, MosDepthNoBed.cov_stat_summary_file])
-        Map[String, Float] cov_stat_summary = select_first([MosDepthOverBed.cov_stat_summary, MosDepthNoBed.cov_stat_summary])
+        File? cov_stat_summary_file      = select_first([MosDepthOverBed.cov_stat_summary_file, MosDepthNoBed.cov_stat_summary_file])
+        Map[String, Float]? cov_stat_summary = select_first([MosDepthOverBed.cov_stat_summary, MosDepthNoBed.cov_stat_summary])
     }
 
 }

From 1131d859980a7a3ee3c2727410e1ce751ee5ceb3 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Tue, 6 Aug 2024 10:45:00 -0400
Subject: [PATCH 32/62] "Addition: added a scatter option"

---
 .../TechAgnostic/Utility/CoverageStats.wdl    | 67 ++++++++++++-------
 1 file changed, 43 insertions(+), 24 deletions(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 655b70c17..6623a39ad 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -11,7 +11,7 @@ workflow MosdepthCoverageStats {
         bed_file: "BED file containing regions of interest."
         bin_length: "Length of bins to use for coverage calculation."
         preemptible_tries: "Number of times to retry a preempted task."
-#        stats_per_interval: "If true, calculate coverage statistics per interval."
+        stats_per_interval: "If true, calculate coverage statistics per interval. Must provide a bed file."
     }
 
     input {
@@ -20,32 +20,32 @@ workflow MosdepthCoverageStats {
         File? bed_file
 
         Int? bin_length
-#
-#        Boolean stats_per_interval = false
+
+        Boolean stats_per_interval = false
 
         # Runtime parameters
         Int preemptible_tries = 3
     }
 
-#    if (stats_per_interval) {
-#        scatter ( bed_line in read_lines(bed_file) ) {
-#            call MosDepthOverBed as MosDepthPerInterval {
-#                input:
-#                    bam = aligned_bam,
-#                    bai = aligned_bai,
-#                    bed = write_lines([bed_line]),
-#                    preemptible_tries = preemptible_tries,
-#            }
-#            File cov_stat_input_bed = select_first([MosDepthPerInterval.per_base, MosDepthPerInterval.regions])
-#            String basename = basename(aligned_bam, ".bam")
-#            call CoverageStats {
-#                input:
-#                    mosdepth_regions = cov_stat_input_bed,
-#                    basename_input = basename,
-#                    preemptible_tries = preemptible_tries,
-#            }
-#        }
-#    }
+    if (stats_per_interval ) {
+        if (!defined(bed_file)) {
+            call FailWorkflow {
+                input:
+                    message = "stats_per_interval is set to true, but no bed file is provided."
+            }
+        }
+        scatter ( bed_line in read_lines(select_first([bed_file])) ) {
+            call MosDepthOverBed as MosDepthPerInterval {
+                input:
+                    bam = aligned_bam,
+                    bai = aligned_bai,
+                    bed = write_lines([bed_line]),
+                    preemptible_tries = preemptible_tries,
+            }
+        }
+        Array[File] cov_stat_summary_files  = MosDepthPerInterval.cov_stat_summary_file
+        Array[Map[String, Float]] cov_stat_summaries = MosDepthPerInterval.cov_stat_summary
+    }
 
 
 
@@ -82,8 +82,10 @@ workflow MosdepthCoverageStats {
 
 
     output {
-        File? cov_stat_summary_file      = select_first([MosDepthOverBed.cov_stat_summary_file, MosDepthNoBed.cov_stat_summary_file])
-        Map[String, Float]? cov_stat_summary = select_first([MosDepthOverBed.cov_stat_summary, MosDepthNoBed.cov_stat_summary])
+        File cov_stat_summary_file      = select_first([MosDepthOverBed.cov_stat_summary_file, MosDepthNoBed.cov_stat_summary_file])
+        Map[String, Float] cov_stat_summary = select_first([MosDepthOverBed.cov_stat_summary, MosDepthNoBed.cov_stat_summary])
+        Array[File]? stats_per_interval_cov_stat_summary_files      = cov_stat_summary_files
+        Array[Map[String, Float]]? stats_per_interval_cov_stat_summaries = cov_stat_summaries
     }
 
 }
@@ -206,3 +208,20 @@ task BinBed {
     }
 }
 
+task FailWorkflow {
+    input{
+        String message
+    }
+    command {
+        echo "Failing workflow"
+        echo ~{message}
+        exit 1
+    }
+    output {
+        String out_message = message
+    }
+    runtime {
+        docker: "marketplace.gcr.io/google/ubuntu2004"
+    }
+}
+

From 08a2bb4dc644e708997098553193634e1d3d2e4b Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Tue, 6 Aug 2024 13:35:11 -0400
Subject: [PATCH 33/62] "Addition: memory parameter added "

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 6623a39ad..f05f349fa 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -114,6 +114,7 @@ task MosDepthOverBed {
         Int bin_length = 1000
 
         # Runtime parameters
+        Int mem = 8
         Int preemptible_tries = 3
     }
     # mosdepth parameters
@@ -127,7 +128,7 @@ task MosDepthOverBed {
     Int round = 2
     String header_suffix = "_coverage"
 
-
+    # Calculate disk size
     Int disk_size = 2 * ceil(size(bam, "GB") + size(bai, "GB"))
     String basename = basename(bam, ".bam")
     String prefix = "~{basename}.coverage_over_bed"
@@ -172,7 +173,7 @@ task MosDepthOverBed {
 
     runtime {
         cpu:                    4
-        memory:                 8 + " GiB"
+        memory:                 mem + " GiB"
         disks: "local-disk " +  disk_size + " HDD"
         preemptible:            preemptible_tries
         maxRetries:             1

From ec15f626c5b3bf659fe1fbbd901349ed7db71751 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Tue, 6 Aug 2024 13:42:04 -0400
Subject: [PATCH 34/62] "Addition: memory parameter added to workflow
 parameter"

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index f05f349fa..ad68b1f98 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -25,6 +25,7 @@ workflow MosdepthCoverageStats {
 
         # Runtime parameters
         Int preemptible_tries = 3
+        Int mosdepth_mem = 8
     }
 
     if (stats_per_interval ) {
@@ -41,6 +42,7 @@ workflow MosdepthCoverageStats {
                     bai = aligned_bai,
                     bed = write_lines([bed_line]),
                     preemptible_tries = preemptible_tries,
+                    mem = mosdepth_mem,
             }
         }
         Array[File] cov_stat_summary_files  = MosDepthPerInterval.cov_stat_summary_file
@@ -66,6 +68,7 @@ workflow MosdepthCoverageStats {
                 bed = select_first([BinBed.binned_bed, bed_file]),
                 bin_length = bin_length,
                 preemptible_tries = preemptible_tries,
+                mem = mosdepth_mem,
         }
     }
 
@@ -77,6 +80,7 @@ workflow MosdepthCoverageStats {
                 bai = aligned_bai,
                 bin_length = bin_length,
                 preemptible_tries = preemptible_tries,
+                mem = mosdepth_mem,
         }
     }
 

From 39b33314d4894801396ddfe2e68fd964a911e1a1 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Tue, 6 Aug 2024 14:44:04 -0400
Subject: [PATCH 35/62] "Addition: fail if scatter interval is over 200"

---
 .../TechAgnostic/Utility/CoverageStats.wdl    | 23 ++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index ad68b1f98..a8ceb848d 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -8,10 +8,10 @@ workflow MosdepthCoverageStats {
     parameter_meta {
         aligned_bam: "Aligned BAM file."
         aligned_bai: "Aligned BAM index file."
-        bed_file: "BED file containing regions of interest."
+        bed_file: "BED file containing regions of interest. Workflow will fail if bed file is empty"
         bin_length: "Length of bins to use for coverage calculation."
         preemptible_tries: "Number of times to retry a preempted task."
-        stats_per_interval: "If true, calculate coverage statistics per interval. Must provide a bed file."
+        stats_per_interval: "If true, calculate coverage statistics per interval. Must provide a bed file. Workflow will fail if bed file contains more than 200 intervals to avoid accidently over spending."
     }
 
     input {
@@ -28,13 +28,30 @@ workflow MosdepthCoverageStats {
         Int mosdepth_mem = 8
     }
 
+    if (defined(bed_file)) {
+        if (length(read_lines(select_first([bed_file]))) == 0) {
+            call FailWorkflow as EmptyBedFile {
+                input:
+                    message = "stats_per_interval is set to true, but the provided bed file is empty."
+            }
+        }
+    }
+
     if (stats_per_interval ) {
         if (!defined(bed_file)) {
-            call FailWorkflow {
+            call FailWorkflow as NoBedFile {
                 input:
                     message = "stats_per_interval is set to true, but no bed file is provided."
             }
         }
+        if (defined(bed_file)) {
+            if (length(read_lines(select_first([bed_file]))) > 200) {
+                call FailWorkflow as TooManyIntervals {
+                    input:
+                        message = "stats_per_interval is set to true, but the provided bed file contains more than 200 intervals."
+                }
+            }
+        }
         scatter ( bed_line in read_lines(select_first([bed_file])) ) {
             call MosDepthOverBed as MosDepthPerInterval {
                 input:

From 087788d4233477803a890f0dced74384e513348f Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Tue, 6 Aug 2024 14:56:27 -0400
Subject: [PATCH 36/62] "Addition: check bed length in scatter regardless"

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index a8ceb848d..475a2d3b8 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -44,12 +44,10 @@ workflow MosdepthCoverageStats {
                     message = "stats_per_interval is set to true, but no bed file is provided."
             }
         }
-        if (defined(bed_file)) {
-            if (length(read_lines(select_first([bed_file]))) > 200) {
-                call FailWorkflow as TooManyIntervals {
-                    input:
-                        message = "stats_per_interval is set to true, but the provided bed file contains more than 200 intervals."
-                }
+        if (length(read_lines(select_first([bed_file]))) > 200) {
+            call FailWorkflow as TooManyIntervals {
+                input:
+                    message = "stats_per_interval is set to true, but the provided bed file contains more than 200 intervals."
             }
         }
         scatter ( bed_line in read_lines(select_first([bed_file])) ) {

From 5e49318667fa5873bf59fc8616d7f399207ac91b Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Tue, 6 Aug 2024 16:31:21 -0400
Subject: [PATCH 37/62] "Addition: mosdepth summarizes regions"

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 475a2d3b8..56660bf80 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -58,6 +58,7 @@ workflow MosdepthCoverageStats {
                     bed = write_lines([bed_line]),
                     preemptible_tries = preemptible_tries,
                     mem = mosdepth_mem,
+                    summarize_regions = true
             }
         }
         Array[File] cov_stat_summary_files  = MosDepthPerInterval.cov_stat_summary_file
@@ -84,6 +85,7 @@ workflow MosdepthCoverageStats {
                 bin_length = bin_length,
                 preemptible_tries = preemptible_tries,
                 mem = mosdepth_mem,
+                summarize_regions = true,
         }
     }
 
@@ -131,6 +133,7 @@ task MosDepthOverBed {
 
         String? chrom
         Int bin_length = 1000
+        boolean summarize_regions = false
 
         # Runtime parameters
         Int mem = 8
@@ -152,6 +155,8 @@ task MosDepthOverBed {
     String basename = basename(bam, ".bam")
     String prefix = "~{basename}.coverage_over_bed"
 
+    String cov_file_to_summarize = if summarize_regions then "~{prefix}.regions.bed.gz" else "~{prefix}.per-base.bed.gz"
+
     command {
         set -euxo pipefail
 
@@ -173,7 +178,7 @@ task MosDepthOverBed {
         --cov_col ~{cov_col} \
         --round ~{round} \
         --output_prefix ~{prefix} \
-        ~{prefix}.per-base.bed.gz
+        ~{cov_file_to_summarize}
     }
 
     output {

From f798f33a96d1620381801157f03d1f704444e0c2 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Tue, 6 Aug 2024 16:47:43 -0400
Subject: [PATCH 38/62] "Addition: mini fix"

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 56660bf80..21be19591 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -133,7 +133,7 @@ task MosDepthOverBed {
 
         String? chrom
         Int bin_length = 1000
-        boolean summarize_regions = false
+        Boolean summarize_regions = false
 
         # Runtime parameters
         Int mem = 8

From 8bfcfa76593a4a4b1f943d64f823ddf5c39f987b Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Tue, 6 Aug 2024 20:40:15 -0400
Subject: [PATCH 39/62] "Addition: created separate task for cov per interval"

---
 .../TechAgnostic/Utility/CoverageStats.wdl    | 115 ++++++++++++++++--
 1 file changed, 104 insertions(+), 11 deletions(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 21be19591..98772becc 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -37,7 +37,7 @@ workflow MosdepthCoverageStats {
         }
     }
 
-    if (stats_per_interval ) {
+    if (stats_per_interval) {
         if (!defined(bed_file)) {
             call FailWorkflow as NoBedFile {
                 input:
@@ -50,19 +50,14 @@ workflow MosdepthCoverageStats {
                     message = "stats_per_interval is set to true, but the provided bed file contains more than 200 intervals."
             }
         }
-        scatter ( bed_line in read_lines(select_first([bed_file])) ) {
-            call MosDepthOverBed as MosDepthPerInterval {
+            call MosDepthPerInterval {
                 input:
                     bam = aligned_bam,
                     bai = aligned_bai,
-                    bed = write_lines([bed_line]),
+                    bed = bed_file,
                     preemptible_tries = preemptible_tries,
                     mem = mosdepth_mem,
-                    summarize_regions = true
             }
-        }
-        Array[File] cov_stat_summary_files  = MosDepthPerInterval.cov_stat_summary_file
-        Array[Map[String, Float]] cov_stat_summaries = MosDepthPerInterval.cov_stat_summary
     }
 
 
@@ -105,8 +100,8 @@ workflow MosdepthCoverageStats {
     output {
         File cov_stat_summary_file      = select_first([MosDepthOverBed.cov_stat_summary_file, MosDepthNoBed.cov_stat_summary_file])
         Map[String, Float] cov_stat_summary = select_first([MosDepthOverBed.cov_stat_summary, MosDepthNoBed.cov_stat_summary])
-        Array[File]? stats_per_interval_cov_stat_summary_files      = cov_stat_summary_files
-        Array[Map[String, Float]]? stats_per_interval_cov_stat_summaries = cov_stat_summaries
+        File? stats_per_interval_cov_stat_summary_files      = MosDepthPerInterval.cov_stat_summary_all_file
+        Map[String, Float]? stats_per_interval_cov_stat_summaries = MosDepthPerInterval.cov_stat_summary_all
     }
 
 }
@@ -148,7 +143,6 @@ task MosDepthOverBed {
     # coverage_stats.py parameters
     Int cov_col = 4 # column holding the coverage values
     Int round = 2
-    String header_suffix = "_coverage"
 
     # Calculate disk size
     Int disk_size = 2 * ceil(size(bam, "GB") + size(bai, "GB"))
@@ -205,8 +199,105 @@ task MosDepthOverBed {
     }
 }
 
+task MosDepthPerInterval {
+
+    meta {
+        description: "Calculate coverage using mosdepth for each interval in bed."
+    }
+    parameter_meta {
+        bam: "Aligned BAM file."
+        bai: "Aligned BAM index file."
+        bed: "BED file containing regions of interest."
+        bin_length: "Length of bins to use for coverage calculation. default is 1000. If bed file is provided, this parameter is ignored."
+        preemptible_tries: "Number of times to retry a preempted task."
+    }
+
+    input {
+        File bam
+        File bai
+        File? bed
+
+        Int bin_length = 1000
+
+        # Runtime parameters
+        Int mem = 8
+        Int preemptible_tries = 3
+    }
+    # mosdepth parameters
+    Int threads = 4
+    Boolean no_per_base = false
+    Boolean fast_mode = false
+    Int mapq = 1
+
+    # coverage_stats.py parameters
+    Int cov_col = 4 # column holding the coverage values
+    Int round = 2
+
+    # Calculate disk size
+    Int disk_size = 2 * ceil(size(bam, "GB") + size(bai, "GB"))
+    String basename = basename(bam, ".bam")
+    String prefix = "~{basename}.coverage_over_bed"
+
+    String cov_file_to_summarize = "~{prefix}.regions.bed.gz"
+
+    command <<<
+        set -euxo pipefail
+
+        # Create symbolic links for bam and bai in the current working directory
+        ln -s ~{bam} ./~{basename}.bam
+        ln -s ~{bai} ./~{basename}.bai
+
+        # create file for coverage stats summary of all intervals
+        touch ~{prefix}.cov_stat_summary_all.json
+
+        # Create a temporary directory for intermediate files
+        tmp_dir=$(mktemp -d)
+        trap "rm -rf $tmp_dir" EXIT
+
+        for bed_line in $(cat ~{bed}); do
+            bed_file="$tmp_dir/bed_line.bed"
+            echo $bed_line > $bed_file
+
+            mosdepth \
+            ~{true="-n" false="" no_per_base} \
+            ~{true="-x" false="" fast_mode} \
+            -t ~{threads} \
+            -b $bed_file \
+            ~{"-Q " + mapq} \
+            ~{prefix} ./~{basename}.bam
+
+            # Run coverage_stats.py
+            python3 /coverage_stats.py \
+            --cov_col ~{cov_col} \
+            --round ~{round} \
+            --output_prefix ~{prefix} \
+            ~{cov_file_to_summarize}
+
+            # Append the coverage stats summary of the current interval to the file containing the summary of all intervals
+            cat ~{prefix}.cov_stat_summary.json >> ~{prefix}.cov_stat_summary_all.json
+
+        done
 
 
+    >>>
+
+    output {
+        # coverage_stats.py output
+        File cov_stat_summary_all_file = "~{prefix}.cov_stat_summary_all.json"
+        Map[String, Float] cov_stat_summary_all = read_json("~{prefix}.cov_stat_summary_all.json")
+
+    }
+
+    runtime {
+        cpu:                    4
+        memory:                 mem + " GiB"
+        disks: "local-disk " +  disk_size + " HDD"
+        preemptible:            preemptible_tries
+        maxRetries:             1
+        docker:                 "us.gcr.io/broad-dsp-lrma/lr-mosdepth:bs-cov-sum-0.3.1"
+    }
+}
+
 
 task BinBed {
     input {
@@ -238,6 +329,8 @@ task FailWorkflow {
         String message
     }
     command {
+        set -e
+
         echo "Failing workflow"
         echo ~{message}
         exit 1

From d6a5ea13c5a9b79cb5134c9c85ecd1847ec465e5 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Tue, 6 Aug 2024 21:07:47 -0400
Subject: [PATCH 40/62] "Addition: changed for loop for per interval"

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 98772becc..872ce1e23 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -254,7 +254,9 @@ task MosDepthPerInterval {
         tmp_dir=$(mktemp -d)
         trap "rm -rf $tmp_dir" EXIT
 
-        for bed_line in $(cat ~{bed}); do
+        mapfile -t lines < "~{bed}"
+
+        for line in "${lines[@]}"; do
             bed_file="$tmp_dir/bed_line.bed"
             echo $bed_line > $bed_file
 

From c988ddfae02445002ad2bf8ac179084dde3373d6 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Tue, 6 Aug 2024 21:14:51 -0400
Subject: [PATCH 41/62] "Addition: fix bed_line to line in for loop for per
 interval"

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 872ce1e23..9328e2d7a 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -247,7 +247,7 @@ task MosDepthPerInterval {
         ln -s ~{bam} ./~{basename}.bam
         ln -s ~{bai} ./~{basename}.bai
 
-        # create file for coverage stats summary of all intervals
+        # Create file for coverage stats summary of all intervals
         touch ~{prefix}.cov_stat_summary_all.json
 
         # Create a temporary directory for intermediate files
@@ -258,7 +258,7 @@ task MosDepthPerInterval {
 
         for line in "${lines[@]}"; do
             bed_file="$tmp_dir/bed_line.bed"
-            echo $bed_line > $bed_file
+            echo $line > $bed_file
 
             mosdepth \
             ~{true="-n" false="" no_per_base} \

From 9f7fe0ec0d2b89351b15ce8339c641f3fc2b83f4 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Tue, 6 Aug 2024 21:36:52 -0400
Subject: [PATCH 42/62] "Addition: fix white space to be tab in for loop for
 per interval"

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 9328e2d7a..1a089a8c7 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -258,7 +258,7 @@ task MosDepthPerInterval {
 
         for line in "${lines[@]}"; do
             bed_file="$tmp_dir/bed_line.bed"
-            echo $line > $bed_file
+            echo $line | tr ' ' '\t' > $bed_file
 
             mosdepth \
             ~{true="-n" false="" no_per_base} \

From 23f29c820853c5d3abf902384352dbdecb13ba25 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Wed, 7 Aug 2024 10:29:34 -0400
Subject: [PATCH 43/62] "Addition: creating json list for cover per interval"

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 1a089a8c7..780e4a2a5 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -249,6 +249,7 @@ task MosDepthPerInterval {
 
         # Create file for coverage stats summary of all intervals
         touch ~{prefix}.cov_stat_summary_all.json
+        cat "[\n" > ~{prefix}.cov_stat_summary_all.json
 
         # Create a temporary directory for intermediate files
         tmp_dir=$(mktemp -d)
@@ -276,9 +277,15 @@ task MosDepthPerInterval {
             ~{cov_file_to_summarize}
 
             # Append the coverage stats summary of the current interval to the file containing the summary of all intervals
-            cat ~{prefix}.cov_stat_summary.json >> ~{prefix}.cov_stat_summary_all.json
+            if [[ -s ~{prefix}.cov_stat_summary.json ]]; then
+                cat ~{prefix}.cov_stat_summary.json >> ~{prefix}.cov_stat_summary_all.json
+                echo "," >> ~{prefix}.cov_stat_summary_all.json
+            fi
 
         done
+        # remove the last comma and close the json array
+        sed -i '$ s/,$//' ~{prefix}.cov_stat_summary_all.json
+        echo "]" >> ~{prefix}.cov_stat_summary_all.json
 
 
     >>>

From 39e4c80499e147a0c9508938a06b14faacb13ccf Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Wed, 7 Aug 2024 10:50:48 -0400
Subject: [PATCH 44/62] "Addition: fix for first line json list for cover per
 interval"

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 780e4a2a5..3bd5e32b9 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -249,7 +249,7 @@ task MosDepthPerInterval {
 
         # Create file for coverage stats summary of all intervals
         touch ~{prefix}.cov_stat_summary_all.json
-        cat "[\n" > ~{prefix}.cov_stat_summary_all.json
+        echo "[" > ~{prefix}.cov_stat_summary_all.json
 
         # Create a temporary directory for intermediate files
         tmp_dir=$(mktemp -d)

From a763344f9f2a854283e3c87dda6d036d72d74548 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Wed, 7 Aug 2024 11:05:38 -0400
Subject: [PATCH 45/62] "Addition: fix output to be list of dict for cover per
 interval"

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 3bd5e32b9..29dd68392 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -101,7 +101,7 @@ workflow MosdepthCoverageStats {
         File cov_stat_summary_file      = select_first([MosDepthOverBed.cov_stat_summary_file, MosDepthNoBed.cov_stat_summary_file])
         Map[String, Float] cov_stat_summary = select_first([MosDepthOverBed.cov_stat_summary, MosDepthNoBed.cov_stat_summary])
         File? stats_per_interval_cov_stat_summary_files      = MosDepthPerInterval.cov_stat_summary_all_file
-        Map[String, Float]? stats_per_interval_cov_stat_summaries = MosDepthPerInterval.cov_stat_summary_all
+        Array[Map[String, Float]]? stats_per_interval_cov_stat_summaries = MosDepthPerInterval.cov_stat_summary_all
     }
 
 }
@@ -293,7 +293,7 @@ task MosDepthPerInterval {
     output {
         # coverage_stats.py output
         File cov_stat_summary_all_file = "~{prefix}.cov_stat_summary_all.json"
-        Map[String, Float] cov_stat_summary_all = read_json("~{prefix}.cov_stat_summary_all.json")
+        Array[Map[String, Float]] cov_stat_summary_all = read_json("~{prefix}.cov_stat_summary_all.json")
 
     }
 

From a3d06e2bf63f39168e90751375dbda25895f6f05 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Wed, 7 Aug 2024 11:29:58 -0400
Subject: [PATCH 46/62] "Addition: fix output to be object value for cover per
 interval"

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 29dd68392..29c7303ae 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -185,7 +185,7 @@ task MosDepthOverBed {
         File regions_csi      = "~{prefix}.regions.bed.gz.csi"
         # coverage_stats.py output
         File cov_stat_summary_file = "~{prefix}.cov_stat_summary.json"
-        Map[String, Float] cov_stat_summary = read_json("~{prefix}.cov_stat_summary.json")
+        Map[String, Object] cov_stat_summary = read_json("~{prefix}.cov_stat_summary.json")
 
     }
 
@@ -293,7 +293,7 @@ task MosDepthPerInterval {
     output {
         # coverage_stats.py output
         File cov_stat_summary_all_file = "~{prefix}.cov_stat_summary_all.json"
-        Array[Map[String, Float]] cov_stat_summary_all = read_json("~{prefix}.cov_stat_summary_all.json")
+        Array[Map[String, Object]] cov_stat_summary_all = read_json("~{prefix}.cov_stat_summary_all.json")
 
     }
 

From e5daa8af91b708c40bab4df79ec1d53b1e60942d Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Wed, 7 Aug 2024 11:35:22 -0400
Subject: [PATCH 47/62] "Addition: fix output to be object value for cover per
 interval"

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 29c7303ae..4c27f8d72 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -101,7 +101,7 @@ workflow MosdepthCoverageStats {
         File cov_stat_summary_file      = select_first([MosDepthOverBed.cov_stat_summary_file, MosDepthNoBed.cov_stat_summary_file])
         Map[String, Float] cov_stat_summary = select_first([MosDepthOverBed.cov_stat_summary, MosDepthNoBed.cov_stat_summary])
         File? stats_per_interval_cov_stat_summary_files      = MosDepthPerInterval.cov_stat_summary_all_file
-        Array[Map[String, Float]]? stats_per_interval_cov_stat_summaries = MosDepthPerInterval.cov_stat_summary_all
+        Array[Map[String, Object]]? stats_per_interval_cov_stat_summaries = MosDepthPerInterval.cov_stat_summary_all
     }
 
 }
@@ -185,7 +185,7 @@ task MosDepthOverBed {
         File regions_csi      = "~{prefix}.regions.bed.gz.csi"
         # coverage_stats.py output
         File cov_stat_summary_file = "~{prefix}.cov_stat_summary.json"
-        Map[String, Object] cov_stat_summary = read_json("~{prefix}.cov_stat_summary.json")
+        Map[String, Float] cov_stat_summary = read_json("~{prefix}.cov_stat_summary.json")
 
     }
 

From 13b035033c2bd0ee972d08c23de69ffad5674b28 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Wed, 7 Aug 2024 12:32:03 -0400
Subject: [PATCH 48/62] "Addition: fix output to be object value for cover per
 interval"

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 4c27f8d72..d75a45892 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -101,7 +101,7 @@ workflow MosdepthCoverageStats {
         File cov_stat_summary_file      = select_first([MosDepthOverBed.cov_stat_summary_file, MosDepthNoBed.cov_stat_summary_file])
         Map[String, Float] cov_stat_summary = select_first([MosDepthOverBed.cov_stat_summary, MosDepthNoBed.cov_stat_summary])
         File? stats_per_interval_cov_stat_summary_files      = MosDepthPerInterval.cov_stat_summary_all_file
-        Array[Map[String, Object]]? stats_per_interval_cov_stat_summaries = MosDepthPerInterval.cov_stat_summary_all
+#        Array[Map[String, Object]]? stats_per_interval_cov_stat_summaries = MosDepthPerInterval.cov_stat_summary_all
     }
 
 }
@@ -293,7 +293,7 @@ task MosDepthPerInterval {
     output {
         # coverage_stats.py output
         File cov_stat_summary_all_file = "~{prefix}.cov_stat_summary_all.json"
-        Array[Map[String, Object]] cov_stat_summary_all = read_json("~{prefix}.cov_stat_summary_all.json")
+#        Array[Map[String, Object]] cov_stat_summary_all = read_json("~{prefix}.cov_stat_summary_all.json")
 
     }
 

From 1e8a13f005e9cb283e5bf33fb01429d169165826 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Wed, 7 Aug 2024 13:23:23 -0400
Subject: [PATCH 49/62] "Addition: made max bed interval value a parameter for
 cover per interval"

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index d75a45892..33b794d81 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -12,6 +12,7 @@ workflow MosdepthCoverageStats {
         bin_length: "Length of bins to use for coverage calculation."
         preemptible_tries: "Number of times to retry a preempted task."
         stats_per_interval: "If true, calculate coverage statistics per interval. Must provide a bed file. Workflow will fail if bed file contains more than 200 intervals to avoid accidently over spending."
+        max_bed_intervals: "Maximum number of intervals allowed in the bed file for MosdpethPerInterval. Default is 200."
     }
 
     input {
@@ -22,6 +23,7 @@ workflow MosdepthCoverageStats {
         Int? bin_length
 
         Boolean stats_per_interval = false
+        Int max_bed_intervals = 200
 
         # Runtime parameters
         Int preemptible_tries = 3
@@ -44,7 +46,7 @@ workflow MosdepthCoverageStats {
                     message = "stats_per_interval is set to true, but no bed file is provided."
             }
         }
-        if (length(read_lines(select_first([bed_file]))) > 200) {
+        if (length(read_lines(select_first([bed_file]))) > max_bed_intervals) {
             call FailWorkflow as TooManyIntervals {
                 input:
                     message = "stats_per_interval is set to true, but the provided bed file contains more than 200 intervals."

From b58a0ed28c26fcf2374bbe06600d83c0b3517809 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Wed, 7 Aug 2024 14:31:20 -0400
Subject: [PATCH 50/62] "Addition: added extra disk parameter"

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 33b794d81..c2fa9442d 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -28,6 +28,7 @@ workflow MosdepthCoverageStats {
         # Runtime parameters
         Int preemptible_tries = 3
         Int mosdepth_mem = 8
+        Int mosdepth_extra_disk = 0
     }
 
     if (defined(bed_file)) {
@@ -59,6 +60,7 @@ workflow MosdepthCoverageStats {
                     bed = bed_file,
                     preemptible_tries = preemptible_tries,
                     mem = mosdepth_mem,
+                    extra_disk = mosdepth_extra_disk
             }
     }
 
@@ -82,6 +84,7 @@ workflow MosdepthCoverageStats {
                 bin_length = bin_length,
                 preemptible_tries = preemptible_tries,
                 mem = mosdepth_mem,
+                extra_disk = mosdepth_extra_disk,
                 summarize_regions = true,
         }
     }
@@ -95,6 +98,7 @@ workflow MosdepthCoverageStats {
                 bin_length = bin_length,
                 preemptible_tries = preemptible_tries,
                 mem = mosdepth_mem,
+                extra_disk = mosdepth_extra_disk,
         }
     }
 
@@ -121,6 +125,7 @@ task MosDepthOverBed {
         bin_length: "Length of bins to use for coverage calculation. default is 1000. If bed file is provided, this parameter is ignored."
         chrom: "Chromosome to calculate coverage for."
         preemptible_tries: "Number of times to retry a preempted task."
+        extra_disk: "Extra disk space to allocate for intermediate files."
     }
 
     input {
@@ -135,6 +140,7 @@ task MosDepthOverBed {
         # Runtime parameters
         Int mem = 8
         Int preemptible_tries = 3
+        Int extra_disk = 0
     }
     # mosdepth parameters
     Int threads = 4
@@ -147,7 +153,7 @@ task MosDepthOverBed {
     Int round = 2
 
     # Calculate disk size
-    Int disk_size = 2 * ceil(size(bam, "GB") + size(bai, "GB"))
+    Int disk_size = (2 * ceil(size(bam, "GB") + size(bai, "GB"))) + 100 + extra_disk
     String basename = basename(bam, ".bam")
     String prefix = "~{basename}.coverage_over_bed"
 
@@ -212,6 +218,7 @@ task MosDepthPerInterval {
         bed: "BED file containing regions of interest."
         bin_length: "Length of bins to use for coverage calculation. default is 1000. If bed file is provided, this parameter is ignored."
         preemptible_tries: "Number of times to retry a preempted task."
+        extra_disk: "Extra disk space to allocate for intermediate files."
     }
 
     input {
@@ -224,6 +231,7 @@ task MosDepthPerInterval {
         # Runtime parameters
         Int mem = 8
         Int preemptible_tries = 3
+        Int extra_disk = 0
     }
     # mosdepth parameters
     Int threads = 4
@@ -236,7 +244,7 @@ task MosDepthPerInterval {
     Int round = 2
 
     # Calculate disk size
-    Int disk_size = 2 * ceil(size(bam, "GB") + size(bai, "GB"))
+    Int disk_size = (2 * ceil(size(bam, "GB") + size(bai, "GB"))) + 100 + extra_disk
     String basename = basename(bam, ".bam")
     String prefix = "~{basename}.coverage_over_bed"
 

From 3ba18d485ad47e8fd61fdd07e9f8d4ddaa15e68b Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Wed, 7 Aug 2024 14:59:45 -0400
Subject: [PATCH 51/62] "Addition: removed cpu details from runtime to avoid
 expensive custom vms, this should automatically be scaled by mem"

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 --
 1 file changed, 2 deletions(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index c2fa9442d..17d6e3968 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -198,7 +198,6 @@ task MosDepthOverBed {
     }
 
     runtime {
-        cpu:                    4
         memory:                 mem + " GiB"
         disks: "local-disk " +  disk_size + " HDD"
         preemptible:            preemptible_tries
@@ -308,7 +307,6 @@ task MosDepthPerInterval {
     }
 
     runtime {
-        cpu:                    4
         memory:                 mem + " GiB"
         disks: "local-disk " +  disk_size + " HDD"
         preemptible:            preemptible_tries

From 8a8db154d27e96295f6e819e59ea7133c3c67149 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Thu, 15 Aug 2024 13:38:11 -0400
Subject: [PATCH 52/62] "Addition: added interval to per interval json

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 17d6e3968..7e3da8c8d 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -285,6 +285,10 @@ task MosDepthPerInterval {
             --output_prefix ~{prefix} \
             ~{cov_file_to_summarize}
 
+            # In the coverage stats summary replace the open bracket with 'interval' name and the line as the value
+            sed -i '' 's/{/\{"interval": "$line", /' ~{prefix}.cov_stat_summary.json
+
+
             # Append the coverage stats summary of the current interval to the file containing the summary of all intervals
             if [[ -s ~{prefix}.cov_stat_summary.json ]]; then
                 cat ~{prefix}.cov_stat_summary.json >> ~{prefix}.cov_stat_summary_all.json

From 2cb6d449ccf80575b3d5122d8209415a89f5c2cd Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Thu, 15 Aug 2024 14:37:29 -0400
Subject: [PATCH 53/62] "Addition: fix sed command

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 7e3da8c8d..117793977 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -286,7 +286,7 @@ task MosDepthPerInterval {
             ~{cov_file_to_summarize}
 
             # In the coverage stats summary replace the open bracket with 'interval' name and the line as the value
-            sed -i '' 's/{/\{"interval": "$line", /' ~{prefix}.cov_stat_summary.json
+            sed -i 's/{/\{"interval": "$line", /' ~{prefix}.cov_stat_summary.json
 
 
             # Append the coverage stats summary of the current interval to the file containing the summary of all intervals

From a2622476c63e740aeacb711221be448f8fa6811a Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Thu, 15 Aug 2024 14:50:56 -0400
Subject: [PATCH 54/62] "Addition: fix sed command with backslash for double
 quotes

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 117793977..19fe08f42 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -286,7 +286,7 @@ task MosDepthPerInterval {
             ~{cov_file_to_summarize}
 
             # In the coverage stats summary replace the open bracket with 'interval' name and the line as the value
-            sed -i 's/{/\{"interval": "$line", /' ~{prefix}.cov_stat_summary.json
+            sed -i 's/{/\{"interval": \"$line\", /' ~{prefix}.cov_stat_summary.json
 
 
             # Append the coverage stats summary of the current interval to the file containing the summary of all intervals

From 2cf2d12a485ccaea76e1d09aaf31acae3108c689 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Mon, 19 Aug 2024 11:06:19 -0400
Subject: [PATCH 55/62] "Fix: updated quotation for sed in mosdepthperinterval"

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 19fe08f42..92567553e 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -286,7 +286,7 @@ task MosDepthPerInterval {
             ~{cov_file_to_summarize}
 
             # In the coverage stats summary replace the open bracket with 'interval' name and the line as the value
-            sed -i 's/{/\{"interval": \"$line\", /' ~{prefix}.cov_stat_summary.json
+            sed -i "s/{/\{\"interval\": \"$line\", /" ~{prefix}.cov_stat_summary.json
 
 
             # Append the coverage stats summary of the current interval to the file containing the summary of all intervals

From 8d7f19713d2dc6e14546ac76bb3cdc4fd4c51849 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Mon, 19 Aug 2024 14:18:55 -0400
Subject: [PATCH 56/62] "Fix: updated mosdepthperinterval to skip interval if
 mosdepth failed"

---
 .../TechAgnostic/Utility/CoverageStats.wdl    | 39 +++++++++++--------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 92567553e..92cc24b39 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -266,7 +266,7 @@ task MosDepthPerInterval {
 
         mapfile -t lines < "~{bed}"
 
-        for line in "${lines[@]}"; do
+        for line in "${lines[@]}"; then
             bed_file="$tmp_dir/bed_line.bed"
             echo $line | tr ' ' '\t' > $bed_file
 
@@ -278,24 +278,29 @@ task MosDepthPerInterval {
             ~{"-Q " + mapq} \
             ~{prefix} ./~{basename}.bam
 
-            # Run coverage_stats.py
-            python3 /coverage_stats.py \
-            --cov_col ~{cov_col} \
-            --round ~{round} \
-            --output_prefix ~{prefix} \
-            ~{cov_file_to_summarize}
-
-            # In the coverage stats summary replace the open bracket with 'interval' name and the line as the value
-            sed -i "s/{/\{\"interval\": \"$line\", /" ~{prefix}.cov_stat_summary.json
-
-
-            # Append the coverage stats summary of the current interval to the file containing the summary of all intervals
-            if [[ -s ~{prefix}.cov_stat_summary.json ]]; then
-                cat ~{prefix}.cov_stat_summary.json >> ~{prefix}.cov_stat_summary_all.json
-                echo "," >> ~{prefix}.cov_stat_summary_all.json
+            # if mosdepth fails, skip the current interval
+            if [[ $? -ne 0 ]]; then
+                # Run coverage_stats.py
+                python3 /coverage_stats.py \
+                --cov_col ~{cov_col} \
+                --round ~{round} \
+                --output_prefix ~{prefix} \
+                ~{cov_file_to_summarize}
+
+                # In the coverage stats summary replace the open bracket with 'interval' name and the line as the value
+                sed -i "s/{/\{\"interval\": \"$line\", /" ~{prefix}.cov_stat_summary.json
+
+
+                # Append the coverage stats summary of the current interval to the file containing the summary of all intervals
+                if [[ -s ~{prefix}.cov_stat_summary.json ]]; then
+                    cat ~{prefix}.cov_stat_summary.json >> ~{prefix}.cov_stat_summary_all.json
+                    echo "," >> ~{prefix}.cov_stat_summary_all.json
+                fi
+            else
+                echo "Mosdepth failed for interval: $line. Coverage stats script was not run."
             fi
-
         done
+
         # remove the last comma and close the json array
         sed -i '$ s/,$//' ~{prefix}.cov_stat_summary_all.json
         echo "]" >> ~{prefix}.cov_stat_summary_all.json

From 8a13b1098bccef05b5100ec6eae5bea5e0d6fcca Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Mon, 19 Aug 2024 16:02:05 -0400
Subject: [PATCH 57/62] "Fix: updated mosdepthperinterval fail loud if mosdepth
 fails"

---
 .../TechAgnostic/Utility/CoverageStats.wdl    | 43 ++++++++++---------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 92cc24b39..7212a47cf 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -266,7 +266,7 @@ task MosDepthPerInterval {
 
         mapfile -t lines < "~{bed}"
 
-        for line in "${lines[@]}"; then
+        for line in "${lines[@]}"; do
             bed_file="$tmp_dir/bed_line.bed"
             echo $line | tr ' ' '\t' > $bed_file
 
@@ -278,29 +278,30 @@ task MosDepthPerInterval {
             ~{"-Q " + mapq} \
             ~{prefix} ./~{basename}.bam
 
-            # if mosdepth fails, skip the current interval
+            # if mosdepth fails, exit with failure
             if [[ $? -ne 0 ]]; then
-                # Run coverage_stats.py
-                python3 /coverage_stats.py \
-                --cov_col ~{cov_col} \
-                --round ~{round} \
-                --output_prefix ~{prefix} \
-                ~{cov_file_to_summarize}
-
-                # In the coverage stats summary replace the open bracket with 'interval' name and the line as the value
-                sed -i "s/{/\{\"interval\": \"$line\", /" ~{prefix}.cov_stat_summary.json
-
-
-                # Append the coverage stats summary of the current interval to the file containing the summary of all intervals
-                if [[ -s ~{prefix}.cov_stat_summary.json ]]; then
-                    cat ~{prefix}.cov_stat_summary.json >> ~{prefix}.cov_stat_summary_all.json
-                    echo "," >> ~{prefix}.cov_stat_summary_all.json
-                fi
-            else
-                echo "Mosdepth failed for interval: $line. Coverage stats script was not run."
+                echo "Mosdepth failed for interval $line"
+                exit 1
             fi
-        done
 
+            # Run coverage_stats.py
+            python3 /coverage_stats.py \
+            --cov_col ~{cov_col} \
+            --round ~{round} \
+            --output_prefix ~{prefix} \
+            ~{cov_file_to_summarize}
+
+            # In the coverage stats summary replace the open bracket with 'interval' name and the line as the value
+            sed -i "s/{/\{\"interval\": \"$line\", /" ~{prefix}.cov_stat_summary.json
+
+
+            # Append the coverage stats summary of the current interval to the file containing the summary of all intervals
+            if [[ -s ~{prefix}.cov_stat_summary.json ]]; then
+                cat ~{prefix}.cov_stat_summary.json >> ~{prefix}.cov_stat_summary_all.json
+                echo "," >> ~{prefix}.cov_stat_summary_all.json
+            fi
+
+        done
         # remove the last comma and close the json array
         sed -i '$ s/,$//' ~{prefix}.cov_stat_summary_all.json
         echo "]" >> ~{prefix}.cov_stat_summary_all.json

From e0cf7d3a45dd7b47a4ed7c54e2feb2077fef828b Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Thu, 22 Aug 2024 11:44:01 -0400
Subject: [PATCH 58/62] "Fix: updated mosdepthperinterval to disable per base
 reads "

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 7212a47cf..4e8d3de64 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -234,7 +234,7 @@ task MosDepthPerInterval {
     }
     # mosdepth parameters
     Int threads = 4
-    Boolean no_per_base = false
+    Boolean no_per_base = true
     Boolean fast_mode = false
     Int mapq = 1
 

From 35ea1d393e23105e62d35191e2b2f469b27a5fca Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Mon, 26 Aug 2024 10:45:32 -0400
Subject: [PATCH 59/62] "Fix: updated mosdepthperinterval to use samtools to
 create new bam interval to be processed my mosdepth per-base"

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 4e8d3de64..4153e4166 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -234,7 +234,7 @@ task MosDepthPerInterval {
     }
     # mosdepth parameters
     Int threads = 4
-    Boolean no_per_base = true
+    Boolean no_per_base = false
     Boolean fast_mode = false
     Int mapq = 1
 
@@ -247,7 +247,7 @@ task MosDepthPerInterval {
     String basename = basename(bam, ".bam")
     String prefix = "~{basename}.coverage_over_bed"
 
-    String cov_file_to_summarize = "~{prefix}.regions.bed.gz"
+    String cov_file_to_summarize = "~{prefix}.per-base.bed.gz"
 
     command <<<
         set -euxo pipefail
@@ -270,13 +270,15 @@ task MosDepthPerInterval {
             bed_file="$tmp_dir/bed_line.bed"
             echo $line | tr ' ' '\t' > $bed_file
 
+            # Use samtools to extract reads overlapping the interval
+            samtools view -b -L $bed_file ~{basename}.bam > $tmp_dir/interval.bam
+
             mosdepth \
             ~{true="-n" false="" no_per_base} \
             ~{true="-x" false="" fast_mode} \
             -t ~{threads} \
-            -b $bed_file \
             ~{"-Q " + mapq} \
-            ~{prefix} ./~{basename}.bam
+            ~{prefix} $tmp_dir/interval.bam
 
             # if mosdepth fails, exit with failure
             if [[ $? -ne 0 ]]; then
@@ -286,6 +288,7 @@ task MosDepthPerInterval {
 
             # Run coverage_stats.py
             python3 /coverage_stats.py \
+            --debug \
             --cov_col ~{cov_col} \
             --round ~{round} \
             --output_prefix ~{prefix} \

From e6c2f8be2914dc23f75b13908a9fd025372e8a40 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Mon, 26 Aug 2024 14:34:51 -0400
Subject: [PATCH 60/62] "Fix: updated mosdepthperinterval to use samtools to
 create new bam index"

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 4153e4166..046653ab5 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -272,6 +272,7 @@ task MosDepthPerInterval {
 
             # Use samtools to extract reads overlapping the interval
             samtools view -b -L $bed_file ~{basename}.bam > $tmp_dir/interval.bam
+            samtools index $tmp_dir/interval.bam
 
             mosdepth \
             ~{true="-n" false="" no_per_base} \

From 9cff0552cf95abcbf34decbefdf90fb301c42a95 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Mon, 26 Aug 2024 18:39:05 -0400
Subject: [PATCH 61/62] "Fix: updated mosdepthperinterval to use samtools
 --region-file"

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 046653ab5..1c6c167c0 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -271,7 +271,7 @@ task MosDepthPerInterval {
             echo $line | tr ' ' '\t' > $bed_file
 
             # Use samtools to extract reads overlapping the interval
-            samtools view -b -L $bed_file ~{basename}.bam > $tmp_dir/interval.bam
+            samtools view -b --region-file $bed_file ~{basename}.bam > $tmp_dir/interval.bam
             samtools index $tmp_dir/interval.bam
 
             mosdepth \

From f177effddf297bf6780ba67437ed62bac6905124 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Mon, 26 Aug 2024 19:56:11 -0400
Subject: [PATCH 62/62] "Fix: updated mosdepthperinterval to use samtools -M
 and -L"

---
 wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
index 1c6c167c0..cd496f532 100644
--- a/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
+++ b/wdl/pipelines/TechAgnostic/Utility/CoverageStats.wdl
@@ -271,7 +271,7 @@ task MosDepthPerInterval {
             echo $line | tr ' ' '\t' > $bed_file
 
             # Use samtools to extract reads overlapping the interval
-            samtools view -b --region-file $bed_file ~{basename}.bam > $tmp_dir/interval.bam
+            samtools view -bM -L $bed_file ~{basename}.bam > $tmp_dir/interval.bam
             samtools index $tmp_dir/interval.bam
 
             mosdepth \