@@ -716,53 +716,76 @@ task GetDuplicateReadnamesInQnameSortedBam {
716
716
}
717
717
parameter_meta {
718
718
qns_bam : {
719
+ desciption : "Query name sorted BAM to be de-duplicated" ,
719
720
localization_optional : true
720
721
}
722
+ trial_idx : "the n-th time this is being tried for (start from 1), if this value is >= trial_max, the BAM will be localized and the task will use a persistent SSD instead of persistent HDD."
723
+ trial_max : "the max number of attempt to perform the duty by streaming in the BAM; this design together with trial_idx is to prevent call-caching preventing retries."
721
724
}
722
725
input {
723
726
File qns_bam
727
+ Int trial_idx = 1
728
+ Int trial_max = 3
724
729
}
725
730
726
731
output {
727
732
File dup_names_txt = "dup_read_names.txt"
728
733
Boolean result_may_be_corrupted = read_boolean ("samtools.failed.txt" )
729
734
}
730
735
736
+ Boolean localize_bam = trial_idx >= trial_max
737
+
731
738
command <<<
732
- # the way this works is the following:
733
- # 0) relying on the re-auth.sh script to export the credentials
734
- # 1) perform the remote sam-view subsetting in the background
735
- # 2) listen to the PID of the background process, while re-auth every 1200 seconds
739
+
736
740
source /opt/re-auth.sh
737
741
set -euxo pipefail
738
742
739
743
# assumption
740
744
sort_order = $(samtools view -H ~{qns_bam } | grep "^@HD" | tr '\t' '\n' | grep "^SO:" | awk -F ':' '{print $2}' )
741
745
if [[ "queryname" != "${sort_order} " ]]; then echo -e "Sort order ${sort_oder} isn't the expected 'queryname'." && exit 1 ; fi
742
746
743
- # remote grab read names
744
- echo "false" > samtools.failed.txt
745
- samtools view ~{qns_bam } \
746
- | awk -F '\t' '{print $1}' \
747
- | uniq -d \
748
- > "dup_read_names.txt" \
749
- || { echo "true" > samtools.failed.txt ; exit 77 ; } &
750
- pid = $!
747
+ if ~{localize_bam }; then
748
+ time \
749
+ gcloud storage cp ~{qns_bam } name_does_not_matter.bam
751
750
752
- set +e
753
- count = 1
754
- while true ; do
755
- sleep 1200 && date && source /opt/re-auth.sh
756
- if [[ ${count } -gt 2 ]]; then exit 0 ; fi
757
- if ! pgrep -x -P $pid ; then exit 0 ; fi
758
- count = $(( count + 1 ))
759
- done
751
+ samtools view name_does_not_matter.bam \
752
+ | awk -F '\t' '{print $1}' \
753
+ | uniq -d \
754
+ > "dup_read_names.txt"
755
+
756
+ echo "false" > samtools.failed.txt
757
+ else
758
+ # the way this works is the following:
759
+ # 0) relying on the re-auth.sh script to export the credentials
760
+ # 1) perform the remote sam-view operation in the background
761
+ # 2) listen to the PID of the background process, while re-auth every 1200 seconds
762
+
763
+ # remote grab read names
764
+ echo "false" > samtools.failed.txt
765
+ samtools view ~{qns_bam } \
766
+ | awk -F '\t' '{print $1}' \
767
+ | uniq -d \
768
+ > "dup_read_names.txt" \
769
+ || { echo "true" > samtools.failed.txt ; exit 77 ; } &
770
+ pid = $!
771
+
772
+ set +e
773
+ count = 1
774
+ while true ; do
775
+ sleep 1200 && date && source /opt/re-auth.sh
776
+ if [[ ${count } -gt 2 ]]; then exit 0 ; fi
777
+ if ! pgrep -x -P $pid ; then exit 0 ; fi
778
+ count = $(( count + 1 ))
779
+ done
780
+ fi
760
781
>>>
761
782
783
+ Int disk_size = 5 + (if (localize_bam ) then ceil (size (qns_bam , "Gib" )) else 0 )
784
+ String disk_type = if (localize_bam ) then "SSD" else "HDD"
762
785
runtime {
763
786
cpu : 1
764
787
memory : "4 GiB"
765
- disks : "local-disk 10 HDD "
788
+ disks : "local-disk ~{ disk_size } ~{ disk_type } "
766
789
preemptible : 2
767
790
maxRetries : 1
768
791
docker : "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3"
@@ -1736,31 +1759,38 @@ task SplitByRG {
1736
1759
}
1737
1760
1738
1761
parameter_meta {
1739
- bam : "BAM to be split"
1762
+ bam : {
1763
+ desciption : "BAM to be split" ,
1764
+ localization_optional : true
1765
+ }
1740
1766
out_prefix : "prefix for output bam and bai file names"
1767
+ retain_rgless_records : "flag to save the reads that have no RG tag"
1741
1768
sort_and_index : "if the user wants to (pos-)sort and index the resulting BAMs; this indicates the input BAM is mapped"
1742
1769
1743
1770
split_bam : "the resuling BAMs, each having reads only in a single read group"
1744
1771
split_bai : "the accompanying BAIs, if possible and explicit requested"
1745
1772
}
1746
1773
1747
- Int disk_size = if defined (num_ssds ) then 375 *select_first ([num_ssds ]) else 1 +3 *ceil (size ([bam ], "GB" ))
1748
-
1749
1774
Array [String ] extra_args = if (retain_rgless_records ) then ["-u" , "~{out_prefix }_noRG.bam" ] else ["" ]
1775
+
1776
+ String local_bam = basename (bam )
1750
1777
command <<<
1751
1778
set -eux
1779
+ time \
1780
+ gcloud storage cp ~{bam } ~{local_bam }
1752
1781
1753
- samtools view -H ~{bam } | grep "^@RG" > "read_groups_header.txt"
1782
+ samtools view -H ~{local_bam } | grep "^@RG" > "read_groups_header.txt"
1754
1783
cat "read_groups_header.txt" | tr '\t' '\n' | grep "^ID:" | awk -F ':' '{print $2}' > "RG_ids.txt"
1755
1784
1756
1785
samtools split -@3 \
1757
1786
-f "~{out_prefix}_%#.bam" \
1758
1787
~{sep =" " extra_args } \
1759
- ~{bam }
1788
+ ~{local_bam }
1789
+ rm ~{local_bam }
1790
+
1760
1791
if ~{sort_and_index } ;
1761
1792
then
1762
1793
# cleanup space for the sorting
1763
- rm ~{bam }
1764
1794
for split_bam in "~{out_prefix}_" *.bam ;
1765
1795
do
1766
1796
mv "${split_bam} " temp.bam
@@ -1780,6 +1810,9 @@ task SplitByRG {
1780
1810
}
1781
1811
1782
1812
#########################
1813
+ Int disk_size = if defined (num_ssds ) then 375 *select_first ([num_ssds ]) else 1 +3 *ceil (size ([bam ], "GB" ))
1814
+ String disk_type = if defined (num_ssds ) then "LOCAL" else "SSD" # IO-bound operation, no HDD please
1815
+
1783
1816
RuntimeAttr default_attr = object {
1784
1817
cpu_cores : 4 ,
1785
1818
mem_gb : 16 ,
@@ -1792,7 +1825,7 @@ task SplitByRG {
1792
1825
runtime {
1793
1826
cpu : select_first ([runtime_attr .cpu_cores , default_attr .cpu_cores ])
1794
1827
memory : select_first ([runtime_attr .mem_gb , default_attr .mem_gb ]) + " GiB"
1795
- disks : "local-disk " + select_first ([runtime_attr .disk_gb , default_attr .disk_gb ]) + " LOCAL "
1828
+ disks : "local-disk " + select_first ([runtime_attr .disk_gb , default_attr .disk_gb ]) + " ~{ disk_type } "
1796
1829
preemptible : select_first ([runtime_attr .preemptible_tries , default_attr .preemptible_tries ])
1797
1830
maxRetries : select_first ([runtime_attr .max_retries , default_attr .max_retries ])
1798
1831
docker : select_first ([runtime_attr .docker , default_attr .docker ])
0 commit comments