Skip to content

Commit ce90a22

Browse files
committed
Chaning utility WDLs:
* make BAM split by readgroup more efficient * make the ONT duplicated reads verification step resilient to http transient errors * new task to verify primrose was run on PacBio BAM * new task to get basecall model from ONT BAM
1 parent 1ac2a31 commit ce90a22

File tree

3 files changed

+147
-28
lines changed

3 files changed

+147
-28
lines changed

wdl/tasks/Utility/BAMutils.wdl

Lines changed: 61 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -716,53 +716,76 @@ task GetDuplicateReadnamesInQnameSortedBam {
716716
}
717717
parameter_meta {
718718
qns_bam: {
719+
desciption: "Query name sorted BAM to be de-duplicated",
719720
localization_optional: true
720721
}
722+
trial_idx: "the n-th time this is being tried for (start from 1), if this value is >= trial_max, the BAM will be localized and the task will use a persistent SSD instead of persistent HDD."
723+
trial_max: "the max number of attempt to perform the duty by streaming in the BAM; this design together with trial_idx is to prevent call-caching preventing retries."
721724
}
722725
input {
723726
File qns_bam
727+
Int trial_idx = 1
728+
Int trial_max = 3
724729
}
725730

726731
output {
727732
File dup_names_txt = "dup_read_names.txt"
728733
Boolean result_may_be_corrupted = read_boolean("samtools.failed.txt")
729734
}
730735

736+
Boolean localize_bam = trial_idx >= trial_max
737+
731738
command <<<
732-
# the way this works is the following:
733-
# 0) relying on the re-auth.sh script to export the credentials
734-
# 1) perform the remote sam-view subsetting in the background
735-
# 2) listen to the PID of the background process, while re-auth every 1200 seconds
739+
736740
source /opt/re-auth.sh
737741
set -euxo pipefail
738742
739743
# assumption
740744
sort_order=$(samtools view -H ~{qns_bam} | grep "^@HD" | tr '\t' '\n' | grep "^SO:" | awk -F ':' '{print $2}')
741745
if [[ "queryname" != "${sort_order}" ]]; then echo -e "Sort order ${sort_oder} isn't the expected 'queryname'." && exit 1; fi
742746
743-
# remote grab read names
744-
echo "false" > samtools.failed.txt
745-
samtools view ~{qns_bam} \
746-
| awk -F '\t' '{print $1}' \
747-
| uniq -d \
748-
> "dup_read_names.txt" \
749-
|| { echo "true" > samtools.failed.txt; exit 77; } &
750-
pid=$!
747+
if ~{localize_bam}; then
748+
time \
749+
gcloud storage cp ~{qns_bam} name_does_not_matter.bam
751750
752-
set +e
753-
count=1
754-
while true; do
755-
sleep 1200 && date && source /opt/re-auth.sh
756-
if [[ ${count} -gt 2 ]]; then exit 0; fi
757-
if ! pgrep -x -P $pid; then exit 0; fi
758-
count=$(( count+1 ))
759-
done
751+
samtools view name_does_not_matter.bam \
752+
| awk -F '\t' '{print $1}' \
753+
| uniq -d \
754+
> "dup_read_names.txt"
755+
756+
echo "false" > samtools.failed.txt
757+
else
758+
# the way this works is the following:
759+
# 0) relying on the re-auth.sh script to export the credentials
760+
# 1) perform the remote sam-view operation in the background
761+
# 2) listen to the PID of the background process, while re-auth every 1200 seconds
762+
763+
# remote grab read names
764+
echo "false" > samtools.failed.txt
765+
samtools view ~{qns_bam} \
766+
| awk -F '\t' '{print $1}' \
767+
| uniq -d \
768+
> "dup_read_names.txt" \
769+
|| { echo "true" > samtools.failed.txt; exit 77; } &
770+
pid=$!
771+
772+
set +e
773+
count=1
774+
while true; do
775+
sleep 1200 && date && source /opt/re-auth.sh
776+
if [[ ${count} -gt 2 ]]; then exit 0; fi
777+
if ! pgrep -x -P $pid; then exit 0; fi
778+
count=$(( count+1 ))
779+
done
780+
fi
760781
>>>
761782

783+
Int disk_size = 5 + (if (localize_bam) then ceil(size(qns_bam, "Gib")) else 0)
784+
String disk_type = if (localize_bam) then "SSD" else "HDD"
762785
runtime {
763786
cpu: 1
764787
memory: "4 GiB"
765-
disks: "local-disk 10 HDD"
788+
disks: "local-disk ~{disk_size} ~{disk_type}"
766789
preemptible: 2
767790
maxRetries: 1
768791
docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3"
@@ -1736,31 +1759,38 @@ task SplitByRG {
17361759
}
17371760

17381761
parameter_meta {
1739-
bam: "BAM to be split"
1762+
bam: {
1763+
desciption: "BAM to be split",
1764+
localization_optional: true
1765+
}
17401766
out_prefix: "prefix for output bam and bai file names"
1767+
retain_rgless_records: "flag to save the reads that have no RG tag"
17411768
sort_and_index: "if the user wants to (pos-)sort and index the resulting BAMs; this indicates the input BAM is mapped"
17421769

17431770
split_bam: "the resuling BAMs, each having reads only in a single read group"
17441771
split_bai: "the accompanying BAIs, if possible and explicit requested"
17451772
}
17461773

1747-
Int disk_size = if defined(num_ssds) then 375*select_first([num_ssds]) else 1+3*ceil(size([bam], "GB"))
1748-
17491774
Array[String] extra_args = if (retain_rgless_records) then ["-u", "~{out_prefix}_noRG.bam"] else [""]
1775+
1776+
String local_bam = basename(bam)
17501777
command <<<
17511778
set -eux
1779+
time \
1780+
gcloud storage cp ~{bam} ~{local_bam}
17521781
1753-
samtools view -H ~{bam} | grep "^@RG" > "read_groups_header.txt"
1782+
samtools view -H ~{local_bam} | grep "^@RG" > "read_groups_header.txt"
17541783
cat "read_groups_header.txt" | tr '\t' '\n' | grep "^ID:" | awk -F ':' '{print $2}' > "RG_ids.txt"
17551784
17561785
samtools split -@3 \
17571786
-f "~{out_prefix}_%#.bam" \
17581787
~{sep=" " extra_args} \
1759-
~{bam}
1788+
~{local_bam}
1789+
rm ~{local_bam}
1790+
17601791
if ~{sort_and_index} ;
17611792
then
17621793
# cleanup space for the sorting
1763-
rm ~{bam}
17641794
for split_bam in "~{out_prefix}_"*.bam;
17651795
do
17661796
mv "${split_bam}" temp.bam
@@ -1780,6 +1810,9 @@ task SplitByRG {
17801810
}
17811811

17821812
#########################
1813+
Int disk_size = if defined(num_ssds) then 375*select_first([num_ssds]) else 1+3*ceil(size([bam], "GB"))
1814+
String disk_type = if defined(num_ssds) then "LOCAL" else "SSD" # IO-bound operation, no HDD please
1815+
17831816
RuntimeAttr default_attr = object {
17841817
cpu_cores: 4,
17851818
mem_gb: 16,
@@ -1792,7 +1825,7 @@ task SplitByRG {
17921825
runtime {
17931826
cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
17941827
memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
1795-
disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL"
1828+
disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " ~{disk_type}"
17961829
preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
17971830
maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
17981831
docker: select_first([runtime_attr.docker, default_attr.docker])

wdl/tasks/Utility/ONTUtils.wdl

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,3 +321,46 @@ task DeduplicateBam {
321321
docker: select_first([runtime_attr.docker, default_attr.docker])
322322
}
323323
}
324+
325+
task GetBasecallModel {
326+
meta {
327+
desciption: "Getting the basecall model string of an ONT BAM"
328+
}
329+
parameter_meta {
330+
bam: {
331+
desciption: "BAM to operate on",
332+
localization_optional: true
333+
}
334+
runid_2_model: "The basecall model for each run."
335+
}
336+
input {
337+
File bam
338+
}
339+
output {
340+
Map[String, String] runid_2_model = read_map("results.tsv")
341+
}
342+
343+
command <<<
344+
set -eux
345+
346+
export GCS_OAUTH_TOKEN=$(gcloud auth application-default print-access-token)
347+
samtools view -H ~{bam} | grep "^@RG" > one_rg_per_line.txt
348+
349+
while IFS= read -r line
350+
do
351+
echo "$line" | tr '\t' '\n' | grep "^DS:" | sed "s/^DS://" | tr ' ' '\n' > tmp.txt
352+
runid=$(grep "^runid=" tmp.txt | awk -F '=' '{print $2}')
353+
model=$(grep "^basecall_model=" tmp.txt | awk -F '=' '{print $2}')
354+
echo -e "${runid}\t${model}" >> results.tsv
355+
done < one_rg_per_line.txt
356+
>>>
357+
358+
runtime {
359+
cpu: 1
360+
memory: "4 GiB"
361+
disks: "local-disk 10 HDD"
362+
preemptible: 2
363+
maxRetries: 1
364+
docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3"
365+
}
366+
}

wdl/tasks/Utility/PBUtils.wdl

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1287,3 +1287,46 @@ task SummarizePBI {
12871287
docker: select_first([runtime_attr.docker, default_attr.docker])
12881288
}
12891289
}
1290+
1291+
# todo: primrose is rebranded as jasmine, take care of that later
1292+
task VerifyPacBioBamHasAppropriatePrimroseRuns {
1293+
meta {
1294+
desciption: "Verify that a PacBio's BAM has primrose run on all its read groups"
1295+
}
1296+
input {
1297+
String bam
1298+
}
1299+
1300+
output {
1301+
Array[String] readgroups_missing_primrose = read_lines("movies_without_primrose.txt")
1302+
}
1303+
1304+
command <<<
1305+
set -eux
1306+
1307+
export GCS_OAUTH_TOKEN=`gcloud auth application-default print-access-token`
1308+
samtools view -H ~{bam} > header.txt
1309+
1310+
# get read groups' movies
1311+
grep "^@RG" header.txt | tr '\t' '\n' | grep "^PU:" | awk -F ':' '{print $2}' | sort > readgroup.movies.txt
1312+
cat readgroup.movies.txt
1313+
1314+
# get primrose PG lines
1315+
grep "^@PG" header.txt | grep -v "^@SQ" | grep "^@PG" | grep -F 'ID:primrose' | tr '\t' '\n' | grep '^CL:' > primrose.pg.lines.txt
1316+
tr ' ' '\n' < primrose.pg.lines.txt
1317+
1318+
touch movies_without_primrose.txt
1319+
while IFS= read -r readgroup; do
1320+
if ! grep -q "${readgroup}" primrose.pg.lines.txt; then echo "${readgroup}" >> movies_without_primrose.txt; fi
1321+
done < readgroup.movies.txt
1322+
>>>
1323+
1324+
runtime {
1325+
cpu: 1
1326+
memory: "4 GiB"
1327+
disks: "local-disk 10 HDD"
1328+
preemptible: 2
1329+
maxRetries: 1
1330+
docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3"
1331+
}
1332+
}

0 commit comments

Comments
 (0)