Skip to content

Commit 29c570b

Browse files
committed
Two new workflows:
handle (ref-indepedent, alignment) a single piece of on-instrument demultiplexed CCS bam
1 parent b487248 commit 29c570b

File tree

3 files changed

+380
-0
lines changed

3 files changed

+380
-0
lines changed

.dockstore.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,12 @@ workflows:
8181
- name: PBMASIsoSeqDemultiplex
8282
subclass: wdl
8383
primaryDescriptorPath: /wdl/pipelines/PacBio/Utility/PBMASIsoSeqDemultiplex.wdl
84+
- name: ProcessOnInstrumentDemuxedChunkRefFree
85+
subclass: wdl
86+
primaryDescriptorPath: /wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunkRefFree.wdl
87+
- name: ProcessOnInstrumentDemuxedChunk
88+
subclass: wdl
89+
primaryDescriptorPath: /wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunk.wdl
8490

8591
###################################################
8692
# TechAgnostic - *mics data processing
Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
version 1.0
2+
3+
import "../../../tasks/Utility/GeneralUtils.wdl" as GU
4+
5+
import "../../../tasks/Alignment/AlignHiFiUBAM.wdl" as ALN
6+
7+
import "../../TechAgnostic/Utility/AlignedBamQCandMetrics.wdl" as QCMetrics
8+
9+
import "../../../tasks/Utility/Finalize.wdl" as FF
10+
11+
workflow ProcessOnInstrumentDemuxedChunk {
12+
13+
meta {
14+
desciption: "Given an on-instrument demultiplexed hifi_reads.bam, perform alignment and QC check, and collect a few metrics."
15+
}
16+
17+
parameter_meta {
18+
19+
#########
20+
# inputs
21+
readgroup_id:
22+
"Unique ID for a readgroup, for example (D|E)A[0-9]{6}-<barcode>; no whitespaces allowed"
23+
24+
bam_SM_field:
25+
"value to place in the SM field of the resulting BAM header's @RG line"
26+
27+
platform:
28+
"PacBio platform used for generating the data; accepted value: [Sequel, Revio]"
29+
30+
gcs_out_root_dir:
31+
"output files will be copied over there"
32+
33+
qc_metrics_config_json:
34+
"A config json to for running the QC and metrics-collection sub-workflow 'AlignedBamQCandMetrics'"
35+
36+
fingerprint_sample_id:
37+
"For fingerprint verification: the ID of the sample supposedly this BAM belongs to; note that the fingerprint VCF is assumed to be located at {fingerprint_store}/{fingerprint_sample_id}*.vcf(.gz)?"
38+
39+
expected_sex_type:
40+
"If provided, triggers sex concordance check. Accepted value: [M, F, NA, na]"
41+
42+
#########
43+
# outputs
44+
wgs_cov:
45+
"whole genome mean coverage"
46+
47+
nanoplot_summ:
48+
"Summary on alignment metrics provided by Nanoplot (todo: study the value of this output)"
49+
50+
sam_flag_stats:
51+
"SAM flag stats"
52+
53+
fingerprint_check:
54+
"Summary on (human) fingerprint checking results"
55+
56+
contamination_est:
57+
"cross-(human)individual contamination estimation by VerifyBAMID2"
58+
59+
inferred_sex_info:
60+
"Inferred sex concordance information if expected sex type is provided"
61+
62+
methyl_tag_simple_stats:
63+
"Simple stats on the reads with & without SAM methylation tags (MM/ML)."
64+
65+
aBAM_metrics_files:
66+
"A map where keys are summary-names and values are paths to files generated from the various QC/metrics tasks"
67+
}
68+
69+
input {
70+
String gcs_out_root_dir
71+
72+
File uBAM
73+
File? uPBI
74+
75+
String readgroup_id
76+
String bam_SM_field
77+
78+
String platform
79+
80+
# args for optional QC subworkflows
81+
File? qc_metrics_config_json
82+
String? fingerprint_sample_id
83+
String? expected_sex_type
84+
85+
File ref_map_file
86+
String disk_type = "SSD"
87+
}
88+
89+
output {
90+
String last_processing_date = today.yyyy_mm_dd
91+
92+
File aligned_bam = FinalizeAlignedBam.gcs_path
93+
File aligned_bai = FinalizeAlignedBai.gcs_path
94+
File aligned_pbi = FinalizeAlignedPbi.gcs_path
95+
96+
String movie = AlignHiFiUBAM.movie
97+
98+
# metrics block (caveat: always has to keep an eye on the QC subworkflow about outputs)
99+
Float wgs_cov = QCandMetrics.wgs_cov
100+
Map[String, Float] nanoplot_summ = QCandMetrics.nanoplot_summ
101+
Map[String, Float] sam_flag_stats = QCandMetrics.sam_flag_stats
102+
103+
# fingerprint
104+
Map[String, String]? fingerprint_check = QCandMetrics.fingerprint_check
105+
106+
# contam
107+
Float? contamination_est = QCandMetrics.contamination_est
108+
109+
# sex concordance
110+
Map[String, String]? inferred_sex_info = QCandMetrics.inferred_sex_info
111+
112+
# methyl
113+
Map[String, String]? methyl_tag_simple_stats = QCandMetrics.methyl_tag_simple_stats
114+
115+
# file-based QC/metrics outputs all packed into a finalization map
116+
Map[String, String] aBAM_metrics_files = QCandMetrics.aBAM_metrics_files
117+
}
118+
119+
###################################################################################
120+
# prep work
121+
122+
# where to store final results
123+
String workflow_name = "ProcessOnInstrumentDemuxedChunk"
124+
String outdir = sub(gcs_out_root_dir, "/$", "") + "/" + workflow_name
125+
126+
# String bc_specific_out = outdir + '/' + readgroup_id
127+
String bc_specific_aln_out = outdir + '/alignments/' + readgroup_id
128+
String bc_specific_metrics_out = outdir + "/metrics/" + readgroup_id
129+
130+
###################################################################################
131+
# align
132+
call ALN.AlignHiFiUBAM as AlignHiFiUBAM { input:
133+
uBAM = uBAM,
134+
uPBI = uPBI,
135+
bam_sample_name = bam_SM_field,
136+
ref_map_file = ref_map_file,
137+
application = 'HIFI',
138+
disk_type = disk_type
139+
}
140+
141+
File aBAM = AlignHiFiUBAM.aligned_bam
142+
File aBAI = AlignHiFiUBAM.aligned_bai
143+
File aPBI = AlignHiFiUBAM.aligned_pbi
144+
145+
# save
146+
call FF.FinalizeToFile as FinalizeAlignedBam { input: outdir = bc_specific_aln_out, file = aBAM, name = readgroup_id + '.bam' }
147+
call FF.FinalizeToFile as FinalizeAlignedBai { input: outdir = bc_specific_aln_out, file = aBAI, name = readgroup_id + '.bai' }
148+
call FF.FinalizeToFile as FinalizeAlignedPbi { input: outdir = bc_specific_aln_out, file = aPBI, name = readgroup_id + '.pbi' }
149+
150+
###################################################################################
151+
# QC
152+
AlignedBamQCnMetricsConfig qcm_config = read_json(select_first([qc_metrics_config_json]))
153+
call QCMetrics.Work as QCandMetrics { input:
154+
bam = aBAM,
155+
bai = aBAI,
156+
157+
tech = platform,
158+
159+
cov_bed = qcm_config.cov_bed,
160+
cov_bed_descriptor = qcm_config.cov_bed_descriptor,
161+
162+
fingerprint_vcf_store = qcm_config.fingerprint_vcf_store,
163+
fingerprint_sample_id = fingerprint_sample_id,
164+
165+
expected_sex_type = expected_sex_type,
166+
167+
vbid2_config_json = qcm_config.vbid2_config_json,
168+
169+
methyl_tag_check_bam_descriptor = qcm_config.methyl_tag_check_bam_descriptor,
170+
save_methyl_uncalled_reads = qcm_config.save_methyl_uncalled_reads,
171+
172+
ref_map_file = ref_map_file,
173+
disk_type = disk_type,
174+
175+
output_prefix = readgroup_id, # String output_prefix = bam_sample_name + "." + flowcell
176+
gcs_out_root_dir = bc_specific_metrics_out,
177+
}
178+
179+
###################################################################################
180+
181+
call GU.GetTodayDate as today {}
182+
}
Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
version 1.0
2+
3+
import "../../../tasks/Utility/Utils.wdl"
4+
import "../../../tasks/Utility/BAMutils.wdl" as BU
5+
import "../../../tasks/Utility/GeneralUtils.wdl" as GU
6+
7+
import "../../../tasks/Visualization/NanoPlot.wdl" as QC0
8+
import "../../TechAgnostic/Utility/CountTheBeans.wdl" as QC1
9+
import "../../TechAgnostic/Utility/DystPeaker.wdl" as QC2
10+
import "../../TechAgnostic/Utility/FASTQstats.wdl" as QC3
11+
12+
import "../../../tasks/Utility/Finalize.wdl" as FF
13+
import "../../TechAgnostic/Utility/SaveFilesToDestination.wdl" as SAVE
14+
15+
workflow ProcessOnInstrumentDemuxedChunkRefFree {
16+
17+
meta {
18+
desciption: "Given an on-instrument demultiplexed hifi_reads.bam, perform ref-independent prep work, and collect a few metrics"
19+
}
20+
21+
parameter_meta {
22+
input_hifi_fq:
23+
"The preexisting Hifi FASTQ; if provided the metrics calculation is faster; if not, you most likely want to set convert_to_fq to true"
24+
25+
readgroup_id:
26+
"Unique ID for a readgroup, for example (D|E)A[0-9]{6}-<barcode>"
27+
28+
short_reads_threshold:
29+
"a length threshold below which reads are classified as short"
30+
31+
bam_descriptor:
32+
"a one-word description of the purpose of the BAM (e.g. 'a_particular_seq_center'; used for saving the reads that don't have any MM/ML tags; doesn't need to be single-file specific)"
33+
34+
run_nanoplot:
35+
"if true, will kick off Nanoplot to collect metrics on the uBAM; this isn't necessary if you also run the alignment version to process the uBAM as Nanoplot is run there automatically and produces a superset of metrics"
36+
37+
convert_to_fq:
38+
"if true, input HiFi uBAM will be converted to FASTQ"
39+
hifi_fq:
40+
"available only if convert_to_fq is true."
41+
42+
nanoplot_u_summ:
43+
"A few metrics output by Nanoplot on the uBAM"
44+
seqkit_stats:
45+
"A few metrics output by seqkit stats"
46+
47+
read_len_summaries:
48+
"A few metrics summarizing the read length distribution"
49+
read_len_peaks:
50+
"Peaks of the read length distribution (heruistic)"
51+
read_len_deciles:
52+
"Deciles of the read length distribution"
53+
54+
methyl_tag_simple_stats:
55+
"Simple stats on the reads with & without SAM methylation tags (MM/ML)."
56+
57+
uBAM_metrics_files:
58+
"A map where keys are summary-names and values are paths to files generated from the various QC/metrics tasks"
59+
}
60+
61+
input {
62+
File uBAM
63+
String readgroup_id
64+
File? input_hifi_fq
65+
66+
String bam_descriptor
67+
Int short_reads_threshold
68+
69+
Boolean run_nanoplot = false
70+
Boolean convert_to_fq = false
71+
Boolean i_donot_want_fq = false
72+
73+
String disk_type = "SSD"
74+
75+
String gcs_out_root_dir
76+
}
77+
78+
output {
79+
String? movie = movie_name
80+
81+
File? converted_hifi_fq = FinalizeFQ.gcs_path
82+
83+
String last_processing_date = today.yyyy_mm_dd
84+
85+
# todo merge these two together
86+
Map[String, Float] seqkit_stats = select_first([SeqKitOnBAM.stats, SeqKitOnFASTQ.stats])
87+
Map[String, Float]? nanoplot_u_summ = NanoPlotFromUBam.stats_map
88+
89+
Map[String, String] read_len_summaries = DystPeaker.read_len_summaries
90+
Array[Int] read_len_peaks = DystPeaker.read_len_peaks
91+
Array[Int] read_len_deciles = DystPeaker.read_len_deciles
92+
93+
# methylation call rate stats
94+
Map[String, String] methyl_tag_simple_stats = NoMissingBeans.methyl_tag_simple_stats
95+
96+
# file-based outputs all packed into a finalization map
97+
Map[String, String] uBAM_metrics_files = files_out
98+
}
99+
100+
###################################################################################
101+
# prep work
102+
103+
# arg validation
104+
if (defined(input_hifi_fq) == convert_to_fq) {
105+
if (!i_donot_want_fq) {
106+
call Utils.StopWorkflow as MutExProvided { input:
107+
reason = "You most likely want to either convert to FASTQ here, or have a pre-existing FASTQ, but not (miss) both."
108+
}
109+
}
110+
}
111+
112+
# where to store final results
113+
String workflow_name = "ProcessOnInstrumentDemuxedChunkRefFree"
114+
String outdir = sub(gcs_out_root_dir, "/$", "") + "/" + workflow_name
115+
116+
String outdir_ref_free = outdir + '/RefFree' # ideally, this isn't necessary, but it's a relic we'd not touch right now (2023-12-23)
117+
String bc_specific_out = outdir_ref_free + '/' + readgroup_id
118+
String bc_specific_metrics_out = bc_specific_out + "/metrics"
119+
120+
###################################################################################
121+
# convert to FASTQ if there isn't already one
122+
if (!defined(input_hifi_fq) && !i_donot_want_fq) {
123+
call BU.GetReadGroupInfo as RG { input: bam = uBAM, keys = ['PU']}
124+
String movie_name = RG.read_group_info['PU']
125+
126+
###################################################################################
127+
call BU.BamToFastq { input: bam = uBAM, prefix = "does_not_matter"}
128+
call FF.FinalizeToFile as FinalizeFQ { input:
129+
outdir = bc_specific_out,
130+
file = BamToFastq.reads_fq,
131+
name = readgroup_id + '.hifi.fq.gz'
132+
}
133+
}
134+
135+
###################################################################################
136+
# more QCs and metrics
137+
138+
##########
139+
if (run_nanoplot) {
140+
call QC0.NanoPlotFromUBam { input: uBAM = uBAM }
141+
FinalizationManifestLine a = object
142+
{files_to_save: flatten([[NanoPlotFromUBam.stats], NanoPlotFromUBam.plots]),
143+
is_singleton_file: false,
144+
destination: bc_specific_metrics_out + "/nanoplot",
145+
output_attribute_name: "nanoplot"}
146+
}
147+
##########
148+
call QC1.CountTheBeans as NoMissingBeans { input:
149+
bam=uBAM,
150+
bam_descriptor=bam_descriptor,
151+
gcs_out_root_dir=bc_specific_metrics_out,
152+
use_local_ssd=disk_type=='LOCAL'
153+
}
154+
Map[String, String] methyl_out = {"missing_methyl_tag_reads":
155+
NoMissingBeans.methyl_tag_simple_stats['files_holding_reads_without_tags']}
156+
##########
157+
call QC2.DystPeaker { input:
158+
input_file=uBAM, input_is_bam=true,
159+
id=readgroup_id, short_reads_threshold=short_reads_threshold,
160+
gcs_out_root_dir=bc_specific_metrics_out
161+
}
162+
Map[String, String] read_len_out = {"read_len_hist": DystPeaker.read_len_hist,
163+
"raw_rl_file": DystPeaker.read_len_summaries['raw_rl_file']}
164+
165+
##########
166+
# seqkit stats, detail depends on if FASTQ is available/desired or not
167+
if (i_donot_want_fq) {
168+
call QC3.FASTQstats as SeqKitOnBAM { input: reads = uBAM, file_type = "BAM" }
169+
}
170+
if (!i_donot_want_fq) {
171+
File use_this_hifi_fq = select_first([input_hifi_fq, BamToFastq.reads_fq])
172+
call QC3.FASTQstats as SeqKitOnFASTQ { input: reads=use_this_hifi_fq, file_type='FASTQ' }
173+
}
174+
175+
###################################################################################
176+
if (run_nanoplot) {
177+
call SAVE.SaveFilestoDestination as SaveRest { input:
178+
instructions = select_all([a]),
179+
already_finalized = select_all([methyl_out,
180+
read_len_out])
181+
}
182+
}
183+
if (!run_nanoplot) {
184+
call GU.MergeMaps { input:
185+
one = methyl_out,
186+
two = read_len_out,
187+
}
188+
}
189+
Map[String, String] files_out = select_first([SaveRest.result, MergeMaps.merged])
190+
191+
call GU.GetTodayDate as today {}
192+
}

0 commit comments

Comments
 (0)