modkitopt/main.nf at main · comprna/modkitopt · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
#!/usr/bin/env nextflow

/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    modkitopt
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    Pipeline to optimise the modkit pileup parameters --filter-threshold and
    --mod-threshold to maximise the sensitivity and precision of nanopore RNA
    modification calls.

    Steps:
     1. Filter & sort modBAM with samtools
     2. Run modkit pileup across modkit parameter combinations
     3. For each modkit parameter combination, compute recall and precision of
        predicted sites using ground truth sites
     4. Determine modkit parameters that give highest recall and precision

    Github : https://github.com/comprna/modkitopt
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */

nextflow.enable.dsl = 2

include { INSTALL_ENV      } from './modules/install_env.nf'
include { SAMTOOLS_FILTER  } from './modules/samtools_filter.nf'
include { SAMTOOLS_SORT    } from './modules/samtools_sort.nf'
include { SAMTOOLS_INDEX   } from './modules/samtools_index.nf'
include { MODKIT_PILEUP    } from './modules/modkit_pileup.nf'
include { T2G              } from './modules/t2g.nf'
include { PRECISION_RECALL } from './modules/precision_recall.nf'
include { COMPARE_PARAMS   } from './modules/compare_params.nf'

def helpMessage() {
  log.info """
        Usage:
        The typical command structure for running the pipeline is as follows:
        nextflow run main.nf --modbam sample.bam
                             --mod_type <m6A|pseU|m5C|inosine>
                             --modkit /path/to/modkit
                             --fasta /path/to/transcriptome.fa
                             --annotation /path/to/annotation.gff3
                             -profile <local|pbs|pbspro|slurm>

        Mandatory arguments:
         --modbam             .bam file containing per-read modification calls
         --mod_type           Modification type (options: m6A, pseU, m5C, inosine)
         --modkit             Path to modkit executable
         --fasta              Path to reference transcriptome
         --annotation         Path to corresponding reference annotation (.gtf or .gff3)
         -profile             Execution environment (options: local, pbs, pbspro, slurm)

        Mandatory arguments if running on an HPC system (-profile is pbs, pbspro or slurm):
         --hpc_queue          Name of the queue that Nextflow can schedule jobs to (e.g., 'normal')
         --hpc_project        HPC project code that Nextflow can schedule jobs to (e.g., 'ab12')
         --hpc_storage        HPC storage location that outputs can be written to (e.g., 'gdata/ab12')
         --help               This usage statement

        Optional arguments:
         --truth_sites        .tsv file containing known modification sites (genomic coordinates, expected columns 1 and 2: [chr, pos], mandatory if mod_type is m5C or inosine)
         --top_outdir         Path to preferred output directory (default: 'results')
        """
}

workflow {
    /*
     * =========================================================================
     *  Help message
     * =========================================================================
     */

    if (params.help) {
      helpMessage()
      return
    }

    /*
     * =========================================================================
     *  Install mode
     * =========================================================================
     */

    if (params.install) {
        INSTALL_ENV(true)
        return
    }

    /*
     * =========================================================================
     *  Demo mode
     * =========================================================================
     */

    // Don't override params
    if (params.demo) {
        modbam = "./resources/demo/demo_m6A.bam"
        mod_type = "m6A"
        fasta = "./resources/demo/gencode.v45.transcripts.subset.fa"
        annotation = "./resources/demo/gencode.v45.annotation.subset.gff3"
    } else {
        modbam = params.modbam
        mod_type = params.mod_type
        fasta = params.fasta
        annotation = params.annotation
    }

    /*
     * =========================================================================
     *  Input validation
     * =========================================================================
     */

    if (!params.demo) {

        // Required arguments
        if (!params.modbam) {
            exit 1, "Please provide --modbam (.bam file output by modification caller)"
        }
        if (!params.mod_type) {
            exit 1, "Please provide --mod_type (options: m6A, pseU, m5C, inosine)"
        }
        if (!params.fasta) {
            exit 1, "Please provide the path to the reference transriptome via --fasta"
        }
        if (!params.annotation) {
            exit 1, "Please provide the path to the reference annotation (.gtf or .gff3) via --annotation"
        }
    }

    if (!params.modkit) {
        exit 1, "Please provide the path to modkit via --modkit"
    }

    // Validate modification type
    assert mod_type in ['m6A','pseU','m5C','inosine'], \
        "Modification type not recognised, choose from: m6A, pseU, m5C, inosine"

    // Validate ground truth
    def truth_sites
    if (!params.truth_sites) {

        // If no ground truth sites provided for m6A, then use supplied default
        if (mod_type == "m6A") {
            truth_sites = "./resources/m6A_validated.tsv"
            log.info "No --truth_sites supplied for --mod_type m6A; using default '${truth_sites}'"

        // If no ground truth sites provided for pseU, then use supplied default
        } else if (mod_type == "pseU") {
            truth_sites = "./resources/pseU_validated.tsv"
            log.info "No --truth_sites supplied for --mod_type pseU; using default '${truth_sites}'"

        // Ground truth sites must be provided for mods other than m6A or pseU
        } else {
            exit 1, "ERROR: --truth_sites is required when mod_type is not m6A or pseU"
        }

    } else {
        truth_sites = params.truth_sites
    }

    // HPC parameters must be specified if using pbs, pbspro or slurm
    def hpcProfiles = ['pbs','pbspro','slurm']
    if (workflow.profile in hpcProfiles) {
        if (!params.hpc_project) {
            exit 1, "--hpc_project must be specified when using profile '${workflow.profile}'."
        }
        if (!params.hpc_storage) {
            exit 1, "--hpc_storage must be specified when using profile '${workflow.profile}'."
        }
        if (!params.hpc_queue) {
            exit 1, "--hpc_queue must be specified when using profile '${workflow.profile}'."
        }
    }

    /*
     * =========================================================================
     *  Filter, sort and index BAM
     * =========================================================================
     */

    ch_modbam = channel.fromPath(modbam, checkIfExists: true)
    ch_filtered_bam = SAMTOOLS_FILTER(ch_modbam)
    ch_sorted_bam = SAMTOOLS_SORT(ch_filtered_bam)
    ch_indexed_bam = SAMTOOLS_INDEX(ch_sorted_bam)

    /*
     * =========================================================================
     *  Run modkit across parameter combinations
     * =========================================================================
     */

    // Create a channel of parameter combinations
    ch_modkit_params = channel.from(params.filter_thresholds)
                              .combine(channel.from(params.mod_thresholds))
                              .map { ft, mt -> tuple(ft, mt) }
                              .mix(Channel.value(["default", "default"]))

    // Create a channel for the reference fasta file
    ch_fasta = channel.fromPath(fasta, checkIfExists: true)

    // Combine the BAM and BAM index into a single channel
    ch_bam_index = ch_indexed_bam.indexed_bam.combine(ch_indexed_bam.index)

    // Combine also with the fasta channel
    ch_bam_fasta = ch_bam_index.combine(ch_fasta)

    // Combine also with the modkit parameters channel
    ch_bam_fasta_params = ch_bam_fasta.combine(ch_modkit_params)

    // Combine also with the modification type
    ch_mod_type = Channel.value(mod_type)
    ch_modkit_input = ch_bam_fasta_params.combine(ch_mod_type)

    // Run modkit for each parameter combination
    ch_modkit_output = MODKIT_PILEUP(ch_modkit_input)

    // Merge the modkit output bed file with the modkit parameters
    ch_bed_params = ch_modkit_output.modkit_bed
                                    .merge(ch_modkit_output.filter_threshold)
                                    .merge(ch_modkit_output.mod_threshold)

    /*
     * =========================================================================
     *  Convert transcriptomic to genomic coordinates for comparison with
     *  validated sites
     * =========================================================================
     */

    // Create a channel for the reference annotation
    ch_annotation = channel.fromPath(annotation, checkIfExists: true)

    // Combine the annotation with the bed file channel
    ch_t2g_input = ch_bed_params.combine(ch_annotation)

    // Convert transcriptomic to genomic coordinates
    ch_modkit_genomic = T2G(ch_t2g_input)

    // Merge the genomic bed file with the modkit parameters
    ch_genomic_params = ch_modkit_genomic.bed_genomic
                                         .merge(ch_modkit_genomic.filter_threshold)
                                         .merge(ch_modkit_genomic.mod_threshold)

    /*
     * =========================================================================
     *  Compute recall and precision of called sites, compared against ground
     *  truth sites, across stoichiometry thresholds
     * =========================================================================
     */

    // Create a channel for the ground truth sites
    ch_truth = channel.fromPath(truth_sites, checkIfExists: true)

    // Combine the channels for ground truth sites and modkit sites
    ch_eval_params_input = ch_genomic_params.combine(ch_truth)

    // Evaluate precision and recall
    ch_prec_recall = PRECISION_RECALL(ch_eval_params_input)

    /*
     * =========================================================================
     *  Compare performance of modkit parameters
     * =========================================================================
     */

    // Collect the results across all parameters
    ch_prec_recall_collected = ch_prec_recall.precision_recall.collect()

    ch_best_params = COMPARE_PARAMS(ch_prec_recall_collected)
    ch_best_params.out_string.view()

}