diff --git a/.test/config/config.yaml b/.test/config/config.yaml index 1031ab7..921f575 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -1,6 +1,6 @@ -# This file should contain everything to configure the workflow on a global scale. +# This file contains everything to configure the workflow on a global scale. # In case of sample based data, it should be complemented by a samples.tsv file that contains -# one row per sample. It can be parsed easily via pandas. +# one row per sample. It is parsed in common.smk using pandas (https://pandas.pydata.org/). samples: "config/samples.tsv" # to download reads from SRA the accession numbers (see https://www.ncbi.nlm.nih.gov/sra) of samples must be given in # units.tsv dataset for testing this workflow with single end reads: @@ -20,9 +20,9 @@ resources: release: 101 # Genome build build: GRCh38 - # for testing data a specific chromosome can be selected + # for testing data a single chromosome can be selected (leave empty for a regular analysis) chromosome: - # specify release version number of igenomes list to use (see https://github.com/nf-core/chipseq/releases), default: 1.2.2 + # specify release version number of igenomes list to use (see https://github.com/nf-core/chipseq/releases), e.g. 1.2.2 igenomes_release: 1.2.2 # if igenomes.yaml cannot be used, a value for the mappable or effective genome size can be specified here, e.g. macs-gsize: 2.7e9 macs-gsize: @@ -45,7 +45,7 @@ params: picard_metrics: activate: True deseq2: - # optional to run vst transform instead of rlog + # set to True to use the vst transformation instead of the rlog transformation for the DESeq2 analysis vst: True peak-annotation-analysis: activate: True @@ -53,6 +53,21 @@ params: activate: True consensus-peak-analysis: activate: True + # samtools view parameter suggestions (for full parameters, see: https://www.htslib.org/doc/samtools-view.html): + # if duplicates should be removed in this filtering, add "-F 0x0400" to the params + # if for each read, you only want to retain a single (best) mapping, add "-q 1" to params + # if you would like to restrict analysis to certain regions (e.g. excluding other "blacklisted" regions), + # the -L option is automatically activated if a path to a blacklist of the given genome exists in the + # downloaded "resources/ref/igenomes.yaml" or has been provided via the parameter + # "config['resources']['ref']['blacklist']" in this configuration file + samtools-view-se: "-b -F 0x004" + samtools-view-pe: "-b -F 0x004 -G 0x009 -f 0x001" + plotfingerprint: + # --numberOfSamples parameter of deeptools plotFingerprint, see: https://deeptools.readthedocs.io/en/develop/content/tools/plotFingerprint.html#Optional%20arguments + number-of-samples: 500000 + # optional parameters for picard's CollectMultipleMetrics from sorted, filtered and merged bam files in post analysis step + # see https://gatk.broadinstitute.org/hc/en-us/articles/360037594031-CollectMultipleMetrics-Picard- + collect-multiple-metrics: VALIDATION_STRINGENCY=LENIENT # TODO: move adapter parameters into a `adapter` column in units.tsv and check for its presence via the units.schema.yaml -- this enables unit-specific adapters, e.g. when integrating multiple datasets # these cutadapt parameters need to contain the required flag(s) for # the type of adapter(s) to trim, i.e.: diff --git a/.test/config/samples.tsv b/.test/config/samples.tsv index 3bb0b81..d01ea7d 100644 --- a/.test/config/samples.tsv +++ b/.test/config/samples.tsv @@ -6,36 +6,35 @@ D E2 batch2 AJ ERa E TNFa batch1 AK ERa F TNFa batch2 AL ERa G E2_TNFa batch1 AM ERa -H E2_TNFa batch2 AN ERa -I Veh batch1 AG p65 -J Veh batch2 AH p65 -K E2 batch1 AI p65 -L E2 batch2 AJ p65 -M TNFa batch1 AK p65 -N TNFa batch2 AL p65 -O E2_TNFa batch1 AM p65 -P E2_TNFa batch2 AN p65 -Q Veh batch1 AG FoxA1 -R Veh batch2 AH FoxA1 -S E2 batch1 AI FoxA1 -T E2 batch2 AJ FoxA1 -U TNFa batch1 AK FoxA1 -V TNFa batch2 AL FoxA1 -W E2_TNFa batch1 AM FoxA1 -X E2_TNFa batch2 AM FoxA1 +H Veh batch1 AG p65 +I Veh batch2 AH p65 +J E2 batch1 AI p65 +K E2 batch2 AJ p65 +L TNFa batch1 AK p65 +M TNFa batch2 AL p65 +N E2_TNFa batch1 AM p65 +O E2_TNFa batch2 AN p65 +P Veh batch1 AG FoxA1 +Q Veh batch2 AH FoxA1 +R E2 batch1 AI FoxA1 +S E2 batch2 AJ FoxA1 +T TNFa batch1 AK FoxA1 +U TNFa batch2 AL FoxA1 +V E2_TNFa batch1 AM FoxA1 +W E2_TNFa batch2 AM FoxA1 +X E2_TNFa batch1 AM ERa Y E2_TNFa batch1 AM ERa Z E2_TNFa batch1 AM ERa AA E2_TNFa batch1 AM ERa -AB E2_TNFa batch1 AM ERa +AB E2_TNFa batch2 AN ERa AC E2_TNFa batch2 AN ERa AD E2_TNFa batch2 AN ERa AE E2_TNFa batch2 AN ERa -AF E2_TNFa batch2 AN ERa -AG Veh batch1 -AH Veh batch2 -AI E2 batch1 -AJ E2 batch2 -AK TNFa batch1 -AL TNFa batch2 -AM E2_TNFa batch1 -AN E2_TNFa batch2 +AF Veh batch1 +AG Veh batch2 +AH E2 batch1 +AI E2 batch2 +AJ TNFa batch1 +AK TNFa batch2 +AL E2_TNFa batch1 +AM E2_TNFa batch2 diff --git a/.test/config/units.tsv b/.test/config/units.tsv index 857731c..ede8e79 100644 --- a/.test/config/units.tsv +++ b/.test/config/units.tsv @@ -1,41 +1,41 @@ -sample unit fragment_len_mean fragment_len_sd fq1 fq2 sra_accession platform -A 1 SRR1635443 ILLUMINA -B 1 SRR1635444 ILLUMINA -C 1 300 14 SRR1635445 ILLUMINA -D 1 SRR1635446 ILLUMINA -E 1 SRR1635447 ILLUMINA -F 1 SRR1635448 ILLUMINA -G 1 SRR1635449 ILLUMINA -H 2 SRR1635450 ILLUMINA -I 1 SRR1635451 ILLUMINA -J 2 SRR1635452 ILLUMINA -K 1 SRR1635453 ILLUMINA -L 2 SRR1635454 ILLUMINA -M 1 SRR1635455 ILLUMINA -N 2 SRR1635456 ILLUMINA -O 1 SRR1635457 ILLUMINA -P 2 SRR1635458 ILLUMINA -Q 1 SRR1635459 ILLUMINA -R 2 SRR1635460 ILLUMINA -S 1 SRR1635461 ILLUMINA -T 2 SRR1635462 ILLUMINA -U 1 SRR1635463 ILLUMINA -V 2 SRR1635464 ILLUMINA -W 1 SRR1635465 ILLUMINA -X 2 SRR1635466 ILLUMINA -Y 1 SRR1635467 ILLUMINA -Z 2 SRR1635468 ILLUMINA -AA 1 SRR1635469 ILLUMINA -AB 2 SRR1635470 ILLUMINA -AC 1 SRR1635471 ILLUMINA -AD 2 SRR1635472 ILLUMINA -AE 1 SRR1635473 ILLUMINA -AF 2 SRR1635474 ILLUMINA -AG 1 SRR1635435 ILLUMINA -AH 2 SRR1635436 ILLUMINA -AI 1 SRR1635437 ILLUMINA -AJ 2 SRR1635438 ILLUMINA -AK 1 SRR1635439 ILLUMINA -AL 2 SRR1635440 ILLUMINA -AM 1 SRR1635441 ILLUMINA -AN 2 SRR1635442 ILLUMINA +sample unit fq1 fq2 sra_accession platform +A 1 SRR1635443 ILLUMINA +B 1 SRR1635444 ILLUMINA +C 1 SRR1635445 ILLUMINA +D 1 SRR1635446 ILLUMINA +E 1 SRR1635447 ILLUMINA +F 1 SRR1635448 ILLUMINA +G 1 SRR1635449 ILLUMINA +G 2 SRR1635450 ILLUMINA +H 1 SRR1635451 ILLUMINA +I 1 SRR1635452 ILLUMINA +J 1 SRR1635453 ILLUMINA +K 1 SRR1635454 ILLUMINA +L 1 SRR1635455 ILLUMINA +M 1 SRR1635456 ILLUMINA +N 1 SRR1635457 ILLUMINA +O 1 SRR1635458 ILLUMINA +P 1 SRR1635459 ILLUMINA +Q 1 SRR1635460 ILLUMINA +R 1 SRR1635461 ILLUMINA +S 1 SRR1635462 ILLUMINA +T 1 SRR1635463 ILLUMINA +U 1 SRR1635464 ILLUMINA +V 1 SRR1635465 ILLUMINA +W 1 SRR1635466 ILLUMINA +X 1 SRR1635467 ILLUMINA +Y 1 SRR1635468 ILLUMINA +Z 1 SRR1635469 ILLUMINA +AA 1 SRR1635470 ILLUMINA +AB 1 SRR1635471 ILLUMINA +AC 1 SRR1635472 ILLUMINA +AD 1 SRR1635473 ILLUMINA +AE 1 SRR1635474 ILLUMINA +AF 1 SRR1635435 ILLUMINA +AG 1 SRR1635436 ILLUMINA +AH 1 SRR1635437 ILLUMINA +AI 1 SRR1635438 ILLUMINA +AJ 1 SRR1635439 ILLUMINA +AK 1 SRR1635440 ILLUMINA +AL 1 SRR1635441 ILLUMINA +AM 1 SRR1635442 ILLUMINA diff --git a/.test/config_paired_end/config.yaml b/.test/config_paired_end/config.yaml index 280c1cb..7170824 100644 --- a/.test/config_paired_end/config.yaml +++ b/.test/config_paired_end/config.yaml @@ -1,6 +1,6 @@ -# This file should contain everything to configure the workflow on a global scale. +# This file contains everything to configure the workflow on a global scale. # In case of sample based data, it should be complemented by a samples.tsv file that contains -# one row per sample. It can be parsed easily via pandas. +# one row per sample. It is parsed in common.smk using pandas (https://pandas.pydata.org/). samples: "config_paired_end_reduced/samples.tsv" units: "config_paired_end_reduced/units.tsv" single_end: False @@ -17,9 +17,9 @@ resources: release: 101 # Genome build build: R64-1-1 - # for testing data only chromosome 21 is selected + # for testing data a single chromosome can be selected (leave empty for a regular analysis) chromosome: - # specify release version number of igenomes list to use (see https://github.com/nf-core/chipseq/releases), default: 1.2.2 + # specify release version number of igenomes list to use (see https://github.com/nf-core/chipseq/releases), e.g. 1.2.2 igenomes_release: 1.2.2 # if igenomes.yaml cannot be used, a value for the mappable or effective genome size can be specified here, e.g. macs-gsize: 2.7e9 macs-gsize: @@ -42,7 +42,7 @@ params: picard_metrics: activate: True deseq2: - # optional to run vst transform instead of rlog + # set to True to use the vst transformation instead of the rlog transformation for the DESeq2 analysis vst: True peak-annotation-analysis: activate: True @@ -50,6 +50,21 @@ params: activate: True consensus-peak-analysis: activate: True + # samtools view parameter suggestions (for full parameters, see: https://www.htslib.org/doc/samtools-view.html): + # if duplicates should be removed in this filtering, add "-F 0x0400" to the params + # if for each read, you only want to retain a single (best) mapping, add "-q 1" to params + # if you would like to restrict analysis to certain regions (e.g. excluding other "blacklisted" regions), + # the -L option is automatically activated if a path to a blacklist of the given genome exists in the + # downloaded "resources/ref/igenomes.yaml" or has been provided via the parameter + # "config['resources']['ref']['blacklist']" in this configuration file + samtools-view-se: "-b -F 0x004" + samtools-view-pe: "-b -F 0x004 -G 0x009 -f 0x001" + plotfingerprint: + # --numberOfSamples parameter of deeptools plotFingerprint, see: https://deeptools.readthedocs.io/en/develop/content/tools/plotFingerprint.html#Optional%20arguments + number-of-samples: 500000 + # optional parameters for picard's CollectMultipleMetrics from sorted, filtered and merged bam files in post analysis step + # see https://gatk.broadinstitute.org/hc/en-us/articles/360037594031-CollectMultipleMetrics-Picard- + collect-multiple-metrics: VALIDATION_STRINGENCY=LENIENT # TODO: move adapter parameters into a `adapter` column in units.tsv and check for its presence via the units.schema.yaml -- this enables unit-specific adapters, e.g. when integrating multiple datasets # these cutadapt parameters need to contain the required flag(s) for # the type of adapter(s) to trim, i.e.: diff --git a/.test/config_paired_end/units.tsv b/.test/config_paired_end/units.tsv index 2b3de2d..f36af94 100644 --- a/.test/config_paired_end/units.tsv +++ b/.test/config_paired_end/units.tsv @@ -1,7 +1,7 @@ -sample unit fragment_len_mean fragment_len_sd fq1 fq2 sra_accession platform -A 1 data/atacseq/test-datasets/testdata/SRR1822153_1.fastq.gz data/atacseq/test-datasets/testdata/SRR1822153_2.fastq.gz ILLUMINA -B 1 data/atacseq/test-datasets/testdata/SRR1822154_1.fastq.gz data/atacseq/test-datasets/testdata/SRR1822154_2.fastq.gz ILLUMINA -C 1 300 14 data/atacseq/test-datasets/testdata/SRR1822157_1.fastq.gz data/atacseq/test-datasets/testdata/SRR1822157_2.fastq.gz ILLUMINA -D 1 data/atacseq/test-datasets/testdata/SRR1822158_1.fastq.gz data/atacseq/test-datasets/testdata/SRR1822158_2.fastq.gz ILLUMINA -E 1 data/chipseq/test-datasets/testdata/SRR5204809_Spt5-ChIP_Input1_SacCer_ChIP-Seq_ss100k_R1.fastq.gz data/chipseq/test-datasets/testdata/SRR5204809_Spt5-ChIP_Input1_SacCer_ChIP-Seq_ss100k_R2.fastq.gz ILLUMINA -F 1 data/chipseq/test-datasets/testdata/SRR5204810_Spt5-ChIP_Input2_SacCer_ChIP-Seq_ss100k_R1.fastq.gz data/chipseq/test-datasets/testdata/SRR5204810_Spt5-ChIP_Input2_SacCer_ChIP-Seq_ss100k_R2.fastq.gz ILLUMINA +sample unit fq1 fq2 sra_accession platform +A 1 data/atacseq/test-datasets/testdata/SRR1822153_1.fastq.gz data/atacseq/test-datasets/testdata/SRR1822153_2.fastq.gz ILLUMINA +B 1 data/atacseq/test-datasets/testdata/SRR1822154_1.fastq.gz data/atacseq/test-datasets/testdata/SRR1822154_2.fastq.gz ILLUMINA +C 1 data/atacseq/test-datasets/testdata/SRR1822157_1.fastq.gz data/atacseq/test-datasets/testdata/SRR1822157_2.fastq.gz ILLUMINA +D 1 data/atacseq/test-datasets/testdata/SRR1822158_1.fastq.gz data/atacseq/test-datasets/testdata/SRR1822158_2.fastq.gz ILLUMINA +E 1 data/chipseq/test-datasets/testdata/SRR5204809_Spt5-ChIP_Input1_SacCer_ChIP-Seq_ss100k_R1.fastq.gz data/chipseq/test-datasets/testdata/SRR5204809_Spt5-ChIP_Input1_SacCer_ChIP-Seq_ss100k_R2.fastq.gz ILLUMINA +F 1 data/chipseq/test-datasets/testdata/SRR5204810_Spt5-ChIP_Input2_SacCer_ChIP-Seq_ss100k_R1.fastq.gz data/chipseq/test-datasets/testdata/SRR5204810_Spt5-ChIP_Input2_SacCer_ChIP-Seq_ss100k_R2.fastq.gz ILLUMINA diff --git a/.test/config_paired_end_reduced/config.yaml b/.test/config_paired_end_reduced/config.yaml index 6663e58..e8dde9a 100644 --- a/.test/config_paired_end_reduced/config.yaml +++ b/.test/config_paired_end_reduced/config.yaml @@ -1,6 +1,6 @@ # This file should contain everything to configure the workflow on a global scale. # In case of sample based data, it should be complemented by a samples.tsv file that contains -# one row per sample. It can be parsed easily via pandas. +# one row per sample. It is parsed in common.smk using pandas (https://pandas.pydata.org/). samples: "config_paired_end_reduced/samples.tsv" units: "config_paired_end_reduced/units.tsv" single_end: False @@ -17,9 +17,9 @@ resources: release: 101 # Genome build build: R64-1-1 - # for testing data a specific chromosome can be selected + # for testing data a single chromosome can be selected (leave empty for a regular analysis) chromosome: VII - # specify release version number of igenomes list to use (see https://github.com/nf-core/chipseq/releases), default: 1.2.2 + # specify release version number of igenomes list to use (see https://github.com/nf-core/chipseq/releases), e.g. 1.2.2 igenomes_release: 1.2.2 # if igenomes.yaml cannot be used, a value for the mappable or effective genome size can be specified here, e.g. macs-gsize: 2.7e9 macs-gsize: @@ -42,7 +42,7 @@ params: picard_metrics: activate: True deseq2: - # optional to run vst transform instead of rlog + # set to True to use the vst transformation instead of the rlog transformation for the DESeq2 analysis vst: False peak-annotation-analysis: activate: True @@ -50,6 +50,21 @@ params: activate: True consensus-peak-analysis: activate: True + # samtools view parameter suggestions (for full parameters, see: https://www.htslib.org/doc/samtools-view.html): + # if duplicates should be removed in this filtering, add "-F 0x0400" to the params + # if for each read, you only want to retain a single (best) mapping, add "-q 1" to params + # if you would like to restrict analysis to certain regions (e.g. excluding other "blacklisted" regions), + # the -L option is automatically activated if a path to a blacklist of the given genome exists in the + # downloaded "resources/ref/igenomes.yaml" or has been provided via the parameter + # "config['resources']['ref']['blacklist']" in this configuration file + samtools-view-se: "-b -F 0x004" + samtools-view-pe: "-b -F 0x004 -G 0x009 -f 0x001" + plotfingerprint: + # --numberOfSamples parameter of deeptools plotFingerprint, see: https://deeptools.readthedocs.io/en/develop/content/tools/plotFingerprint.html#Optional%20arguments + number-of-samples: 500000 + # optional parameters for picard's CollectMultipleMetrics from sorted, filtered and merged bam files in post analysis step + # see https://gatk.broadinstitute.org/hc/en-us/articles/360037594031-CollectMultipleMetrics-Picard- + collect-multiple-metrics: VALIDATION_STRINGENCY=LENIENT # TODO: move adapter parameters into a `adapter` column in units.tsv and check for its presence via the units.schema.yaml -- this enables unit-specific adapters, e.g. when integrating multiple datasets # these cutadapt parameters need to contain the required flag(s) for # the type of adapter(s) to trim, i.e.: diff --git a/.test/config_paired_end_reduced/units.tsv b/.test/config_paired_end_reduced/units.tsv index 53324be..87687ea 100644 --- a/.test/config_paired_end_reduced/units.tsv +++ b/.test/config_paired_end_reduced/units.tsv @@ -1,6 +1,6 @@ -sample unit fragment_len_mean fragment_len_sd fq1 fq2 sra_accession platform -A 1 data/paired_end_test_data/A-1_vii_1.fastq.gz data/paired_end_test_data/A-1_vii_2.fastq.gz ILLUMINA -B 1 data/paired_end_test_data/B-1_vii_1.fastq.gz data/paired_end_test_data/B-1_vii_2.fastq.gz ILLUMINA -C 1 300 14 data/paired_end_test_data/C-1_vii_1.fastq.gz data/paired_end_test_data/C-1_vii_2.fastq.gz ILLUMINA -D 1 data/paired_end_test_data/D-1_vii_1.fastq.gz data/paired_end_test_data/D-1_vii_2.fastq.gz ILLUMINA -E 1 data/paired_end_test_data/E-1_vii_1.fastq.gz data/paired_end_test_data/E-1_vii_2.fastq.gz ILLUMINA +sample unit fq1 fq2 sra_accession platform +A 1 data/paired_end_test_data/A-1_vii_1.fastq.gz data/paired_end_test_data/A-1_vii_2.fastq.gz ILLUMINA +B 1 data/paired_end_test_data/B-1_vii_1.fastq.gz data/paired_end_test_data/B-1_vii_2.fastq.gz ILLUMINA +C 1 data/paired_end_test_data/C-1_vii_1.fastq.gz data/paired_end_test_data/C-1_vii_2.fastq.gz ILLUMINA +D 1 data/paired_end_test_data/D-1_vii_1.fastq.gz data/paired_end_test_data/D-1_vii_2.fastq.gz ILLUMINA +E 1 data/paired_end_test_data/E-1_vii_1.fastq.gz data/paired_end_test_data/E-1_vii_2.fastq.gz ILLUMINA diff --git a/.test/config_single_end/config.yaml b/.test/config_single_end/config.yaml index 947b992..60d4e03 100644 --- a/.test/config_single_end/config.yaml +++ b/.test/config_single_end/config.yaml @@ -1,6 +1,6 @@ -# This file should contain everything to configure the workflow on a global scale. +# This file contains everything to configure the workflow on a global scale. # In case of sample based data, it should be complemented by a samples.tsv file that contains -# one row per sample. It can be parsed easily via pandas. +# one row per sample. It is parsed in common.smk using pandas (https://pandas.pydata.org/). samples: "config_single_end/samples.tsv" # to download reads from SRA the accession numbers (see https://www.ncbi.nlm.nih.gov/sra) of samples must be given in # units.tsv dataset for testing this workflow with single end reads: @@ -20,9 +20,9 @@ resources: release: 101 # Genome build build: GRCh38 - # for testing data a specific chromosome can be selected + # for testing data a single chromosome can be selected (leave empty for a regular analysis) chromosome: - # specify release version number of igenomes list to use (see https://github.com/nf-core/chipseq/releases), default: 1.2.2 + # specify release version number of igenomes list to use (see https://github.com/nf-core/chipseq/releases), e.g. 1.2.2 igenomes_release: 1.2.2 # if igenomes.yaml cannot be used, a value for the mappable or effective genome size can be specified here, e.g. macs-gsize: 2.7e9 macs-gsize: @@ -45,7 +45,7 @@ params: picard_metrics: activate: True deseq2: - # optional to run vst transform instead of rlog + # set to True to use the vst transformation instead of the rlog transformation for the DESeq2 analysis vst: True peak-annotation-analysis: activate: True @@ -53,6 +53,21 @@ params: activate: True consensus-peak-analysis: activate: True + # samtools view parameter suggestions (for full parameters, see: https://www.htslib.org/doc/samtools-view.html): + # if duplicates should be removed in this filtering, add "-F 0x0400" to the params + # if for each read, you only want to retain a single (best) mapping, add "-q 1" to params + # if you would like to restrict analysis to certain regions (e.g. excluding other "blacklisted" regions), + # the -L option is automatically activated if a path to a blacklist of the given genome exists in the + # downloaded "resources/ref/igenomes.yaml" or has been provided via the parameter + # "config['resources']['ref']['blacklist']" in this configuration file + samtools-view-se: "-b -F 0x004" + samtools-view-pe: "-b -F 0x004 -G 0x009 -f 0x001" + plotfingerprint: + # --numberOfSamples parameter of deeptools plotFingerprint, see: https://deeptools.readthedocs.io/en/develop/content/tools/plotFingerprint.html#Optional%20arguments + number-of-samples: 500000 + # optional parameters for picard's CollectMultipleMetrics from sorted, filtered and merged bam files in post analysis step + # see https://gatk.broadinstitute.org/hc/en-us/articles/360037594031-CollectMultipleMetrics-Picard- + collect-multiple-metrics: VALIDATION_STRINGENCY=LENIENT # TODO: move adapter parameters into a `adapter` column in units.tsv and check for its presence via the units.schema.yaml -- this enables unit-specific adapters, e.g. when integrating multiple datasets # these cutadapt parameters need to contain the required flag(s) for # the type of adapter(s) to trim, i.e.: diff --git a/.test/config_single_end/units.tsv b/.test/config_single_end/units.tsv index bbd8e31..f18818d 100644 --- a/.test/config_single_end/units.tsv +++ b/.test/config_single_end/units.tsv @@ -1,7 +1,7 @@ -sample unit fragment_len_mean fragment_len_sd fq1 fq2 sra_accession platform -A 1 SRR1635455 ILLUMINA -B 1 SRR1635456 ILLUMINA -C 1 SRR1635457 ILLUMINA -C 2 SRR1635458 ILLUMINA -D 1 SRR1635439 ILLUMINA -E 1 SRR1635441 ILLUMINA +sample unit fq1 fq2 sra_accession platform +A 1 SRR1635455 ILLUMINA +B 1 SRR1635456 ILLUMINA +C 1 SRR1635457 ILLUMINA +C 2 SRR1635458 ILLUMINA +D 1 SRR1635439 ILLUMINA +E 1 SRR1635441 ILLUMINA diff --git a/.test/config_single_end_reduced/config.yaml b/.test/config_single_end_reduced/config.yaml index 15630e3..18c9380 100644 --- a/.test/config_single_end_reduced/config.yaml +++ b/.test/config_single_end_reduced/config.yaml @@ -1,6 +1,6 @@ -# This file should contain everything to configure the workflow on a global scale. +# This file contains everything to configure the workflow on a global scale. # In case of sample based data, it should be complemented by a samples.tsv file that contains -# one row per sample. It can be parsed easily via pandas. +# one row per sample. It is parsed in common.smk using pandas (https://pandas.pydata.org/). samples: "config_single_end_reduced/samples.tsv" # to download reads from SRA the accession numbers (see https://www.ncbi.nlm.nih.gov/sra) of samples must be given in # units.tsv dataset for testing this workflow with single end reads: @@ -20,9 +20,9 @@ resources: release: 101 # Genome build build: GRCh38 - # for testing data a specific chromosome can be selected + # for testing data a single chromosome can be selected (leave empty for a regular analysis) chromosome: 21 - # specify release version number of igenomes list to use (see https://github.com/nf-core/chipseq/releases), default: 1.2.2 + # specify release version number of igenomes list to use (see https://github.com/nf-core/chipseq/releases), e.g. 1.2.2 igenomes_release: 1.2.2 # if igenomes.yaml cannot be used, a value for the mappable or effective genome size can be specified here, e.g. macs-gsize: 2.7e9 macs-gsize: @@ -45,7 +45,7 @@ params: picard_metrics: activate: True deseq2: - # optional to run vst transform instead of rlog + # set to True to use the vst transformation instead of the rlog transformation for the DESeq2 analysis vst: True peak-annotation-analysis: activate: True @@ -53,6 +53,21 @@ params: activate: True consensus-peak-analysis: activate: True + # samtools view parameter suggestions (for full parameters, see: https://www.htslib.org/doc/samtools-view.html): + # if duplicates should be removed in this filtering, add "-F 0x0400" to the params + # if for each read, you only want to retain a single (best) mapping, add "-q 1" to params + # if you would like to restrict analysis to certain regions (e.g. excluding other "blacklisted" regions), + # the -L option is automatically activated if a path to a blacklist of the given genome exists in the + # downloaded "resources/ref/igenomes.yaml" or has been provided via the parameter + # "config['resources']['ref']['blacklist']" in this configuration file + samtools-view-se: "-b -F 0x004" + samtools-view-pe: "-b -F 0x004 -G 0x009 -f 0x001" + plotfingerprint: + # --numberOfSamples parameter of deeptools plotFingerprint, see: https://deeptools.readthedocs.io/en/develop/content/tools/plotFingerprint.html#Optional%20arguments + number-of-samples: 500000 + # optional parameters for picard's CollectMultipleMetrics from sorted, filtered and merged bam files in post analysis step + # see https://gatk.broadinstitute.org/hc/en-us/articles/360037594031-CollectMultipleMetrics-Picard- + collect-multiple-metrics: VALIDATION_STRINGENCY=LENIENT # TODO: move adapter parameters into a `adapter` column in units.tsv and check for its presence via the units.schema.yaml -- this enables unit-specific adapters, e.g. when integrating multiple datasets # these cutadapt parameters need to contain the required flag(s) for # the type of adapter(s) to trim, i.e.: diff --git a/.test/config_single_end_reduced/units.tsv b/.test/config_single_end_reduced/units.tsv index a652722..0ddd560 100644 --- a/.test/config_single_end_reduced/units.tsv +++ b/.test/config_single_end_reduced/units.tsv @@ -1,7 +1,7 @@ -sample unit fragment_len_mean fragment_len_sd fq1 fq2 sra_accession platform -A 1 data/single_end_test_data/A-1_chr21.fastq.gz ILLUMINA -B 1 data/single_end_test_data/B-1_chr21.fastq.gz ILLUMINA -C 1 data/single_end_test_data/C-1_chr21.fastq.gz ILLUMINA -C 2 data/single_end_test_data/C-2_chr21.fastq.gz ILLUMINA -D 1 data/single_end_test_data/D-1_chr21.fastq.gz ILLUMINA -E 1 data/single_end_test_data/E-1_chr21.fastq.gz ILLUMINA +sample unit fq1 fq2 sra_accession platform +A 1 data/single_end_test_data/A-1_chr21.fastq.gz ILLUMINA +B 1 data/single_end_test_data/B-1_chr21.fastq.gz ILLUMINA +C 1 data/single_end_test_data/C-1_chr21.fastq.gz ILLUMINA +C 2 data/single_end_test_data/C-2_chr21.fastq.gz ILLUMINA +D 1 data/single_end_test_data/D-1_chr21.fastq.gz ILLUMINA +E 1 data/single_end_test_data/E-1_chr21.fastq.gz ILLUMINA diff --git a/README.md b/README.md index 40495a0..39b49bf 100644 --- a/README.md +++ b/README.md @@ -1,104 +1,12 @@ # Snakemake workflow: chipseq -[![Snakemake](https://img.shields.io/badge/snakemake-≥5.14.0-brightgreen.svg)](https://snakemake.bitbucket.io) -[![Build Status](https://travis-ci.org/snakemake-workflows/chipseq.svg?branch=master)](https://travis-ci.org/snakemake-workflows/chipseq) +[![Snakemake](https://img.shields.io/badge/snakemake-≥6.4.0-brightgreen.svg)](https://snakemake.github.io) +[![GitHub actions status](https://github.com/snakemake-workflows/chipseq/workflows/Tests/badge.svg?branch=master)](https://github.com/snakemake-workflows/chipseq/actions?query=branch%3Amaster+workflow%3ATests) -This is the template for a new Snakemake workflow. Replace this text with a comprehensive description covering the purpose and domain. -Insert your code into the respective folders, i.e. `scripts`, `rules`, and `envs`. Define the entry point of the workflow in the `Snakefile` and the main configuration in the `config.yaml` file. - -## Authors - -* Antonie Vietor (@AntonieV) +This workflow is a Snakemake port of the [nextflow chipseq pipeline](https://nf-co.re/chipseq) and performs ChIP-seq peak-calling, QC and differential analysis. ## Usage -If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) repository and, if available, its DOI (see above). - -### Step 1: Obtain a copy of this workflow - -1. Create a new github repository using this workflow [as a template](https://help.github.com/en/articles/creating-a-repository-from-a-template). -2. [Clone](https://help.github.com/en/articles/cloning-a-repository) the newly created repository to your local system, into the place where you want to perform the data analysis. - -### Step 2: Configure workflow - -Configure the workflow according to your needs via editing the files in the `config/` folder. Adjust `config.yaml` to configure the workflow execution, and `samples.tsv` to specify your sample setup. - -### Step 3: Install Snakemake - -Install Snakemake using [conda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html): - - conda create -c bioconda -c conda-forge -n snakemake snakemake - -For installation details, see the [instructions in the Snakemake documentation](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html). - -### Step 4: Execute workflow - -Activate the conda environment: - - conda activate snakemake - -Test your configuration by performing a dry-run via - - snakemake --use-conda -n - -Execute the workflow locally via - - snakemake --use-conda --cores $N - -using `$N` cores or run it in a cluster environment via - - snakemake --use-conda --cluster qsub --jobs 100 - -or - - snakemake --use-conda --drmaa --jobs 100 - -If you not only want to fix the software stack but also the underlying OS, use - - snakemake --use-conda --use-singularity - -in combination with any of the modes above. -See the [Snakemake documentation](https://snakemake.readthedocs.io/en/stable/executable.html) for further details. - -### Step 5: Investigate results - -After successful execution, you can create a self-contained interactive HTML report with all results via: - - snakemake --report report.html - -This report can, e.g., be forwarded to your collaborators. -An example (using some trivial test data) can be seen [here](https://cdn.rawgit.com/snakemake-workflows/rna-seq-kallisto-sleuth/master/.test/report.html). - -### Step 6: Commit changes - -Whenever you change something, don't forget to commit the changes back to your github copy of the repository: - - git commit -a - git push - -### Step 7: Obtain updates from upstream - -Whenever you want to synchronize your workflow copy with new developments from upstream, do the following. - -1. Once, register the upstream repository in your local copy: `git remote add -f upstream git@github.com:snakemake-workflows/chipseq.git` or `git remote add -f upstream https://github.com/snakemake-workflows/chipseq.git` if you do not have setup ssh keys. -2. Update the upstream version: `git fetch upstream`. -3. Create a diff with the current version: `git diff HEAD upstream/master workflow > upstream-changes.diff`. -4. Investigate the changes: `vim upstream-changes.diff`. -5. Apply the modified diff via: `git apply upstream-changes.diff`. -6. Carefully check whether you need to update the config files: `git diff HEAD upstream/master config`. If so, do it manually, and only where necessary, since you would otherwise likely overwrite your settings and samples. - - -### Step 8: Contribute back - -In case you have also changed or added steps, please consider contributing them back to the original repository: - -1. [Fork](https://help.github.com/en/articles/fork-a-repo) the original repo to a personal or lab account. -2. [Clone](https://help.github.com/en/articles/cloning-a-repository) the fork to your local system, to a different place than where you ran your analysis. -3. Copy the modified files from your analysis to the clone of your fork, e.g., `cp -r workflow path/to/fork`. Make sure to **not** accidentally copy config file contents or sample sheets. Instead, manually update the example config files if necessary. -4. Commit and push your changes to your fork. -5. Create a [pull request](https://help.github.com/en/articles/creating-a-pull-request) against the original repository. - -## Testing - -Test cases are in the subfolder `.test`. They are automatically executed via continuous integration with [Github Actions](https://github.com/features/actions). +The usage of this workflow is described in the [Snakemake Workflow Catalog](https://snakemake.github.io/snakemake-workflow-catalog/?usage=snakemake-workflows/chipseq). +If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) repository and its DOI (see above). diff --git a/config/README.md b/config/README.md new file mode 100644 index 0000000..ed07b51 --- /dev/null +++ b/config/README.md @@ -0,0 +1,60 @@ + +# General settings +To configure this workflow, modify ``config/config.yaml`` according to your needs, following the explanations provided in the file. + +# Sample sheet + +Add samples to `config/samples.tsv`. For each sample, the columns `sample`, `group`, `control`, and `antibody` have to be defined. +* Samples / IP (immunoprecipitations) within the same `group` represent replicates and must have the same antibody and the same control. +* Controls / Input are listed like samples, but they do not have entries in the columns for `control` and `antibody`. +* The identifiers of each control has to be noted in the column `sample`. +* For all samples, the identifiers of the corresponding controls have to be given in the `control` column (see example below). + +**Sample sheet example**: +* Samples / IP: A, B and C +* Controls / Input: D and E + +| sample | group | batch_effect | control | antibody | +|--------|--------|--------------|---------|----------| +| A | TNFa | batch1 | D | p65 | +| B | TNFa | batch2 | D | p65 | +| C | E2TNFa | batch1 | E | p65 | +| D | TNFa | batch1 | | | +| E | E2TNFa | batch1 | | | + +# Unit sheet + +For each sample, add one or more sequencing units (runs or lanes) to the unit sheet `config/units.tsv`. For each unit, the columns `sample`, `unit`, `platform` and either `fq1` (single-end reads) or `fq1` and `fq2` (paired-end reads) or `sra_accession` have to be defined. +* Each unit has a name specified in column `unit`, which can be e.g. a running number, or an actual run, lane or replicate id. +* Each unit has a `sample` name, which associates it with the biological sample it comes from. +* For single-end reads define for each unit either a path to FASTQ file (column `fq1`) or define an SRA (sequence read archive) accession (starting with e.g. ERR or SRR) by using the column `sra_accession`. +* For paired-end reads define for each unit either two paths to FASTQ files (columns `fq1`, `fq2`) or define an SRA accession (column `sra_accession`). +* In case SRA accession numbers are used, the pipeline will automatically download the corresponding reads from SRA. If both local files and SRA accession are available, the local files will be preferred. +* The platform column needs to contain the used sequencing platform (one of 'CAPILLARY', 'LS454', 'ILLUMINA', 'SOLID', 'HELICOS', 'IONTORRENT', 'ONT', 'PACBIO’). + +**Unit sheet example for single-end reads:** + +| sample | unit | fq1 | fq2 | sra_accession | platform | +|--------|------|----------------------|-----|---------------|----------| +| A | 1 | data/A-run1.fastq.gz | | | ILLUMINA | +| B | 1 | data/B-run1.fastq.gz | | | ILLUMINA | +| B | 2 | data/B-run2.fastq.gz | | | ILLUMINA | +| C | 1 | data/C-run1.fastq.gz | | | ILLUMINA | + +**Unit sheet example for paired-end reads:** + +| sample | unit | fq1 | fq2 | sra_accession | platform | +|--------|------|------------------------|------------------------|---------------|----------| +| A | 1 | data/A-run1_1.fastq.gz | data/A-run1_2.fastq.gz | | ILLUMINA | +| B | 1 | data/B-run1_1.fastq.gz | data/B-run1_2.fastq.gz | | ILLUMINA | +| B | 2 | data/B-run2_1.fastq.gz | data/B-run1_2.fastq.gz | | ILLUMINA | +| C | 1 | data/C-run1_1.fastq.gz | data/C-run1_2.fastq.gz | | ILLUMINA | + +**Unit sheet example with SRA download (single-end or paired-end reads):** + +| sample | unit | fq1 | fq2 | sra_accession | platform | +|--------|------|-----|-----|---------------|----------| +| A | 1 | | | SRR1635456 | ILLUMINA | +| B | 1 | | | SRR1635457 | ILLUMINA | +| B | 2 | | | SRR1635458 | ILLUMINA | +| C | 1 | | | SRR1635439 | ILLUMINA | diff --git a/config/config.yaml b/config/config.yaml index 4f90f2b..b4a0681 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,8 +1,9 @@ -# This file should contain everything to configure the workflow on a global scale. -# In case of sample based data, it should be complemented by a samples.tsv file that contains -# one row per sample. It can be parsed easily via pandas. +# This file contains everything to configure the workflow on a global scale. +# The sample based data must be complemented by a samples.tsv file that contains +# one row per sample. It is parsed in common.smk using pandas (https://pandas.pydata.org/). samples: "config/samples.tsv" -units: "config/units.tsv" # to download reads from SRA the accession numbers (see https://www.ncbi.nlm.nih.gov/sra) of samples must be given in units.tsv +# The source of fastq files for every sequencing unit of all samples has to be provided in the units.tsv file. +units: "config/units.tsv" single_end: False resources: @@ -16,9 +17,9 @@ resources: release: 101 # Genome build build: R64-1-1 - # for testing data only chromosome 21 is selected + # for testing data a single chromosome can be selected (leave empty for a regular analysis) chromosome: - # specify release version number of igenomes list to use (see https://github.com/nf-core/chipseq/releases), default: 1.2.2 + # specify release version number of igenomes list to use (see https://github.com/nf-core/chipseq/releases), e.g. 1.2.2 igenomes_release: 1.2.2 # if igenomes.yaml cannot be used, a value for the mappable or effective genome size can be specified here, e.g. macs-gsize: 2.7e9 macs-gsize: @@ -41,7 +42,7 @@ params: picard_metrics: activate: True deseq2: - # optional to run vst transform instead of rlog + # set to True to use the vst transformation instead of the rlog transformation for the DESeq2 analysis vst: False peak-annotation-analysis: activate: True @@ -49,13 +50,21 @@ params: activate: True consensus-peak-analysis: activate: True - # samtools view parameters: + # samtools view parameter suggestions (for full parameters, see: https://www.htslib.org/doc/samtools-view.html): # if duplicates should be removed in this filtering, add "-F 0x0400" to the params # if for each read, you only want to retain a single (best) mapping, add "-q 1" to params # if you would like to restrict analysis to certain regions (e.g. excluding other "blacklisted" regions), - # the -L option is automatically activated if a path to a blacklist of the given genome exists in "config/igenomes.yaml" or has been entered there + # the -L option is automatically activated if a path to a blacklist of the given genome exists in the + # downloaded "resources/ref/igenomes.yaml" or has been provided via the parameter + # "config['resources']['ref']['blacklist']" in this configuration file samtools-view-se: "-b -F 0x004" samtools-view-pe: "-b -F 0x004 -G 0x009 -f 0x001" + plotfingerprint: + # --numberOfSamples parameter of deeptools plotFingerprint, see: https://deeptools.readthedocs.io/en/develop/content/tools/plotFingerprint.html#Optional%20arguments + number-of-samples: 500000 + # optional parameters for picard's CollectMultipleMetrics from sorted, filtered and merged bam files in post analysis step + # see https://gatk.broadinstitute.org/hc/en-us/articles/360037594031-CollectMultipleMetrics-Picard- + collect-multiple-metrics: VALIDATION_STRINGENCY=LENIENT # TODO: move adapter parameters into a `adapter` column in units.tsv and check for its presence via the units.schema.yaml -- this enables unit-specific adapters, e.g. when integrating multiple datasets # these cutadapt parameters need to contain the required flag(s) for # the type of adapter(s) to trim, i.e.: diff --git a/config/samples.tsv b/config/samples.tsv index 42fab3a..d9e2b76 100644 --- a/config/samples.tsv +++ b/config/samples.tsv @@ -1,5 +1 @@ sample group batch_effect control antibody -A treated batch1 D BCATENIN -B untreated batch2 D BCATENIN -C treated batch1 D TCF4 -D untreated batch2 diff --git a/config/units.tsv b/config/units.tsv index 7da20e5..2907c2d 100644 --- a/config/units.tsv +++ b/config/units.tsv @@ -1,6 +1 @@ -sample unit fragment_len_mean fragment_len_sd fq1 fq2 sra_accession platform -A 1 resources/reads/a.chr21.1.fq resources/reads/a.chr21.2.fq ILLUMINA -B 1 resources/reads/b.chr21.1.fq resources/reads/b.chr21.2.fq ILLUMINA -B 2 300 14 resources/reads/b.chr21.1.fq ILLUMINA -C 1 resources/reads/a.chr21.1.fq resources/reads/a.chr21.2.fq ILLUMINA -D 1 resources/reads/b.chr21.1.fq resources/reads/b.chr21.2.fq ILLUMINA +sample unit fq1 fq2 sra_accession platform diff --git a/workflow/Snakefile b/workflow/Snakefile index fa54668..a434099 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -22,6 +22,7 @@ include: "rules/utils.smk" include: "rules/post-analysis.smk" include: "rules/peak_analysis.smk" include: "rules/consensus_peak_analysis.smk" +include: "rules/igv_session.smk" rule all: input: all_input diff --git a/workflow/envs/zip.yaml b/workflow/envs/zip.yaml new file mode 100644 index 0000000..9baef16 --- /dev/null +++ b/workflow/envs/zip.yaml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - defaults +dependencies: + - zip ==3.0 diff --git a/workflow/report/igv_session.rst b/workflow/report/igv_session.rst new file mode 100644 index 0000000..d95a3ff --- /dev/null +++ b/workflow/report/igv_session.rst @@ -0,0 +1,7 @@ +The IGV session can be downloaded here. It contains all files necessary for the session. +Unzip the directory 'report_igv_session.zip'. +`Install IGV `_ and open session by selecting +'report_igv_session.xml'. + +Alternatively you can also start the IGV session directly from the results of the workflow. +The session file is located in 'results/IGV/igv_session.xml'. diff --git a/workflow/report/multiqc_report.rst b/workflow/report/multiqc_report.rst new file mode 100644 index 0000000..a409cbb --- /dev/null +++ b/workflow/report/multiqc_report.rst @@ -0,0 +1,2 @@ +The **MultiQC** report is a collection of multiple plots, stats and metrics from phantompeakqualtools (Chip-seq processing), Preseq, Picard, Samtools and FastQC. +For detailed descriptions of the individual plots and statistics, load the MultiQC report by clicking on it. diff --git a/workflow/report/plot_annotatepeaks_homer.rst b/workflow/report/plot_annotatepeaks_homer.rst index e69de29..251eb30 100644 --- a/workflow/report/plot_annotatepeaks_homer.rst +++ b/workflow/report/plot_annotatepeaks_homer.rst @@ -0,0 +1,2 @@ +`Homer annotatePeaks `_ assigns known genomic features to peaks. +For each sample-control pair, plots show the peak locations relative to annotated features, the percentage of unique genes to closest peak and the peak distribution relative to TSS (Transcription Start Site). diff --git a/workflow/report/plot_annotatepeaks_summary_homer.rst b/workflow/report/plot_annotatepeaks_summary_homer.rst index 47826cc..d417353 100644 --- a/workflow/report/plot_annotatepeaks_summary_homer.rst +++ b/workflow/report/plot_annotatepeaks_summary_homer.rst @@ -1 +1,3 @@ -**HOMER** peak annotation summary plot is generated by calculating the proportion of {{snakemake.config["params"]["peak-analysis"]}} peaks assigned to genomic features by `HOMER annotatePeaks.pl `_. +**HOMER** peak annotation summary plot is generated by calculating the proportion of +{{snakemake.config["params"]["peak-analysis"]}} peaks assigned to genomic features by +`HOMER annotatePeaks.pl `_. diff --git a/workflow/report/plot_base_distribution_by_cycle_picard_mm.rst b/workflow/report/plot_base_distribution_by_cycle_picard_mm.rst index 8d1c8b6..1dbc07d 100644 --- a/workflow/report/plot_base_distribution_by_cycle_picard_mm.rst +++ b/workflow/report/plot_base_distribution_by_cycle_picard_mm.rst @@ -1 +1,8 @@ - +`Plot of base distribution per sequencing cycle +`_. +This **Picard** tool shows the nucleotide distribution per sequencing cycle of the bam files after filtering, sorting, merging and removing orphans. +For any sequencing cycle within reads, the relative proportions of nucleotides should reflect the AT:CG content. +For all nucleotides, flat lines would be expected and any spikes suggest a systematic sequencing error. +For more information about `collected Picard metrics +`_ please +see the `documentation `_. diff --git a/workflow/report/plot_consensus_peak_intersect.rst b/workflow/report/plot_consensus_peak_intersect.rst index 652e3f1..3fdb52b 100644 --- a/workflow/report/plot_consensus_peak_intersect.rst +++ b/workflow/report/plot_consensus_peak_intersect.rst @@ -1 +1,2 @@ -**MACS2 and bedtools** merged consensus {{ snakemake.wildcards.peak }} peaks plot is generated by calculating the proportion of intersection size assigned to {{ snakemake.wildcards.samples }} for {{ snakemake.wildcards.antibody }}. +**MACS2** consensus peaks plot for {{ snakemake.wildcards.peak }} peaks is an `UpSet plot `_ that shows +which sample subsets for the antibody {{ snakemake.wildcards.antibody }} share how many peaks. diff --git a/workflow/report/plot_deseq2_FDR_1_perc_MA.rst b/workflow/report/plot_deseq2_FDR_1_perc_MA.rst index 8d1c8b6..6271191 100644 --- a/workflow/report/plot_deseq2_FDR_1_perc_MA.rst +++ b/workflow/report/plot_deseq2_FDR_1_perc_MA.rst @@ -1 +1,9 @@ - +The `MA plot `_ **(FDR 0.01)** +displays the effect size (log2 fold changes) of differences in feature counts +between the group {{snakemake.wildcards["group_1"]}} +and the group {{snakemake.wildcards["group_2"]}} +(both treated with the {{snakemake.wildcards["antibody"]}} antibody) +as determined by DESeq2 (based on the mean of normalized counts). +The results of this plot have been filtered for a false discovery rate (FDR) of ``0.01``. +For more information about DESeq2 please see the +`documentation `_. diff --git a/workflow/report/plot_deseq2_FDR_1_perc_volcano.rst b/workflow/report/plot_deseq2_FDR_1_perc_volcano.rst index 8d1c8b6..c0f7e4d 100644 --- a/workflow/report/plot_deseq2_FDR_1_perc_volcano.rst +++ b/workflow/report/plot_deseq2_FDR_1_perc_volcano.rst @@ -1 +1,10 @@ +The **Volcano plot (FDR 0.01)** shows the significance (adjusted p-value) versus the effect size (log2 fold changes) +differences in feature counts between the group {{snakemake.wildcards["group_1"]}} +and the group {{snakemake.wildcards["group_2"]}} group +(both treated with the {{snakemake.wildcards["antibody"]}} antibody) +as determined by DESeq2 (based on the mean of normalized counts). +The results of this plot have been filtered for a false discovery rate (FDR) of ``0.01``. +For more information about DESeq2 please see the +`documentation `_. + diff --git a/workflow/report/plot_deseq2_FDR_5_perc_MA.rst b/workflow/report/plot_deseq2_FDR_5_perc_MA.rst index 8d1c8b6..6a85b50 100644 --- a/workflow/report/plot_deseq2_FDR_5_perc_MA.rst +++ b/workflow/report/plot_deseq2_FDR_5_perc_MA.rst @@ -1 +1,10 @@ - +The `MA plot `_ **(FDR 0.05)** +displays the effect size (log2 fold changes) of differences in feature counts +between the group {{snakemake.wildcards["group_1"]}} +and the group {{snakemake.wildcards["group_2"]}} +(both treated with the {{snakemake.wildcards["antibody"]}} antibody) +as determined by DESeq2 (based on the mean of normalized counts). +The results of this plot have been filtered for a false discovery rate (FDR) of ``0.05``. +For more information about DESeq2 please see the +`documentation `_. + diff --git a/workflow/report/plot_deseq2_FDR_5_perc_volcano.rst b/workflow/report/plot_deseq2_FDR_5_perc_volcano.rst index 8d1c8b6..4329bfa 100644 --- a/workflow/report/plot_deseq2_FDR_5_perc_volcano.rst +++ b/workflow/report/plot_deseq2_FDR_5_perc_volcano.rst @@ -1 +1,8 @@ - +The **Volcano plot (FDR 0.01)** shows the significance (adjusted p-value) versus the effect size (log2 fold changes) +differences in feature counts between the group {{snakemake.wildcards["group_1"]}} +and the group {{snakemake.wildcards["group_2"]}} group +(both treated with the {{snakemake.wildcards["antibody"]}} antibody) +as determined by DESeq2 (based on the mean of normalized counts). +The results of this plot have been filtered for a false discovery rate (FDR) of ``0.05``. +For more information about DESeq2 please see the +`documentation `_. diff --git a/workflow/report/plot_deseq2_heatmap.rst b/workflow/report/plot_deseq2_heatmap.rst index 8d1c8b6..732e018 100644 --- a/workflow/report/plot_deseq2_heatmap.rst +++ b/workflow/report/plot_deseq2_heatmap.rst @@ -1 +1,7 @@ - +`Heatmap `_ of the +{% if snakemake.params.vst == "true" %}vst{% else %}rlog{% endif %} transformed feature counts from +DESeq2 for all groups treated with the {{snakemake.wildcards["antibody"]}} antibody. +There is more detailed `information on the count transformation in the docs `_. +For more information about DESeq2 in general, please also see the +`documentation `_. + diff --git a/workflow/report/plot_deseq2_pca.rst b/workflow/report/plot_deseq2_pca.rst index 8d1c8b6..e75d5d4 100644 --- a/workflow/report/plot_deseq2_pca.rst +++ b/workflow/report/plot_deseq2_pca.rst @@ -1 +1,7 @@ - +`PCA plot `_ +**(Principal Component Analysis)** show the variance of the +{% if snakemake.params.vst == "true" %}vst{% else %}rlog{% endif %} transformed feature counts +from DESeq2 with regard to the used {{snakemake.wildcards["antibody"]}} antibody. +There is more detailed `information on the count transformation in the docs `_. +For more information about DESeq2 in general, please also see the +`documentation `_. diff --git a/workflow/report/plot_deseq2_sample_corr_heatmap.rst b/workflow/report/plot_deseq2_sample_corr_heatmap.rst index 8d1c8b6..9d0c283 100644 --- a/workflow/report/plot_deseq2_sample_corr_heatmap.rst +++ b/workflow/report/plot_deseq2_sample_corr_heatmap.rst @@ -1 +1,8 @@ - +`Correlation heatmap `_ of the +{% if snakemake.params.vst == "true" %}vst{% else %}rlog{% endif %} transformed feature counts +from DESeq2, comparing the {{snakemake.wildcards["group_1"]}} group with +the {{snakemake.wildcards["group_2"]}} group +(both treated with the {{snakemake.wildcards["antibody"]}} antibody). +There is more detailed `information on the count transformation in the docs `_. +For more information about DESeq2 in general, please also see the +`documentation `_. diff --git a/workflow/report/plot_deseq2_scatter.rst b/workflow/report/plot_deseq2_scatter.rst index 8d1c8b6..e244fef 100644 --- a/workflow/report/plot_deseq2_scatter.rst +++ b/workflow/report/plot_deseq2_scatter.rst @@ -1 +1,9 @@ - +This **scatter plot** uses the matrix of the +{% if snakemake.params.vst == "true" %}vst{% else %}rlog{% endif %} transformed feature counts +from DESeq2, comparing the {{snakemake.wildcards["group_1"]}} group with +the {{snakemake.wildcards["group_2"]}} group +(both treated with the {{snakemake.wildcards["antibody"]}} antibody). +There is more detailed `information on the count transformation in the docs `_. +For more information about DESeq2 in general, please also see the +`documentation `_. + diff --git a/workflow/report/plot_fingerprint_deeptools.rst b/workflow/report/plot_fingerprint_deeptools.rst index 8d1c8b6..fecf21e 100644 --- a/workflow/report/plot_fingerprint_deeptools.rst +++ b/workflow/report/plot_fingerprint_deeptools.rst @@ -1 +1,9 @@ - +**deepTools** `plotFingerprint `_ is a +quality control tool, which determines how well the signal in the ChIP-seq sample can be differentiated from the +background distribution of reads in the control sample. plotFingerprint randomly samples genome regions of a specified +length and sums the per-base coverage that overlap with those regions. These values are then sorted according to their +rank and the cumulative sum of read counts is plotted. With a perfect uniform distribution of reads along the genome and +infinite sequencing coverage a straight diagonal line is shown in the plot. For more information on the interpretation of +the plots please see `documentation `_. +For more information about plotFingerprint metrics see +`here `_. diff --git a/workflow/report/plot_frip_score_macs2_bedtools.rst b/workflow/report/plot_frip_score_macs2_bedtools.rst index ecf3919..655c187 100644 --- a/workflow/report/plot_frip_score_macs2_bedtools.rst +++ b/workflow/report/plot_frip_score_macs2_bedtools.rst @@ -1,2 +1,3 @@ -**MACS2 FRiP score** is generated by calculating the fraction of all mapped reads that fall into the `MACS2 `_ called {{snakemake.config["params"]["peak-analysis"]}} peak regions. +**MACS2 FRiP score** is generated by calculating the fraction of all mapped reads that fall into the +`MACS2 `_ called {{snakemake.config["params"]["peak-analysis"]}} peak regions. A read must overlap a peak by at least 20% to be counted to `FRiP score `_. diff --git a/workflow/report/plot_heatmap_deeptools.rst b/workflow/report/plot_heatmap_deeptools.rst index 8d1c8b6..b52cb52 100644 --- a/workflow/report/plot_heatmap_deeptools.rst +++ b/workflow/report/plot_heatmap_deeptools.rst @@ -1 +1,2 @@ - +`**deepTools heatmap** `_ +of read coverage over sets of genomic regions, a visualisation for the genome-wide enrichment of the samples. diff --git a/workflow/report/plot_insert_size_histogram_picard_mm.rst b/workflow/report/plot_insert_size_histogram_picard_mm.rst index 8d1c8b6..05edb0d 100644 --- a/workflow/report/plot_insert_size_histogram_picard_mm.rst +++ b/workflow/report/plot_insert_size_histogram_picard_mm.rst @@ -1 +1,6 @@ - +`Insert size histogram +`_ **(Picard)** is used +for validating paired-end library construction. It shows the insert size distribution versus fractions of read pairs in +each of the three orientations (FR, RF, and TANDEM) as a histogram. For more information about `collected Picard metrics +`_ please +see `documentation `_. diff --git a/workflow/report/plot_macs2_qc.rst b/workflow/report/plot_macs2_qc.rst index 8d1c8b6..7ae5895 100644 --- a/workflow/report/plot_macs2_qc.rst +++ b/workflow/report/plot_macs2_qc.rst @@ -1 +1,9 @@ - +These `MACS2 `_ +**callpeak quality control plots** show the following metrics for all samples and their associated controls: +the number of peaks and their distributions of peak length, +the `fold-change `_ +of the enrichment for their peak summit against random Poisson distribution, +FDR and p-value from the results of the +`MACS callpeak analysis `_. +For more information about Model-based Analysis of ChIP-Seq (MACS) please see the +`MACS article `_. diff --git a/workflow/report/plot_peaks_count_macs2.rst b/workflow/report/plot_peaks_count_macs2.rst index 160b8c6..d83da3d 100644 --- a/workflow/report/plot_peaks_count_macs2.rst +++ b/workflow/report/plot_peaks_count_macs2.rst @@ -1 +1,2 @@ -**MACS2 peak count** is calculated from total number of {{snakemake.config["params"]["peak-analysis"]}} peaks called by `MACS2 `_. +**MACS2 peak count** is calculated from total number of {{snakemake.config["params"]["peak-analysis"]}} peaks called by +`MACS2 `_. diff --git a/workflow/report/plot_phantompeak_phantompeakqualtools.rst b/workflow/report/plot_phantompeak_phantompeakqualtools.rst index 8d1c8b6..85da65e 100644 --- a/workflow/report/plot_phantompeak_phantompeakqualtools.rst +++ b/workflow/report/plot_phantompeak_phantompeakqualtools.rst @@ -1 +1,10 @@ - +`Phantompeakqualtools plot `_ shows strand-shift versus +cross-correlation and computes informative enrichment and quality measures for ChIP-seq data. It also calculates the +relative (RSC) and the normalized strand cross-correlation coefficient (NSC). Datasets with NSC values much less than +1.1 tend to have low signal to noise or few peaks. This may indicate poor quality or only few binding sites. RSC values +significantly lower than 1 (< 0.8) tend to have low signal to noise. The low scores can be due to several factors and +are often due to failed and poor quality ChIP or low read sequence quality. In section "ChIP-seq processing pipeline" +of MultiQC report are additional plots across all samples for RSC, NSC and strand-shift cross-correlation. +For more information about Phantompeakqualtools please +see `documentation `_. For more information +about interpretation see published `article `_. diff --git a/workflow/report/plot_profile_deeptools.rst b/workflow/report/plot_profile_deeptools.rst index 8d1c8b6..7bc5fa1 100644 --- a/workflow/report/plot_profile_deeptools.rst +++ b/workflow/report/plot_profile_deeptools.rst @@ -1 +1,3 @@ - +**deepTools** `profile plots `_ +show the read coverage over sets of genomic regions and give a visualisation for the genome-wide +enrichment of the samples. diff --git a/workflow/report/plot_quality_by_cycle_picard_mm.rst b/workflow/report/plot_quality_by_cycle_picard_mm.rst index 8d1c8b6..63bb520 100644 --- a/workflow/report/plot_quality_by_cycle_picard_mm.rst +++ b/workflow/report/plot_quality_by_cycle_picard_mm.rst @@ -1 +1,7 @@ - +`Mean quality by cycle plot +`_ **(Picard)** is used as +quality control for sequencing machine performance and collects the mean quality by cycle of the bam files after +filtering, sorting, merging and removing orphans. Clear drops in quality within reads may indicate technical problems +during sequencing. For more information about `collected Picard metrics +`_ please +see `documentation `_. diff --git a/workflow/report/plot_quality_distribution_picard_mm.rst b/workflow/report/plot_quality_distribution_picard_mm.rst index 8d1c8b6..bd181ef 100644 --- a/workflow/report/plot_quality_distribution_picard_mm.rst +++ b/workflow/report/plot_quality_distribution_picard_mm.rst @@ -1 +1,7 @@ - +`Quality score distribution plot +`_ **(Picard)** is used +as overall quality control for a library in a given run. It shows for the range of quality scores the corresponding +total numbers of bases. For more information about `collected Picard metrics +`_ please +see `documentation `_. + diff --git a/workflow/report/workflow.rst b/workflow/report/workflow.rst index 189810d..0b6ef61 100644 --- a/workflow/report/workflow.rst +++ b/workflow/report/workflow.rst @@ -1 +1,2 @@ -**ChIP-seq** peak-calling, QC and differential analysis pipeline (Snakemake port of the `nextflow pipeline `_). +**ChIP-seq** peak-calling, QC and differential analysis pipeline (Snakemake port of the +`nextflow pipeline `_). diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index b70336c..4c19821 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -161,6 +161,63 @@ def get_read_group(wildcards): unit=wildcards.unit, platform=units.loc[(wildcards.sample, wildcards.unit), "platform"]) +def get_unique_antibodies(): + return set([antibody for antibody in samples["antibody"] if not pd.isnull(antibody)]) + +def get_igv_input(): + igv_input = [] + igv_input.extend([ + "results/IGV/big_wig/merged_library.bigWig.igv.txt" + ]) + igv_input.extend( + expand( + [ + "results/IGV/macs2_callpeak-{peak}/merged_library.{sample_control_peak}_peaks.igv.txt", + + ], + sample_control_peak=get_sample_control_peak_combinations_list(), + peak=config["params"]["peak-analysis"] + ) + ) + + for antibody in get_unique_antibodies(): + igv_input.extend( + expand( + [ + "results/IGV/consensus/merged_library.{antibody}.consensus_{peak}-peaks.igv.txt", + "results/IGV/consensus/merged_library.{antibody}.consensus_{peak}-peaks.deseq2.FDR_0.05.igv.txt" + ], + antibody=antibody, + peak=config["params"]["peak-analysis"] + ) + ) + return igv_input + +def get_files_for_igv(): + igv_files = [] + for sample in samples.index: + igv_files.extend( + expand( + [ + "results/big_wig/{sample}.bigWig" + ], + sample=sample + ) + ) + + igv_files.extend( + expand( + [ + "results/macs2_callpeak/{sample_control_peak}_peaks.{peak}Peak", + "results/macs2_merged_expand/{antibody}.consensus_{peak}-peaks.boolean.bed" + ], + sample_control_peak=get_sample_control_peak_combinations_list(), + peak=config["params"]["peak-analysis"], + antibody=get_unique_antibodies() + ) + ) + return igv_files + def get_multiqc_input(wildcards): multiqc_input = [] for (sample, unit) in units.index: @@ -277,9 +334,11 @@ def all_input(wildcards): wanted_input = [] - # QC with fastQC and multiQC + # QC with fastQC and multiQC, igv session wanted_input.extend([ - "results/qc/multiqc/multiqc.html" + "results/qc/multiqc/multiqc.html", + "results/IGV/igv_session.xml", + "results/IGV/report_igv_session.zip" ]) # trimming reads @@ -312,7 +371,6 @@ def all_input(wildcards): wanted_input.extend( expand ( [ - "results/IGV/big_wig/merged_library.bigWig.igv.txt", "results/phantompeakqualtools/{sample}.phantompeak.pdf" ], sample = sample @@ -363,8 +421,7 @@ def all_input(wildcards): expand( [ "results/macs2_merged_expand/{antibody}.consensus_{peak}-peaks.boolean.saf", - "results/macs2_merged_expand/plots/{antibody}.consensus_{peak}-peaks.boolean.intersect.plot.pdf", - "results/IGV/consensus/merged_library.{antibody}.consensus_{peak}-peaks.igv.txt" + "results/macs2_merged_expand/plots/{antibody}.consensus_{peak}-peaks.boolean.intersect.plot.pdf" ], peak = config["params"]["peak-analysis"], antibody = antibody @@ -387,21 +444,22 @@ def all_input(wildcards): "results/deseq2/sizeFactors/{antibody}.consensus_{peak}-peaks.sizeFactors.RData", "results/deseq2/sizeFactors/{antibody}.consensus_{peak}-peaks.sizeFactors.sizeFactor.txt", "results/deseq2/results/{antibody}.consensus_{peak}-peaks.deseq2_results.txt", - "results/deseq2/FDR/{antibody}.consensus_{peak}-peaks.deseq2.FDR_0.01.results.txt", - "results/deseq2/FDR/{antibody}.consensus_{peak}-peaks.deseq2.FDR_0.05.results.txt", - "results/deseq2/FDR/{antibody}.consensus_{peak}-peaks.deseq2.FDR_0.01.results.bed", - "results/deseq2/FDR/{antibody}.consensus_{peak}-peaks.deseq2.FDR_0.05.results.bed", - "results/deseq2/plots/FDR/{antibody}.consensus_{peak}-peaks_FDR_0.01_MA_plot.pdf", - "results/deseq2/plots/FDR/{antibody}.consensus_{peak}-peaks_FDR_0.05_MA_plot.pdf", - "results/deseq2/plots/FDR/{antibody}.consensus_{peak}-peaks_FDR_0.01_volcano_plot.pdf", - "results/deseq2/plots/FDR/{antibody}.consensus_{peak}-peaks_FDR_0.05_volcano_plot.pdf", - "results/deseq2/plots/{antibody}.consensus_{peak}-peaks_sample_corr_heatmap.pdf", - "results/deseq2/plots/{antibody}.consensus_{peak}-peaks_scatter_plots.pdf" + "results/deseq2/FDR/results/FDR_0.01_{antibody}.consensus_{peak}-peaks", + "results/deseq2/FDR/results/FDR_0.05_{antibody}.consensus_{peak}-peaks", + "results/deseq2/FDR/bed_files/FDR_0.01_{antibody}.consensus_{peak}-peaks", + "results/deseq2/FDR/bed_files/FDR_0.05_{antibody}.consensus_{peak}-peaks", + "results/deseq2/comparison_plots/MA_plots/FDR_0.01_{antibody}consensus_{peak}-peaks", + "results/deseq2/comparison_plots/MA_plots/FDR_0.05_{antibody}consensus_{peak}-peaks", + "results/deseq2/comparison_plots/volcano_plots/FDR_0.01_{antibody}consensus_{peak}-peaks", + "results/deseq2/comparison_plots/volcano_plots/FDR_0.05_{antibody}consensus_{peak}-peaks", + "results/deseq2/comparison_plots/correlation_heatmaps_{antibody}consensus_{peak}-peaks", + "results/deseq2/comparison_plots/scatter_plots_{antibody}consensus_{peak}-peaks" ], peak = config["params"]["peak-analysis"], antibody = antibody ) ) + wanted_input.extend( expand( [ @@ -409,7 +467,6 @@ def all_input(wildcards): "results/macs2_callpeak/{sample}-{control}.{peak}_treat_pileup.bdg", "results/macs2_callpeak/{sample}-{control}.{peak}_control_lambda.bdg", "results/macs2_callpeak/{sample}-{control}.{peak}_peaks.{peak}Peak", - "results/IGV/macs2_callpeak-{peak}/merged_library.{sample}-{control}.{peak}_peaks.igv.txt", "results/macs2_callpeak/plots/plot_{peak}_peaks_count.pdf", "results/macs2_callpeak/plots/plot_{peak}_peaks_frip_score.pdf", "results/macs2_callpeak/plots/plot_{peak}_peaks_macs2.pdf" diff --git a/workflow/rules/consensus_peak_analysis.smk b/workflow/rules/consensus_peak_analysis.smk index 8357ace..e1de340 100644 --- a/workflow/rules/consensus_peak_analysis.smk +++ b/workflow/rules/consensus_peak_analysis.smk @@ -69,7 +69,10 @@ rule plot_peak_intersect: input: "results/macs2_merged_expand/{antibody}.consensus_{peak}-peaks.boolean.intersect.txt" output: - report("results/macs2_merged_expand/plots/{antibody}.consensus_{peak}-peaks.boolean.intersect.plot.pdf", caption="../report/plot_consensus_peak_intersect.rst", category="ConsensusPeak") + report( + "results/macs2_merged_expand/plots/{antibody}.consensus_{peak}-peaks.boolean.intersect.plot.pdf", + caption="../report/plot_consensus_peak_intersect.rst", + category="ConsensusPeak") conda: "../envs/consensus_plot.yaml" log: @@ -82,10 +85,12 @@ rule create_consensus_igv: "results/macs2_merged_expand/{antibody}.consensus_{peak}-peaks.boolean.bed" output: "results/IGV/consensus/merged_library.{antibody}.consensus_{peak}-peaks.igv.txt" + params: + lambda w, input: "\n".join(["{}\t0,0,0".format(path) for path in input]) log: "logs/igv/consensus/merged_library.{antibody}.consensus_{peak}-peaks.igv.log" shell: - "find {input} -type f -name '*.consensus_{wildcards.peak}-peaks.boolean.bed' -exec echo -e 'results/IGV/consensus/{wildcards.antibody}/\"{{}}\"\t0,0,0' \; > {output} 2> {log}" + "echo -e '{params}' > {output} 2> {log}" rule homer_consensus_annotatepeaks: input: @@ -133,9 +138,8 @@ rule merge_bool_and_annotatepeaks: rule feature_counts: input: - sam=lambda wc: expand(["results/filtered/{sample}.sorted.bam", "results/filtered/{control}.sorted.bam"], - sample=get_samples_of_antibody(wc.antibody), - control=get_controls_of_antibody(wc.antibody)), + sam=lambda wc: expand("results/filtered/{sample}.sorted.bam", + sample=get_samples_of_antibody(wc.antibody)), annotation="results/macs2_merged_expand/{antibody}.consensus_{peak}-peaks.boolean.saf" output: multiext("results/feature_counts/{antibody}.consensus_{peak}-peaks", @@ -170,31 +174,51 @@ rule featurecounts_deseq2: "results/feature_counts/{antibody}.consensus_{peak}-peaks_modified.featureCounts" output: dds="results/deseq2/dss_rld/{antibody}.consensus_{peak}-peaks.dds.rld.RData", - plot_pca=report("results/deseq2/plots/{antibody}.consensus_{peak}-peaks.pca_plot.pdf", #ToDo: add description to report caption + plot_pca=report("results/deseq2/plots/{antibody}.consensus_{peak}-peaks.pca_plot.pdf", caption = "../report/plot_deseq2_pca.rst", category = "DESeq2"), - plot_heatmap=report("results/deseq2/plots/{antibody}.consensus_{peak}-peaks.heatmap_plot.pdf", #ToDo: add description to report caption + plot_heatmap=report("results/deseq2/plots/{antibody}.consensus_{peak}-peaks.heatmap_plot.pdf", caption = "../report/plot_deseq2_heatmap.rst", category = "DESeq2"), pca_data="results/deseq2/pca_vals/{antibody}.consensus_{peak}-peaks.pca.vals.txt", dist_data="results/deseq2/dists/{antibody}.consensus_{peak}-peaks.sample.dists.txt", size_factors_rdata="results/deseq2/sizeFactors/{antibody}.consensus_{peak}-peaks.sizeFactors.RData", size_factors_res="results/deseq2/sizeFactors/{antibody}.consensus_{peak}-peaks.sizeFactors.sizeFactor.txt", results="results/deseq2/results/{antibody}.consensus_{peak}-peaks.deseq2_results.txt", - FDR_1_perc_res="results/deseq2/FDR/{antibody}.consensus_{peak}-peaks.deseq2.FDR_0.01.results.txt", - FDR_5_perc_res="results/deseq2/FDR/{antibody}.consensus_{peak}-peaks.deseq2.FDR_0.05.results.txt", - FDR_1_perc_bed="results/deseq2/FDR/{antibody}.consensus_{peak}-peaks.deseq2.FDR_0.01.results.bed", - FDR_5_perc_bed="results/deseq2/FDR/{antibody}.consensus_{peak}-peaks.deseq2.FDR_0.05.results.bed", - plot_FDR_1_perc_MA=report("results/deseq2/plots/FDR/{antibody}.consensus_{peak}-peaks_FDR_0.01_MA_plot.pdf", #ToDo: add description to report caption - caption = "../report/plot_deseq2_FDR_1_perc_MA.rst", category = "DESeq2-FDR"), - plot_FDR_5_perc_MA=report("results/deseq2/plots/FDR/{antibody}.consensus_{peak}-peaks_FDR_0.05_MA_plot.pdf", #ToDo: add description to report caption - caption = "../report/plot_deseq2_FDR_5_perc_MA.rst", category = "DESeq2-FDR"), - plot_FDR_1_perc_volcano=report("results/deseq2/plots/FDR/{antibody}.consensus_{peak}-peaks_FDR_0.01_volcano_plot.pdf", #ToDo: add description to report caption - caption = "../report/plot_deseq2_FDR_1_perc_volcano.rst", category = "DESeq2-FDR"), - plot_FDR_5_perc_volcano=report("results/deseq2/plots/FDR/{antibody}.consensus_{peak}-peaks_FDR_0.05_volcano_plot.pdf", #ToDo: add description to report caption - caption = "../report/plot_deseq2_FDR_5_perc_volcano.rst", category = "DESeq2-FDR"), - plot_sample_corr_heatmap=report("results/deseq2/plots/{antibody}.consensus_{peak}-peaks_sample_corr_heatmap.pdf", #ToDo: add description to report caption - caption = "../report/plot_deseq2_sample_corr_heatmap.rst", category = "DESeq2"), - plot_scatter=report("results/deseq2/plots/{antibody}.consensus_{peak}-peaks_scatter_plots.pdf", #ToDo: add description to report caption - caption = "../report/plot_deseq2_scatter.rst", category = "DESeq2") + # pairwise comparisons of samples across the groups from a particular antibody + FDR_1_perc_res=directory("results/deseq2/FDR/results/FDR_0.01_{antibody}.consensus_{peak}-peaks"), + FDR_5_perc_res=directory("results/deseq2/FDR/results/FDR_0.05_{antibody}.consensus_{peak}-peaks"), + FDR_1_perc_bed=directory("results/deseq2/FDR/bed_files/FDR_0.01_{antibody}.consensus_{peak}-peaks"), + FDR_5_perc_bed=directory("results/deseq2/FDR/bed_files/FDR_0.05_{antibody}.consensus_{peak}-peaks"), + igv_FDR_5_bed="results/IGV/consensus/merged_library.{antibody}.consensus_{peak}-peaks.deseq2.FDR_0.05.igv.txt", + plot_FDR_1_perc_MA=report( + directory("results/deseq2/comparison_plots/MA_plots/FDR_0.01_{antibody}consensus_{peak}-peaks"), + patterns=["{antibody}.{group_1}_{antibody_1}_{control_1}vs{group_2}_{antibody_2}_{control_2}.MA-plot_FDR_0.01.pdf"], + caption = "../report/plot_deseq2_FDR_1_perc_MA.rst", + category = "DESeq2"), + plot_FDR_5_perc_MA=report( + directory("results/deseq2/comparison_plots/MA_plots/FDR_0.05_{antibody}consensus_{peak}-peaks"), + patterns=["{antibody}.{group_1}_{antibody_1}_{control_1}vs{group_2}_{antibody_2}_{control_2}.MA-plot_FDR_0.05.pdf"], + caption = "../report/plot_deseq2_FDR_5_perc_MA.rst", + category = "DESeq2"), + plot_FDR_1_perc_volcano=report( + directory("results/deseq2/comparison_plots/volcano_plots/FDR_0.01_{antibody}consensus_{peak}-peaks"), + patterns=["{antibody}.{group_1}_{antibody_1}_{control_1}vs{group_2}_{antibody_2}_{control_2}.volcano_FDR_0.01.pdf"], + caption = "../report/plot_deseq2_FDR_1_perc_volcano.rst", + category = "DESeq2"), + plot_FDR_5_perc_volcano=report( + directory("results/deseq2/comparison_plots/volcano_plots/FDR_0.05_{antibody}consensus_{peak}-peaks"), + patterns=["{antibody}.{group_1}_{antibody_1}_{control_1}vs{group_2}_{antibody_2}_{control_2}.volcano_FDR_0.05.pdf"], + caption = "../report/plot_deseq2_FDR_5_perc_volcano.rst", + category = "DESeq2"), + plot_sample_corr_heatmap=report( + directory("results/deseq2/comparison_plots/correlation_heatmaps_{antibody}consensus_{peak}-peaks"), + patterns=["{antibody}.{group_1}_{antibody_1}_{control_1}vs{group_2}_{antibody_2}_{control_2}.correlation_heatmap.pdf"], + caption = "../report/plot_deseq2_sample_corr_heatmap.rst", + category = "DESeq2"), + plot_scatter=report( + directory("results/deseq2/comparison_plots/scatter_plots_{antibody}consensus_{peak}-peaks"), + patterns=["{antibody}.{group_1}_{antibody_1}_{control_1}vs{group_2}_{antibody_2}_{control_2}.scatter_plots.pdf"], + caption = "../report/plot_deseq2_scatter.rst", + category = "DESeq2") threads: 2 params: @@ -205,13 +229,3 @@ rule featurecounts_deseq2: "../envs/featurecounts_deseq2.yaml" script: "../scripts/featurecounts_deseq2.R" - -rule create_deseq2_igv: - input: - "results/deseq2/results/{antibody}.consensus_{peak}-peaks.deseq2.FDR_0.05.results.bed" - output: - "results/IGV/consensus/merged_library.{antibody}.consensus_{peak}-peaks.deseq2.FDR_0.05.igv.txt" - log: - "logs/igv/consensus/merged_library.{antibody}.consensus_{peak}-peaks.deseq2.FDR_0.05.igv.log" - shell: - "find {input} -type f -name '*.consensus_{wildcards.peak}-peaks.deseq2.FDR_0.05.results.bed' -exec echo -e 'results/IGV/consensus/{wildcards.antibody}/deseq2/\"{{}}\"\t255,0,0' \; > {output} 2> {log}" diff --git a/workflow/rules/filtering.smk b/workflow/rules/filtering.smk index f3ec86d..a634c31 100644 --- a/workflow/rules/filtering.smk +++ b/workflow/rules/filtering.smk @@ -4,14 +4,8 @@ rule samtools_view_filter: output: temp("results/sam-view/{sample}.bam") params: - # if duplicates should be removed in this filtering, add "-F 0x0400" to the params - # if for each read, you only want to retain a single (best) mapping, add "-q 1" to params - # if you would like to restrict analysis to certain regions (e.g. excluding other "blacklisted" regions), - # the -L option is automatically activated if a path to a blacklist of the given genome exists in the - # downloaded "resources/ref/igenomes.yaml" or has been provided via the "config/config.yaml" - # parameter "config['resources']['ref']['blacklist']" - lambda wc, input: "-b -F 0x004 {pe_params} {blacklist}".format( - pe_params="" if config["single_end"] else "-G 0x009 -f 0x001", + lambda wc, input: "{flags} {blacklist}".format( + flags=config["params"]["samtools-view-se"] if config["single_end"] else config["params"]["samtools-view-pe"], blacklist="" if len(input) == 1 else "-L {}".format(list(input)[1]) ) log: diff --git a/workflow/rules/igv_session.smk b/workflow/rules/igv_session.smk new file mode 100644 index 0000000..6670ac2 --- /dev/null +++ b/workflow/rules/igv_session.smk @@ -0,0 +1,76 @@ +rule create_igv_input_file: + input: + get_igv_input() + output: + temp("results/IGV/igv_files.txt") + log: + "logs/igv/merged_igv_files.log" + shell: + "cat {input} > {output} 2> {log}" + +# igv session that can be started directly from the generated files of workflow +rule igv_files_to_session: + input: + igv="results/IGV/igv_files.txt", + fasta="resources/ref/genome.fasta" + output: + "results/IGV/igv_session.xml" + params: + "--path_prefix '../../'" + log: + "logs/igv/igv_session_to_file.log" + shell: + " ../workflow/scripts/igv_files_to_session.py {output} {input.igv} ../../{input.fasta} {params} 2> {log}" + +# remove paths for igv session to download from report.zip +rule igv_report_cleanup: + input: + "results/IGV/igv_files.txt" + output: + temp("results/IGV/report_igv_files.txt") + log: + "logs/igv/igv_files.log" + script: + "../scripts/igv_report_cleanup.py" + +rule igv_files_to_report: + input: + igv="results/IGV/report_igv_files.txt", + fasta="resources/ref/genome.fasta" + output: + temp("results/IGV/report_igv_session.xml") + params: + "" + log: + "logs/igv/igv_files_to_report.log" + shell: + " ../workflow/scripts/igv_files_to_session.py {output} {input.igv} $(basename {input.fasta}) {params} 2> {log}" + +rule collect_igv_report_session_files: + input: + igv_data=get_files_for_igv(), + deseq2_files=directory(expand("results/deseq2/FDR/bed_files/FDR_0.05_{antibody}.consensus_{peak}-peaks", + antibody=get_unique_antibodies(), peak=config["params"]["peak-analysis"])), + fasta="resources/ref/genome.fasta", + igv_session="results/IGV/report_igv_session.xml" + output: + temp(directory("results/IGV/report_igv_session")) + log: + "logs/igv/collect_igv_report_session_files.log" + shell: + "mkdir -p {output}; cp {input.igv_data} {output}/; " + "cp -R {input.deseq2_files}/* {output}/; cp {input.fasta} {output}/; " + "cp {input.igv_session} {output}/" + +# igv session that can be downloaded from generated report +rule zip_igv_report_session: + input: + rules.collect_igv_report_session_files.output + output: + report("results/IGV/report_igv_session.zip", caption = "../report/igv_session.rst", category="IGV session") + log: + "logs/igv/collect_igv_report_session_files.log" + conda: + "../envs/zip.yaml" + shell: + "cd $(dirname {input}); zip $(basename {output}) $(basename {input})/*" diff --git a/workflow/rules/peak_analysis.smk b/workflow/rules/peak_analysis.smk index a02e899..bf81907 100644 --- a/workflow/rules/peak_analysis.smk +++ b/workflow/rules/peak_analysis.smk @@ -6,9 +6,12 @@ rule plot_fingerprint: stats=expand("results/{step}/{{sample}}.sorted.{step}.stats.txt", step="bamtools_filtered" if config["single_end"] else "orph_rm_pe") - output: #ToDo: add description to report caption + output: # https://snakemake-wrappers.readthedocs.io/en/stable/wrappers/deeptools/plotfingerprint.html. - fingerprint=report("results/deeptools/{sample}-{control}.plot_fingerprint.pdf", caption="../report/plot_fingerprint_deeptools.rst", category="QC"), + fingerprint=report( + "results/deeptools/{sample}-{control}.plot_fingerprint.pdf", + caption="../report/plot_fingerprint_deeptools.rst", + category="other QC"), counts="results/deeptools/{sample}-{control}.fingerprint_counts.txt", qc_metrics="results/deeptools/{sample}-{control}.fingerprint_qcmetrics.txt" log: @@ -16,7 +19,7 @@ rule plot_fingerprint: params: "--labels {sample} {control}", "--skipZeros ", - "--numberOfSamples 500000 ", # ToDo: to config? + "--numberOfSamples {} ".format(config["params"]["plotfingerprint"]["number-of-samples"]), lambda w, input: "{se_option}{fragment_size}".format( se_option="--extendReads " if config["single_end"] else "", @@ -51,8 +54,7 @@ rule macs2_callpeak_broad: # these output extensions internally set the --broad option: "_peaks.broadPeak", "_peaks.gappedPeak" - ), - + ) log: "logs/macs2/callpeak.{sample}-{control}.broad.log" params: @@ -112,7 +114,10 @@ rule sm_report_peaks_count_plot: input: get_peaks_count_plot_input() output: - report("results/macs2_callpeak/plots/plot_{peak}_peaks_count.pdf", caption="../report/plot_peaks_count_macs2.rst", category="CallPeaks") + report( + "results/macs2_callpeak/plots/plot_{peak}_peaks_count.pdf", + caption="../report/plot_peaks_count_macs2.rst", + category="CallPeaks") log: "logs/macs2_callpeak/plot_{peak}_peaks_count.log" conda: @@ -136,8 +141,9 @@ rule bedtools_intersect: rule frip_score: input: intersect="results/bedtools_intersect/{sample}-{control}.{peak}.intersected.bed", - flagstats=expand("results/{step}/{{sample}}.sorted.{step}.flagstat", step= "bamtools_filtered" if config["single_end"] - else "orph_rm_pe") + flagstats=expand( + "results/{step}/{{sample}}.sorted.{step}.flagstat", + step= "bamtools_filtered" if config["single_end"] else "orph_rm_pe") output: "results/bedtools_intersect/{sample}-{control}.{peak}.peaks_frip.tsv" log: @@ -155,7 +161,10 @@ rule sm_rep_frip_score: input: get_frip_score_input() output: - report("results/macs2_callpeak/plots/plot_{peak}_peaks_frip_score.pdf", caption="../report/plot_frip_score_macs2_bedtools.rst", category="CallPeaks") + report( + "results/macs2_callpeak/plots/plot_{peak}_peaks_frip_score.pdf", + caption="../report/plot_frip_score_macs2_bedtools.rst", + category="CallPeaks") log: "logs/bedtools/intersect/plot_{peak}_peaks_frip_score.log" conda: @@ -168,10 +177,12 @@ rule create_igv_peaks: "results/macs2_callpeak/{sample}-{control}.{peak}_peaks.{peak}Peak" output: "results/IGV/macs2_callpeak-{peak}/merged_library.{sample}-{control}.{peak}_peaks.igv.txt" + params: + lambda w, input: "\n".join(["{}\t0,0,178".format(path) for path in input]) log: "logs/igv/create_igv_peaks/merged_library.{sample}-{control}.{peak}_peaks.log" shell: - " find {input} -type f -name '*_peaks.{wildcards.peak}Peak' -exec echo -e 'results/IGV/macs2_callpeak/{wildcards.peak}/\"{{}}\"\t0,0,178' \; > {output} 2> {log}" + "echo -e '{params}' > {output} 2> {log}" rule homer_annotatepeaks: input: @@ -193,9 +204,12 @@ rule homer_annotatepeaks: rule plot_macs_qc: input: get_macs2_peaks() - output: #ToDo: add description to report caption + output: summmary="results/macs2_callpeak/plots/plot_{peak}_peaks_macs2_summary.txt", - plot=report("results/macs2_callpeak/plots/plot_{peak}_peaks_macs2.pdf", caption="../report/plot_macs2_qc.rst", category="CallPeaks") + plot=report( + "results/macs2_callpeak/plots/plot_{peak}_peaks_macs2.pdf", + caption="../report/plot_macs2_qc.rst", + category="CallPeaks") params: input = lambda wc, input: ','.join(input), sample_control_combinations = ','.join(get_sample_control_peak_combinations_list()) @@ -204,14 +218,18 @@ rule plot_macs_qc: conda: "../envs/plot_macs_annot.yaml" shell: - "Rscript ../workflow/scripts/plot_macs_qc.R -i {params.input} -s {params.sample_control_combinations} -o {output.plot} -p {output.summmary} 2> {log}" + "Rscript ../workflow/scripts/plot_macs_qc.R -i {params.input} -s {params.sample_control_combinations} " + "-o {output.plot} -p {output.summmary} 2> {log}" rule plot_homer_annotatepeaks: input: get_plot_homer_annotatepeaks_input() output: #ToDo: add description to report caption summmary="results/homer/plots/plot_{peak}_annotatepeaks_summary.txt", - plot=report("results/homer/plots/plot_{peak}_annotatepeaks.pdf", caption="../report/plot_annotatepeaks_homer.rst", category="CallPeaks") + plot=report( + "results/homer/plots/plot_{peak}_annotatepeaks.pdf", + caption="../report/plot_annotatepeaks_homer.rst", + category="CallPeaks") params: input = lambda wc, input: ','.join(input), sample_control_combinations = ','.join(get_sample_control_peak_combinations_list()) @@ -220,13 +238,17 @@ rule plot_homer_annotatepeaks: conda: "../envs/plot_macs_annot.yaml" shell: - "Rscript ../workflow/scripts/plot_homer_annotatepeaks.R -i {params.input} -s {params.sample_control_combinations} -o {output.plot} -p {output.summmary} 2> {log}" + "Rscript ../workflow/scripts/plot_homer_annotatepeaks.R -i {params.input} " + "-s {params.sample_control_combinations} -o {output.plot} -p {output.summmary} 2> {log}" rule plot_sum_annotatepeaks: input: "results/homer/plots/plot_{peak}_annotatepeaks_summary.txt" output: - report("results/homer/plots/plot_{peak}_annotatepeaks_summary.pdf", caption="../report/plot_annotatepeaks_summary_homer.rst", category="CallPeaks") + report( + "results/homer/plots/plot_{peak}_annotatepeaks_summary.pdf", + caption="../report/plot_annotatepeaks_summary_homer.rst", + category="CallPeaks") log: "logs/homer/plot_{peak}_annotatepeaks_summary.log" conda: diff --git a/workflow/rules/post-analysis.smk b/workflow/rules/post-analysis.smk index 36f6a6e..ff5d6cb 100644 --- a/workflow/rules/post-analysis.smk +++ b/workflow/rules/post-analysis.smk @@ -14,7 +14,7 @@ rule collect_multiple_metrics: input: bam="results/filtered/{sample}.sorted.bam", ref="resources/ref/genome.fasta" - output: #ToDo: add descriptions to report captions + output: # Through the output file extensions the different tools for the metrics can be selected # so that it is not necessary to specify them under params with the "PROGRAM" option. # Usable extensions (and which tools they implicitly call) are listed here: @@ -33,9 +33,18 @@ rule collect_multiple_metrics: ".quality_by_cycle_metrics", ".quality_distribution_metrics", ), - report("{path}{sample}.base_distribution_by_cycle.pdf", caption="../report/plot_base_distribution_by_cycle_picard_mm.rst", category="Multiple Metrics (picard)"), - report("{path}{sample}.quality_by_cycle.pdf", caption="../report/plot_quality_by_cycle_picard_mm.rst", category="Multiple Metrics (picard)"), - report("{path}{sample}.quality_distribution.pdf", caption="../report/plot_quality_distribution_picard_mm.rst", category="Multiple Metrics (picard)") + report( + "{path}{sample}.base_distribution_by_cycle.pdf", + caption="../report/plot_base_distribution_by_cycle_picard_mm.rst", + category="Multiple Metrics (picard)"), + report( + "{path}{sample}.quality_by_cycle.pdf", + caption="../report/plot_quality_by_cycle_picard_mm.rst", + category="Multiple Metrics (picard)"), + report( + "{path}{sample}.quality_distribution.pdf", + caption="../report/plot_quality_distribution_picard_mm.rst", + category="Multiple Metrics (picard)") resources: # This parameter (default 3 GB) can be used to limit the total resources a pipeline is allowed to use, see: # https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#resources @@ -43,8 +52,8 @@ rule collect_multiple_metrics: log: "logs/picard/{path}{sample}.log" params: - # optional parameters, TODO: move to config.yaml and load from there - "VALIDATION_STRINGENCY=LENIENT " + # optional parameters + "{} ".format(config["params"]["collect-multiple-metrics"]) wrapper: "0.64.0/bio/picard/collectmultiplemetrics" @@ -103,14 +112,15 @@ rule bedGraphToBigWig: rule create_igv_bigwig: input: - "resources/ref/genome.bed", - expand("results/big_wig/{sample}.bigWig", sample=samples.index) + bigwig=expand("results/big_wig/{sample}.bigWig", sample=samples.index) output: "results/IGV/big_wig/merged_library.bigWig.igv.txt" + params: + lambda w, input: "\n".join(["{}\t0,0,178".format(path) for path in input.bigwig]) log: "logs/igv/create_igv_bigwig.log" shell: - "find {input} -type f -name '*.bigWig' -exec echo -e 'results/IGV/big_wig/\"{{}}\"\t0,0,178' \; > {output} 2> {log}" + "echo -e '{params}' > {output} 2> {log}" rule compute_matrix: input: @@ -139,10 +149,13 @@ rule compute_matrix: rule plot_profile: input: "results/deeptools/matrix_files/matrix.gz" - output: #ToDo: add description to report caption + output: # Usable output variables, their extensions and which option they implicitly call are listed here: # https://snakemake-wrappers.readthedocs.io/en/stable/wrappers/deeptools/plotprofile.html. - plot_img=report("results/deeptools/plot_profile.pdf", caption="../report/plot_profile_deeptools.rst", category="GenomicRegions"), + plot_img=report( + "results/deeptools/plot_profile.pdf", + caption="../report/plot_profile_deeptools.rst", + category="GenomicRegions"), data="results/deeptools/plot_profile_data.tab" log: "logs/deeptools/plot_profile.log" @@ -154,10 +167,13 @@ rule plot_profile: rule plot_heatmap: input: "results/deeptools/matrix_files/matrix.gz" - output: #ToDo: add description to report caption + output: # Usable output variables, their extensions and which option they implicitly call are listed here: # https://snakemake-wrappers.readthedocs.io/en/stable/wrappers/deeptools/plotheatmap.html. - heatmap_img=report("results/deeptools/heatmap.pdf", caption="../report/plot_heatmap_deeptools.rst", category="Heatmaps"), + heatmap_img=report( + "results/deeptools/heatmap.pdf", + caption="../report/plot_heatmap_deeptools.rst", + category="Heatmaps"), heatmap_matrix="results/deeptools/heatmap_matrix.tab" log: "logs/deeptools/heatmap.log" @@ -169,10 +185,13 @@ rule plot_heatmap: rule phantompeakqualtools: input: "results/filtered/{sample}.sorted.bam" - output: #ToDo: add description to report caption + output: res_phantom="results/phantompeakqualtools/{sample}.phantompeak.spp.out", r_data="results/phantompeakqualtools/{sample}.phantompeak.Rdata", - plot=report("results/phantompeakqualtools/{sample}.phantompeak.pdf", caption="../report/plot_phantompeak_phantompeakqualtools.rst", category="Phantompeak") + plot=report( + "results/phantompeakqualtools/{sample}.phantompeak.pdf", + caption="../report/plot_phantompeak_phantompeakqualtools.rst", + category="Phantompeak") threads: 8 log: diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index bd9a7df..d0b3480 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -16,7 +16,7 @@ rule multiqc: input: get_multiqc_input output: - "results/qc/multiqc/multiqc.html" + report("results/qc/multiqc/multiqc.html", caption="../report/multiqc_report.rst", category="MultiQC") log: "logs/multiqc.log" wrapper: diff --git a/workflow/scripts/col_mod_featurecounts.py b/workflow/scripts/col_mod_featurecounts.py index 2e6a8b8..8256e66 100644 --- a/workflow/scripts/col_mod_featurecounts.py +++ b/workflow/scripts/col_mod_featurecounts.py @@ -11,10 +11,11 @@ def get_group_control_combination(bam_path): sample = os.path.basename(bam_path.split(".sorted.bam")[0]) sample_row = samples.loc[samples["sample"] == sample] group = sample_row["group"].iloc[0] + antibody = sample_row["antibody"].iloc[0] control = sample_row["control"].iloc[0] if pd.isnull(control): control = sample - return "{}_{}_{}".format(group, control, sample) + return "{}_{}_{}_{}".format(group, antibody, control, sample) def modify_header(old_header): diff --git a/workflow/scripts/featurecounts_deseq2.R b/workflow/scripts/featurecounts_deseq2.R index 0b178d7..dd4e55d 100644 --- a/workflow/scripts/featurecounts_deseq2.R +++ b/workflow/scripts/featurecounts_deseq2.R @@ -285,16 +285,65 @@ if (file.exists(ResultsFile) == FALSE) { ## WRITE RESULTS FILE if (MIN_FDR == 0.01) { - CompResultsFile <- snakemake@output[["FDR_1_perc_res"]] - CompBEDFile <- snakemake@output[["FDR_1_perc_bed"]] - MAplotFile <- snakemake@output[["plot_FDR_1_perc_MA"]] # AVI: added to create separate pdf files - VolcanoPlotFile <- snakemake@output[["plot_FDR_1_perc_volcano"]] # AVI: added to create separate pdf files + # AVI: dynamically generated file name extensions added + dirs <- c( + snakemake@output[["FDR_1_perc_res"]], + snakemake@output[["FDR_1_perc_bed"]], + snakemake@output[["plot_FDR_1_perc_MA"]], + snakemake@output[["plot_FDR_1_perc_volcano"]], + snakemake@output[["FDR_5_perc_res"]], + snakemake@output[["FDR_5_perc_bed"]], + snakemake@output[["plot_FDR_5_perc_MA"]], + snakemake@output[["plot_FDR_5_perc_volcano"]], + snakemake@output[["plot_sample_corr_heatmap"]], + snakemake@output[["plot_scatter"]] + ) + for (dir in dirs) { + if (file.exists(dir) == FALSE) { + dir.create(dir,recursive=TRUE) + } + } + + CompResultsFile <- file.path( + snakemake@output[["FDR_1_perc_res"]], + paste(snakemake@params[["antibody"]], ".", CompPrefix, ".FDR_0.01_results.txt", sep="") + ) + CompBEDFile <- file.path( + snakemake@output[["FDR_1_perc_bed"]], + paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".FDR_0.01_results.bed", sep="") + ) + MAplotFile <- file.path( + snakemake@output[["plot_FDR_1_perc_MA"]], + paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".MA-plot_FDR_0.01.pdf", sep="") + ) # AVI: added to create separate pdf files + VolcanoPlotFile <- file.path( + snakemake@output[["plot_FDR_1_perc_volcano"]], + paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".volcano_FDR_0.01.pdf", sep="") + ) # AVI: added to create separate pdf files } if (MIN_FDR == 0.05) { - CompResultsFile <- snakemake@output[["FDR_5_perc_res"]] - CompBEDFile <- snakemake@output[["FDR_5_perc_bed"]] - MAplotFile <- snakemake@output[["plot_FDR_5_perc_MA"]] # AVI: added to create separate pdf files - VolcanoPlotFile <- snakemake@output[["plot_FDR_5_perc_volcano"]] # AVI: added to create separate pdf files + # AVI: dynamically create file names + CompResultsFile <- file.path( + snakemake@output[["FDR_5_perc_res"]], paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".FDR_0.05_results.txt", sep="") + ) + CompBEDFile <- file.path( + snakemake@output[["FDR_5_perc_bed"]], + paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".FDR_0.05_results.bed", sep="") + ) + # AVI: write paths of FDR 0.05 bed files to igv file + cat( + paste0(CompBEDFile, "\t255,0,0\n"), + file=snakemake@output[["igv_FDR_5_bed"]], + append=TRUE + ) + MAplotFile <- file.path( + snakemake@output[["plot_FDR_5_perc_MA"]], + paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".MA-plot_FDR_0.05.pdf", sep="") + ) # AVI: added to create separate pdf files + VolcanoPlotFile <- file.path( + snakemake@output[["plot_FDR_5_perc_volcano"]], + paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".volcano_FDR_0.05.pdf", sep="") + ) # AVI: added to create separate pdf files } write.table(pass.fdr.table, file=CompResultsFile, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE) @@ -317,11 +366,15 @@ if (file.exists(ResultsFile) == FALSE) { cat("\n",file=LogFile,append=TRUE,sep="") # AVI: creates required output files with message } else { - stop("More than 2 samples treated with the same antibody are needed to calculate the FDR & LOGFC.") + warning("More than 2 samples treated with the same antibody are needed to calculate the FDR & LOGFC.") } ## SAMPLE CORRELATION HEATMAP - pdf(file=snakemake@output[["plot_sample_corr_heatmap"]],width=10,height=8) # AVI: added to create separate pdf files + CorrHeatmapFile <- file.path( + snakemake@output[["plot_sample_corr_heatmap"]], + paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".correlation_heatmap.pdf", sep="") + ) # AVI: dynamically create file names + pdf(file=CorrHeatmapFile,width=10,height=8) # AVI: added to create separate pdf files rld.subset <- assay(rld)[,comp.samples] sampleDists <- dist(t(rld.subset)) sampleDistMatrix <- as.matrix(sampleDists) @@ -330,7 +383,11 @@ if (file.exists(ResultsFile) == FALSE) { dev.off() # AVI: added to create separate pdf files ## SCATTER PLOT FOR RLOG COUNTS - pdf(file=snakemake@output[["plot_scatter"]],width=10,height=8) # AVI: added to create separate pdf files + ScatterPlotFile <- file.path( + snakemake@output[["plot_scatter"]], + paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".scatter_plots.pdf", sep="") + ) # AVI: dynamically create file names + pdf(file=ScatterPlotFile,width=10,height=8) # AVI: added to create separate pdf files combs <- combn(comp.samples,2,simplify=FALSE) clabels <- sapply(combs,function(x){paste(x,collapse=' & ')}) plotdat <- data.frame(x=unlist(lapply(combs, function(x){rld.subset[, x[1] ]})),y=unlist(lapply(combs, function(y){rld.subset[, y[2] ]})),comp=rep(clabels, each=nrow(rld.subset))) diff --git a/workflow/scripts/igv_files_to_session.py b/workflow/scripts/igv_files_to_session.py new file mode 100755 index 0000000..54c118a --- /dev/null +++ b/workflow/scripts/igv_files_to_session.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python + +### source: https://github.com/nf-core/chipseq/blob/master/bin/igv_files_to_session.py +### own changes and adjustments for snakemake-workflow chipseq are marked with "# AVI: " in comment + +#MIT License +# +#Copyright (c) Philip Ewels +# +#Permission is hereby granted, free of charge, to any person obtaining a copy +#of this software and associated documentation files (the "Software"), to deal +#in the Software without restriction, including without limitation the rights +#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +#copies of the Software, and to permit persons to whom the Software is +#furnished to do so, subject to the following conditions: +# +#The above copyright notice and this permission notice shall be included in all +#copies or substantial portions of the Software. +# +#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +#SOFTWARE. + + + +####################################################################### +####################################################################### +## Created on July 4th 2018 to create IGV session file from file list +####################################################################### +####################################################################### + +import os +import errno +import argparse + +############################################ +############################################ +## PARSE ARGUMENTS +############################################ +############################################ + +Description = 'Create IGV session file from a list of files and associated colours - ".bed", ".bw", ".bigwig", ".tdf", ".gtf" files currently supported.' +Epilog = """Example usage: python igv_files_to_session.py """ + +argParser = argparse.ArgumentParser(description=Description, epilog=Epilog) + +## REQUIRED PARAMETERS +argParser.add_argument('XML_OUT', help="XML output file.") +argParser.add_argument('LIST_FILE', help="Tab-delimited file containing two columns i.e. file_name\tcolour. Header isnt required.") +argParser.add_argument('GENOME', help="Full path to genome fasta file or shorthand for genome available in IGV e.g. hg19.") + +## OPTIONAL PARAMETERS +argParser.add_argument('-pp', '--path_prefix', type=str, dest="PATH_PREFIX", default='', help="Path prefix to be added at beginning of all files in input list file.") +args = argParser.parse_args() + +############################################ +############################################ +## HELPER FUNCTIONS +############################################ +############################################ + +def makedir(path): + + if not len(path) == 0: + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + +############################################ +############################################ +## MAIN FUNCTION +############################################ +############################################ + +def igv_files_to_session(XMLOut,ListFile,Genome,PathPrefix=''): + + makedir(os.path.dirname(XMLOut)) + + fileList = [] + fin = open(ListFile,'r') + while True: + line = fin.readline() + if line: + ifile,colour = line.strip().split('\t') + if len(colour.strip()) == 0: + colour = '0,0,178' + fileList.append((PathPrefix.strip()+ifile,colour)) + else: + break + fout.close() + + ## ADD RESOURCES SECTION + XMLStr = '\n' + XMLStr += '\n' % (Genome) + XMLStr += '\t\n' + for ifile,colour in fileList: + XMLStr += '\t\t\n' % (ifile) + XMLStr += '\t\n' + + ## ADD PANEL SECTION + XMLStr += '\t\n' + for ifile,colour in fileList: + extension = os.path.splitext(ifile)[1].lower() + if extension in ['.bed','.broadpeak','.narrowpeak']: + XMLStr += '\t\t