diff --git a/.gitignore b/.gitignore index 2719ee1..8919ea6 100755 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ build/ *.swp *.DS_Store #.+ +hg19_GRCh37_1000genomes diff --git a/ACEseqWorkflow.jar b/ACEseqWorkflow.jar old mode 100755 new mode 100644 index 6e68149..ba1a8f9 Binary files a/ACEseqWorkflow.jar and b/ACEseqWorkflow.jar differ diff --git a/ACEseqWorkflow_5.0.1.iml b/ACEseqWorkflow_6.0.0.iml similarity index 100% rename from ACEseqWorkflow_5.0.1.iml rename to ACEseqWorkflow_6.0.0.iml diff --git a/README.ACEseqWorkflow.txt b/README.ACEseqWorkflow.txt index feea37e..9e6ae62 100755 --- a/README.ACEseqWorkflow.txt +++ b/README.ACEseqWorkflow.txt @@ -15,8 +15,26 @@ isNoControlWorkflow false Run analysis with matching control and estim == Changelist +* Version update 6.0.0 +- major: Changed the phasing routine. The program "impute2" was replaced by "Beagle". Files and tools were renamed accordingly. +-- Generally renamed all tools and files from "imputeGenotype" to "phaseGenotype" (and so on) as the subroutine does not actually perform imputation but rather phasing. +- major: htslib,samtools and bcftoosl are updated to 1.9 +- minor: `ngs_share` paths are updated to `/omics/odcf/reference_data/legacy/ngs_share` +- major: Added the hg38 support +-- new mappability with `-m2 -e2` option +-- new exclusion list from the ICGC medulloblastoma samples +-- new dbSNP version 151 + +* Version update to 5.1.0 +- added contributing segments file for LST +- implemented gene annotation in TCN chromosome plots +- fixed minor bugs + * Version update to 5.0.1 - fixed density(NA) bug and index bug for frequencies (as.character) in clustering step +- fixed empty line bug (newline) in smoothData script +- switched to COWorkflowsBasePlugin:1.2.1 (sample name handling) +- star plot: green (NCTN) and yellow (DH>1) line instead of common red line * Version update to 5.0.0 - introduced CNA.type 'AMP' (TCN>=2*ploidy + 1) diff --git a/buildinfo.txt b/buildinfo.txt index 1ed86d9..1750c45 100755 --- a/buildinfo.txt +++ b/buildinfo.txt @@ -1,4 +1,4 @@ -dependson=COWorkflowsBasePlugin:1.2.1 +dependson=COWorkflowsBasePlugin:1.4.2 RoddyAPIVersion=3.2 GroovyVersion=2.4 JDKVersion=1.8 diff --git a/buildversion.txt b/buildversion.txt index e595df4..3542499 100755 --- a/buildversion.txt +++ b/buildversion.txt @@ -1,2 +1,2 @@ -5.0 +6.0 0 diff --git a/documentation/source/methods.rst b/documentation/source/methods.rst index 786751e..c3c5d7b 100644 --- a/documentation/source/methods.rst +++ b/documentation/source/methods.rst @@ -2,7 +2,7 @@ Methods - Theory ================ -| ACEseq can be used to estimate copy-numbers from WGS data using a tumor vs. control approach. Thus a pre-requesite is WGS data from healthy tissue and tumor tissue of the same patient with at least 30x coverage. Samtools [] mpileup is used to determine the coverage for tumor and control sample - position specific for each single nucleotide polymorphism (SNP) position recorded in dbSNP and per 1 kb window. To get chromosome specific allele frequencies, the genotypes of SNP positions are phased with Impute2 [] and A and B allele are assigned accordingly. Haploblocks are defined as regions with consecutively phased SNPs. Subsequently, B-allele frequencies (BAFs) are estimated for all SNP positions in tumor and control with sufficient coverage in the control: +| ACEseq can be used to estimate copy-numbers from WGS data using a tumor vs. control approach. Thus a pre-requesite is WGS data from healthy tissue and tumor tissue of the same patient with at least 30x coverage. Samtools [] mpileup is used to determine the coverage for tumor and control sample - position specific for each single nucleotide polymorphism (SNP) position recorded in dbSNP and per 1 kb window. To get chromosome specific allele frequencies, the genotypes of SNP positions are phased with Impute2 (up to v5) or Beagle (since v6) [] and A and B allele are assigned accordingly. Haploblocks are defined as regions with consecutively phased SNPs. Subsequently, B-allele frequencies (BAFs) are estimated for all SNP positions in tumor and control with sufficient coverage in the control: .. math:: \label{eq:BAF} diff --git a/documentation/source/parameters.rst b/documentation/source/parameters.rst index 58f79cd..d57d2bb 100644 --- a/documentation/source/parameters.rst +++ b/documentation/source/parameters.rst @@ -11,7 +11,7 @@ Multiple parameters can be set with ACEseq though not all are necessary to chang svOutputDirectory,${outputAnalysisBaseDirectory}/SV_calls,path, crestOutputDirectory,${outputAnalysisBaseDirectory}/crest,path, cnvSnpOutputDirectory,${aceseqOutputDirectory}/cnv_snp,path, - imputeOutputDirectory,${aceseqOutputDirectory}/phasing,path, + phasingOutputDirectory,${aceseqOutputDirectory}/phasing,path, plotOutputDirectory,${aceseqOutputDirectory}/plots,path, runWithoutControl,false,boolean,use control for analysis (false|true); up to version 2 isNoControlWorkflow,false,boolean,use control for analysis (false|true); since version 3 @@ -42,7 +42,7 @@ Multiple parameters can be set with ACEseq though not all are necessary to chang min_distance,0.05,float,min_distance haplogroupFilePrefix,haploblocks_chr,string,prefix for file with haplogroups per chromosome haplogroupFileSuffix,txt,string,suffix for file with haplogroups per chromosome - haplogroupFilePath,${imputeOutputDirectory}/${haplogroupFilePrefix},path, + haplogroupFilePath,${phasingOutputDirectory}/${haplogroupFilePrefix},path, min_length_purity,1000000,integer,minimum length of segments to be considered for tumor cell content and ploidy estimation min_hetSNPs_purity,500,integer,minimum number of control heterozygous SNPs in segments to be considered for tumor cell content and ploidy estimation dh_stop,max,string, @@ -73,18 +73,9 @@ Multiple parameters can be set with ACEseq though not all are necessary to chang PATIENTSEX,male,string,patient sex used in case of no control workflow (male|female|klinefelter) CNV_ANNO_SUFFIX,cnv.anno.tab.gz,string,suffix for mappability annotated chromosome-wise 1kb coverage files CNV_SUFFIX,cnv.tab.gz,string,suffix chromosome-wise 1kb coverage files - FILE_UNPHASED_PRE,${imputeOutputDirectory}/${unphasedGenotypesFilePrefix},path, - FILE_UNPHASED_GENOTYPE,${imputeOutputDirectory}/unphased_genotype_chr,path, - FILE_PHASED_PRE,${imputeOutputDirectory}/${phasedGenotypesFilePrefix},path, - FILE_PHASED_GENOTYPE,${imputeOutputDirectory}/phased_genotype_chr,path, - FILE_INFO,info,string, - FILE_INFO_SAMPLE,info_by_sample,string, - FILE_HAPS,haps,string, - FILE_HAPS_CONF,haps_confidence,string, - FILE_SUMMARY,summary,string, - FILE_WARNINGS,warnings,string, - FILE_PART,part,string, - FILE_SAMPLE_G,${imputeOutputDirectory}/sample_g.txt,path,sample_g file used by imputation on X chromosome for females + FILE_UNPHASED_PRE,${phasingOutputDirectory}/${unphasedGenotypesFilePrefix},path, + FILE_PHASED_GENOTYPE,${phasingOutputDirectory}/phased_genotype_chr,path, + FILE_SAMPLE_G,${phasingOutputDirectory}/sample_g.txt,path,sample_g file used by imputation on X chromosome for females MALE_FAKE_CONTROL_PRE,${pathToACEseqResults}/cnv_snp/${pid}.chr,path,path and prefix to chromosome-wise 1kb coverage file used for fake control workflow for male patients FEMALE_FAKE_CONTROL_PRE,${pathToACEseqResults}/cnv_snp/${pid}.chr,path,path and prefix to chromosome-wise 1kb coverage file used for fake control workflow for female patients PLOT_PRE,${aceseqOutputDirectory}/${pid}_plot,path, @@ -105,16 +96,13 @@ Multiple parameters can be set with ACEseq though not all are necessary to chang CHROMOSOME_LENGTH_FILE,${path}/chrlengths.txt,path, REPLICATION_TIME_FILE,${path}/ReplicationTime_10cellines_mean_10KB.Rda,path,"replication timing file" GC_CONTENT_FILE,${path}/hg19_GRch37_100genomes_gc_content_10kb.txt,path, - GENETIC_MAP_FILE,${path}/genetic_map_chr${CHR_NAME}_combined_b37.txt,path,"impute files" - KNOWN_HAPLOTYPES_FILE,${path}/ALL.chr${CHR_NAME}.integrated_phase1_v3. 20101123.snps_indels_svs.genotypes.nomono.haplotypes.gz,path,"impute files" - KNOWN_HAPLOTYPES_LEGEND_FILE,${path}ALL.chr${CHR_NAME}.integrated_phase1_v3. 20101123.snps_indels_svs.genotypes.nomono.legend.gz,path,"impute files" - GENETIC_MAP_FILE_X,${path}/genetic_map_chrX_nonPAR_combined_b37.txt,path,"impute files" - KNOWN_HAPLOTYPES_FILE_X,${path}/ALL_1000G_phase1integrated_v3_chrX_nonPAR_impute.hap.gz,path,"impute files" - KNOWN_HAPLOTYPES_LEGEND_FILE_X,${path}/ALL_1000G_phase1integrated_v3_chrX_nonPAR_impute.legend.gz,path,"impute files" outputExecutionDirectory,${path}/exec_${executionTimeString},,"path to log files" - imputeBaseDirectory,${path}/,path,"directory for impute files" mergedBamSuffix,merged.mdup.bam,string,"A list of all known suffixes for merged bam files. I.e. merged.dupmark.bam, merged.mdup.bam..." mergedBamSuffixList,${mergedBamSuffix},string,"A list of all known suffixes for merged bam files. I.e. merged.dupmark.bam, merged.mdup.bam..." defaultMergedBamSuffix,${mergedBamSuffix},string,The default suffix for merged bam files when they are created by Roddy. libloc_PSCBS,,string,path to PSCBS library in R libloc_flexclust,,string,path to felxclust library in R + BEAGLE_REFERENCE_FILE,${baseDirectoryReference}/tools_data/Beagle/chr${CHR_NAME}.1kg.phase3.v5a.b37.bref3,path + BEAGLE_REFERENCE_FILE_X,${baseDirectoryReference}/tools_data/Beagle/chrX.1kg.phase3.v5a.b37.bref3,path + BEAGLE_GENETIC_MAP,${baseDirectoryReference}/tools_data/genetic_maps/plink.chr${CHR_NAME}.GRCh37.map,path + BEAGLE_GENETIC_MAP_X,${baseDirectoryReference}/tools_data/genetic_maps/plink.chrX.GRCh37.map,path diff --git a/installation/GRCh38_related_files/README_downloadReferences_GRCh38.md b/installation/GRCh38_related_files/README_downloadReferences_GRCh38.md new file mode 100644 index 0000000..547f029 --- /dev/null +++ b/installation/GRCh38_related_files/README_downloadReferences_GRCh38.md @@ -0,0 +1,107 @@ +### Reference files for GRCh38 + +Information on downloading and parsing files needed for GRCh38 reference genome + + +#### Reference genome + +We are using the GRCh38 version hosted at `http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa`, + +The above reference genome (GRCh38_decoy_ebv_phiX_alt_hla) can be downloaded and processed using the repositiry at `https://github.com/DKFZ-ODCF/setup-reference-data`. + +List of files needed from the above process +- GRCh38_decoy_ebv_phiX_alt_hla_chr.fa +- GRCh38_decoy_ebv_phiX_alt_hla_chr.fa.chrLenOnlyACGT_realChromosomes.tsv +- GRCh38_decoy_ebv_phiX_alt_hla_chr.fa.chrLength.tsv + + +#### dbSNP version 135 + +Downloaded from ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh38p7/VCF/00-All.vcf.gz + +Post-processing for usage in ACEseq: +``` +DBSNP_VERSION=135 +zcat 00-All.vcf.gz | + awk '/^#/{print} /VC=SNV/{ v=$8; sub(/.*dbSNPBuildID=/, "", v); sub(/;.*/, "", v); if (v~/^[0-9]+$/ && int(v)<='$DBSNP_VERSION') print }' | + bgzip > 00-All.SNV.vcf.gz +tabix -p vcf 00-All.SNV.vcf.gz +``` + +#### Generating mappability track +The mappability file (`GRCh38_Mappability_Align_100mer_m2e2.ALT_HLA.bedGraph.gz`) is created using the `create_mappability.sh` bash script. + +The tools used have to be in the `tools` folder. The tools `gem-2-wig`, `gem-indexer`, `gem-mappability` are from the package *gem-tools* from the Vlaams Instituut voor Biotechnologie (VIB). +Information about the tools can be found here: https://wiki.bits.vib.be/index.php/Create_a_mappability_track#Install_and_run_the_GEM_library_tools. The tools were downloaded from https://sourceforge.net/projects/gemlibrary/files/gem-library/Binary%20pre-release%202/. The corresponding paper should be this one: https://www.researchgate.net/publication/221776385_Fast_Computation_and_Applications_of_Genome_Mappability + +The comptatible gemtools version (1.7.1-i3) was downloaded from "http://barnaserver.com/gemtools/releases/GEMTools-static-i3-1.7.1.tar.gz". +For hg38 `gem-mappability` an option of `-m2 -e2` were added to keep the score compatabiltble with the hg19 versions downloaded from UCSC. + +The two tools `wigToBigWig` and `BigWigToBedGraph` are downloaded from http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/ + +Finally, `tabix` from htslib is used (see http://www.htslib.org/doc/tabix.html) + +Run the script `sh create_mappability.sh` + + +#### Replication time +Replication time from individual cell lines from GRCh37 ENCODE data were lifted over to GRCh38, and averages were re-calculated. +The new R-object is uploaded to the `$repo_root/installation/GRCh38_related_files/time_mean_10KB.Rda` + + +#### Gaps and centromeres +- `gap.txt.gz` downloaded from `http://hgdownload.cse.ucsc.edu/goldenpath/hg38/database/gap.txt.gz` +- `centromeres.txt.gz` downloaded from `http://hgdownload.cse.ucsc.edu/goldenpath/hg38/database/centromeres.txt.gz` +- `centromeres_merged.txt.gz` created with the following command +``` +zcat centromeres.txt.gz | sort -k 2 -V | awk 'BEGIN {printf "#bin\tchrom\tchromStart\tchromEnd\n"; chr=""} $2!=chr { if (end != ""){printf $1"\t" chr "\t" start "\t" end "\n"}; chr=$2; start=$3} {end=$4} END {printf $1"\t" chr "\t" start "\t" end "\n"}' | gzip > centromeres_merged.txt.gz +``` +- `gap_with_centromeres.txt.gz` file created with the following command +``` +zcat centromeres_merged.txt.gz | tail -n +2 | awk ' BEGIN {OFS="\t"} {print $0, "1", "N", $4-$3, "centromere", "no"}' | gzip > gap_with_centromeres.txt.gz +``` + + +#### GC content +The GC-content (`gc_content_hg38.txt`) is calculated directly from the reference genome with the script `calc_gc_content.py`. +``` +python3 calc_gc_content -v -i ${reference_path}/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa -o gc_content_hg38.txt +``` +The -v flag increases verbosity. Note that you have to use python3! + + +#### Beagle reference files +The variants were downloaded for all the chromosomes mapped to hg38 reference genome +Info: https://www.internationalgenome.org/announcements/Variant-calls-from-1000-Genomes-Project-data-on-the-GRCh38-reference-assemlby/ +FTP site: http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ + +Added 'chr' prefix and converted the VCF files to BREF format using Beagle. Refer to the script `beagle_vcf_to_bref.sh` + +This step will generate the following files, +- `ALL.chr${CHR_NAME}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.CHR.bref3` +- `ALL.chrX.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.CHR.bref3` + + +#### Beagle genetic map files +Downloaded from `http://bochet.gcc.biostat.washington.edu/beagle/genetic_maps/plink.GRCh38.map.zip` + +Add `chr` prefix + +``` +for chr in `seq 1 22` X ; do cat plink.chr${chr}.GRCh38.map | sed 's/^/chr/' > plink.chr${chr}.GRCh38.CHR.map; done +``` + + +#### Local controls for no-control workflow (optional) +These are lift-over files from the hg19 workflow. New hg38 native files will be generated for the next versions. + + +#### Exclusion list or blacklist files +The exclusion list contains the 'homodel' region present in almost all of the 216 samples in the ICGC medulloblastoma project. +`artifact.homoDels.potentialArtifacts.hg38.txt` + +#### Hg38 cytoband +Cytoband file was copied from ANNOVAR database files. Only the coordinates from Chr1-22, X, and Y were kept. +``` +cat ANNOVAR/annovar_April2018/humandb/hg38_cytoBand.txt | grep -v "_" | grep -v "^chrM" > hg38_cytoBand.txt +``` \ No newline at end of file diff --git a/installation/GRCh38_related_files/beagle_vcf_to_bref.sh b/installation/GRCh38_related_files/beagle_vcf_to_bref.sh new file mode 100644 index 0000000..ba75666 --- /dev/null +++ b/installation/GRCh38_related_files/beagle_vcf_to_bref.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# Convert BEAGLE VCF to BREF format +# Script takes in the VCF files with header and adds 'chr' prefix to the variants +module load java/1.8.0_131 + +for chr in `seq 1 22` X Y +do + echo $chr + (zcat ALL.chr${chr}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz \ + | head -n 300 \ + | grep "#" ; zcat ALL.chr${chr}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz \ + | grep -v "#" | sed 's/^/chr/') \ + | java -jar bref3.18May20.d20.jar > ALL.chr${chr}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.CHR.bref +done diff --git a/installation/GRCh38_related_files/calc_gc_content_bed.py b/installation/GRCh38_related_files/calc_gc_content_bed.py new file mode 100644 index 0000000..718fcd6 --- /dev/null +++ b/installation/GRCh38_related_files/calc_gc_content_bed.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +import sys +import argparse +import re + +assert sys.version_info.major >= 3, "Python 3 is required for string operations" + +parser = argparse.ArgumentParser() +parser.add_argument("-i", "--input", help="Input sequence - fasta file format", required=True) +parser.add_argument("-o", "--output", help="Output file name", default='gc_content.txt') +parser.add_argument("-t", "--threshold", help="Threshold", default=0.0) +parser.add_argument("--stepsize", help="Threshold (regions with gc lower than threshold are discarded)", default=10000) +parser.add_argument("-v", "--verbose", action = 'store_true', help="Add verbosity") +args = parser.parse_args() + +threshold = float(args.threshold) +stepsize = int(args.stepsize) + +output_data = ['\t'.join(['chromosome', 'start', 'end', 'gc_content'])] + +if args.verbose: print('Load data') +with open(args.input, "r") as f: + data = f.read().splitlines() + +if args.verbose: print('Calculate chromosome stats') +chr_names, chr_borders = [], [] + +for i in range(len(data)): + if '>' in data[i]: + chr_names.append(data[i].split('>')[1].split(' ')[0]) + chr_borders.append(i) +chr_borders = chr_borders + [len(data)] + +for i in range(len(chr_names)): + + if not re.match("^(chr)?[0-9,X,Y]{1,2}$", chr_names[i]): + continue + + if args.verbose: print('Calculating Chromosome {} - length = {} bp'.format(chr_names[i], 61*(chr_borders[i+1] - chr_borders[i]))) + current_chr = data[chr_borders[i]:chr_borders[i+1]][1:] + + current_chr = ''.join(current_chr) + + for j in range(0, len(current_chr)//stepsize): + current_data = current_chr[(stepsize*j):(stepsize*(j+1))] + + gc = len(re.findall('[gcGC]', current_data)) + gcta = len(re.findall('[gctaGCTA]', current_data)) + if gcta == 0: gcta = 1. + gc_content = gc/gcta + + if gc_content > threshold: + output_data.append('\t'.join([chr_names[i], str(stepsize*j), str(stepsize*(j+1)), '{:1.6f}'.format(gc_content)])) +if args.verbose: print('Finished with calculation. Writing to file {}'.format(args.output)) + +with open(args.output, 'w') as f: + for line in output_data: + f.write("%s\n" % line) +if args.verbose: print('GC-content calculated succesfully') diff --git a/installation/GRCh38_related_files/create_mappability.sh b/installation/GRCh38_related_files/create_mappability.sh new file mode 100755 index 0000000..ece4011 --- /dev/null +++ b/installation/GRCh38_related_files/create_mappability.sh @@ -0,0 +1,37 @@ +#!/bin/bash +set -ue -o pipefail + +nr_cores=4 +kmer=100 +reference_genome="${reference_path}/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa" + +mkdir -p ref out +# copy the reference but only chr1-22, X and Y +echo "copy reference genome" +awk '/^>chrM/ {exit} /^>/ {print $1} !/^>/ {print}' ${reference_genome} > ref/reference.fa +echo "gem-indexer" +./gemtools-1.7.1-i3/bin/gem-indexer -m force-slow-algorithm -T ${nr_cores} -c dna -i ref/reference.fa -o out/index +echo "gem-mappability" +./gemtools-1.7.1-i3/bin/gem-mappability -m2 -e2 -T ${nr_cores} -I out/index.gem -l ${kmer} -o out/outfiles +echo "gem-2-wig" +./gemtools-1.7.1-i3/bin/gem-2-wig -I out/index.gem -i out/outfiles.mappability -o out/outfiles +echo "wigToBigWig" + +./tools/wigToBigWig out/outfiles.wig out/outfiles.sizes out/outfiles.bigWig +echo "bigWigToBedGraph" +./tools/bigWigToBedGraph out/outfiles.bigWig out/outfiles.bedGraph + +echo "create FAKE scores for HLA and ALT contigs" +echo "Remove primary chrs and phix contigs" +grep ">" GRCh38_decoy_ebv_phiX_alt_hla_chr.fa > contigs +cat contigs | grep -v HLA | cut -f1,7 -d " " | sed -e 's/ /\t1\t/g' -e 's/LN://' -e 's/>//' -e 's/$/\t1.0/' > contigs_fake_mappability.bedGraph +cat contigs | grep HLA | cut -f1,2 -d " " | sed -e 's/\t/ /' | cut -f1,3 -d " " | sed -e 's/ /\t1\t/g' -e 's/>//' -e 's/$/\t1.0/' >> contigs_fake_mappability.bedGraph + +echo "Filter lines and compress" +(awk '$4 > 0.0 {print $0}' out/outfiles.bedGraph ; cat contigs_fake_mappability.bedGraph ) | ./tools/bgzip > GRCh38_Mappability_Align_100mer_m2e2_ALT_HLA.bedGraph.gz + +echo "Create Index with Tabix" +./tools/tabix -p bed GRCh38_Mappability_Align_100mer_m2e2_ALT_HLA.bedGraph.gz + +echo "cleaning" +rm -r ref out diff --git a/installation/GRCh38_related_files/time_mean_10KB.Rda b/installation/GRCh38_related_files/time_mean_10KB.Rda new file mode 100644 index 0000000..54550e7 Binary files /dev/null and b/installation/GRCh38_related_files/time_mean_10KB.Rda differ diff --git a/installation/downloadReferences.sh b/installation/downloadReferences.sh index e533a83..d0de96e 100755 --- a/installation/downloadReferences.sh +++ b/installation/downloadReferences.sh @@ -8,16 +8,16 @@ trap 'echo "Download incomplete. Please restart script."' ERR # convenience function to create a directory and change into it mkdir_cd() { - mkdir -p "$1" - cd "$1" + mkdir -p "$1" + cd "$1" } # compute an MD5 sum over all files found recursively in the current directory # and check it against the MD5 sum given in the variable EXPECTED_MD5SUM check_md5sum() { - local FILES=$(find -type f | sort) - local MD5SUM=$([ -n "$FILES" ] && cat $FILES | md5sum | cut -f1 -d' ') - [ "$EXPECTED_MD5SUM" = "$MD5SUM" ] + local FILES=$(find -type f | sort) + local MD5SUM=$([ -n "$FILES" ] && cat $FILES | md5sum | cut -f1 -d' ') + [ "$EXPECTED_MD5SUM" = "$MD5SUM" ] } @@ -31,32 +31,31 @@ mkdir_cd hg19_GRCh37_1000genomes # download reference genome ############################################################################### ( - mkdir_cd sequence/1KGRef + mkdir_cd sequence/1KGRef - EXPECTED_MD5SUM=12a0bed94078e2d9e8c00da793bbc84e - check_md5sum && exit 0 || echo downloading reference genome.... + EXPECTED_MD5SUM=12a0bed94078e2d9e8c00da793bbc84e + check_md5sum && exit 0 || echo downloading reference genome.... - wget -c ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz - gunzip hs37d5.fa.gz + wget -c ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz + gunzip hs37d5.fa.gz - check_md5sum + check_md5sum ) - ############################################################################### # download dbSNP database ############################################################################### ( - DBSNP_VERSION=135 - mkdir_cd databases/dbSNP/dbSNP_$DBSNP_VERSION + DBSNP_VERSION=135 + mkdir_cd databases/dbSNP/dbSNP_$DBSNP_VERSION EXPECTED_MD5SUM=4a93e8130b24b9c8ec6411b76fd2b76a check_md5sum && exit 0 || echo downloading dbSNP file.... - # CITATION - # As a NCBI Resource: "Sherry ST, Ward MH, Kholodov M, Baker J, Phan L, Smigielski EM, Sirotkin K. dbSNP: the NCBI database of genetic variation. Nucleic Acids Res. 2001 Jan 1;29(1):308-11." - # As a whole for a specific build (use this!) : "Database of Single Nucleotide Polymorphisms (dbSNP). Bethesda (MD): National Center for Biotechnology Information, National Library of Medicine. (dbSNP Build ID: 141 ). Available from: http://www.ncbi.nlm.nih.gov/SNP/" - # A single or a range of Submitted SNP (ss) or Reference SNP (rs) entries: "Database of Single Nucleotide Polymorphisms (dbSNP). Bethesda (MD): National Center for Biotechnology Information, National Library of Medicine. dbSNP accession:{ss1 or ss1 – ss100}, (dbSNP Build ID: 141). Available from: http://www.ncbi.nlm.nih.gov/SNP/" + # CITATION + # As a NCBI Resource: "Sherry ST, Ward MH, Kholodov M, Baker J, Phan L, Smigielski EM, Sirotkin K. dbSNP: the NCBI database of genetic variation. Nucleic Acids Res. 2001 Jan 1;29(1):308-11." + # As a whole for a specific build (use this!) : "Database of Single Nucleotide Polymorphisms (dbSNP). Bethesda (MD): National Center for Biotechnology Information, National Library of Medicine. (dbSNP Build ID: 141 ). Available from: http://www.ncbi.nlm.nih.gov/SNP/" + # A single or a range of Submitted SNP (ss) or Reference SNP (rs) entries: "Database of Single Nucleotide Polymorphisms (dbSNP). Bethesda (MD): National Center for Biotechnology Information, National Library of Medicine. dbSNP accession:{ss1 or ss1 – ss100}, (dbSNP Build ID: 141). Available from: http://www.ncbi.nlm.nih.gov/SNP/" # NOTE ON VERSIONING # The dbSNP database only maintains the newest version and has no archives of older versions. Therefore this download script will always download the newest version and subsequently filter entries according to `$DBSNP_VERSION`. However, as newer dbSNP versions might drop certain entries, the database might still change in the future. This has to be kept in mind with respect to the reproducibility of the whole workflow. @@ -67,86 +66,97 @@ mkdir_cd hg19_GRCh37_1000genomes wget -c "$DBSNP_BASE_URL/README.txt" wget -c "$DBSNP_BASE_URL/00-All.vcf.gz" - # POST PROCESSING - # extract SNPs from dbSNP version 135 and older - zcat 00-All.vcf.gz | - awk '/^#/{print} /VC=SNV/{ v=$8; sub(/.*dbSNPBuildID=/, "", v); sub(/;.*/, "", v); if (v~/^[0-9]+$/ && int(v)<='$DBSNP_VERSION') print }' | - bgzip > 00-All.SNV.vcf.gz - tabix -p vcf 00-All.SNV.vcf.gz + # POST PROCESSING + # extract SNPs from dbSNP version 135 and older + zcat 00-All.vcf.gz | + awk '/^#/{print} /VC=SNV/{ v=$8; sub(/.*dbSNPBuildID=/, "", v); sub(/;.*/, "", v); if (v~/^[0-9]+$/ && int(v)<='$DBSNP_VERSION') print }' | + bgzip > 00-All.SNV.vcf.gz + tabix -p vcf 00-All.SNV.vcf.gz - # CLEANUP - rm -f 00-All.vcf.gz + # CLEANUP + rm -f 00-All.vcf.gz - check_md5sum + check_md5sum ) ############################################################################### # download mappability file ############################################################################### ( - mkdir_cd databases/UCSC + mkdir_cd databases/UCSC EXPECTED_MD5SUM=4c735c9bc4f6ebb7d7609acedc785290 check_md5sum && exit 0 || echo downloading mappability file.... - wget -c http://hgdownload.soe.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeCrgMapabilityAlign100mer.bigWig - bigWigToBedGraph wgEncodeCrgMapabilityAlign100mer.bigWig /dev/stdout | bgzip > wgEncodeCrgMapabilityAlign100mer_chr.bedGraph.gz - tabix -p bed wgEncodeCrgMapabilityAlign100mer_chr.bedGraph.gz - rm -f wgEncodeCrgMapabilityAlign100mer.bigWig + wget -c http://hgdownload.soe.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeCrgMapabilityAlign100mer.bigWig + bigWigToBedGraph wgEncodeCrgMapabilityAlign100mer.bigWig /dev/stdout | bgzip > wgEncodeCrgMapabilityAlign100mer_chr.bedGraph.gz + tabix -p bed wgEncodeCrgMapabilityAlign100mer_chr.bedGraph.gz + rm -f wgEncodeCrgMapabilityAlign100mer.bigWig - check_md5sum + check_md5sum ) ############################################################################### # download replication timings ############################################################################### ( - mkdir_cd databases/ENCODE + mkdir_cd databases/ENCODE - EXPECTED_MD5SUM=2a63b34a737383af2a3f7eb32801a5fa - check_md5sum && exit 0 || echo downloading replication timing file.... + EXPECTED_MD5SUM=2a63b34a737383af2a3f7eb32801a5fa + check_md5sum && exit 0 || echo downloading replication timing file.... wget -c "https://github.com/DKFZ-ODCF/ACEseqWorkflow/blob/master/installation/ReplicationTime_10cellines_mean_10KB.Rda?raw=true" -O ReplicationTime_10cellines_mean_10KB.Rda - check_md5sum + check_md5sum ) ############################################################################### # download chromosome statistics ############################################################################### ( - mkdir_cd stats + mkdir_cd stats - EXPECTED_MD5SUM=801bdaa8c3b0d5c18a0637b0b29fd337 - check_md5sum && exit 0 || echo downloading stats files.... + EXPECTED_MD5SUM=801bdaa8c3b0d5c18a0637b0b29fd337 + check_md5sum && exit 0 || echo downloading stats files.... - wget -c http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/chromInfo.txt.gz - zcat chromInfo.txt.gz | grep -Pv "(_)|(chrM)" | sed -e '1i\#chrom\tsize\tfileName' > chrlengths.txt - rm -f chromInfo.txt.gz + wget -c http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/chromInfo.txt.gz + zcat chromInfo.txt.gz | grep -Pv "(_)|(chrM)" | sed -e '1i\#chrom\tsize\tfileName' > chrlengths.txt + rm -f chromInfo.txt.gz wget -c "https://github.com/DKFZ-ODCF/ACEseqWorkflow/blob/master/installation/hg19_GRch37_100genomes_gc_content_10kb.txt?raw=true" -O hg19_GRch37_100genomes_gc_content_10kb.txt - check_md5sum + check_md5sum ) ############################################################################### -# download IMPUTE database +# download Beagle database ############################################################################### ( - mkdir_cd databases/1000genomes/IMPUTE + mkdir_cd databases/Beagle_references + + EXPECTED_MD5SUM=529947b78dbedfb7a947b530da510887 + check_md5sum && exit 0 || echo downloading Beagle reference files.... + + for chr in {1..22}; do wget -c "http://bochet.gcc.biostat.washington.edu/beagle/1000_Genomes_phase3_v5a/b37.bref3/chr${chr}.1kg.phase3.v5a.b37.bref3"; done + wget -c "http://bochet.gcc.biostat.washington.edu/beagle/1000_Genomes_phase3_v5a/b37.bref3/chrX.1kg.phase3.v5a.b37.bref3" + + check_md5sum +) - EXPECTED_MD5SUM=261a28d6b6917340cd82ada2d7185e17 - check_md5sum && exit 0 || echo downloading impute files.... +############################################################################### +# download genetic maps +############################################################################### +( + mkdir_cd databases/genetic_maps - wget -c https://mathgen.stats.ox.ac.uk/impute/ALL.integrated_phase1_SHAPEIT_16-06-14.nomono.tgz - tar -xzvf ALL.integrated_phase1_SHAPEIT_16-06-14.nomono.tgz - rm -f ALL.integrated_phase1_SHAPEIT_16-06-14.nomono.tgz + EXPECTED_MD5SUM=37a6b0f29695cd87a3249065a1eae420 + check_md5sum && exit 0 || echo downloading genetic maps.... - wget -c https://mathgen.stats.ox.ac.uk/impute/ALL_1000G_phase1integrated_v3_impute.tgz - tar -xzvf ALL_1000G_phase1integrated_v3_impute.tgz - rm -f ALL_1000G_phase1integrated_v3_impute.tgz + wget -c "http://bochet.gcc.biostat.washington.edu/beagle/genetic_maps/plink.GRCh37.map.zip" + unzip plink.GRCh37.map.zip + rm plink.GRCh37.map.zip - check_md5sum + check_md5sum ) echo "All files downloaded successfully" diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/HRD_estimation.R b/resources/analysisTools/copyNumberEstimationWorkflow/HRD_estimation.R index 4264a51..6c174ea 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/HRD_estimation.R +++ b/resources/analysisTools/copyNumberEstimationWorkflow/HRD_estimation.R @@ -17,12 +17,14 @@ ploidy <- as.integer(args[4]) tcc <- as.numeric(args[5]) pid <- args[6] outfile <- args[7] -contributingSegmentsFile <- args[8] -centromerFile <- args[9] -cytobandsFile <- args[10] -pipelineDir <- args[11] -if( length(args)>11 ){ - cutoff <- as.numeric(args[12]) +contributingSegmentsFile_HRD_File <- args[8] +contributingSegments_LST_File <- args[9] +merged_reduced_outputFile = args[10] +centromerFile <- args[11] +cytobandsFile <- args[12] +pipelineDir <- args[13] +if( length(args)>13 ){ + cutoff <- as.numeric(args[14]) }else{ cutoff <- 0.7 } @@ -95,7 +97,7 @@ if(length(selNoChangeChr) != length(unique(merged.df$chromosome)) ){ merged.df <- merged.df[! merged.df$chromosome %in% selNoChangeChr,] index.HRDSmooth = which(grepl("LOH", merged.df$CNA.type) & merged.df$length>15000000 ) numberHRDSmooth <- length( index.HRDSmooth ) - write.table( merged.df[index.HRDSmooth,], contributingSegmentsFile, sep="\t", row.names=FALSE, quote=FALSE ) + write.table( merged.df[index.HRDSmooth,], contributingSegmentsFile_HRD_File, sep="\t", row.names=FALSE, quote=FALSE ) segmentsPerChr <- split(merged.df, merged.df$chromosome) @@ -142,13 +144,14 @@ if(length(selNoChangeChr) != length(unique(merged.df$chromosome)) ){ return(currentSegments.merged.df) }) mergedReduced.df = do.call(rbind, mergedReduced.df.List) + write.table( mergedReduced.df, merged_reduced_outputFile, sep="\t", row.names=FALSE, quote=FALSE ) index.HRDSmooth = which(grepl("LOH", merged.df$CNA.type) & merged.df$length>15000000 ) index.HRDSmoothReduced = which(grepl("LOH", mergedReduced.df$CNA.type) & mergedReduced.df$length>15000000 ) numberHRDSmooth <- length( index.HRDSmooth ) numberHRDSmoothReduced <- length( index.HRDSmoothReduced ) - write.table( merged.df[index.HRDSmooth,], contributingSegmentsFile, sep="\t", row.names=FALSE, quote=FALSE ) - write.table( mergedReduced.df[index.HRDSmoothReduced,], paste0(contributingSegmentsFile,".CentromerReduced.txt"), sep="\t", row.names=FALSE, quote=FALSE ) + write.table( merged.df[index.HRDSmooth,], contributingSegmentsFile_HRD_File, sep="\t", row.names=FALSE, quote=FALSE ) + write.table( mergedReduced.df[index.HRDSmoothReduced,], paste0(contributingSegmentsFile_HRD_File,".CentromerReduced.txt"), sep="\t", row.names=FALSE, quote=FALSE ) index.HRDLoss = which(grepl("(LOH)|(DEL)", merged.df$CNA.type) & merged.df$length>15000000 ) index.HRDLossReduced = which(grepl("(LOH)|(DEL)", mergedReduced.df$CNA.type) & mergedReduced.df$length>15000000 ) @@ -209,6 +212,7 @@ if(length(selNoChangeChr) != length(unique(merged.df$chromosome)) ){ # LSTReduced score i=1 LSTReduced=0 + LST_SEGMENTS_INDICES=c() for( j in 2:nrow(mergedReduced.df) ){ if( mergedReduced.df$chromosome[i] != mergedReduced.df$chromosome[j] | mergedReduced.df$length[i] < 10e6 | mergedReduced.df$length[j] <10e6 ){ i=i+1 @@ -220,9 +224,13 @@ if(length(selNoChangeChr) != length(unique(merged.df$chromosome)) ){ } if( abs(mergedReduced.df$tcnMean[j] - mergedReduced.df$tcnMean[i]) > cutoff){ LSTReduced=LSTReduced+1 + LST_SEGMENTS_INDICES=c(LST_SEGMENTS_INDICES,c(i,j)) } i=i+1 } + if (LSTReduced>0) { + write.table( mergedReduced.df[LST_SEGMENTS_INDICES,], contributingSegments_LST_File, sep="\t", row.names=FALSE, quote=FALSE ) + } } else { # all chromosomes have only 1 state numberHRDSmooth <- 0 diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/PSCBSgabs_plus_CRESTpoints.py b/resources/analysisTools/copyNumberEstimationWorkflow/PSCBSgabs_plus_CRESTpoints.py index 704afda..8536dfc 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/PSCBSgabs_plus_CRESTpoints.py +++ b/resources/analysisTools/copyNumberEstimationWorkflow/PSCBSgabs_plus_CRESTpoints.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # Copyright (c) 2017 The ACEseq workflow developers. # Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt). diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/PSCBSgabs_plus_sv_points.py b/resources/analysisTools/copyNumberEstimationWorkflow/PSCBSgabs_plus_sv_points.py index 6301b53..c5847b2 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/PSCBSgabs_plus_sv_points.py +++ b/resources/analysisTools/copyNumberEstimationWorkflow/PSCBSgabs_plus_sv_points.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # Copyright (c) 2017 The ACEseq workflow developers. # Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt). diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/addMappability.py b/resources/analysisTools/copyNumberEstimationWorkflow/addMappability.py index 779cae8..6144d44 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/addMappability.py +++ b/resources/analysisTools/copyNumberEstimationWorkflow/addMappability.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # Copyright (c) 2017 The ACEseq workflow developers. # Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt). diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/add_haplotypes.py b/resources/analysisTools/copyNumberEstimationWorkflow/add_haplotypes.py index 3ef595c..9359e3f 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/add_haplotypes.py +++ b/resources/analysisTools/copyNumberEstimationWorkflow/add_haplotypes.py @@ -16,7 +16,7 @@ from python_modules import Tabfile -parser=argparse.ArgumentParser(description = "Add imputed haplotypes") +parser=argparse.ArgumentParser(description = "Add phased haplotypes") parser.add_argument('--inputsuffix', '-i', type = str, help = "suffix of vcf file with phased genotypes") parser.add_argument('--inputpath', '-p', type = str, help = "path and prefix to vcf file with phased genotypes") parser.add_argument('--out', '-o', default=sys.stdout, help = "Output file. Default: STDOUT") diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/artifact.homoDels.potentialArtifacts.hg38.txt b/resources/analysisTools/copyNumberEstimationWorkflow/artifact.homoDels.potentialArtifacts.hg38.txt new file mode 100755 index 0000000..44c353c --- /dev/null +++ b/resources/analysisTools/copyNumberEstimationWorkflow/artifact.homoDels.potentialArtifacts.hg38.txt @@ -0,0 +1,8 @@ +#chromosome start end freq +chr2 131419966 132370117 216 +chr6 57329685 58280115 216 +chr11 48869983 50100000 216 +chr17 21549966 22200012 216 +chr18 14119957 14910027 216 +chr19 42699872 43300013 216 +chr20 25749915 26290005 216 diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/bashLib.sh b/resources/analysisTools/copyNumberEstimationWorkflow/bashLib.sh new file mode 100755 index 0000000..9769e89 --- /dev/null +++ b/resources/analysisTools/copyNumberEstimationWorkflow/bashLib.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +# Copyright (c) 2017 The ACEseq workflow developers. +# Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt). + +dieWith() { + local msg="${1:?No error message}" + local ec="${2:-$?}" + echo "$msg: exit code $ec" >> /dev/stderr + exit "$ec" +} diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/beagle_create_fake_samples.py b/resources/analysisTools/copyNumberEstimationWorkflow/beagle_create_fake_samples.py new file mode 100755 index 0000000..5f9eae6 --- /dev/null +++ b/resources/analysisTools/copyNumberEstimationWorkflow/beagle_create_fake_samples.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python + +import argparse + +def is_comment_line(line): + return line.startswith("#") + +parser = argparse.ArgumentParser() +parser.add_argument('--in_file', help='Input .vcf file') +parser.add_argument('--out_file', help='Out .vcf file') +args = parser.parse_args() + +vcf_infile = open( args.in_file, "r" ) +outfile = open( args.out_file, "w" ) + +for vcf_line in vcf_infile: + + if is_comment_line(vcf_line): + if vcf_line.startswith("#CHROM"): + vcf_line = vcf_line.rstrip().split("\t") + vcf_line.append('sample0') + vcf_line = "\t".join( vcf_line )+"\n" + + else: + vcf_line = vcf_line.rstrip().split("\t") + vcf_line.append(vcf_line[9]) + vcf_line = "\t".join( vcf_line )+"\n" + + outfile.write( vcf_line ) diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/beagle_embed_haplotypes_vcf.py b/resources/analysisTools/copyNumberEstimationWorkflow/beagle_embed_haplotypes_vcf.py new file mode 100755 index 0000000..324ccb0 --- /dev/null +++ b/resources/analysisTools/copyNumberEstimationWorkflow/beagle_embed_haplotypes_vcf.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python +import argparse +import gzip + +def is_comment_line(line): + return line.startswith("#") + +parser = argparse.ArgumentParser() +parser.add_argument('--hap_file', help='Input file with phased haplotypes (i.e. output from Beagle)') +parser.add_argument('--vcf_file', help='Input .vcf file') +parser.add_argument('--out_file', help='Out .vcf file') +args = parser.parse_args() + +if args.hap_file.split('.')[-1] == 'gz': + hap_file = gzip.open( args.hap_file, "r" ) +else: + hap_file = open( args.hap_file, "r" ) +vcf_file = open( args.vcf_file, "r" ) +outfile = open( args.out_file, "w" ) + +hap_line = hap_file.readline() +while is_comment_line(hap_line): + hap_line = hap_file.readline() + +if hap_line: + hap_line = hap_line.rstrip().split() + +for vcf_line in vcf_file: + + if not is_comment_line(vcf_line): + + vcf_line = vcf_line.rstrip().split("\t") + vcf_line[0] = vcf_line[0].replace('chr', '') + vcf_line[0] = vcf_line[0].replace( 'X', '23' ) + + if len(vcf_line) >= 9 and hap_line: + + while hap_line and int(hap_line[1]) < int(vcf_line[1]): + hap_line = hap_file.readline() + + if hap_line: + hap_line = hap_line.rstrip().split() + + if hap_line and int(hap_line[1]) == int(vcf_line[1]): + + format_field = hap_line[8].split(":") + genotype_pos_hap = format_field.index( "GT" ) + format_field = vcf_line[8].split(":") + genotype_pos_vcf = format_field.index( "GT" ) + vcf_line[9] = vcf_line[9].split(":") + + vcf_line[9][genotype_pos_vcf] = hap_line[9].split(":")[genotype_pos_hap] + vcf_line[9] = ":".join( vcf_line[9] ) + + vcf_line = "\t".join( vcf_line )+"\n" + + outfile.write( vcf_line ) diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/centromers.hg38.txt b/resources/analysisTools/copyNumberEstimationWorkflow/centromers.hg38.txt new file mode 100755 index 0000000..bf1f5fb --- /dev/null +++ b/resources/analysisTools/copyNumberEstimationWorkflow/centromers.hg38.txt @@ -0,0 +1,48 @@ +chr1 0 123400000 p gneg +chr1 123400000 248956422 q gneg +chr10 0 39800000 p gneg +chr10 39800000 133797422 q gneg +chr11 0 53400000 p gneg +chr11 53400000 135086622 q gneg +chr12 0 35500000 p gneg +chr12 35500000 133275309 q gneg +chr13 0 17700000 p gneg +chr13 17700000 114364328 q gneg +chr14 0 17200000 p gneg +chr14 17200000 107043718 q gneg +chr15 0 19000000 p gneg +chr15 19000000 101991189 q gneg +chr16 0 36800000 p gneg +chr16 36800000 90338345 q gneg +chr17 0 25100000 p gneg +chr17 25100000 83257441 q gneg +chr18 0 18500000 p gneg +chr18 18500000 80373285 q gneg +chr19 0 26200000 p gneg +chr19 26200000 58617616 q gneg +chr2 0 93900000 p gneg +chr2 93900000 242193529 q gneg +chr20 0 28100000 p gneg +chr20 28100000 64444167 q gneg +chr21 0 12000000 p gneg +chr21 12000000 46709983 q gneg +chr22 0 15000000 p gneg +chr22 15000000 50818468 q gneg +chr3 0 90900000 p gneg +chr3 90900000 198295559 q gneg +chr4 0 50000000 p gneg +chr4 50000000 190214555 q gneg +chr5 0 48800000 p gneg +chr5 48800000 181538259 q gneg +chr6 0 59800000 p gneg +chr6 59800000 170805979 q gneg +chr7 0 60100000 p gneg +chr7 60100000 159345973 q gneg +chr8 0 45200000 p gneg +chr8 45200000 145138636 q gneg +chr9 0 43000000 p gneg +chr9 43000000 138394717 q gneg +chrX 0 61000000 p gneg +chrX 61000000 156040895 q gneg +chrY 0 10400000 p gneg +chrY 10400000 57227415 q gneg diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/clusteredPrunedNormal.sh b/resources/analysisTools/copyNumberEstimationWorkflow/clusteredPrunedNormal.sh index 055a87d..8b068ff 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/clusteredPrunedNormal.sh +++ b/resources/analysisTools/copyNumberEstimationWorkflow/clusteredPrunedNormal.sh @@ -38,9 +38,13 @@ then fi mv ${tmpClusteredSeg} ${FILENAME_CLUSTERED_SEGMENTS} -mv ${tmpSnpsOut} ${FILENAME_ALL_SNP_UPDATE2} -$TABIX_BINARY -f -s 1 -b 2 -e 2 ${FILENAME_ALL_SNP_UPDATE2} +# Not sure why there is a NULL line in the end of the segments file +zcat ${tmpSnpsOut} | grep -v NULL | bgzip -f > ${tmpSnpsOut}.2 +mv ${tmpSnpsOut}.2 ${FILENAME_ALL_SNP_UPDATE2} +rm ${tmpSnpsOut} + +$TABIX_BINARY -f -s 1 -b 2 -e 2 --comment chromosome ${FILENAME_ALL_SNP_UPDATE2} if [[ "$?" != 0 ]] then diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/convertTabToJson.py b/resources/analysisTools/copyNumberEstimationWorkflow/convertTabToJson.py index dae98a1..0324034 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/convertTabToJson.py +++ b/resources/analysisTools/copyNumberEstimationWorkflow/convertTabToJson.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # Copyright (c) 2017 The ACEseq workflow developers. # Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt). diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/correct_gc_bias.sh b/resources/analysisTools/copyNumberEstimationWorkflow/correct_gc_bias.sh index cb6c147..3e98190 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/correct_gc_bias.sh +++ b/resources/analysisTools/copyNumberEstimationWorkflow/correct_gc_bias.sh @@ -4,25 +4,25 @@ # Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt). -tmp_corrected_windowfile="$FILENAME_GC_CORRECTED_WINDOWS.tmp" -tmp_corrected_table="$FILENAME_GC_CORRECTED_QUALITY.tmp" -corrected_table_slim="$FILENAME_GC_CORRECTED_QUALITY.slim.txt" - -$RSCRIPT_BINARY "$TOOL_CORRECT_GC_BIAS_R" \ - --windowFile "$FILENAME_COV_WINDOWS_WG" \ - --timefile "$REPLICATION_TIME_FILE" \ - --chrLengthFile "$CHROMOSOME_LENGTH_FILE" \ - --pid "$PID" \ - --email "$EMAIL" \ - --outfile "$tmp_corrected_windowfile" \ - --corPlot "$FILENAME_GC_CORRECT_PLOT" \ - --corTab "$tmp_corrected_table" \ - --qcTab "$corrected_table_slim" \ - --gcFile "$GC_CONTENT_FILE" \ - --outDir "$aceseqOutputDirectory" \ - --lowess_f "$LOWESS_F" \ - --scaleFactor "$SCALE_FACTOR" \ - --coverageYlims "$COVERAGEPLOT_YLIMS" +tmp_corrected_windowfile=${FILENAME_GC_CORRECTED_WINDOWS}.tmp +tmp_corrected_table=${FILENAME_GC_CORRECTED_QUALITY}.tmp +corrected_table_slim=${FILENAME_GC_CORRECTED_QUALITY}.slim.txt + +${RSCRIPT_BINARY} "${TOOL_CORRECT_GC_BIAS_R}" \ + --windowFile "${FILENAME_COV_WINDOWS_WG}" \ + --timefile "${REPLICATION_TIME_FILE}" \ + --chrLengthFile "${CHROMOSOME_LENGTH_FILE}" \ + --pid "${PID}" \ + --email "${EMAIL}" \ + --outfile "${tmp_corrected_windowfile}" \ + --corPlot "${FILENAME_GC_CORRECT_PLOT}" \ + --corTab "${tmp_corrected_table}" \ + --qcTab "${corrected_table_slim}" \ + --gcFile "${GC_CONTENT_FILE}" \ + --outDir "${aceseqOutputDirectory}" \ + --lowess_f "${LOWESS_F}" \ + --scaleFactor "${SCALE_FACTOR}" \ + --coverageYlims "${COVERAGEPLOT_YLIMS}" diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/correct_gc_bias_1kb.sh b/resources/analysisTools/copyNumberEstimationWorkflow/correct_gc_bias_1kb.sh index 87f4d1b..5f582f5 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/correct_gc_bias_1kb.sh +++ b/resources/analysisTools/copyNumberEstimationWorkflow/correct_gc_bias_1kb.sh @@ -6,18 +6,18 @@ tmp_corrected_windowfile="$FILENAME_GC_CORRECTED_WINDOWS.tmp" -$RSCRIPT_BINARY "$TOOL_CORRECT_GC_BIAS_R \ - --windowFile "$cnvSnpOutputDirectory/$PID".all.cnv.chr*.tab \ - --timefile "$REPLICATION_TIME_FILE" \ - --chrLengthFile $CHROMOSOME_LENGTH_FILE" \ - --pid "$PID" \ - --email "$EMAIL" \ - --outfile "$tmp_corrected_windowfile" \ - --corPlot "$FILENAME_GC_CORRECT_PLOT" \ - --gcFile "$GC_CONTENT_FILE" \ - --outDir "$aceseqOutputDirectory" \ - --lowess_f "$LOWESS_F" \ - --scaleFactor "$SCALE_FACTOR" +${RSCRIPT_BINARY} "${TOOL_CORRECT_GC_BIAS_R}" \ + --windowFile "${cnvSnpOutputDirectory}/${PID}.all.cnv.chr*.tab" \ + --timefile "${REPLICATION_TIME_FILE}" \ + --chrLengthFile "${CHROMOSOME_LENGTH_FILE}" \ + --pid "${PID}" \ + --email "${EMAIL}" \ + --outfile "${tmp_corrected_windowfile}" \ + --corPlot "${FILENAME_GC_CORRECT_PLOT}" \ + --gcFile "${GC_CONTENT_FILE}" \ + --outDir "${aceseqOutputDirectory}" \ + --lowess_f "${LOWESS_F}" \ + --scaleFactor "${SCALE_FACTOR}" diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/createUnphasedFiles.sh b/resources/analysisTools/copyNumberEstimationWorkflow/createUnphasedFiles.sh index f1de902..a87fdd7 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/createUnphasedFiles.sh +++ b/resources/analysisTools/copyNumberEstimationWorkflow/createUnphasedFiles.sh @@ -12,7 +12,7 @@ zcat $dbSNP_FILE | perl $TOOL_ANNOTATE_CNV_VCF \ --columnName genotype \ --aColNameLineStart "#CHROM" | \ grep -v "^#" | \ - $PERL_BINARY $TOOL_PARSE_VCF ${imputeOutputDirectory} $unphasedGenotypesFilePrefix $unphasedGenotypesFileSuffix + $PERL_BINARY $TOOL_PARSE_VCF ${phasingOutputDirectory} $unphasedGenotypesFilePrefix $unphasedGenotypesFileSuffix $CHR_PREFIX if [[ $? != 0 ]] diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/datatablePSCBSgaps.sh b/resources/analysisTools/copyNumberEstimationWorkflow/datatablePSCBSgaps.sh index 8015303..672b49e 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/datatablePSCBSgaps.sh +++ b/resources/analysisTools/copyNumberEstimationWorkflow/datatablePSCBSgaps.sh @@ -27,7 +27,7 @@ fi mv ${tmp_pscbsData} ${FILE_PSCBS_DATA} -${TABIX_BINARY} -f -s 2 -b 1 ${FILE_PSCBS_DATA} +${TABIX_BINARY} -f -s 2 -b 1 --comment a ${FILE_PSCBS_DATA} if [[ "$?" != 0 ]] then diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/embed_haplotypes_vcf.py b/resources/analysisTools/copyNumberEstimationWorkflow/embed_haplotypes_vcf.py index 20a9ed0..1e56a84 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/embed_haplotypes_vcf.py +++ b/resources/analysisTools/copyNumberEstimationWorkflow/embed_haplotypes_vcf.py @@ -1,10 +1,13 @@ - #!/usr/bin/python +#!/usr/bin/env python # Copyright (c) 2017 The ACEseq workflow developers. # Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt). from python_modules import Options +def is_comment_line(line): + return line.startswith("#") + options = Options.parse( { "hap_file" : str, "vcf_file" : str, "outfile" : str } ) if options: @@ -20,7 +23,7 @@ for vcf_line in vcf_infile: - if vcf_line[0] != "#": + if not is_comment_line(vcf_line): vcf_line = vcf_line.rstrip().split("\t") vcf_line[0] = vcf_line[0].replace('chr', '') diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/environments/tbi-cluster.sh b/resources/analysisTools/copyNumberEstimationWorkflow/environments/tbi-cluster.sh index ddf12bd..b4bb097 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/environments/tbi-cluster.sh +++ b/resources/analysisTools/copyNumberEstimationWorkflow/environments/tbi-cluster.sh @@ -10,8 +10,9 @@ module load "R/$RSCRIPT_VERSION" module load "bedtools/$BEDTOOLS_VERSION" module load "samtools/$SAMTOOLS_VERSION" module load "vcftools/$VCFTOOLS_VERSION" +module load "beagle/$BEAGLE_VERSION" -source /odcf/cluster/virtualenvs/warsow/python_2.7.9_SNVCalling_1.2.166-1/bin/activate +source /dkfz/cluster/virtualenvs/warsow/python_2.7.9_SNVCalling_1.2.166-1/bin/activate export BGZIP_BINARY=bgzip export TABIX_BINARY=tabix @@ -32,5 +33,10 @@ module load "vcftools/$VCFTOOLS_VERSION" export VCFTOOLS_SORT_BINARY=vcf-sort module load "samtools/$SAMTOOLS_VERSION" -export BCFTOOLS_BINARY=bcftools export SAMTOOLS_BINARY=samtools + +module load "bcftools/$BCFTOOLS_VERSION" +export BCFTOOLS_BINARY=bcftools + +module load "java/$JAVA_VERSION" +export JAVA_BINARY=java diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/estimateHRDScore.sh b/resources/analysisTools/copyNumberEstimationWorkflow/estimateHRDScore.sh index 427de7e..d3362f8 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/estimateHRDScore.sh +++ b/resources/analysisTools/copyNumberEstimationWorkflow/estimateHRDScore.sh @@ -19,9 +19,7 @@ do eval $item done combProFile=${aceseqOutputDirectory}/${pid}_comb_pro_extra${ploidyFactor}_${tcc}.txt - HRDFile=${aceseqOutputDirectory}/${pid}_HRDscore_${ploidyFactor}_${tcc}.txt - HRD_DETAILS_FILE=${aceseqOutputDirectory}/${pid}_HRDscore_contributingSegments_${ploidyFactor}_${tcc}.txt - echo $combProFile + mostImportantFile=${aceseqOutputDirectory}/${pid}_comb_pro_extra${ploidyFactor}_${tcc}.txt ##remove artifact regions @@ -92,19 +90,25 @@ do exit 2 fi + HRDFile=${aceseqOutputDirectory}/${pid}_HRDscore_${ploidyFactor}_${tcc}.txt + HRD_DETAILS_FILE=${aceseqOutputDirectory}/${pid}_HRDscore_contributingSegments_${ploidyFactor}_${tcc}.txt + LST_DETAILS_FILE=${aceseqOutputDirectory}/${pid}_LSTscore_contributingSegments_${ploidyFactor}_${tcc}.CentromerReduced.txt + MERGED_REDUCED_FILE=${aceseqOutputDirectory}/${pid}_comb_pro_extra${ploidyFactor}_${tcc}.smoothed.CentromerReduced.txt ${RSCRIPT_BINARY} ${TOOL_HRD_ESTIMATION} \ $combProFileNoArtifacts \ - $combProFile.tmp \ + ${combProFile}.tmp \ $patientsex \ $ploidy \ $tcc \ $pid \ - $HRDFile.tmp \ + ${HRDFile}.tmp \ ${HRD_DETAILS_FILE}.tmp \ + ${LST_DETAILS_FILE}.tmp \ + ${MERGED_REDUCED_FILE}.tmp \ ${FILENAME_CENTROMERES} \ ${cytobandsFile} \ - $PIPELINE_DIR + ${PIPELINE_DIR} if [[ "$?" != 0 ]] @@ -122,9 +126,11 @@ do exit 2 fi - mv $HRDFile.tmp $HRDFile + mv ${HRDFile}.tmp ${HRDFile} mv ${HRD_DETAILS_FILE}.tmp ${HRD_DETAILS_FILE} - rm $combProFile.tmp + mv ${LST_DETAILS_FILE}.tmp ${LST_DETAILS_FILE} + mv ${MERGED_REDUCED_FILE}.tmp ${MERGED_REDUCED_FILE} + rm ${combProFile}.tmp done if [[ "$?" != 0 ]] then @@ -132,4 +138,4 @@ then exit 2 fi rm ${FILENAME_PARAMETER_JSON}.tmp -touch $FILENAME_CHECKPOINT_HRD +touch ${FILENAME_CHECKPOINT_HRD} diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/extract_genotype_vcf.py b/resources/analysisTools/copyNumberEstimationWorkflow/extract_genotype_vcf.py index bd00d6b..fca795f 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/extract_genotype_vcf.py +++ b/resources/analysisTools/copyNumberEstimationWorkflow/extract_genotype_vcf.py @@ -1,10 +1,13 @@ -#!/usr/bin/python +#!/usr/bin/env python # Copyright (c) 2017 The ACEseq workflow developers. # Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt). from python_modules import Options +def is_comment_line(line): + return line.startswith("#") + options = Options.parse( { "vcf_file" : str, "outfile" : str } ) if options: @@ -12,7 +15,7 @@ outfile = open( options["outfile" ], "w" ) for line in vcf_infile: - if line[0] != "#": + if not is_comment_line(line): line = line.rstrip("\n").split("\t") if len(line) > 9: gt_index = line[8].split(":").index("GT") diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/getFinalPurityPloidy.py b/resources/analysisTools/copyNumberEstimationWorkflow/getFinalPurityPloidy.py index 7ebc344..3b69399 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/getFinalPurityPloidy.py +++ b/resources/analysisTools/copyNumberEstimationWorkflow/getFinalPurityPloidy.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # Copyright (c) 2017 The ACEseq workflow developers. # Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt). diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/group_genotypes.py b/resources/analysisTools/copyNumberEstimationWorkflow/group_genotypes.py index 07ee438..bd0db7e 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/group_genotypes.py +++ b/resources/analysisTools/copyNumberEstimationWorkflow/group_genotypes.py @@ -13,7 +13,7 @@ from python_modules import Tabfile -parser=argparse.ArgumentParser(description = "Group imputed haplotypes") +parser=argparse.ArgumentParser(description = "Group phased haplotypes") parser.add_argument('--infile', '-i', type = file, help = "vcf file with phased genotypes") parser.add_argument('--out', '-o', help = "Output file. Default: STDOUT") parser.add_argument('--minHT', '-m', type = int, default = 5, help = 'Minimum number of phased haplotypes for segment to be considered. Default=5') diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/homozygDel.sh b/resources/analysisTools/copyNumberEstimationWorkflow/homozygDel.sh index f8651c5..14b0eb6 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/homozygDel.sh +++ b/resources/analysisTools/copyNumberEstimationWorkflow/homozygDel.sh @@ -44,7 +44,7 @@ fi mv ${tmpSegsHomDel} ${FILENAME_SEGMENTS_W_HOMDEL} -$TABIX_BINARY -f -s 1 -b 2 -e 3 "${FILENAME_SEGMENTS_W_HOMDEL}" +$TABIX_BINARY -f -s 1 -b 2 -e 3 --comment chromosome "${FILENAME_SEGMENTS_W_HOMDEL}" # #if [[ "$?" != 0 ]] diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/impute2 b/resources/analysisTools/copyNumberEstimationWorkflow/impute2 deleted file mode 100755 index 9b7c1da..0000000 Binary files a/resources/analysisTools/copyNumberEstimationWorkflow/impute2 and /dev/null differ diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/impute2.sh b/resources/analysisTools/copyNumberEstimationWorkflow/impute2.sh deleted file mode 100755 index dfa7daf..0000000 --- a/resources/analysisTools/copyNumberEstimationWorkflow/impute2.sh +++ /dev/null @@ -1,166 +0,0 @@ -#!/bin/bash - -# Copyright (c) 2017 The ACEseq workflow developers. -# Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt). - - -#DEFINE FILE NAMES - -PHASED_GENOTYPE="${FILE_PHASED_GENOTYPE}${CHR_NAME}.${FILE_TXT_SUF}" -PHASED_HAPS="${PHASED_GENOTYPE}_haps" -PHASED_HAPS_CONF="${PHASED_GENOTYPE}_haps_confidence" -PHASED_INFO="${PHASED_GENOTYPE}_info" -PHASED_INFO_SAMPLE="${PHASED_GENOTYPE}_info_by_sample" -PHASED_SUMMARY="${PHASED_GENOTYPE}_summary" -PHASED_WARNINGS="${PHASED_GENOTYPE}_warnings" -tmpPhased="${FILENAME_PHASED_GENOTYPES}_tmp" -tmpHaploblocks="${FILENAME_HAPLOBLOCK_GROUPS}_tmp" -UNPHASED="${FILE_UNPHASED_PRE}${CHR_NAME}.${FILE_VCF_SUF}" -UNPHASED_GENOTYPE="${FILE_UNPHASED_GENOTYPE}${CHR_NAME}.${FILE_TXT_SUF}" - -if [[ ${isNoControlWorkflow} == false ]] -then - - source ${TOOL_ANALYZE_BAM_HEADER} - getRefGenomeAndChrPrefixFromHeader ${FILE_CONTROL_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME - - CHR_NR=${CHR_PREFIX}${CHR_NAME:?CHR_NAME is not set} - - ${SAMTOOLS_BINARY} mpileup ${CNV_MPILEUP_OPTS} -u \ - -f "${REFERENCE_GENOME}" \ - -r ${CHR_NR} \ - "${FILE_CONTROL_BAM}" \ - | \ - ${BCFTOOLS_BINARY} view ${BCFTOOLS_OPTS} - \ - > "${UNPHASED}" - - if [[ "$?" != 0 ]] - then - echo "Non zero exit status for mpileup in impute2.sh" >> /dev/stderr - exit 2 - fi -fi - -${PYTHON_BINARY} "${TOOL_EXTRACT_GENOTYPE_VCF}" \ - --vcf_file "${UNPHASED}" \ - --outfile "${UNPHASED_GENOTYPE}" - -if [[ "$?" != 0 ]] -then - echo "Non zero exit status while extracting genotype in impute2.sh" >> /dev/stderr - exit 2 -fi - - # This runs the impute2 haplotype imputation program - # strand_g option necessary? - echo -n > "${PHASED_HAPS}" - echo -n > "${PHASED_HAPS_CONF}" - echo -n > "${PHASED_INFO}" - echo -n > "${PHASED_INFO_SAMPLE}" - echo -n > "${PHASED_SUMMARY}" - echo -n > "${PHASED_WARNINGS}" - - # For smaller chromosomes, impute2 will simply not print output for the - # last few segments. - for SEGMENT in $(seq 0 50) - do - PHASED_GENOTYPE_PART="${FILE_PHASED_GENOTYPE}${CHR_NAME}.part${SEGMENT}.${FILE_TXT_SUF}" - PHASED_HAPS_PART="${PHASED_GENOTYPE_PART}_haps" - PHASED_HAPS_CONF_PART="${PHASED_GENOTYPE_PART}_haps_confidence" - PHASED_INFO_PART="${PHASED_GENOTYPE_PART}_info" - PHASED_INFO_SAMPLE_PART="${PHASED_GENOTYPE_PART}_info_by_sample" - PHASED_SUMMARY_PART="${PHASED_GENOTYPE_PART}_summary" - PHASED_WARNINGS_PART="${PHASED_GENOTYPE_PART}_warnings" - - echo -n > "${PHASED_GENOTYPE_PART}" - echo -n > "${PHASED_HAPS_PART}" - echo -n > "${PHASED_HAPS_CONF_PART}" - echo -n > "${PHASED_INFO_PART}" - echo -n > "${PHASED_INFO_SAMPLE_PART}" - echo -n > "${PHASED_SUMMARY_PART}" - echo -n > "${PHASED_WARNINGS_PART}" - - ${TOOL_IMPUTE} \ - -seed 25041988 \ - -phase \ - -m $(echo "${GENETIC_MAP_FILE}" | sed "s/\${CHR_NR}/${CHR_NAME}/g") \ - -h $(echo "${KNOWN_HAPLOTYPES_FILE}" | sed "s/\${CHR_NR}/${CHR_NAME}/g") \ - -l $(echo "${KNOWN_HAPLOTYPES_LEGEND_FILE}" | sed "s/\${CHR_NR}/${CHR_NAME}/g") \ - -g "${UNPHASED_GENOTYPE}" \ - -int $[5000000*${SEGMENT}] $[5000000*${SEGMENT} + 4999999] \ - -Ne 20000 \ - -o "${PHASED_GENOTYPE_PART}" 2>&1 /dev/stderr - - if [[ "$?" != 0 ]] - then - if [[ $test == "test" ]] - then - grep 'ERROR: There are no type 2 SNPs after applying the command-line settings for this run, which makes it impossible to perform imputation.' ${target_dir}/phasing/phased_genotypes_chr${CHR_NAME}_part${SEGMENT}.txt_summary > ${target_dir}/phasing/exit_check_temp.txt - - var=$(ls -s1 ${target_dir}/phasing/exit_check_temp.txt | awk '{print $1}') - - if [[ $var == 0 ]] - then - echo "WARNING: Non zero exit status during segmentation of segment ${SEGMENT} on chr ${CHR_NAME} in impute2.sh" >> /dev/stderr - exit 2 - else - echo "Warning of no type 2 SNPs was issued but is ignored during segmentation of segment ${SEGMENT} on chr ${CHR_NAME} in impute2.sh" >> /dev/stderr - fi - - rm ${target_dir}/phasing/exit_check_temp.txt - var=0 - else - echo "WARNING: Non zero exit status during segmentation of segment ${SEGMENT} on chr ${CHR_NAME} in impute2.sh" >> /dev/stderr - exit 2 - - fi - - fi - - cat "${PHASED_HAPS_PART}" \ - >> "${PHASED_HAPS}" - cat "${PHASED_HAPS_CONF_PART}" \ - >> "${PHASED_HAPS_CONF}" - cat "${PHASED_INFO_PART}" \ - >> "${PHASED_INFO}" - cat "${PHASED_INFO_SAMPLE_PART}" \ - >> "${PHASED_INFO_SAMPLE}" - cat "${PHASED_SUMMARY_PART}" \ - >> "${PHASED_SUMMARY}" - cat "${PHASED_WARNINGS_PART}" \ - >> "${PHASED_WARNINGS}" - - rm "${PHASED_GENOTYPE_PART}" \ - "${PHASED_HAPS_PART}" \ - "${PHASED_HAPS_CONF_PART}" \ - "${PHASED_INFO_PART}" \ - "${PHASED_INFO_SAMPLE_PART}" \ - "${PHASED_SUMMARY_PART}" \ - "${PHASED_WARNINGS_PART}" - done - -${PYTHON_BINARY} "${TOOL_EMBED_HAPLOTYPES_VCF}" \ - --hap_file "${PHASED_HAPS}" \ - --vcf_file "${UNPHASED}" \ - --outfile "${tmpPhased}" - -if [[ "$?" != 0 ]] -then - echo "Non zero exit status while embedding haplotypes in impute2.sh" >> /dev/stderr - exit 2 -fi - - -${PYTHON_BINARY} "${TOOL_GROUP_HAPLOTYPES}" \ - --infile "${tmpPhased}" \ - --out "${tmpHaploblocks}" \ - --minHT ${minHT} - -if [[ "$?" != 0 ]] -then - echo "Non zero exit status while grouping haplotypes in impute2.sh" >> /dev/stderr - exit 2 -fi - -mv $tmpPhased ${FILENAME_PHASED_GENOTYPES} -mv $tmpHaploblocks ${FILENAME_HAPLOBLOCK_GROUPS} diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/impute2_X.sh b/resources/analysisTools/copyNumberEstimationWorkflow/impute2_X.sh deleted file mode 100755 index e56c4ab..0000000 --- a/resources/analysisTools/copyNumberEstimationWorkflow/impute2_X.sh +++ /dev/null @@ -1,182 +0,0 @@ -#!/bin/bash - -# Copyright (c) 2017 The ACEseq workflow developers. -# Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt). - -if [[ ${isNoControlWorkflow} == false ]]; then - source ${TOOL_ANALYZE_BAM_HEADER} - getRefGenomeAndChrPrefixFromHeader ${FILE_CONTROL_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME -fi - -CHR_NAME=X -CHR_NR=${CHR_PREFIX}${CHR_NAME} - -#DEFINE FILENAMES -PHASED_GENOTYPE=${FILE_PHASED_GENOTYPE}${CHR_NAME}.${FILE_TXT_SUF} -PHASED_HAPS=${PHASED_GENOTYPE}_haps -PHASED_HAPS_CONF=${PHASED_GENOTYPE}_haps_confidence -PHASED_INFO=${PHASED_GENOTYPE}_info -PHASED_INFO_SAMPLE=${PHASED_GENOTYPE}_info_by_sample -PHASED_SUMMARY=${PHASED_GENOTYPE}_summary -PHASED_WARNINGS=${PHASED_GENOTYPE}_warnings -tmpphased=${FILENAME_PHASED_GENOTYPES}_tmp #These two files should have 23 as chromosomes name rather than 'X' -tmphaploblocks=${FILENAME_HAPLOBLOCK_GROUPS}_tmp -UNPHASED="${FILE_UNPHASED_PRE}${CHR_NAME}.${FILE_VCF_SUF}" -FILENAME_UNPHASED_GENOTYPE=${FILE_UNPHASED_GENOTYPE}${CHR_NAME}.${FILE_TXT_SUF} - - -#check whether the patient is female or male -if grep -Pv 'female|klinefelter' "${FILENAME_SEX}" - then - echo "Patient $PID is male." - echo " " >"${FILENAME_HAPLOBLOCK_GROUPS}" - echo "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample_control_${PID}" >"${FILENAME_PHASED_GENOTYPES}" - exit 0 - fi - -if [[ ${isNoControlWorkflow} == false ]] -then - - ${SAMTOOLS_BINARY} mpileup ${CNV_MPILEUP_OPTS} -u \ - -f "${REFERENCE_GENOME}" \ - -r ${CHR_NR} \ - "${FILE_CONTROL_BAM}" \ - | \ - ${BCFTOOLS_BINARY} view ${BCFTOOLS_OPTS} - \ - > "${UNPHASED}" - - if [[ "$?" != 0 ]] - then - echo "Non zero exit status for mpileup in impute2.sh" - exit 2 - fi -fi - -${PYTHON_BINARY} "${TOOL_EXTRACT_GENOTYPE_VCF}" \ - --vcf_file "${UNPHASED}" \ - --outfile "${FILENAME_UNPHASED_GENOTYPE}" - -if [[ "$?" != 0 ]] -then - echo "Non zero exit status while extracting genotype in impute2.sh" - exit 2 -fi - - #create sample_g file - echo "ID_1 ID_2 missing sex" > "${FILE_SAMPLE_G}" - echo "0 0 0 D" >> "${FILE_SAMPLE_G}" - echo "${PID} ${PID} 0 2" >> "${FILE_SAMPLE_G}" - - - # This runs the impute2 haplotype imputation program - # strand_g option necessary? - echo -n> "${PHASED_HAPS}" - echo -n> "${PHASED_HAPS_CONF}" - echo -n> "${PHASED_INFO}" - echo -n> "${PHASED_INFO_SAMPLE}" - echo -n> "${PHASED_SUMMARY}" - echo -n> "${PHASED_WARNINGS}" - - # For smaller chromosomes, impute2 will simply not print output for the - # last few segments. - for SEGMENT in $(seq 0 50) - do - PHASED_GENOTYPE_PART=${FILE_PHASED_GENOTYPE}${CHR_NAME}.part${SEGMENT}.${FILE_TXT_SUF} - PHASED_HAPS_PART=${PHASED_GENOTYPE_PART}_haps - PHASED_HAPS_CONF_PART=${PHASED_GENOTYPE_PART}_haps_confidence - PHASED_INFO_PART=${PHASED_GENOTYPE_PART}_info - PHASED_INFO_SAMPLE_PART=${PHASED_GENOTYPE_PART}_info_by_sample - PHASED_SUMMARY_PART=${PHASED_GENOTYPE_PART}_summary - PHASED_WARNINGS_PART=${PHASED_GENOTYPE_PART}_warnings - - echo -n> "${PHASED_GENOTYPE_PART}" - echo -n> "${PHASED_HAPS_PART}" - echo -n> "${PHASED_HAPS_CONF_PART}" - echo -n> "${PHASED_INFO_PART}" - echo -n> "${PHASED_INFO_SAMPLE_PART}" - echo -n> "${PHASED_SUMMARY_PART}" - echo -n> "${PHASED_WARNINGS_PART}" - - ${TOOL_IMPUTE} \ - -seed 25041988 \ - -phase \ - -m $(echo "${GENETIC_MAP_FILE_X}" | sed "s/\${CHR_NR}/${CHR_NAME}/g") \ - -h $(echo "${KNOWN_HAPLOTYPES_FILE_X}" | sed "s/\${CHR_NR}/${CHR_NAME}/g") \ - -l $(echo "${KNOWN_HAPLOTYPES_LEGEND_FILE_X}" | sed "s/\${CHR_NR}/${CHR_NAME}/g") \ - -g "${FILENAME_UNPHASED_GENOTYPE}" \ - -int $[5000000*${SEGMENT}] $[5000000*${SEGMENT} + 4999999] \ - -Ne 20000 \ - -o "${PHASED_GENOTYPE_PART}" - - if [[ "$?" != 0 ]] - then - if [[ $test == "test" ]] - then - grep 'ERROR: There are no type 2 SNPs after applying the command-line settings for this run, which makes it impossible to perform imputation.' ${target_dir}/phasing/phased_genotypes_chr${CHR_NAME}_part${SEGMENT}.txt_summary > ${target_dir}/phasing/exit_check_temp.txt - - var=$(ls -s1 ${target_dir}/phasing/exit_check_temp.txt | awk '{print $1}') - - if [[ $var == 0 ]] - then - echo "WARNING: Non zero exit status during segmentation of segment ${SEGMENT} on chr ${CHR_NAME} in impute2.sh" - exit 2 - else - echo "Warning of no type 2 SNPs was issued but is ignored during segmentation of segment ${SEGMENT} on chr ${CHR_NAME} in impute2.sh" - fi - - rm ${target_dir}/phasing/exit_check_temp.txt - var=0 - else - echo "WARNING: Non zero exit status during segmentation of segment ${SEGMENT} on chr ${CHR_NAME} in impute2.sh" - exit 2 - - fi - - fi - - cat "${PHASED_HAPS_PART}" \ - >> "${PHASED_HAPS}" - cat "${PHASED_HAPS_CONF_PART}" \ - >> "${PHASED_HAPS_CONF}" - cat "${PHASED_INFO_PART}" \ - >> "${PHASED_INFO}" - cat "${PHASED_INFO_SAMPLE_PART}" \ - >> "${PHASED_INFO_SAMPLE}" - cat "${PHASED_SUMMARY_PART}" \ - >> "${PHASED_SUMMARY}" - cat "${PHASED_WARNINGS_PART}" \ - >> "${PHASED_WARNINGS}" - - rm "${PHASED_GENOTYPE_PART}" \ - "${PHASED_HAPS_PART}" \ - "${PHASED_HAPS_CONF_PART}" \ - "${PHASED_INFO_PART}" \ - "${PHASED_INFO_SAMPLE_PART}" \ - "${PHASED_SUMMARY_PART}" \ - "${PHASED_WARNINGS_PART}" - done - -${PYTHON_BINARY} "${TOOL_EMBED_HAPLOTYPES_VCF}" \ - --hap_file "${PHASED_HAPS}" \ - --vcf_file "${UNPHASED}" \ - --outfile "${tmpphased}" - -if [[ "$?" != 0 ]] -then - echo "Non zero exit status while embedding haplotypes in impute2.sh" - exit 2 -fi - -${PYTHON_BINARY} "${TOOL_GROUP_HAPLOTYPES}" \ - --infile "${tmpphased}" \ - --out "${tmphaploblocks}" \ - --minHT ${minHT} - -if [[ "$?" != 0 ]] -then - echo "Non zero exit status while grouping haplotypes in impute2.sh" - exit 2 -fi - -mv ${tmpphased} ${FILENAME_PHASED_GENOTYPES} -mv ${tmphaploblocks} ${FILENAME_HAPLOBLOCK_GROUPS} diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/manual_pruning.R b/resources/analysisTools/copyNumberEstimationWorkflow/manual_pruning.R index 31c5f1b..611bfa9 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/manual_pruning.R +++ b/resources/analysisTools/copyNumberEstimationWorkflow/manual_pruning.R @@ -360,7 +360,12 @@ if (clustering_YN == "yes") { cat(paste0("cluster_matrix nrow: ",nrow(cluster_matrix),"\n")) cat(paste0("cluster_matrix ncol: ",ncol(cluster_matrix),"\n")) cat(paste0("min_num_cluster: ",min_num_cluster,"\n")) - d_clust <- Mclust(cluster_matrix, G=min_num_cluster:20) + if (runInDebugMode == "true") { + beVerbose = T + } else { + beVerbose = F + } + d_clust <- Mclust(cluster_matrix, G=min_num_cluster:20, verbose = beVerbose) cat(paste0(Sys.time(),": finished Mclust...\n")) m.best <- dim(d_clust$z)[2] diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/mergeArtifacts.py b/resources/analysisTools/copyNumberEstimationWorkflow/mergeArtifacts.py index ba497d3..c49a240 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/mergeArtifacts.py +++ b/resources/analysisTools/copyNumberEstimationWorkflow/mergeArtifacts.py @@ -1,7 +1,7 @@ -#!/usr/bin/python - -# Copyright (c) 2017 The ACEseq workflow developers. -# Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt). +#!/usr/bin/env python + +# Copyright (c) 2017 The ACEseq workflow developers. +# Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt). diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/merge_and_filter_cnv.py b/resources/analysisTools/copyNumberEstimationWorkflow/merge_and_filter_cnv.py index 52a722f..940b816 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/merge_and_filter_cnv.py +++ b/resources/analysisTools/copyNumberEstimationWorkflow/merge_and_filter_cnv.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # Copyright (c) 2017 The ACEseq workflow developers. # Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt). diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/merge_and_filter_snp.py b/resources/analysisTools/copyNumberEstimationWorkflow/merge_and_filter_snp.py index d7811fd..bfcefd2 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/merge_and_filter_snp.py +++ b/resources/analysisTools/copyNumberEstimationWorkflow/merge_and_filter_snp.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # Copyright (c) 2017 The ACEseq workflow developers. # Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt). diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/parseJson.py b/resources/analysisTools/copyNumberEstimationWorkflow/parseJson.py index 74b82fd..bed651f 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/parseJson.py +++ b/resources/analysisTools/copyNumberEstimationWorkflow/parseJson.py @@ -1,5 +1,5 @@ - +#!/usr/bin/env python # Copyright (c) 2017 The ACEseq workflow developers. # Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt). diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/parseVcf.pl b/resources/analysisTools/copyNumberEstimationWorkflow/parseVcf.pl index bc478a1..b156725 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/parseVcf.pl +++ b/resources/analysisTools/copyNumberEstimationWorkflow/parseVcf.pl @@ -5,7 +5,7 @@ use strict; use warnings; -my %filehandle; my $filename; my $filenametmp; +my %filehandle; my $filename; my $filenametmp; my $chr_prefix; for (my $i=1; $i < 25; $i++){ my $id = $i; @@ -24,6 +24,7 @@ $fields[9] = $fields[8]; $fields[8] = "GT"; if ($filehandle{$fields[0]}){ + $fields[0] = "$ARGV[3]$fields[0]"; print {$filehandle{$fields[0]}} join("\t", @fields); } } diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/phasing.sh b/resources/analysisTools/copyNumberEstimationWorkflow/phasing.sh new file mode 100755 index 0000000..33715ce --- /dev/null +++ b/resources/analysisTools/copyNumberEstimationWorkflow/phasing.sh @@ -0,0 +1,68 @@ +#!/bin/bash +set -ue -o pipefail + +# Copyright (c) 2017 The ACEseq workflow developers. +# Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt). + +source "$TOOL_BASH_LIB" + +#DEFINE FILE NAMES +UNPHASED="$FILE_UNPHASED_PRE$CHR_NAME.$FILE_VCF_SUF" +UNPHASED_TWOSAMPLES="$FILE_UNPHASED_PRE${CHR_NAME}_2samples.$FILE_VCF_SUF" +PHASED_TWOSAMPLES="$FILE_PHASED_GENOTYPE${CHR_NAME}_2samples" +tmpPhased="${FILENAME_PHASED_GENOTYPES}_tmp" +tmpHaploblocks="${FILENAME_HAPLOBLOCK_GROUPS}_tmp" + +if [[ "$isNoControlWorkflow" == false ]] +then + + source "$TOOL_ANALYZE_BAM_HEADER" + getRefGenomeAndChrPrefixFromHeader "$FILE_CONTROL_BAM" # Sets CHR_PREFIX and REFERENCE_GENOME + + CHR_NR="$CHR_PREFIX${CHR_NAME:?CHR_NAME is not set}" + + $BCFTOOLS_BINARY mpileup $CNV_MPILEUP_OPTS -O u \ + -f "$REFERENCE_GENOME" \ + -r "$CHR_NR" \ + "$FILE_CONTROL_BAM" \ + | \ + $BCFTOOLS_BINARY call $BCFTOOLS_OPTS - \ + > "$UNPHASED" \ + || dieWith "Non zero exit status for mpileup in phasing.sh" + +fi + +echo -n > "$UNPHASED_TWOSAMPLES" +echo -n > "$tmpPhased" +echo -n > "$tmpHaploblocks" + +$PYTHON_BINARY "$TOOL_BEAGLE_CREATE_FAKE_SAMPLES" \ + --in_file "$UNPHASED" \ + --out_file "$UNPHASED_TWOSAMPLES" \ + || dieWith "Non zero exit status while creating 2nd sample in vcf-file in phasing.sh" + +export JAVA_OPTIONS=$BEAGLE_JAVA_OPTS +$BEAGLE_BINARY \ + gt="$UNPHASED_TWOSAMPLES" \ + ref="$BEAGLE_REFERENCE_FILE" \ + out="$PHASED_TWOSAMPLES" \ + map="$BEAGLE_GENETIC_MAP" \ + impute=false \ + seed=25041988 \ + nthreads="$BEAGLE_NUM_THREAD" \ + || dieWith "Non zero exit status while phasing with Beagle in phasing.sh" + +$PYTHON_BINARY "$TOOL_BEAGLE_EMBED_HAPLOTYPES_VCF" \ + --hap_file "$PHASED_TWOSAMPLES.vcf.gz" \ + --vcf_file "$UNPHASED" \ + --out_file "$tmpPhased" \ + || dieWith "Non zero exit status while embedding haplotypes in phasing.sh" + +$PYTHON_BINARY "$TOOL_GROUP_HAPLOTYPES" \ + --infile "$tmpPhased" \ + --out "$tmpHaploblocks" \ + --minHT "$minHT" \ + || dieWith "Non zero exit status while grouping haplotypes in phasing.sh" + +mv "$tmpPhased" "$FILENAME_PHASED_GENOTYPES" +mv "$tmpHaploblocks" "$FILENAME_HAPLOBLOCK_GROUPS" diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/phasing_X.sh b/resources/analysisTools/copyNumberEstimationWorkflow/phasing_X.sh new file mode 100755 index 0000000..0ad332f --- /dev/null +++ b/resources/analysisTools/copyNumberEstimationWorkflow/phasing_X.sh @@ -0,0 +1,88 @@ +#!/bin/bash +set -ue -o pipefail +# Copyright (c) 2017 The ACEseq workflow developers. +# Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt). + +source "$TOOL_BASH_LIB" + +if [[ "$isNoControlWorkflow" == false ]]; then + source "$TOOL_ANALYZE_BAM_HEADER" + getRefGenomeAndChrPrefixFromHeader "$FILE_CONTROL_BAM" # Sets CHR_PREFIX and REFERENCE_GENOME +fi + +CHR_NAME=X +CHR_NR="$CHR_PREFIX$CHR_NAME" + +#DEFINE FILENAMES +UNPHASED="$FILE_UNPHASED_PRE$CHR_NAME.$FILE_VCF_SUF" +UNPHASED_TWOSAMPLES="$FILE_UNPHASED_PRE${CHR_NAME}_2samples.$FILE_VCF_SUF" +PHASED_TWOSAMPLES="$FILE_PHASED_GENOTYPE${CHR_NAME}_2samples" +tmpPhased="${FILENAME_PHASED_GENOTYPES}_tmp" #These two files should have 23 as chromosomes name rather than 'X' +tmpHaploblocks="${FILENAME_HAPLOBLOCK_GROUPS}_tmp" + + +#check for patient sex and exit if male +if grep -Pv 'female|klinefelter' "$FILENAME_SEX" + then + echo "Patient $PID is male." + echo " " >"$FILENAME_HAPLOBLOCK_GROUPS" + echo "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample_control_$PID" >"$FILENAME_PHASED_GENOTYPES" + exit 0 + fi + +if [[ "$isNoControlWorkflow" == false ]] +then + + $BCFTOOLS_BINARY mpileup $CNV_MPILEUP_OPTS -O u \ + -f "$REFERENCE_GENOME" \ + -r "$CHR_NR" \ + "$FILE_CONTROL_BAM" \ + | \ + $BCFTOOLS_BINARY call $BCFTOOLS_OPTS - \ + > "$UNPHASED" \ + || dieWith "Non zero exit status for mpileup in phasing_X.sh" + +fi + +echo -n > "$UNPHASED_TWOSAMPLES" +echo -n > "$tmpPhased" +echo -n > "$tmpHaploblocks" + +$PYTHON_BINARY "$TOOL_BEAGLE_CREATE_FAKE_SAMPLES" \ + --in_file "$UNPHASED"\ + --out_file "$UNPHASED_TWOSAMPLES" \ + || dieWith "Non zero exit status while creating 2nd sample in vcf-file in phasing_X.sh" + +#create sample_g file +echo "ID_1 ID_2 missing sex" > "$FILE_SAMPLE_G" +echo "0 0 0 D" >> "$FILE_SAMPLE_G" +echo "$PID $PID 0 2" >> "$FILE_SAMPLE_G" + +export JAVA_OPTIONS=$BEAGLE_JAVA_OPTS +$BEAGLE_BINARY \ + gt="$UNPHASED_TWOSAMPLES" \ + ref="$BEAGLE_REFERENCE_FILE_X" \ + out="$PHASED_TWOSAMPLES" \ + map="$BEAGLE_GENETIC_MAP_X" \ + impute=false \ + seed=25041988 \ + nthreads="$BEAGLE_NUM_THREAD" \ + || dieWith "Non zero exit status while phasing with Beagle in phasing_X.sh" + + +$PYTHON_BINARY "$TOOL_BEAGLE_EMBED_HAPLOTYPES_VCF" \ + --hap_file "$PHASED_TWOSAMPLES.vcf.gz" \ + --vcf_file "$UNPHASED" \ + --out_file "$tmpPhased" \ + || dieWith "Non zero exit status while embedding haplotypes in phasing_X.sh" + + +$PYTHON_BINARY "$TOOL_GROUP_HAPLOTYPES" \ + --infile "$tmpPhased" \ + --out "$tmpHaploblocks" \ + --minHT "$minHT" \ + || dieWith "Non zero exit status while grouping haplotypes in phasing_X.sh" + + +mv "$tmpPhased" "$FILENAME_PHASED_GENOTYPES" +mv "$tmpHaploblocks" "$FILENAME_HAPLOBLOCK_GROUPS" diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/plots.sh b/resources/analysisTools/copyNumberEstimationWorkflow/plots.sh index 416d620..5971389 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/plots.sh +++ b/resources/analysisTools/copyNumberEstimationWorkflow/plots.sh @@ -17,7 +17,8 @@ ${RSCRIPT_BINARY} --vanilla "${TOOL_GENERATE_PLOTS}" \ --sv_YN $SV \ --ID ${PID} \ --pipelineDir `dirname ${TOOL_GENERATE_PLOTS}` \ - --ymaxcov_threshold ${ymaxcov_threshold} + --ymaxcov_threshold ${ymaxcov_threshold} \ + --annotatePlotsWithGenes ${annotatePlotsWithGenes} if [[ "$?" != 0 ]] diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/pscbs_all.R b/resources/analysisTools/copyNumberEstimationWorkflow/pscbs_all.R index 41249c1..3ab4020 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/pscbs_all.R +++ b/resources/analysisTools/copyNumberEstimationWorkflow/pscbs_all.R @@ -128,8 +128,10 @@ fit = segmentByPairedPSCBS(data, knownSegments = knownSegments, tbn = FALSE, if (h != 0) { fit = pruneByHClust(fit, h = h, merge = TRUE, update = TRUE, verbose = -10) } +cat("Finished segmentByPairedPSCBS") segments = getSegments(fit, simplify = TRUE) +cat("Finished getSegments") #round coordinates with .5 to closest integer (down for end, up dor start) if( any( segments$start == -Inf, na.rm=TRUE ) ){ diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/pscbs_plots.R b/resources/analysisTools/copyNumberEstimationWorkflow/pscbs_plots.R index 9069332..732ed6e 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/pscbs_plots.R +++ b/resources/analysisTools/copyNumberEstimationWorkflow/pscbs_plots.R @@ -19,7 +19,8 @@ spec <- matrix(c('SNPfile', 'f', 1, "character", 'sv_YN', 'y', 1, "character", 'ID', 'i', 1, "character", 'pipelineDir', 'd', 1, "character", - 'ymaxcov_threshold', 't', 1, "numeric" + 'ymaxcov_threshold', 't', 1, "numeric", + 'annotatePlotsWithGenes', 'a', 1, "character" ), ncol = 4, byrow = TRUE) @@ -62,6 +63,12 @@ if (sv_YN == "true") { sv = NULL } +if (annotatePlotsWithGenes == "true") { + annotatePlotsWithGenes = T +} else { + annotatePlotsWithGenes = F +} + segments = read.table(segments, sep = "\t", header = TRUE, as.is = TRUE, stringsAsFactors = TRUE) segAll = data.frame(segments) combi = segAll @@ -97,7 +104,7 @@ if (chrCount > maxChrCount) { #function wrappers to get chromosome wise plots of TCN, dh and BAF -plotChromosomes = function (chrom, dat, comb, TCC, ploi , roundPloi, svPoints=NULL, secondChoicePloidyFilnameAddition="") { +plotChromosomes = function (chrom, dat, comb, TCC, ploi , roundPloi, svPoints=NULL, secondChoicePloidyFilnameAddition="", annotatePlotsWithGenes=F) { #don't plot if data frame is empty if (nrow(dat)<1) return(NULL) @@ -119,7 +126,7 @@ plotChromosomes = function (chrom, dat, comb, TCC, ploi , roundPloi, svPoints=N # maxCov = 2*ploi maxCov = max(comb$tcnMean, na.rm= TRUE) - p1 <- plotTCN( chrom, ratio, segs, ploi, TCC, roundPloi, chrL, ymaxcov=maxCov, svSub=svSub, ymaxcov_threshold=ymaxcov_threshold) + labs(x=NULL) + p1 <- plotTCN( chrom, ratio, segs, ploi, TCC, roundPloi, chrL, ymaxcov=maxCov, svSub=svSub, ymaxcov_threshold=ymaxcov_threshold, annotatePlotsWithGenes=annotatePlotsWithGenes) + labs(x=NULL) p2 <- plotDHmeans( segs, chrL ) + theme( axis.text.x=element_blank() ) X = which( (ratio$betaN > 0.3) & (ratio$betaN < 0.7) ) @@ -208,7 +215,7 @@ plotAll <- function(dat, comb, ploi, TCC, roundPloi, chrCount, secondChoicePloid if (index == 1) { # Plot combined 'coverage' and 'rawBAF' (only once, not the same plot for all pp solutions) chrLengthTab = chrLength - colnames(chrLengthTab) <- c("chromosome", "length", "info") + colnames(chrLengthTab)[1:2] <- c("chromosome", "length") chrLengthTab$chromosome <- as.numeric(chrLengthTab$chromosome) chrLengthTab <- chrLengthTab[order(chrLengthTab$chromosome),] plotDir = paste0(outDir,"/plots") @@ -233,7 +240,7 @@ plotAll <- function(dat, comb, ploi, TCC, roundPloi, chrCount, secondChoicePloid plotTitle <- textGrob( paste0(ID,", sex=",sex, "")) p = arrangeGrob(plotTitle, coveragePlot, p3, nrow=3, heights=c(1,7,7)) - fileName= paste0( outfile, "_CovBaf.png" ) + fileName = paste0(plotDir,"/",ID,"_CovBaf.png" ) ggplot2::ggsave( fileName, p, width=15, height=7, type='cairo') } @@ -277,7 +284,7 @@ for( index in seq_len( nrow(pp) ) ) { dataList.tmp = lapply( 1:chrCount, function(chr){ cat("Plotting chromosome ",chr, "...\n") dataList.chr <- completeSNP( chr, dataList[[chr]], ploidy, tcc, roundPloidy) - plotChromosomes( chr, dataList.chr, combi.tmp, tcc, ploidy, roundPloidy, svPoints=sv) + plotChromosomes( chr, dataList.chr, combi.tmp, tcc, ploidy, roundPloidy, svPoints=sv, secondChoicePloidyFilnameAddition=secondChoicePloidyFilnameAddition, annotatePlotsWithGenes=annotatePlotsWithGenes) return(dataList.chr) } ) @@ -285,7 +292,7 @@ for( index in seq_len( nrow(pp) ) ) { cat("plotting All chromosomes...\n") #genome wide plot - plotAll(dataList.tmp, comb=combi.tmp, ploidy, tcc, roundPloidy, chrCount, index=index) + plotAll(dataList.tmp, comb=combi.tmp, ploidy, tcc, roundPloidy, chrCount, secondChoicePloidyFilnameAddition=secondChoicePloidyFilnameAddition, index=index) #remove tmp data.frame to free memory and use garbage collection dataList.tmp <- NULL tmp <- NULL diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/pscbs_plots_functions.R b/resources/analysisTools/copyNumberEstimationWorkflow/pscbs_plots_functions.R index 586bfa5..20848d3 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/pscbs_plots_functions.R +++ b/resources/analysisTools/copyNumberEstimationWorkflow/pscbs_plots_functions.R @@ -6,16 +6,22 @@ require(ggbio) require(ggplot2) require(gridExtra) require(grid) +library(GenomicRanges) #plot TCNs #chr is the chromosome number #ratio: data.frame; subset (chromosome) of dataAll dataframe containing all SNPs #seg: data.frame; subset of combi which contains all segments for sample -plotTCN = function (chromosome, ratio, seg, Ploidy, tcc, fullPloidy, chrLen, ymaxcov, plots='single', svSub=NULL, p = NULL, ymaxcov_threshold) { +plotTCN = function (chromosome, ratio, seg, Ploidy, tcc, fullPloidy, chrLen, ymaxcov, plots='single', svSub=NULL, p = NULL, ymaxcov_threshold, geneAnnotations = NA, unconditionalGeneLabeling = F, annotatePlotsWithGenes=F) { ymaxDH = 1 xtotal = chrLen/ 10 len = chrLen/1000000 + + if (plots == 'single' & is.data.frame(geneAnnotations)){ + seg.gr = makeGRangesFromDataFrame(seg, keep.extra.columns = T) + } + # scale values seg$start <- (seg$start/10)/xtotal @@ -40,14 +46,14 @@ plotTCN = function (chromosome, ratio, seg, Ploidy, tcc, fullPloidy, chrLen, yma colScale + theme_bw() } # allele specific and total copy numbers: c1Mean, c2Mean and tcnMean - p <- p + geom_segment( data=seg, aes( x=start, xend=end, y=c1Mean, yend=c1Mean ), colour="#00BFFFFF", size = 1, na.rm=TRUE ) - p <- p + geom_segment( data=seg, aes( x=start,xend=end, y=c2Mean, yend=c2Mean ), colour="#00BFFFFF", size = 1, na.rm =TRUE ) - p <- p + geom_segment( data=seg, aes( x=start,y=tcnMean, xend=end, yend=tcnMean ), colour="#0000CDFF", size = 1 ) + p <- p + geom_segment( data=seg, aes( x=start, xend=end, y=c1Mean, yend=c1Mean ), colour="#00BFFFFF", size = 1, na.rm=TRUE ) + p <- p + geom_segment( data=seg, aes( x=start, xend=end, y=c2Mean, yend=c2Mean ), colour="#00BFFFFF", size = 1, na.rm =TRUE ) + p <- p + geom_segment( data=seg, aes( x=start, y=tcnMean, xend=end, yend=tcnMean ), colour="#0000CDFF", size = 1 ) # labs, title, boundaries p <- p + theme( legend.position="none", panel.grid=element_blank() ) p <- p + xlab('') + ylab('TCN') + xlim( c(0,1) ) - p <- p + scale_y_continuous(limits=c(0, ymaxcov + 1.2), breaks=seq(0, ymaxcov+1.2, 2), labels=as.character( seq( 0, (ymaxcov+1.2), 2 ) ) ) + p <- p + scale_y_continuous(limits=c(0, ymaxcov + 1.2), breaks=seq(0, ymaxcov+1.2, 2), labels=as.character( seq( 0, (ymaxcov+1.2), 2 ) ) ) p <- p + theme( title = element_text(size=15), axis.title = element_text(size=12) ) @@ -69,10 +75,47 @@ plotTCN = function (chromosome, ratio, seg, Ploidy, tcc, fullPloidy, chrLen, yma if (plots == 'single'){ # segment boundaries p <- p + geom_vline( xintercept=seg$start, col = "#000000DD", lty=5, lwd =0.2 ) - p <- p + geom_vline( xintercept=seg$end, col = "#000000DD", lty=5, lwd =0.2 ) + p <- p + geom_vline( xintercept=seg$end, col = "#000000DD", lty=5, lwd =0.2 ) # add axis p <- p + scale_x_continuous( breaks=pretty(1:len, n=10)/len, labels=pretty(1:len, n=10) ) + # gene annotation + if (annotatePlotsWithGenes & is.data.frame(geneAnnotations)) { + genesSubset = geneAnnotations[geneAnnotations$chr==chromosome,] + genesSubset = genesSubset[order(genesSubset$start),] + + if (nrow(genesSubset)>0) { + genesSubset$yOffset = 0 + # determine yPos offset + if (nrow(genesSubset)>1) { + genesSubset$distance = c(NA,sapply(2:nrow(genesSubset), function(j) { + genesSubset[(j),"start"] - genesSubset[(j-1),"start"] + })) + genesSubset$yHasOffset = genesSubset$distance < 13e6 + genesSubset$yHasOffset[is.na(genesSubset$yHasOffset)] = F + for (j in 2:nrow(genesSubset)) { + if (genesSubset[j,"yHasOffset"]) { + genesSubset[j,"yOffset"] = genesSubset[j-1,"yOffset"]+1 + } + } + } + + genesSubset$position = (genesSubset$start+(genesSubset$end - genesSubset$start)/2)/10/xtotal + + if (!unconditionalGeneLabeling) { + genesSubset.gr = makeGRangesFromDataFrame(genesSubset, keep.extra.columns = T) + merged = mergeByOverlaps(genesSubset.gr, seg.gr) + eventsPerGene.list = aggregate(merged$CNA.type, by=list(merged$gene), FUN=paste)$x + index.affectedSegments = grep(pattern = "DEL|LOH|DUP|HomoDel", x = eventsPerGene.list, perl = T) + genesSubset = as.data.frame(genesSubset[index.affectedSegments,]) + } + if (nrow(genesSubset)>0) { + p <- p + geom_vline( xintercept=genesSubset$position, col = "grey80", lty=5, lwd =0.2 ) + p <- p + geom_text(data=genesSubset, aes(x=position, y=ymaxcov+0.2-0.08*ymaxcov*yOffset, label = gene), cex = 4, col = "red") + } + } + } + # sv segments if ( is.data.frame(svSub) ) { diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/purity_ploidy_estimation_final.R b/resources/analysisTools/copyNumberEstimationWorkflow/purity_ploidy_estimation_final.R index bc7d2a6..b7c4661 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/purity_ploidy_estimation_final.R +++ b/resources/analysisTools/copyNumberEstimationWorkflow/purity_ploidy_estimation_final.R @@ -310,6 +310,11 @@ for (ploidy in posPloidies) { if (length(big) >0){ dh_matrix[, i] = max(as.numeric(DHmatrix[big, i]), na.rm=TRUE) } + } else if (dh_Stop == "quantile") { + big = which(TCNmatrix[, 1] == 2 & DHmatrix[, 1] >= min_length_dh_stop) + if (length(big) >0){ + dh_matrix[, i] = quantile(as.numeric(DHmatrix[big, i]), na.rm=TRUE, 0.975) + } } else if (dh_Stop == "mean") { dh_matrix[, i] = mean(as.numeric(DHmatrix[pos, i]), na.rm=TRUE) } @@ -530,11 +535,12 @@ Ndist_TCN = matrix(data = NMEANmatrix[1, (2:Nmatrix_col)], nrow = length(posPloi colnames(Ndist_TCN) = posPurities rownames(Ndist_TCN) = posPloidies -NTCN = matrix(data = Ntcn_matrix[1, 2:matrix_col], nrow = length(posPloidies), ncol = length(posPurities), byrow = TRUE, dimnames = NULL) +NTCN = matrix(data = Ntcn_matrix[1, 2:matrix_col], nrow = length(posPloidies), ncol = length(posPurities), byrow = TRUE, dimnames = NULL) +limitDHOrig = limitDH # max(which(NTCN)) because it is the first value to be >1 no lower purities than that should be allowed -# that means the distance has bee significantly different from 0 +# that means the distance has been significantly different from 0 limitNTCN = rep(NA, length(posPloidies)) for (i in seq_along(posPloidies)) { if (any(NTCN>(1+1e-8))){ @@ -625,6 +631,11 @@ data[seq(2, nrow(data) - 1, 1), seq(2, ncol(data) - 1, 1)] = data_limit data[which(is.na(data))] = 6 + if (all(data==6)) { + cat("\n\nERROR: Did not find a single solution due to 'high purity issue'\n\n") + quit(save = "no", status = 13) + } + count = 0 ploidy_idx = c() purity_idx = c() @@ -750,10 +761,20 @@ for (l in seq_along(limitDH)) { plotPurities[l] = posPurities[limitDH[l]] } +plotPurities_DH = c() +for (l in seq_along(limitDHOrig)) { + plotPurities_DH[l] = posPurities[limitDHOrig[l]] +} + +plotPurities_NTCN = c() +for (l in seq_along(limitNTCN)) { + plotPurities_NTCN[l] = posPurities[limitNTCN[l]] +} + testPlot = melt(data.frame(COMBI_DIST_TCN)) ploi = rep(posPloidies, length(posPurities)) testPlot$ploidy = ploi -df.lines = data.frame(ploidy = ploi, purity = plotPurities) +df.lines = data.frame(ploidy = ploi, purity = plotPurities, purity_DH=plotPurities_DH, purity_NTCN=plotPurities_NTCN) names(testPlot) = c("purity", "distance", "ploidy") testPlot$purity = substring(testPlot$purity, 2, 5) testPlot$purity = as.numeric(testPlot$purity) @@ -792,7 +813,9 @@ erupt = ggplot(testPlot, aes(purity, ploidy, z = distance)) + scale_y_continuous(expand = c(0, 0)) print( erupt + scale_fill_gradient2(limits = c(0, mean(testPlot$distance[which(!is.na(testPlot$distance))])), midpoint = mean(testPlot$distance[which(!is.na(testPlot$distance))]) / 2, low = "darkred", high = "darkblue", na.value = "darkblue") + theme(axis.text.x = element_text(vjust = 0.5, size = 14, color = "black"), axis.text.y = element_text(vjust = 0.5, size = 14, color = "black"), axis.title.x = element_text(color = "black", size = 16, vjust = 2, face = "bold"), axis.title.y = element_text(color = "black", size = 16, face = "bold")) + - geom_point(x = df.lines$purity, y = df.lines$ploidy, size = 1.2, color = "darkred") + + # geom_point(x = df.lines$purity, y = df.lines$ploidy, size = 1.2, color = "darkred") + + geom_point(x = df.lines$purity_NTCN, y = df.lines$ploidy, size = 1.2, color = "green") + + geom_point(x = df.lines$purity_DH, y = df.lines$ploidy, size = 1.2, color = "yellow") + geom_point(x = df.opt$purity, y = df.opt$ploidy, size = 6, color = "black", shape = 8) + xlab("tumor cell content") + theme(legend.text = element_text(colour = "black", size = 16), legend.title = element_text(colour = "black", size = 16, face = "bold")) ) diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/removeBreakpoints.py b/resources/analysisTools/copyNumberEstimationWorkflow/removeBreakpoints.py index a31711d..fe483eb 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/removeBreakpoints.py +++ b/resources/analysisTools/copyNumberEstimationWorkflow/removeBreakpoints.py @@ -1,7 +1,7 @@ -#!/usr/bin/python - -# Copyright (c) 2017 The ACEseq workflow developers. -# Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt). +#!/usr/bin/env python + +# Copyright (c) 2017 The ACEseq workflow developers. +# Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt). import numpy import argparse diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/segments_to_data.py b/resources/analysisTools/copyNumberEstimationWorkflow/segments_to_data.py index 268206d..f6bd71d 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/segments_to_data.py +++ b/resources/analysisTools/copyNumberEstimationWorkflow/segments_to_data.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # Copyright (c) 2017 The ACEseq workflow developers. # Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt). diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/segments_to_data_homodel.py b/resources/analysisTools/copyNumberEstimationWorkflow/segments_to_data_homodel.py index 1e366b1..71daab7 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/segments_to_data_homodel.py +++ b/resources/analysisTools/copyNumberEstimationWorkflow/segments_to_data_homodel.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # Copyright (c) 2017 The ACEseq workflow developers. # Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt). diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/smoothData.py b/resources/analysisTools/copyNumberEstimationWorkflow/smoothData.py index b8c1c0a..8425490 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/smoothData.py +++ b/resources/analysisTools/copyNumberEstimationWorkflow/smoothData.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # Copyright (c) 2017 The ACEseq workflow developers. # Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt). @@ -235,6 +235,6 @@ def merge_first_segment(prior_line, newline): newline=None #print last line(s) - out.write( "\t".join( str(prior_line[key]) for key in infile.header ) + "\n" ) + out.write( "\t".join( str(prior_line[0][key]) for key in infile.header ) + "\n" ) if newline: out.write( "\t".join( str(newline[key]) for key in infile.header ) + "\n" ) diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/smoothDataMergeSimilar.py b/resources/analysisTools/copyNumberEstimationWorkflow/smoothDataMergeSimilar.py index 22b262a..8bd61d9 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/smoothDataMergeSimilar.py +++ b/resources/analysisTools/copyNumberEstimationWorkflow/smoothDataMergeSimilar.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # Copyright (c) 2017 The ACEseq workflow developers. # Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt). diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/snp_cnv.py b/resources/analysisTools/copyNumberEstimationWorkflow/snp_cnv.py index ab345c0..9056961 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/snp_cnv.py +++ b/resources/analysisTools/copyNumberEstimationWorkflow/snp_cnv.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # Copyright (c) 2017 The ACEseq workflow developers. # Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt). diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/snp_cnv_all.py b/resources/analysisTools/copyNumberEstimationWorkflow/snp_cnv_all.py index 56d0b62..9b13d20 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/snp_cnv_all.py +++ b/resources/analysisTools/copyNumberEstimationWorkflow/snp_cnv_all.py @@ -1,7 +1,7 @@ -#!/usr/bin/python - -# Copyright (c) 2017 The ACEseq workflow developers. -# Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt). +#!/usr/bin/env python + +# Copyright (c) 2017 The ACEseq workflow developers. +# Distributed under the MIT License (license terms are at https://www.github.com/eilslabs/ACEseqWorkflow/LICENSE.txt). ####################################### # snp_cnv.py diff --git a/resources/analysisTools/copyNumberEstimationWorkflow/snvMergeFilter.sh b/resources/analysisTools/copyNumberEstimationWorkflow/snvMergeFilter.sh index 86a79ff..b728ce0 100755 --- a/resources/analysisTools/copyNumberEstimationWorkflow/snvMergeFilter.sh +++ b/resources/analysisTools/copyNumberEstimationWorkflow/snvMergeFilter.sh @@ -25,7 +25,7 @@ then exit 2 fi -${TABIX_BINARY} -s 1 -b 2 -e 2 $tmp_snp_filename +${TABIX_BINARY} -s 1 -b 2 -e 2 --comment chr $tmp_snp_filename if [[ "$?" != 0 ]] then diff --git a/resources/configurationFiles/analysisCopyNumberEstimation.xml b/resources/configurationFiles/analysisCopyNumberEstimation.xml index c88ed3b..87d8d5b 100755 --- a/resources/configurationFiles/analysisCopyNumberEstimation.xml +++ b/resources/configurationFiles/analysisCopyNumberEstimation.xml @@ -8,18 +8,21 @@ configurationType='analysis' class='de.dkfz.roddy.core.Analysis' workflowClass='ACESeqWorkflow' runtimeServiceClass='de.dkfz.b080.co.common.BasicCOProjectsRuntimeService' imports="commonCOWorkflowsSettings" - listOfUsedTools="mergeAndFilterSnpFiles,cnvSnpGeneration,annotateCnvFiles,imputeGenotypes,imputeGenotypes_X,addHaplotypesToSnpFile,getBreakpoints,mergeBreakpointsAndSv,getSegmentsAndSnps,markHomozygousDeletions" + listOfUsedTools="mergeAndFilterSnpFiles,cnvSnpGeneration,annotateCnvFiles,phaseGenotypes,phaseGenotypes_X,addHaplotypesToSnpFile,getBreakpoints,mergeBreakpointsAndSv,getSegmentsAndSnps,markHomozygousDeletions" usedToolFolders="copyNumberEstimationWorkflow,tools"> - + - + + + + @@ -32,17 +35,13 @@ - - - - - - + + @@ -50,16 +49,17 @@ - - - - - - - - - + + + + + + + + + + + @@ -71,14 +71,13 @@ - + - @@ -100,7 +99,7 @@ - + @@ -110,20 +109,11 @@ - - - - - - - - - - - - - - + + + + + @@ -132,9 +122,9 @@ - - @@ -169,7 +159,7 @@ - + @@ -192,7 +182,7 @@ - + - + @@ -243,6 +233,7 @@ + @@ -251,7 +242,10 @@ - + + + + @@ -293,7 +287,7 @@ - + @@ -318,7 +312,7 @@ - + @@ -330,7 +324,7 @@ - + @@ -338,7 +332,7 @@ - + @@ -347,7 +341,7 @@ - + @@ -356,7 +350,7 @@ - + @@ -373,15 +367,15 @@ - + - + - + @@ -390,9 +384,9 @@ - + - + @@ -402,7 +396,7 @@ - + @@ -413,7 +407,7 @@ - + @@ -428,7 +422,7 @@ - + @@ -474,7 +468,7 @@ - + @@ -486,7 +480,7 @@ - + @@ -497,7 +491,7 @@ - + @@ -519,7 +513,7 @@ - + @@ -528,7 +522,7 @@ - + @@ -551,7 +545,7 @@ - + @@ -617,50 +611,50 @@ - - - - - - - - - + + + + + + + + + + pattern='${phasingOutputDirectory}/${cvalue, name="unphasedGenotypesPrefix", default="unphased_chr"}${fgindex}.${cvalue,name="unphasedGenotypesSuffix",default="vcf"}'/> diff --git a/resources/configurationFiles/analysisCopyNumberEstimation_GRCh38.xml b/resources/configurationFiles/analysisCopyNumberEstimation_GRCh38.xml new file mode 100644 index 0000000..cb005e9 --- /dev/null +++ b/resources/configurationFiles/analysisCopyNumberEstimation_GRCh38.xml @@ -0,0 +1,54 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/de/dkfz/b080/co/CopyNumberEstimationWorkflowPlugin.java b/src/de/dkfz/b080/co/CopyNumberEstimationWorkflowPlugin.java index 6a8580c..2282b63 100755 --- a/src/de/dkfz/b080/co/CopyNumberEstimationWorkflowPlugin.java +++ b/src/de/dkfz/b080/co/CopyNumberEstimationWorkflowPlugin.java @@ -14,8 +14,8 @@ */ public class CopyNumberEstimationWorkflowPlugin extends BasePlugin { - public static final String CURRENT_VERSION_STRING = "5.0.0"; - public static final String CURRENT_VERSION_BUILD_DATE = "Fri Oct 12 14:45:18 CEST 2018"; + public static final String CURRENT_VERSION_STRING = "6.0.0"; + public static final String CURRENT_VERSION_BUILD_DATE = "Fri Dec 13 15:26:35 CET 2019"; @Override public String getVersionInfo() { diff --git a/src/de/dkfz/b080/co/aceseq/ACESeqMethods.groovy b/src/de/dkfz/b080/co/aceseq/ACESeqMethods.groovy index 76a6f5e..c4cfa02 100755 --- a/src/de/dkfz/b080/co/aceseq/ACESeqMethods.groovy +++ b/src/de/dkfz/b080/co/aceseq/ACESeqMethods.groovy @@ -27,10 +27,7 @@ final class ACESeqMethods { static LinkedHashMap getGlobalJobSpecificParameters(Configuration config) { return new LinkedHashMap( (ACEseqConstants.CHR_NAME): config.configurationValues.getString(ACEseqConstants.CHR_NAME), - (ACEseqConstants.CHR_NR): config.configurationValues.getString(ACEseqConstants.CHR_NR), - (ACEseqConstants.GENETIC_MAP_FILE): config.configurationValues.getString(ACEseqConstants.GENETIC_MAP_FILE), - (ACEseqConstants.KNOWN_HAPLOTYPES_FILE): config.configurationValues.getString(ACEseqConstants.KNOWN_HAPLOTYPES_FILE), - (ACEseqConstants.KNOWN_HAPLOTYPES_LEGEND_FILE): config.configurationValues.getString(ACEseqConstants.KNOWN_HAPLOTYPES_LEGEND_FILE)) + (ACEseqConstants.CHR_NR): config.configurationValues.getString(ACEseqConstants.CHR_NR)) } static CnvSnpGeneratorResultByType generateCNVSNPs(BamFile tumorBam, BamFile controlBam) { @@ -55,37 +52,37 @@ final class ACESeqMethods { return new CnvSnpGeneratorResultByType(indexedFileObjects, tumorBam.getExecutionContext()) } - static ImputeGenotypeByChromosome imputeGenotypes(BamFile controlBam) { + static PhaseGenotypeByChromosome phaseGenotypes(BamFile controlBam) { IndexedFileObjects indexedFileObjects = ParallelizationHelper.runParallel( COConstants.CVALUE_AUTOSOME_INDICES, - ACEseqConstants.TOOL_IMPUTE_GENOTYPES, + ACEseqConstants.TOOL_PHASE_GENOTYPES, controlBam, null, ACEseqConstants.PARM_CHR_INDEX, getGlobalJobSpecificParameters(controlBam.executionContext.configuration)) - return new ImputeGenotypeByChromosome(indexedFileObjects, controlBam.getExecutionContext()) + return new PhaseGenotypeByChromosome(indexedFileObjects, controlBam.getExecutionContext()) } - static ImputeGenotypeByChromosome imputeGenotypes(UnphasedGenotypeFileGroupByChromosome unphasedGenotypeFiles) { + static PhaseGenotypeByChromosome phaseGenotypes(UnphasedGenotypeFileGroupByChromosome unphasedGenotypeFiles) { Map mapOfFiles = [:] mapOfFiles += unphasedGenotypeFiles.getFiles() mapOfFiles.remove("X") IndexedFileObjects indexedFileObjects = runParallel( - ACEseqConstants.TOOL_IMPUTE_GENOTYPES_NOMPILEUP, + ACEseqConstants.TOOL_PHASE_GENOTYPES_NOMPILEUP, new UnphasedGenotypeFileGroupByChromosome(mapOfFiles.keySet() as List, mapOfFiles, unphasedGenotypeFiles.getExecutionContext()), null, ACEseqConstants.PARM_CHR_INDEX, getGlobalJobSpecificParameters(unphasedGenotypeFiles.executionContext.configuration)) - return new ImputeGenotypeByChromosome(indexedFileObjects, unphasedGenotypeFiles.getExecutionContext()) + return new PhaseGenotypeByChromosome(indexedFileObjects, unphasedGenotypeFiles.getExecutionContext()) } - static Tuple2 imputeGenotypeX(GenderFile sexFile, BamFile controlBam) { - return (Tuple2) GenericMethod.callGenericTool(ACEseqConstants.TOOL_IMPUTE_GENOTYPEX, controlBam, sexFile); + static Tuple2 phaseGenotypeX(GenderFile sexFile, BamFile controlBam) { + return (Tuple2) GenericMethod.callGenericTool(ACEseqConstants.TOOL_PHASE_GENOTYPEX, controlBam, sexFile); } - static Tuple2 imputeGenotypeX(GenderFile sexFile, UnphasedGenotypeFileGroupByChromosome unphasedGenotypeFiles) { - return (Tuple2) GenericMethod.callGenericTool(ACEseqConstants.TOOL_IMPUTE_GENOTYPEX_NOMPILEUP, unphasedGenotypeFiles.getFiles().get("X"), sexFile); + static Tuple2 phaseGenotypeX(GenderFile sexFile, UnphasedGenotypeFileGroupByChromosome unphasedGenotypeFiles) { + return (Tuple2) GenericMethod.callGenericTool(ACEseqConstants.TOOL_PHASE_GENOTYPEX_NOMPILEUP, unphasedGenotypeFiles.getFiles().get("X"), sexFile); } static TextFile getGenotypes(TextFile mergedAndFilteredSNPFile) { diff --git a/src/de/dkfz/b080/co/aceseq/ACESeqWorkflow.java b/src/de/dkfz/b080/co/aceseq/ACESeqWorkflow.java index 5d6a6ef..c0cb3d9 100644 --- a/src/de/dkfz/b080/co/aceseq/ACESeqWorkflow.java +++ b/src/de/dkfz/b080/co/aceseq/ACESeqWorkflow.java @@ -67,20 +67,20 @@ public boolean execute(ExecutionContext context, BasicBamFile _bamControlMerged, return true; TextFile mergedAndFilteredSNPFile = resultByType.getPositionFiles().mergeAndFilter(); - ImputeGenotypeByChromosome imputedGenotypeByChromosome; + PhaseGenotypeByChromosome phasedGenotypeByChromosome; Tuple2 phasedGenotypeX; TextFile haplotypedSNPFile; if (isNoControlWorkflow()) { TextFile genotypeSNPFile = ACESeqMethods.getGenotypes(mergedAndFilteredSNPFile); UnphasedGenotypeFileGroupByChromosome unphasedGenotypeFile = ACESeqMethods.createUnphased(genotypeSNPFile); - imputedGenotypeByChromosome = ACESeqMethods.imputeGenotypes(unphasedGenotypeFile); - phasedGenotypeX = ACESeqMethods.imputeGenotypeX(annotationResult.getGenderFile(), unphasedGenotypeFile); - haplotypedSNPFile = ACESeqMethods.addHaploTypes(genotypeSNPFile, imputedGenotypeByChromosome.getPhasedSnpFiles(), phasedGenotypeX.value0); + phasedGenotypeByChromosome = ACESeqMethods.phaseGenotypes(unphasedGenotypeFile); + phasedGenotypeX = ACESeqMethods.phaseGenotypeX(annotationResult.getGenderFile(), unphasedGenotypeFile); + haplotypedSNPFile = ACESeqMethods.addHaploTypes(genotypeSNPFile, phasedGenotypeByChromosome.getPhasedSnpFiles(), phasedGenotypeX.value0); } else { - imputedGenotypeByChromosome = ACESeqMethods.imputeGenotypes(bamControlMerged); - phasedGenotypeX = ACESeqMethods.imputeGenotypeX(annotationResult.getGenderFile(), bamControlMerged); - haplotypedSNPFile = ACESeqMethods.addHaploTypes(mergedAndFilteredSNPFile, imputedGenotypeByChromosome.getPhasedSnpFiles(), phasedGenotypeX.value0); + phasedGenotypeByChromosome = ACESeqMethods.phaseGenotypes(bamControlMerged); + phasedGenotypeX = ACESeqMethods.phaseGenotypeX(annotationResult.getGenderFile(), bamControlMerged); + haplotypedSNPFile = ACESeqMethods.addHaploTypes(mergedAndFilteredSNPFile, phasedGenotypeByChromosome.getPhasedSnpFiles(), phasedGenotypeX.value0); TextFile baffile = ACESeqMethods.createControlBafPlot(haplotypedSNPFile, annotationResult.getGenderFile()); } diff --git a/src/de/dkfz/b080/co/aceseq/ACEseqConstants.java b/src/de/dkfz/b080/co/aceseq/ACEseqConstants.java index bc1132c..5637e53 100755 --- a/src/de/dkfz/b080/co/aceseq/ACEseqConstants.java +++ b/src/de/dkfz/b080/co/aceseq/ACEseqConstants.java @@ -23,10 +23,10 @@ public final class ACEseqConstants { public static final String TOOL_ANNOTATE_COV_WIN = "annotateCnvFiles"; public static final String TOOL_GET_GENOTYPES = "getGenotypes"; public static final String TOOL_CREATE_UNPHASED_GENOTYPE = "createUnphasedGenotype"; - public static final String TOOL_IMPUTE_GENOTYPES = "imputeGenotypes"; - public static final String TOOL_IMPUTE_GENOTYPES_NOMPILEUP = "imputeGenotypes_noMpileup"; - public static final String TOOL_IMPUTE_GENOTYPEX = "imputeGenotypes_X"; - public static final String TOOL_IMPUTE_GENOTYPEX_NOMPILEUP = "imputeGenotypes_X_noMpileup"; + public static final String TOOL_PHASE_GENOTYPES = "phaseGenotypes"; + public static final String TOOL_PHASE_GENOTYPES_NOMPILEUP = "phaseGenotypes_noMpileup"; + public static final String TOOL_PHASE_GENOTYPEX = "phaseGenotypes_X"; + public static final String TOOL_PHASE_GENOTYPEX_NOMPILEUP = "phaseGenotypes_X_noMpileup"; public static final String TOOL_ADD_HAPLOTYPES_TO_SNP_FILE = "addHaplotypesToSnpFile"; public static final String TOOL_CREATE_CONTROL_BAF_PLOTS = "createControlBafPlots"; public static final String TOOL_CORRECT_GC_BIAS = "correctGcBias"; @@ -48,9 +48,6 @@ public final class ACEseqConstants { public static final String PARM_CHR_INDEX = "PARM_CHR_INDEX"; public static final String CHR_NAME = "CHR_NAME"; public static final String CHR_NR = "CHR_NR"; - public static final String GENETIC_MAP_FILE = "GENETIC_MAP_FILE"; - public static final String KNOWN_HAPLOTYPES_FILE = "KNOWN_HAPLOTYPES_FILE"; - public static final String KNOWN_HAPLOTYPES_LEGEND_FILE = "KNOWN_HAPLOTYPES_LEGEND_FILE"; private ACEseqConstants() { } diff --git a/src/de/dkfz/b080/co/files/ImputeGenotypeByChromosome.java b/src/de/dkfz/b080/co/files/PhaseGenotypeByChromosome.java similarity index 89% rename from src/de/dkfz/b080/co/files/ImputeGenotypeByChromosome.java rename to src/de/dkfz/b080/co/files/PhaseGenotypeByChromosome.java index b62db4b..b002905 100755 --- a/src/de/dkfz/b080/co/files/ImputeGenotypeByChromosome.java +++ b/src/de/dkfz/b080/co/files/PhaseGenotypeByChromosome.java @@ -19,13 +19,13 @@ */ -public class ImputeGenotypeByChromosome extends IndexedFileObjects { +public class PhaseGenotypeByChromosome extends IndexedFileObjects { private PhasedGenotypeFileGroupByChromosome phasedSnpFiles; private HaploblockFileGroupByChromosome haploblockFiles; - public ImputeGenotypeByChromosome(IndexedFileObjects indexedFileObjects, ExecutionContext executionContext) { + public PhaseGenotypeByChromosome(IndexedFileObjects indexedFileObjects, ExecutionContext executionContext) { super(indexedFileObjects.getIndices(), indexedFileObjects.getIndexedFileObjects(), executionContext); Map _phasedSnpFiles = new LinkedHashMap<>(); Map _haploblockFiles = new LinkedHashMap<>();