Skip to content

Commit 9ce1cab

Browse files
authored
feat: add missing background data for GRCh38 (#5) (#37)
1 parent 400119f commit 9ce1cab

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+504
-187
lines changed

Snakefile

Lines changed: 19 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ rule all:
7575
f"work/download/annos/grch37/seqvars/dbnsfp/{DV.dbnsfp}a/LICENSE.txt",
7676
f"work/download/annos/grch37/seqvars/dbnsfp/{DV.dbnsfp}c/LICENSE.txt",
7777
f"work/download/annos/grch37/seqvars/dbscsnv/{DV.dbscsnv}/dbscSNV{DV.dbscsnv}.chr1",
78-
f"work/download/annos/grch37/seqvars/dbsnp/{DV.dbsnp_37}/dbsnp.vcf.gz",
78+
f"work/download/annos/grch37/seqvars/dbsnp/{DV.dbsnp}/dbsnp.vcf.gz",
7979
"work/annos/grch37/seqvars/helixmtdb/20200327/helixmtdb.vcf.gz"
8080
f"work/annos/grch37/seqvars/gnomad_mtdna/{DV.gnomad_mtdna}/gnomad_mtdna.vcf.gz",
8181
f"work/annos/grch37/seqvars/gnomad_exomes/{DV.gnomad_v2}/.done",
@@ -85,7 +85,7 @@ rule all:
8585
f"work/download/annos/grch38/seqvars/cadd/{DV.cadd}/gnomad.genomes.r3.0.indel_inclAnno.tsv.gz",
8686
# NB: dbNSFP is dual reference (for download)
8787
# NB: dbscSNV is dual reference (for download)
88-
# TODO: "work/download/annos/grch38/seqvars/dbsnp/dbsnp.vcf.gz",
88+
f"work/download/annos/grch37/seqvars/dbsnp/{DV.dbsnp}/dbsnp.vcf.gz",
8989
"work/annos/grch38/seqvars/helixmtdb/20200327/helixmtdb.vcf.gz"
9090
f"work/annos/grch38/seqvars/gnomad_mtdna/{DV.gnomad_mtdna}/gnomad_mtdna.vcf.gz",
9191
f"work/annos/grch38/seqvars/gnomad_exomes/{DV.gnomad_v2}/.done",
@@ -99,31 +99,29 @@ rule all:
9999
f"work/annos/grch37/strucvars/g1k/{DV.g1k_svs}/g1k.bed.gz",
100100
f"work/annos/grch37/strucvars/gnomad/{DV.gnomad_sv}/gnomad_sv.bed.gz",
101101
# ---- GRCh38
102-
# TODO: "work/annos/grch38/strucvars/dbvar/dbvar.bed.gz",
103-
# TODO: f"work/annos/grch38/strucvars/dgv/{DV.dgv}/dgv.bed.gz",
104-
# TODO: f"work/annos/grch38/strucvars/dgv_gs/{DV.dgv_gs}/dgv_gs.bed.gz",
105-
# TODO: "work/annos/grch38/strucvars/gnomad/gnomad_sv.bed.gz",
102+
f"work/annos/grch38/strucvars/dbvar/{DV.dbvar}/dbvar.bed.gz",
103+
f"work/annos/grch38/strucvars/dgv/{DV.dgv}/dgv.bed.gz",
104+
f"work/annos/grch38/strucvars/dgv_gs/{DV.dgv_gs}/dgv_gs.bed.gz",
105+
# NB: gnomAD-SV GRCh38 was announced end of 2020 but not released yet
106106
# -- genome browser "features" (position-specific)
107107
# ---- GRCh37
108-
f"work/annos/grch37/features/cons/{DV.ucsc_cons}/ucsc_conservation.tsv",
108+
f"work/annos/grch37/features/cons/{DV.ucsc_cons_37}/ucsc_conservation.tsv",
109109
f"work/annos/grch37/features/ensembl/{DV.ensembl_37}/ensembl_genes.bed.gz",
110110
f"work/annos/grch37/features/refseq/{DV.refseq_37}/refseq_genes.bed.gz",
111-
"work/annos/grch37/features/tads/dixon2015/imr90.bed",
112111
"work/annos/grch37/features/tads/dixon2015/hesc.bed",
113-
f"work/annos/grch37/features/ucsc/{DV.ucsc_genomic_super_dups}/genomicSuperDups.bed.gz",
114-
f"work/annos/grch37/features/ucsc/{DV.ucsc_rmsk}/rmsk.bed.gz",
115-
f"work/annos/grch37/features/ucsc/{DV.ucsc_alt_seq_liftover}/altSeqLiftOverPsl.bed.gz",
116-
f"work/annos/grch37/features/ucsc/{DV.ucsc_fix_seq_liftover}/fixSeqLiftOverPsl.bed.gz",
112+
f"work/annos/grch37/features/ucsc/{DV.ucsc_genomic_super_dups_37}/genomicSuperDups.bed.gz",
113+
f"work/annos/grch37/features/ucsc/{DV.ucsc_rmsk_37}/rmsk.bed.gz",
114+
f"work/annos/grch37/features/ucsc/{DV.ucsc_alt_seq_liftover_37}/altSeqLiftOverPsl.bed.gz",
115+
f"work/annos/grch37/features/ucsc/{DV.ucsc_fix_seq_liftover_37}/fixSeqLiftOverPsl.bed.gz",
117116
# ---- GRCh38
118-
# TODO: "work/annos/grch38/features/cons/ucsc_conservation.tsv",
119-
# TODO: "work/annos/grch38/features/ensembl/ensembl_genes.bed.gz",
120-
# TODO: "work/annos/grch38/features/refseq/refseq_genes.bed.gz",
121-
# TODO: "work/annos/grch38/features/tads/imr90.bed",
122-
# TODO: "work/annos/grch38/features/tads/hesc.bed",
123-
# TODO: "work/annos/grch38/features/ucsc/genomicSuperDups.bed.gz",
124-
# TODO: "work/annos/grch38/features/ucsc/rmsk.bed.gz",
125-
# TODO: "work/annos/grch38/features/ucsc/altSeqLiftOverPsl.bed.gz",
126-
# TODO: "work/annos/grch38/features/ucsc/fixSeqLiftOverPsl.bed.gz",
117+
f"work/annos/grch38/features/cons/{DV.ucsc_cons_38}/ucsc_conservation.tsv",
118+
f"work/annos/grch38/features/ensembl/{DV.ensembl_38}/ensembl_genes.bed.gz",
119+
f"work/annos/grch38/features/refseq/{DV.refseq_38}/refseq_genes.bed.gz",
120+
"work/annos/grch38/features/tads/dixon2015/hesc.bed",
121+
f"work/annos/grch38/features/ucsc/{DV.ucsc_genomic_super_dups_38}/genomicSuperDups.bed.gz",
122+
f"work/annos/grch38/features/ucsc/{DV.ucsc_rmsk_38}/rmsk.bed.gz",
123+
f"work/annos/grch38/features/ucsc/{DV.ucsc_alt_seq_liftover_38}/altSeqLiftOverPsl.bed.gz",
124+
f"work/annos/grch38/features/ucsc/{DV.ucsc_fix_seq_liftover_38}/fixSeqLiftOverPsl.bed.gz",
127125

128126

129127
# ===============================================================================================

download_urls.yml

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,12 @@
11
- url: https://ftp.ensembl.org/pub/current_README
22

3-
- comment: dbSNP listing for checking the version.
4-
url: https://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh37p13/VCF
3+
- comment: The UCSC listing is used for checking the versions for GRCh37.
4+
url: https://hgdownload.cse.ucsc.edu/goldenpath/hg19/database
55
excerpt_strategy:
66
strategy: no-excerpt
77
count: null
8-
9-
- comment: The UCSC listing is used for checking the versions.
10-
url: https://hgdownload.cse.ucsc.edu/goldenpath/hg19/database
8+
- comment: The UCSC listing is used for checking the versions for GRCh38.
9+
url: https://hgdownload.cse.ucsc.edu/goldenpath/hg38/database
1110
excerpt_strategy:
1211
strategy: no-excerpt
1312
count: null
@@ -26,12 +25,19 @@
2625
count: 10000
2726
- url: https://hgdownload.cse.ucsc.edu/goldenpath/hg38/multiz100way/alignments/knownGene.exonAA.fa.gz
2827

29-
- url: https://compbio.med.harvard.edu/modencode/webpage/hic/IMR90_domains_hg19.bed
30-
- url: https://compbio.med.harvard.edu/modencode/webpage/hic/hESC_domains_hg19.bed
28+
- url: http://3dgenome.fsm.northwestern.edu/downloads/hg19.TADs.zip
29+
excerpt_strategy:
30+
strategy: no-excerpt
31+
count: null
32+
- url: http://3dgenome.fsm.northwestern.edu/downloads/hg38.TADs.zip
33+
excerpt_strategy:
34+
strategy: no-excerpt
35+
count: null
3136

3237
- url: https://helix-research-public.s3.amazonaws.com/mito/HelixMTdb_20200327.tsv
3338

3439
- url: https://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh37p13/VCF/00-All.vcf.gz
40+
- url: https://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh38p7/VCF/00-All.vcf.gz
3541

3642
- url: https://kircherlab.bihealth.org/download/CADD/v1.6/GRCh37/whole_genome_SNVs_inclAnno.tsv.gz
3743
- url: https://kircherlab.bihealth.org/download/CADD/v1.6/GRCh37/whole_genome_SNVs_inclAnno.tsv.gz.tbi
@@ -65,11 +71,25 @@
6571
strategy: head
6672
count: 10000
6773

74+
- url: https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_assembly_report.txt
75+
excerpt_strategy:
76+
strategy: no-excerpt
77+
count: null
78+
- url: https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gtf.gz
79+
excerpt_strategy:
80+
strategy: head
81+
count: 10000
82+
6883
- url: https://ftp.ensembl.org/pub/grch37/current/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz
6984
excerpt_strategy:
7085
strategy: head
7186
count: 10000
7287

88+
- url: https://ftp.ensembl.org/pub/release-109/gtf/homo_sapiens/Homo_sapiens.GRCh38.109.gtf.gz
89+
excerpt_strategy:
90+
strategy: head
91+
count: 10000
92+
7393
- url: 'https://ensembl.org/biomart/martservice?query=<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "hsapiens_gene_ensembl" interface = "default" ><Attribute name = "ensembl_gene_id" /><Attribute name = "ensembl_transcript_id" /><Attribute name = "entrezgene_id" /><Attribute name = "external_gene_name" /></Dataset></Query>'
7494
- url: 'https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json'
7595
skip_upstream_check: true # does not work reliably in tests
@@ -100,6 +120,10 @@
100120
- url: https://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/rmsk.txt.gz
101121
- url: https://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/altSeqLiftOverPsl.txt.gz
102122
- url: https://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/fixSeqLiftOverPsl.txt.gz
123+
- url: https://hgdownload.cse.ucsc.edu/goldenpath/hg38/database/genomicSuperDups.txt.gz
124+
- url: https://hgdownload.cse.ucsc.edu/goldenpath/hg38/database/rmsk.txt.gz
125+
- url: https://hgdownload.cse.ucsc.edu/goldenpath/hg38/database/altSeqLiftOverPsl.txt.gz
126+
- url: https://hgdownload.cse.ucsc.edu/goldenpath/hg38/database/fixSeqLiftOverPsl.txt.gz
103127

104128
- comment: The dbVar release notes index is pulled for checking for latest release.
105129
url: https://ftp.ncbi.nlm.nih.gov/pub/dbVar/sandbox/sv_datasets/nonredundant/release_notes
@@ -109,8 +133,13 @@
109133
- url: https://ftp.ncbi.nlm.nih.gov/pub/dbVar/sandbox/sv_datasets/nonredundant/deletions/GRCh37.nr_deletions.tsv.gz
110134
- url: https://ftp.ncbi.nlm.nih.gov/pub/dbVar/sandbox/sv_datasets/nonredundant/duplications/GRCh37.nr_duplications.tsv.gz
111135
- url: https://ftp.ncbi.nlm.nih.gov/pub/dbVar/sandbox/sv_datasets/nonredundant/insertions/GRCh37.nr_insertions.tsv.gz
136+
- url: https://ftp.ncbi.nlm.nih.gov/pub/dbVar/sandbox/sv_datasets/nonredundant/deletions/GRCh38.nr_deletions.tsv.gz
137+
- url: https://ftp.ncbi.nlm.nih.gov/pub/dbVar/sandbox/sv_datasets/nonredundant/duplications/GRCh38.nr_duplications.tsv.gz
138+
- url: https://ftp.ncbi.nlm.nih.gov/pub/dbVar/sandbox/sv_datasets/nonredundant/insertions/GRCh38.nr_insertions.tsv.gz
112139
- url: http://dgv.tcag.ca/dgv/docs/GRCh37_hg19_variants_2020-02-25.txt
140+
- url: http://dgv.tcag.ca/dgv/docs/GRCh38_hg38_variants_2020-02-25.txt
113141
- url: http://dgv.tcag.ca/dgv/docs/DGV.GS.March2016.50percent.GainLossSep.Final.hg19.gff3
142+
- url: http://dgv.tcag.ca/dgv/docs/DGV.GS.hg38.gff3
114143
- url: ftp://ftp.broadinstitute.org/pub/ExAC_release/release0.3.1/cnv/exac-final.autosome-1pct-sq60-qc-prot-coding.cnv.bed
115144
- url: https://ftp-trace.ncbi.nih.gov/1000genomes/ftp/phase3/integrated_sv_map/ALL.wgs.integrated_sv_map_v2.20130502.svs.genotypes.vcf.gz
116145
- url: https://storage.googleapis.com/gcp-public-data--gnomad/papers/2019-sv/gnomad_v2.1_sv.sites.vcf.gz
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:80255d58ff142d5fdb2376e2af242af9ec61a8c46c792cdf32255f753e2ac683
3+
size 435707

excerpt-data/17f0d5f9c4671d95/url.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:f0926751798217d37145c4902acfda3faebac9a3545605e6751ad6d36609903a
3+
size 61

excerpt-data/1a5f86b027a18653/IMR90_domains_hg19.bed

Lines changed: 0 additions & 3 deletions
This file was deleted.

excerpt-data/1a5f86b027a18653/url.txt

Lines changed: 0 additions & 3 deletions
This file was deleted.
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:f9e1b1fd2f685f294ba08eb29d3b8ab3f3173601345e68feb574d09a42354cb1
3+
size 80454

excerpt-data/3a9b37b0518ee213/url.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:ba7faa26ba4ff7d478408256659502ae661f6ec80207c967dcd5a9b263dea85b
3+
size 176
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:8a92ff0a04acf8b1e29273a79c10929c623a35e31d81868c2371d44de19dd47c
3+
size 38785

excerpt-data/3b4ab5feb892b2b1/url.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:41054fa686d5102a56d0145b5caf89ec020add570a7b0d9cbd940392135e328d
3+
size 64

0 commit comments

Comments
 (0)