Skip to content

Commit 8d1542f

Browse files
authored
feat: binary conversion of sequence dbs with annonars and worker (#35) (#40)
1 parent 2f6deac commit 8d1542f

File tree

156 files changed

+7374
-284
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

156 files changed

+7374
-284
lines changed

.gitignore

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Ignore the workflow directories.
2-
work/
3-
output/
2+
/work/
3+
/output/
44

55
# Python
66
__pycache__

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ check-black:
5050
.PHONY: check-snakefmt
5151
check-snakefmt:
5252
snakefmt --check --diff --line-length 100 Snakefile
53-
snakefmt --check --diff --line-length 100 rules/*/*.smk rules/*/*/*.smk
53+
snakefmt --check --diff --line-length 100 rules/*/*/*.smk rules/*/*/*.smk
5454

5555
# Run Python linting with flake8.
5656
.PHONY: flake8
@@ -86,4 +86,4 @@ black:
8686
.PHONY: run-snakefmt
8787
run-snakefmt:
8888
snakefmt --line-length 100 Snakefile
89-
snakefmt --line-length 100 rules/*/*.smk rules/*/*/*.smk
89+
snakefmt --line-length 100 rules/*/*/*.smk rules/*/*/*.smk

Snakefile

Lines changed: 85 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,17 @@
66
# ``varfish-server-worker`` and is used in the backend for filtering and/or exposed to the
77
# user via a REST API.
88

9-
from varfish_db_downloader.data_versions import DATA_VERSIONS as DV
9+
from varfish_db_downloader.versions import DATA_VERSIONS as DV, PACKAGE_VERSIONS as PV
1010

1111
# The prefix to use for all shell commands.
1212
SHELL_PREFIX = "export LC_ALL=C; set -x -euo pipefail;"
1313
# Setup the shell prefix by default.
1414
shell.prefix(SHELL_PREFIX)
1515

16+
# Regular expression for genome release.
17+
RE_GENOME = r"grch(37|38)"
18+
# Regular expression for versions.
19+
RE_VERSION = r"\w+(\.\w+)*"
1620

1721
# ===============================================================================================
1822
# Test Mode
@@ -58,6 +62,8 @@ rule help:
5862
## all -- run all rules
5963
rule all:
6064
input:
65+
# == work directory =====================================================================
66+
#
6167
# genes
6268
f"work/genes/dbnsfp/{DV.dbnsfp}/genes.tsv.gz",
6369
f"work/genes/ensembl/{DV.ensembl}/ensembl_xlink.tsv",
@@ -76,20 +82,20 @@ rule all:
7682
f"work/download/annos/grch37/seqvars/dbnsfp/{DV.dbnsfp}c/LICENSE.txt",
7783
f"work/download/annos/grch37/seqvars/dbscsnv/{DV.dbscsnv}/dbscSNV{DV.dbscsnv}.chr1",
7884
f"work/download/annos/grch37/seqvars/dbsnp/{DV.dbsnp}/dbsnp.vcf.gz",
79-
"work/annos/grch37/seqvars/helixmtdb/20200327/helixmtdb.vcf.gz",
85+
f"work/annos/grch37/seqvars/helixmtdb/{DV.helixmtdb}/helixmtdb.vcf.gz",
8086
f"work/annos/grch37/seqvars/gnomad_mtdna/{DV.gnomad_mtdna}/gnomad_mtdna.vcf.gz",
81-
f"work/annos/grch37/seqvars/gnomad_exomes/{DV.gnomad_v2}/.done",
82-
f"work/annos/grch37/seqvars/gnomad_genomes/{DV.gnomad_v2}/.done",
87+
f"work/download/annos/grch37/seqvars/gnomad_exomes/{DV.gnomad_v2}/.done",
88+
f"work/download/annos/grch37/seqvars/gnomad_genomes/{DV.gnomad_v2}/.done",
8389
# ---- GRCh38
8490
f"work/download/annos/grch38/seqvars/cadd/{DV.cadd}/whole_genome_SNVs_inclAnno.tsv.gz",
8591
f"work/download/annos/grch38/seqvars/cadd/{DV.cadd}/gnomad.genomes.r3.0.indel_inclAnno.tsv.gz",
8692
# NB: dbNSFP is dual reference (for download)
8793
# NB: dbscSNV is dual reference (for download)
8894
f"work/download/annos/grch37/seqvars/dbsnp/{DV.dbsnp}/dbsnp.vcf.gz",
89-
"work/annos/grch38/seqvars/helixmtdb/20200327/helixmtdb.vcf.gz",
95+
f"work/annos/grch38/seqvars/helixmtdb/{DV.helixmtdb}/helixmtdb.vcf.gz",
9096
f"work/annos/grch38/seqvars/gnomad_mtdna/{DV.gnomad_mtdna}/gnomad_mtdna.vcf.gz",
91-
f"work/annos/grch38/seqvars/gnomad_exomes/{DV.gnomad_v2}/.done",
92-
f"work/annos/grch38/seqvars/gnomad_genomes/{DV.gnomad_v3}/.done",
97+
f"work/download/annos/grch38/seqvars/gnomad_exomes/{DV.gnomad_v2}/.done",
98+
f"work/download/annos/grch38/seqvars/gnomad_genomes/{DV.gnomad_v3}/.done",
9399
# -- background/population structural variants and annoations thereof
94100
# ---- GRCh37
95101
f"work/annos/grch37/strucvars/dbvar/{DV.dbvar}/dbvar.bed.gz",
@@ -122,38 +128,87 @@ rule all:
122128
f"work/annos/grch38/features/ucsc/{DV.ucsc_rmsk_38}/rmsk.bed.gz",
123129
f"work/annos/grch38/features/ucsc/{DV.ucsc_alt_seq_liftover_38}/altSeqLiftOverPsl.bed.gz",
124130
f"work/annos/grch38/features/ucsc/{DV.ucsc_fix_seq_liftover_38}/fixSeqLiftOverPsl.bed.gz",
131+
#
132+
# == output directory ===================================================================
133+
#
134+
# -- mehari data
135+
# ---- frequencies (via annonars)
136+
f"output/mehari/freqs-grch37-{DV.gnomad_v2}+{DV.gnomad_v2}+{DV.gnomad_mtdna}+{DV.helixmtdb}+{PV.annonars}/rocksdb/IDENTITY",
137+
f"output/mehari/freqs-grch38-{DV.gnomad_v3}+{DV.gnomad_v2}+{DV.gnomad_mtdna}+{DV.helixmtdb}+{PV.annonars}/rocksdb/IDENTITY",
138+
# -- varfish-server-worker data
139+
# ---- CADD
140+
f"output/worker/annos/seqvars/cadd-grch37-{DV.cadd}+{PV.annonars}/rocksdb/IDENTITY",
141+
f"output/worker/annos/seqvars/cadd-grch38-{DV.cadd}+{PV.annonars}/rocksdb/IDENTITY",
142+
# ---- dbSNP
143+
f"output/worker/annos/seqvars/dbsnp-grch37-{DV.dbsnp}+{PV.annonars}/rocksdb/IDENTITY",
144+
f"output/worker/annos/seqvars/dbsnp-grch38-{DV.dbsnp}+{PV.annonars}/rocksdb/IDENTITY",
145+
# ---- dbNSFP
146+
f"output/worker/annos/seqvars/dbnsfp-grch37-{DV.dbnsfp}a+{PV.annonars}/rocksdb/IDENTITY",
147+
f"output/worker/annos/seqvars/dbnsfp-grch38-{DV.dbnsfp}a+{PV.annonars}/rocksdb/IDENTITY",
148+
f"output/worker/annos/seqvars/dbnsfp-grch37-{DV.dbnsfp}c+{PV.annonars}/rocksdb/IDENTITY",
149+
f"output/worker/annos/seqvars/dbnsfp-grch38-{DV.dbnsfp}c+{PV.annonars}/rocksdb/IDENTITY",
150+
# ---- dbscSNV
151+
f"output/worker/annos/seqvars/dbscsnv-grch37-{DV.dbscsnv}+{PV.annonars}/rocksdb/IDENTITY",
152+
f"output/worker/annos/seqvars/dbscsnv-grch38-{DV.dbscsnv}+{PV.annonars}/rocksdb/IDENTITY",
153+
# ---- gnomAD mtDNA
154+
f"output/worker/annos/seqvars/gnomad-mtdna-grch37-{DV.gnomad_mtdna}+{PV.annonars}/rocksdb/IDENTITY",
155+
f"output/worker/annos/seqvars/gnomad-mtdna-grch38-{DV.gnomad_mtdna}+{PV.annonars}/rocksdb/IDENTITY",
156+
# ---- gnomAD exomes
157+
f"output/worker/annos/seqvars/gnomad-exomes-grch37-{DV.gnomad_v2}+{PV.annonars}/rocksdb/IDENTITY",
158+
f"output/worker/annos/seqvars/gnomad-exomes-grch38-{DV.gnomad_v2}+{PV.annonars}/rocksdb/IDENTITY",
159+
# ---- gnomAD genomes
160+
f"output/worker/annos/seqvars/gnomad-genomes-grch37-{DV.gnomad_v2}+{PV.annonars}/rocksdb/IDENTITY",
161+
f"output/worker/annos/seqvars/gnomad-genomes-grch38-{DV.gnomad_v3}+{PV.annonars}/rocksdb/IDENTITY",
162+
# ---- HelixMtDb
163+
f"output/worker/annos/seqvars/helixmtdb-grch37-{DV.helixmtdb}+{PV.annonars}/rocksdb/IDENTITY",
164+
f"output/worker/annos/seqvars/helixmtdb-grch38-{DV.helixmtdb}+{PV.annonars}/rocksdb/IDENTITY",
165+
# ---- UCSC conservation
166+
f"output/worker/annos/seqvars/cons-grch37-{DV.ucsc_cons_37}+{PV.annonars}/rocksdb/IDENTITY",
167+
f"output/worker/annos/seqvars/cons-grch38-{DV.ucsc_cons_38}+{PV.annonars}/rocksdb/IDENTITY",
125168

126169

127170
# ===============================================================================================
128171
# Modular Snakefile Includes
129172
# ===============================================================================================
130173

131174

175+
# -- work directory -----------------------------------------------------------------------------
132176
# Gene-related rules.
133-
include: "rules/genes/dbnsfp.smk"
134-
include: "rules/genes/ensembl.smk"
135-
include: "rules/genes/gnomad.smk"
136-
include: "rules/genes/hgnc.smk"
137-
include: "rules/genes/ncbi.smk"
177+
include: "rules/work/genes/dbnsfp.smk"
178+
include: "rules/work/genes/ensembl.smk"
179+
include: "rules/work/genes/gnomad.smk"
180+
include: "rules/work/genes/hgnc.smk"
181+
include: "rules/work/genes/ncbi.smk"
138182
# Reference sequence--related rules.
139-
include: "rules/reference/human.smk"
183+
include: "rules/work/reference/human.smk"
140184
# Features (position and not variant specific).
141-
include: "rules/annos/features/cons.smk"
142-
include: "rules/annos/features/ensembl.smk"
143-
include: "rules/annos/features/refseq.smk"
144-
include: "rules/annos/features/tads.smk"
145-
include: "rules/annos/features/ucsc.smk"
185+
include: "rules/work/annos/features/cons.smk"
186+
include: "rules/work/annos/features/ensembl.smk"
187+
include: "rules/work/annos/features/refseq.smk"
188+
include: "rules/work/annos/features/tads.smk"
189+
include: "rules/work/annos/features/ucsc.smk"
146190
# Sequence variants and annotations.
147-
include: "rules/annos/seqvars/cadd.smk"
148-
include: "rules/annos/seqvars/dbnsfp.smk"
149-
include: "rules/annos/seqvars/dbscsnv.smk"
150-
include: "rules/annos/seqvars/dbsnp.smk"
151-
include: "rules/annos/seqvars/gnomad_mtdna.smk"
152-
include: "rules/annos/seqvars/gnomad_nuclear.smk"
153-
include: "rules/annos/seqvars/helix.smk"
191+
include: "rules/work/annos/seqvars/cadd.smk"
192+
include: "rules/work/annos/seqvars/dbnsfp.smk"
193+
include: "rules/work/annos/seqvars/dbscsnv.smk"
194+
include: "rules/work/annos/seqvars/dbsnp.smk"
195+
include: "rules/work/annos/seqvars/gnomad_mtdna.smk"
196+
include: "rules/work/annos/seqvars/gnomad_nuclear.smk"
197+
include: "rules/work/annos/seqvars/helix.smk"
154198
# Structural variant related.
155-
include: "rules/annos/strucvars/dbvar.smk"
156-
include: "rules/annos/strucvars/dgv.smk"
157-
include: "rules/annos/strucvars/exac.smk"
158-
include: "rules/annos/strucvars/g1k.smk"
159-
include: "rules/annos/strucvars/gnomad.smk"
199+
include: "rules/work/annos/strucvars/dbvar.smk"
200+
include: "rules/work/annos/strucvars/dgv.smk"
201+
include: "rules/work/annos/strucvars/exac.smk"
202+
include: "rules/work/annos/strucvars/g1k.smk"
203+
include: "rules/work/annos/strucvars/gnomad.smk"
204+
# -- output directory ---------------------------------------------------------------------------
205+
include: "rules/output/mehari/freqs.smk"
206+
include: "rules/output/worker/cadd.smk"
207+
include: "rules/output/worker/dbsnp.smk"
208+
include: "rules/output/worker/dbnsfp.smk"
209+
include: "rules/output/worker/dbscsnv.smk"
210+
include: "rules/output/worker/gnomad_mtdna.smk"
211+
include: "rules/output/worker/gnomad_exomes.smk"
212+
include: "rules/output/worker/gnomad_genomes.smk"
213+
include: "rules/output/worker/helix.smk"
214+
include: "rules/output/worker/cons.smk"

environment.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,3 +39,6 @@ dependencies:
3939
- samtools =1.16
4040
# Parallel (de)compression.
4141
- pigz
42+
# Varfish/Mehari/Annonars related
43+
- annonars =0.7.0
44+
- varfish-server-worker
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:1c2de33ed9b4a0a8733ec18c7626583fb888fced90555001ad86db1e578f2deb
3-
size 89527
2+
oid sha256:3f8010eef34130590d7aa24055fcf7910e2182a06ae87daa2b0cf950a45c5c19
3+
size 152
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:50001e32b9c77a7bad932c8aff89d3adc99419a338fc10e04a97d5e44a87e61d
3-
size 169578
2+
oid sha256:6660483ea6f7e90eb11af1cdf113b8520fa49e1006cc1d9e1c518eb135845eca
3+
size 115
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:e334ca666a53dbe51cdf3df5e9239a94485712797d622b6d6084afcff2ff5736
3-
size 83137
2+
oid sha256:09d83646240ed8a3957e3a34621164bcc176879dd85de9952676be55decec157
3+
size 149
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:4203f3ef9596b1cf06af46cd7d0afe828de8bf2fd9bed39a857125de40d4d13d
3-
size 90933
2+
oid sha256:fe9a926b7c12e334c3febb6197319e8181935b855778aa9ec413059674ceb6c9
3+
size 115
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:5e2861d6931677098408d2d78d85bcd99470b5647b05b1993a46b888e3f20613
3-
size 226696
2+
oid sha256:94b0b74f3e2d977f140523432b5b888ceec9997d80b5b7cf0624acd83dde93f7
3+
size 115
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:0061fc2d3d16fd4f27d6c02853b6b8244f59b26b619740cba0725e318332f587
3-
size 2237
2+
oid sha256:db2ac0adfea9cfd8b0595e3a153a7933adc66e830662e80fcefd56e3dee47361
3+
size 2175

0 commit comments

Comments
 (0)