Skip to content

Commit 3f70b57

Browse files
authored
feat: introduce comprehensive spec YAML files (#33) (#43)
1 parent 4b01fe9 commit 3f70b57

37 files changed

+622
-100
lines changed

Snakefile

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,12 @@
66
# ``varfish-server-worker`` and is used in the backend for filtering and/or exposed to the
77
# user via a REST API.
88

9-
from varfish_db_downloader.versions import DATA_VERSIONS as DV, PACKAGE_VERSIONS as PV
9+
from varfish_db_downloader.versions import (
10+
DATA_VERSIONS as DV,
11+
PACKAGE_VERSIONS as PV,
12+
TODAY,
13+
RUNS_IN_CI,
14+
)
1015

1116
# The prefix to use for all shell commands.
1217
SHELL_PREFIX = "export LC_ALL=C; set -x -euo pipefail;"
@@ -22,16 +27,11 @@ RE_VERSION = r"\w+(\.\w+)*"
2227
# Test Mode
2328
# ===============================================================================================
2429

25-
import os
26-
2730
# Activate test mode by prepending the path to the "test-mode-bin" directory to the PATH.
28-
if os.environ.get("CI", "false").lower() == "true":
31+
if RUNS_IN_CI:
2932
cwd = os.getcwd()
3033
old_path = os.environ["PATH"]
3134
os.environ["PATH"] = f"{cwd}/test-mode-bin:{old_path}"
32-
RUNS_IN_CI = True
33-
else:
34-
RUNS_IN_CI = False
3535

3636

3737
# ===============================================================================================
@@ -115,7 +115,8 @@ rule all:
115115
# ---- frequencies (via annonars)
116116
f"output/mehari/freqs-grch37-{DV.gnomad_v2}+{DV.gnomad_v2}+{DV.gnomad_mtdna}+{DV.helixmtdb}+{PV.annonars}/rocksdb/IDENTITY",
117117
f"output/mehari/freqs-grch38-{DV.gnomad_v3}+{DV.gnomad_v2}+{DV.gnomad_mtdna}+{DV.helixmtdb}+{PV.annonars}/rocksdb/IDENTITY",
118-
# ---- annonars data
118+
# -- annonars data
119+
# ----- sequence variant annotations
119120
f"output/annonars/cadd-grch37-{DV.cadd}+{PV.annonars}/rocksdb/IDENTITY",
120121
f"output/annonars/cadd-grch38-{DV.cadd}+{PV.annonars}/rocksdb/IDENTITY",
121122
f"output/annonars/dbsnp-grch37-{DV.dbsnp}+{PV.annonars}/rocksdb/IDENTITY",
@@ -134,10 +135,13 @@ rule all:
134135
f"output/annonars/gnomad-genomes-grch38-{DV.gnomad_v3}+{PV.annonars}/rocksdb/IDENTITY",
135136
f"output/annonars/helixmtdb-grch37-{DV.helixmtdb}+{PV.annonars}/rocksdb/IDENTITY",
136137
f"output/annonars/helixmtdb-grch38-{DV.helixmtdb}+{PV.annonars}/rocksdb/IDENTITY",
138+
# ----- conservation
137139
f"output/annonars/cons-grch37-{DV.ucsc_cons_37}+{PV.annonars}/rocksdb/IDENTITY",
138140
f"output/annonars/cons-grch38-{DV.ucsc_cons_38}+{PV.annonars}/rocksdb/IDENTITY",
139-
# ----- Genes
141+
# ----- genes
140142
f"output/worker/genes-{DV.acmg_sf}+{DV.gnomad_constraints}+{DV.dbnsfp}+{DV.today}+{PV.worker}/rocksdb/IDENTITY",
143+
# -- worker data
144+
# ----- Genes
141145
f"output/worker/genes-xlink-{DV.today}/genes-xlink.tsv",
142146
f"output/worker/genes-txs-grch37-{DV.mehari_tx}/mehari-data-txs-grch37-{DV.mehari_tx}.bin.zst",
143147
f"output/worker/genes-txs-grch38-{DV.mehari_tx}/mehari-data-txs-grch38-{DV.mehari_tx}.bin.zst",

data/acmg/3.1/acmg.spec.json

Lines changed: 0 additions & 21 deletions
This file was deleted.

data/acmg/3.1/acmg.spec.yaml

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
dc.format: text/tsv
2+
dc.identifier: genes/acmg/sf:3.1
3+
dc.title: ACMG Secondary Findings (SF) Gene List (v3.1)
4+
dc.description: >
5+
This is version 3.1 of the ACMG gene list for reporting incidental
6+
findings. The file was curated from PMID:35802134 as gene symbols
7+
and then translated to ENSEMBL and Entrez/NCBI gene ID with the
8+
HGNC BioMart
9+
dc.date: 2022-02-03
10+
dc.creator: American Collect of Medical Genetics
11+
dc.contributor:
12+
- VarFish Developer Team
13+
dc.source:
14+
- PMID:35802134
15+
- https://www.ncbi.nlm.nih.gov/clinvar/docs/acmg/
16+
- https://biomart.genenames.org/
17+
18+
tsv.columns:
19+
- name: hgnc_id
20+
description: HGNC gene ID.
21+
- name: ensembl_gene_id
22+
description: ENSEMBL gene ID.
23+
- name: ncbi_gene_id
24+
description: NCBI Gene ID.
25+
- name: gene_symbol
26+
description: HGNC approved gene symbol.
27+
- name: mim_gene_id
28+
description: OMIM gene ID.
29+
- name: disease_phenotype
30+
description: Name of the relevant disorder(s).
31+
- name: disorder_mim
32+
description: MIM code of the relevant disorder(s).s
33+
- name: phenotype_category
34+
description: Phenotype category.
35+
- name: inheritance
36+
description: Mode(s) of inheritance.
37+
- name: sf_list_version
38+
description: ACMG SF list version that this gene first appeared in.
39+
- name: variants_to_report
40+
description: comment on which variants are to be reported.

data/patho-mms/20220730/patho-mms-grch37.bed.spec.json

Lines changed: 0 additions & 21 deletions
This file was deleted.
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
dc.format: text/tsv
2+
dc.identifier: features/patho-mms:wetzel-darbro-2022/grch37
3+
dc.title: >
4+
A comprehensive list of human microdeletion and microduplication syndromes
5+
(Wetzel & Darbro, 2022) for GRCh37.
6+
dc.description: >
7+
This TSV file contains regions with microdeletion and microduplication
8+
symbols as described by Wetzel & Darbro (2022).
9+
dc.date: 2022-07-30
10+
dc.creator: Wetzel & Darbro (2022)
11+
dc.contributor:
12+
- VarFish Developer Team
13+
dc.source:
14+
- PMID:36435749
15+
- https://github.com/aswetzel/MMS
16+
17+
tsv.columns:
18+
- name: chrom
19+
description: Chromosome name without chr prefix.
20+
- name: begin
21+
description: 0-based start position.
22+
- name: end
23+
description: 0-based end position.
24+
- name: name
25+
description: Name of the syndrome.

data/patho-mms/20220730/patho-mms-grch38.bed.spec.json

Lines changed: 0 additions & 21 deletions
This file was deleted.
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
dc.format: text/tsv
2+
dc.identifier: features/patho-mms:wetzel-darbro-2022/grch38
3+
dc.title: >
4+
A comprehensive list of human microdeletion and microduplication syndromes
5+
(Wetzel & Darbro, 2022) for GRCh38.
6+
dc.description: >
7+
This TSV file contains regions with microdeletion and microduplication
8+
symbols as described by Wetzel & Darbro (2022).
9+
dc.date: 2022-07-30
10+
dc.creator: Wetzel & Darbro (2022)
11+
dc.contributor:
12+
- VarFish Developer Team
13+
dc.source:
14+
- PMID:36435749
15+
- https://github.com/aswetzel/MMS
16+
17+
tsv.columns:
18+
- name: chrom
19+
description: Chromosome name chr prefix.
20+
- name: begin
21+
description: 0-based start position.
22+
- name: end
23+
description: 0-based end position.
24+
- name: name
25+
description: Name of the syndrome.

data/spec-tpl.yaml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
dc.format: THE__FORMAT
2+
dc.identifier: THE__IDENTIFIER
3+
dc.title: >
4+
THE__TITLE
5+
dc.description: >
6+
THE__DESCRIPTION
7+
dc.date: THE__DATE
8+
dc.creator: THE__CREATOR
9+
dc.contributor:
10+
- VarFish Developer Team
11+
dc.source:
12+
- THE__SOURCE
13+
- THE__SOURCE
14+
15+
tsv.columns:
16+
- name: THE__NAME
17+
description: THE__DESCRIPTIOn

rules/output/annonars/cadd.smk

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,10 @@ rule output_annonars_cadd: # -- build CADD RocksDB with annonars
4949
input:
5050
unpack(input_output_annonars_cadd),
5151
output:
52-
"output/annonars/cadd-{genome_release}-{v_cadd}+{v_annonars}/rocksdb/IDENTITY",
52+
rocksdb_identity=(
53+
"output/annonars/cadd-{genome_release}-{v_cadd}+{v_annonars}/rocksdb/IDENTITY"
54+
),
55+
spec_yaml=("output/annonars/cadd-{genome_release}-{v_cadd}+{v_annonars}/spec.yaml"),
5356
threads: int(os.environ.get("THREADS_ANNONARS_IMPORT", "96"))
5457
resources:
5558
runtime=os.environ.get("RUNTIME_ANNONARS_IMPORT", "48h"),
@@ -63,7 +66,7 @@ rule output_annonars_cadd: # -- build CADD RocksDB with annonars
6366
annonars tsv import \
6467
--path-in-tsv {input.indels} \
6568
--path-in-tsv {input.snvs} \
66-
--path-out-rocksdb $(dirname {output}) \
69+
--path-out-rocksdb $(dirname {output.rocksdb_identity}) \
6770
\
6871
--col-chrom Chrom \
6972
--col-start Pos \
@@ -78,4 +81,16 @@ rule output_annonars_cadd: # -- build CADD RocksDB with annonars
7881
--skip-row-count 1 \
7982
--add-default-null-values \
8083
--path-schema-json rules/output/annonars/cadd-schema-{wildcards.genome_release}.json
84+
85+
varfish-db-downloader tpl \
86+
--template rules/output/annonars/cadd.spec.yaml \
87+
--value today={TODAY} \
88+
--value genome_release={wildcards.genome_release} \
89+
\
90+
--value version={wildcards.v_cadd}+{wildcards.v_annonars} \
91+
--value v_cadd={wildcards.v_cadd} \
92+
\
93+
--value v_annonars={wildcards.v_annonars} \
94+
--value v_downloader={PV.downloader} \
95+
> {output.spec_yaml}
8196
"""

rules/output/annonars/cadd.spec.yaml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
dc.identifier: annonars/seqvars/cadd:{{ version }}-{{ genome_release }}
2+
dc.title: annona-rs CADD RocksDB Database
3+
dc.creator: Kircher Lab
4+
dc.contributor:
5+
- VarFish Developer Teams
6+
dc.format: application/x-rocksdb
7+
dc.date: {{ today }}
8+
x-version: {{ version }}
9+
x-genome-release: {{ genome_release }}
10+
dc.description: |
11+
RocksDB with the information from the CADD score TSV files in their "incl. all annotations"
12+
variant using the annonars package v{{ v_annonars }} in varfish-downloader v{{ v_downloader }}.
13+
dc.source:
14+
- PMID:33618777
15+
- PMID:30371827
16+
- https://cadd.gs.washington.edu/
17+
x-created-from:
18+
- name: CADD
19+
version: {{ v_cadd }}

0 commit comments

Comments
 (0)