Skip to content

Commit 664295d

Browse files
authored
feat: build annonars regions (clingen dosage) (#67)
1 parent 3fd72dd commit 664295d

File tree

11 files changed

+183
-0
lines changed

11 files changed

+183
-0
lines changed

Snakefile

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,11 @@ rule all:
165165
f"output/full/annonars/gnomad-sv-exomes-grch38-{DV.gnomad_cnv4}+{PV.annonars}/rocksdb/IDENTITY",
166166
f"output/full/annonars/gnomad-sv-genomes-grch37-{DV.gnomad_sv}+{PV.annonars}/rocksdb/IDENTITY",
167167
f"output/full/annonars/gnomad-sv-genomes-grch38-{DV.gnomad_sv4}+{PV.annonars}/rocksdb/IDENTITY",
168+
# ----- sequence annotation
169+
f"output/full/annonars/functional-grch37-{DV.refseq_fe_37}+{PV.annonars}/rocksdb/IDENTITY",
170+
f"output/full/annonars/functional-grch38-{DV.refseq_fe_38}+{PV.annonars}/rocksdb/IDENTITY",
171+
f"output/full/annonars/regions-grch37-{DV.today}+{PV.annonars}/rocksdb/IDENTITY",
172+
f"output/full/annonars/regions-grch38-{DV.today}+{PV.annonars}/rocksdb/IDENTITY",
168173
# ----- conservation
169174
f"output/full/annonars/cons-grch37-{DV.ucsc_cons_37}+{PV.annonars}/rocksdb/IDENTITY",
170175
f"output/full/annonars/cons-grch38-{DV.ucsc_cons_38}+{PV.annonars}/rocksdb/IDENTITY",
@@ -386,6 +391,8 @@ include: "rules/output/annonars/gnomad_mtdna.smk"
386391
include: "rules/output/annonars/gnomad_sv.smk"
387392
include: "rules/output/annonars/helix.smk"
388393
include: "rules/output/annonars/genes.smk"
394+
include: "rules/output/annonars/functional.smk"
395+
include: "rules/output/annonars/regions.smk"
389396
# ---- worker
390397
include: "rules/output/worker/patho_mms.smk"
391398
include: "rules/output/worker/clinvar.smk"

download_urls.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
- url: https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/105.20201022/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz
2+
excerpt_strategy:
3+
strategy: gz-head
4+
count: 1000
5+
- url: https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/110/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz
6+
excerpt_strategy:
7+
strategy: gz-head
8+
count: 1000
9+
110
- url: https://storage.googleapis.com/gcp-public-data--gnomad/legacy/exac_browser/ExAC.r1.sites.vep.vcf.gz
211
excerpt_strategy:
312
strategy: gz-head
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:6c9fcd2bed4045002e27b5f8b36be6d798f21e63f277d133df49c1466b0c0680
3+
size 20157

excerpt-data/98935d27cc8f0dc0/url.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:113a71bbc89339505e4d34c739662ce4e11ce40c7773607aea55719aa095e49e
3+
size 141
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:66c6578cae062849e5aa1d59c8a9a0e3b1bbd0173b7d0700eb7f22486154b7f7
3+
size 18765

excerpt-data/f0ed4b0862f1b46b/url.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:aa3523b242c4f92e346384bf1855bf53417f48d0b9814d9a8bf7891cbbcfebe7
3+
size 150

rules/output/annonars/functional.smk

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
## Rules to create build annonars functional annotation database..
2+
3+
4+
rule work_annonars_functional_download_37: # -- download functional data for GRCh37
5+
output:
6+
"work/download/refseq/grch37/{version}/{assembly}_genomic.gff.gz",
7+
shell:
8+
r"""
9+
wget -O {output} \
10+
https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/{wildcards.version}/{wildcards.assembly}/{wildcards.assembly}_genomic.gff.gz
11+
"""
12+
13+
14+
rule work_annonars_functional_download_38: # -- download functional data for GRCh37
15+
output:
16+
"work/download/refseq/grch38/{version}/{assembly}_genomic.gff.gz",
17+
shell:
18+
r"""
19+
wget -O {output} \
20+
https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/{wildcards.version}/{wildcards.assembly}/{wildcards.assembly}_genomic.gff.gz
21+
"""
22+
23+
24+
def output_annonars_functional_input(wildcards):
25+
if wildcards.genome_release == "grch37":
26+
return f"work/download/refseq/grch37/{DV.refseq_fe_37}/GCF_000001405.25_GRCh37.p13_genomic.gff.gz"
27+
else:
28+
return f"work/download/refseq/grch38/{DV.refseq_fe_38}/GCF_000001405.40_GRCh38.p14_genomic.gff.gz"
29+
30+
31+
rule output_annonars_functional: # -- build annonars functional RocksDB file
32+
input:
33+
output_annonars_functional_input,
34+
output:
35+
rocksdb_identity=(
36+
"output/full/annonars/functional-{genome_release}-{v_refseq}+{v_annonars}/"
37+
"rocksdb/IDENTITY"
38+
),
39+
spec_yaml=(
40+
"output/full/annonars/functional-{genome_release}-{v_refseq}+{v_annonars}/spec.yaml"
41+
),
42+
wildcard_constraints:
43+
v_refseq=RE_VERSION,
44+
v_annonars=RE_VERSION,
45+
shell:
46+
r"""
47+
export TMPDIR=$(mktemp -d)
48+
trap "rm -rf $TMPDIR" EXIT
49+
50+
zgrep '^#\|RefSeqFE' {input} > $TMPDIR/tmp.gff
51+
52+
annonars functional import -vvv \
53+
--genome-release {wildcards.genome_release} \
54+
--path-in-gff $TMPDIR/tmp.gff \
55+
--path-out-rocksdb $(dirname {output.rocksdb_identity})
56+
57+
varfish-db-downloader tpl \
58+
--template rules/output/annonars/functional.spec.yaml \
59+
--value today={TODAY} \
60+
\
61+
--value version={wildcards.v_refseq}+{wildcards.v_annonars} \
62+
--value v_refseq={wildcards.v_refseq} \
63+
\
64+
--value v_annonars={wildcards.v_annonars} \
65+
--value v_downloader={PV.downloader} \
66+
> {output.spec_yaml}
67+
"""
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
dc.identifier: annonars/functional:{{ version }}-{{ genome_release }}
2+
dc.title: annonars functional elements RocksDB
3+
dc.creator: VarFish Developer Teams
4+
dc.format: application/x-rocksdb
5+
dc.date: {{ today }}
6+
x-version: {{ version }}
7+
x-genome-release: {{ genome_release }}
8+
dc.description: |
9+
RocksDB built from RefSeq Functional Elements (and other sources in
10+
the future).
11+
dc.source:
12+
- PMID:34876495
13+
- https://www.ncbi.nlm.nih.gov/refseq/
14+
x-created-from:
15+
- name: RefSeq Functional Elements
16+
version: {{ v_refseq }}

rules/output/annonars/regions.smk

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
## Rules to create build annonars regions annotation database..
2+
3+
4+
rule work_annonars_regions_download: # -- download clingen regions
5+
output:
6+
"work/download/clingen/{genome_release}/{today}/ClinGen_region_curation_list_{genome_release}.tsv",
7+
shell:
8+
r"""
9+
if [[ "{wildcards.genome_release}" == "grch38" ]]; then
10+
GENOME=GRCh37
11+
else
12+
GENOME=GRCh38
13+
fi
14+
15+
wget -O {output} \
16+
ftp://ftp.clinicalgenome.org/ClinGen_region_curation_list_$GENOME.tsv
17+
"""
18+
19+
20+
rule output_annonars_regions: # -- build annonars regions RocksDB file
21+
input:
22+
"work/download/clingen/{genome_release}/{date}/ClinGen_region_curation_list_{genome_release}.tsv",
23+
output:
24+
rocksdb_identity=(
25+
"output/full/annonars/regions-{genome_release}-{date}+{v_annonars}/" "rocksdb/IDENTITY"
26+
),
27+
spec_yaml=("output/full/annonars/regions-{genome_release}-{date}+{v_annonars}/spec.yaml"),
28+
wildcard_constraints:
29+
v_refseq=RE_VERSION,
30+
v_annonars=RE_VERSION,
31+
shell:
32+
r"""
33+
if [[ "$(date +%Y%m%d)" != "{wildcards.date}" ]] && [[ "{FORCE_TODAY}" != "True" ]]; then
34+
>&2 echo "{wildcards.date} is not today"
35+
exit 1
36+
fi
37+
38+
annonars regions import -vvv \
39+
--genome-release {wildcards.genome_release} \
40+
--path-in-clingen {input} \
41+
--path-out-rocksdb $(dirname {output.rocksdb_identity})
42+
43+
varfish-db-downloader tpl \
44+
--template rules/output/annonars/regions.spec.yaml \
45+
--value today={TODAY} \
46+
\
47+
--value version={wildcards.date}+{wildcards.v_annonars} \
48+
\
49+
--value v_annonars={wildcards.v_annonars} \
50+
--value v_downloader={PV.downloader} \
51+
> {output.spec_yaml}
52+
"""
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
dc.identifier: annonars/regions:{{ version }}-{{ genome_release }}
2+
dc.title: annonars regions annotation RocksDB
3+
dc.creator: VarFish Developer Teams
4+
dc.format: application/x-rocksdb
5+
dc.date: {{ today }}
6+
x-version: {{ version }}
7+
x-genome-release: {{ genome_release }}
8+
dc.description: |
9+
RocksDB with region annotation.
10+
dc.source:
11+
- https://search.clinicalgenome.org/kb/gene-dosage
12+
x-created-from:
13+
- name: ClinGen Region Dosage Pathogenicity
14+
version: {{ today }}

0 commit comments

Comments
 (0)