Skip to content

Commit a2e63f0

Browse files
authored
feat: importing GTex data into annonars genes database (#59)
1 parent 7d43865 commit a2e63f0

File tree

10 files changed

+149
-2
lines changed

10 files changed

+149
-2
lines changed

Snakefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,7 @@ include: "rules/work/genes/dbnsfp.smk"
327327
include: "rules/work/genes/clingen.smk"
328328
include: "rules/work/genes/ensembl.smk"
329329
include: "rules/work/genes/gnomad.smk"
330+
include: "rules/work/genes/gtex.smk"
330331
include: "rules/work/genes/hgnc.smk"
331332
include: "rules/work/genes/mehari_data_tx.smk"
332333
include: "rules/work/genes/ncbi.smk"

download_urls.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,13 @@
1+
- url: https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt
2+
excerpt_strategy:
3+
strategy: no-excerpt
4+
count: null
5+
6+
- url: https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz
7+
excerpt_strategy:
8+
strategy: manual
9+
count: null
10+
111
- url: https://github.com/Orphanet/orphapacket/archive/refs/tags/v10.1.tar.gz
212
excerpt_strategy:
313
strategy: no-excerpt

environment.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ dependencies:
1111
- cattrs
1212
- click
1313
- loguru
14+
- numpy
1415
- pyyaml
1516
- requests
1617
- requests-ftp
@@ -40,7 +41,7 @@ dependencies:
4041
# Parallel (de)compression.
4142
- pigz
4243
# Varfish related
43-
- annonars =0.15.0
44+
- annonars =0.18.0
4445
- viguno =0.1.6
4546
- mehari =0.6.2
4647
- varfish-server-worker =0.10.1
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:4106fe3619eba6b628686936bf2b4b542bd4e7b8ff627e93642bd6b6c9dc548d
3+
size 1842014

excerpt-data/2295c2a0487d0dab/url.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:ce3752f6604f5e7c8b219777d805a4b911dfc3ff88260db85a50064b05bfa68e
3+
size 119
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:74f6ab4c34ed2648d708a0ae6e6dff324f6c86ea723ae7d1c37d76f5221148f0
3+
size 11512258

excerpt-data/9e484a896c7516d6/url.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:bbd6d3ca32a3e18599513f06b88bac0c8896685a79af7e84c4f723a9ef2900b8
3+
size 112

rules/output/annonars/genes.smk

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file
1313
orpha="work/genes/orphapacket/{v_orpha}+{date}/orpha_diseases.tsv",
1414
rcnv="work/genes/rcnv/2022/rcnv_collins_2022.tsv",
1515
shet="work/genes/shet/2019/shet_weghorn_2019.tsv",
16+
gtex="work/genes/annonars/gtex_v8/genes_tpm.jsonl.gz",
1617
output:
1718
rocksdb_identity=(
1819
"output/full/annonars/genes-{v_acmg_sf}+{v_gnomad_constraints}+{v_dbnsfp}+{v_hpo}+{v_orpha}+{date}+{v_annonars}/"
@@ -46,7 +47,8 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file
4647
--path-in-orpha {input.orpha} \
4748
--path-in-ncbi {input.ncbi} \
4849
--path-in-rcnv {input.rcnv} \
49-
--path-in-shet {input.shet}
50+
--path-in-shet {input.shet} \
51+
--path-in-gtex {input.gtex}
5052
5153
varfish-db-downloader tpl \
5254
--template rules/output/annonars/genes.spec.yaml \

rules/output/annonars/genes.spec.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,5 @@ x-created-from:
4040
version: 2022-Collins-et-al
4141
- name: sHet scores
4242
version: 2019-Weghorn-et-a.
43+
- name: GTex data
44+
version: v8

rules/work/genes/gtex.smk

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
import json
2+
import csv
3+
import gzip
4+
import sys
5+
import typing
6+
7+
import attrs
8+
import cattrs
9+
import numpy as np
10+
11+
12+
@attrs.frozen(auto_attribs=True)
13+
class GtexTissueRecord:
14+
tissue: str
15+
tissue_detailed: str
16+
tpms: typing.List[float] = attrs.field(factory=list)
17+
18+
19+
@attrs.frozen(auto_attribs=True)
20+
class GtexGeneRecord:
21+
hgnc_id: str
22+
ensembl_gene_id: str
23+
ensembl_gene_version: str
24+
records: typing.List[GtexTissueRecord]
25+
26+
27+
rule genes_gtex_v8_download: # -- download GTex v8 gene expression data
28+
output:
29+
attributes="work/download/genes/gtex/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt",
30+
attributes_md5="work/download/genes/gtex/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt.md5",
31+
genes_tpm="work/download/genes/gtex/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz",
32+
genes_tpm_md5="work/download/genes/gtex/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz.md5",
33+
shell:
34+
r"""
35+
wget --no-check-certificate \
36+
-O {output.attributes} \
37+
https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt
38+
39+
wget --no-check-certificate \
40+
-O {output.genes_tpm} \
41+
https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz
42+
43+
md5sum {output.attributes} > {output.attributes_md5}
44+
md5sum {output.genes_tpm} > {output.genes_tpm_md5}
45+
"""
46+
47+
48+
rule genes_gtex_v8_map: # -- map GTex v8 gene files for annonars
49+
input:
50+
attributes="work/download/genes/gtex/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt",
51+
genes_tpm="work/download/genes/gtex/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz",
52+
genes_xlink=f"output/full/mehari/genes-xlink-{DV.today}/genes-xlink.tsv",
53+
output:
54+
genes_tpm="work/genes/annonars/gtex_v8/genes_tpm.jsonl.gz",
55+
run:
56+
# Load mapping from sample ID to sample tissue details
57+
smtsd_count = {}
58+
with open(input.attributes, "rt") as inputf:
59+
reader = csv.DictReader(inputf, delimiter="\t")
60+
sampid_to_tissue = {}
61+
for row in reader:
62+
sampid_to_tissue[row["SAMPID"]] = (row["SMTS"], row["SMTSD"])
63+
smtsd_count.setdefault(row["SMTSD"], 0)
64+
smtsd_count[row["SMTSD"]] += 1
65+
print("Sample counts per tissue:", file=sys.stderr)
66+
for smtsd, count in sorted(smtsd_count.items(), key=lambda x: x[1], reverse=True):
67+
print(f"{smtsd}: {count}", file=sys.stderr)
68+
# Load mapping from ENSEMBL to HGNC gene ID
69+
with open(input.genes_xlink, "rt") as inputf:
70+
reader = csv.DictReader(inputf, delimiter="\t")
71+
ensembl_to_hgnc = {row["ensembl_gene_id"]: row["hgnc_id"] for row in reader}
72+
73+
# Map GTEx v8 gene expression data to counts JSONL data for annonars
74+
print("Transmogrifying expression data...", file=sys.stderr)
75+
with gzip.open(input.genes_tpm, "rt") as inputf, gzip.open(
76+
output.genes_tpm, "wt"
77+
) as outputf:
78+
for _ in range(2):
79+
next(inputf)
80+
reader = csv.DictReader(inputf, delimiter="\t")
81+
for row in reader:
82+
ensembl_gene_id, ensembl_gene_version = row["Name"].split(".", 1)
83+
hgnc_id = ensembl_to_hgnc.get(ensembl_gene_id)
84+
if hgnc_id is None:
85+
print(f"Skipping {ensembl_gene_id}.{ensembl_gene_version}", file=sys.stderr)
86+
continue
87+
88+
tissue_records = {}
89+
90+
for sampid, tpm in row.items():
91+
if not sampid.startswith("GTEX-"):
92+
continue
93+
smts, smtsd = sampid_to_tissue[sampid]
94+
if smtsd not in tissue_records:
95+
tissue_records[smtsd] = GtexTissueRecord(tissue=smts, tissue_detailed=smtsd)
96+
tissue_records[smtsd].tpms.append(float(tpm))
97+
98+
records = []
99+
for tissue_record in tissue_records.values():
100+
records.append(
101+
attrs.evolve(
102+
tissue_record,
103+
tpms=np.quantile(
104+
np.array(tissue_record.tpms), [0.0, 0.25, 0.5, 0.75, 1.0]
105+
).tolist(),
106+
)
107+
)
108+
109+
gene_record = GtexGeneRecord(
110+
hgnc_id=hgnc_id,
111+
ensembl_gene_id=ensembl_gene_id,
112+
ensembl_gene_version=ensembl_gene_version,
113+
records=list(sorted(records, key=lambda r: (r.tissue, r.tissue_detailed))),
114+
)
115+
print(
116+
json.dumps(cattrs.unstructure(gene_record)),
117+
file=outputf,
118+
)
119+
print("... done transmogrifying GTex data", file=sys.stderr)

0 commit comments

Comments
 (0)