Skip to content

Commit 375dd36

Browse files
authored
feat: add DECIPHER HI predictions v3 (#63)
1 parent b6bc6c5 commit 375dd36

File tree

5 files changed

+67
-0
lines changed

5 files changed

+67
-0
lines changed

Snakefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ rule all:
8989
f"work/download/genes/rcnv/2022/Collins_rCNV_2022.dosage_sensitivity_scores.tsv.gz",
9090
f"work/download/genes/orphapacket/{DV.orphapacket}/orphapacket.tar.gz",
9191
f"work/genes/dbnsfp/{DV.dbnsfp}/genes.tsv.gz",
92+
"work/genes/decipher/v3/decipher_hi_prediction.tsv.gz",
9293
f"work/genes/ensembl/{DV.ensembl}/ensembl_xlink.tsv",
9394
f"work/genes/enst_ensg/grch37/{DV.ensembl_37}/enst_ensg.tsv",
9495
f"work/genes/entrez/{DV.today}/gene_info.jsonl",
@@ -329,6 +330,7 @@ include: "rules/work/misc/hpo.smk"
329330
# Gene-related rules.
330331
include: "rules/work/genes/dbnsfp.smk"
331332
include: "rules/work/genes/clingen.smk"
333+
include: "rules/work/genes/decipher.smk"
332334
include: "rules/work/genes/ensembl.smk"
333335
include: "rules/work/genes/gnomad.smk"
334336
include: "rules/work/genes/gtex.smk"

download_urls.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
- url: https://www.deciphergenomics.org/files/downloads/HI_Predictions_Version3.bed.gz
2+
13
- url: ftp://ftp.clinicalgenome.org/ClinGen_region_curation_list_GRCh37.tsv
24
- url: ftp://ftp.clinicalgenome.org/ClinGen_region_curation_list_GRCh38.tsv
35

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:f2d2ed0ba3247c444a0f55a0659d16b253f6030108e3497785505d11dea72838
3+
size 3072
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:5daa1c09650953bb18b68be21872a95a222b60e1619b3959a1cb8d443e40a055
3+
size 80

rules/work/genes/decipher.smk

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
## Rules related to DECIPHER gene information.
2+
3+
4+
rule genes_decipher_hi_download: # -- download DECIPHER HI predictions
5+
output:
6+
bed="work/download/genes/decipher/v3/HI_Predictions_Version3.bed.gz",
7+
bed_md5="work/download/genes/decipher/v3/HI_Predictions_Version3.bed.gz.md5",
8+
shell:
9+
r"""
10+
wget --no-check-certificate \
11+
-O {output.bed} \
12+
https://www.deciphergenomics.org/files/downloads/HI_Predictions_Version3.bed.gz
13+
14+
md5sum {output.bed} > {output.bed_md5}
15+
"""
16+
17+
18+
rule genes_decipher_hi_convert: # -- convert DECIPHER HI predictions to TSV
19+
input:
20+
hgnc=f"output/full/mehari/genes-xlink-{DV.today}/genes-xlink.tsv",
21+
bed="work/download/genes/decipher/v3/HI_Predictions_Version3.bed.gz",
22+
output:
23+
tsv="work/genes/decipher/v3/decipher_hi_prediction.tsv.gz",
24+
tsv_md5="work/genes/decipher/v3/decipher_hi_prediction.tsv.gz.md5",
25+
shell:
26+
r"""
27+
set -x
28+
29+
export TMPDIR=$(mktemp -d)
30+
trap "rm -rf $TMPDIR" EXIT
31+
32+
echo -e "gene_symbol\tp_hi\thi_index" > $TMPDIR/tmp.tsv
33+
34+
zcat {input.bed} \
35+
| tail -n +2 \
36+
| cut -f 4 \
37+
| tr '|' '\t' \
38+
| sed -e 's/%$//g' \
39+
>> $TMPDIR/tmp.tsv
40+
41+
qsv join \
42+
gene_symbol {input.hgnc} \
43+
gene_symbol $TMPDIR/tmp.tsv \
44+
> $TMPDIR/tmp2.tsv
45+
46+
( \
47+
echo -e "hgnc_id\thgnc_symbol\tp_hi\thi_index"; \
48+
tail -n +2 $TMPDIR/tmp2.tsv \
49+
| tr ',' '\t' \
50+
| cut -f 1,5-7 \
51+
| LC_ALL=C sort \
52+
) \
53+
| gzip -c \
54+
> {output.tsv}
55+
56+
md5sum {output.tsv} > {output.tsv_md5}
57+
"""

0 commit comments

Comments
 (0)