|
| 1 | +import json |
| 2 | +import csv |
| 3 | +import gzip |
| 4 | +import sys |
| 5 | +import typing |
| 6 | + |
| 7 | +import attrs |
| 8 | +import cattrs |
| 9 | +import numpy as np |
| 10 | + |
| 11 | + |
| 12 | +@attrs.frozen(auto_attribs=True) |
| 13 | +class GtexTissueRecord: |
| 14 | + tissue: str |
| 15 | + tissue_detailed: str |
| 16 | + tpms: typing.List[float] = attrs.field(factory=list) |
| 17 | + |
| 18 | + |
| 19 | +@attrs.frozen(auto_attribs=True) |
| 20 | +class GtexGeneRecord: |
| 21 | + hgnc_id: str |
| 22 | + ensembl_gene_id: str |
| 23 | + ensembl_gene_version: str |
| 24 | + records: typing.List[GtexTissueRecord] |
| 25 | + |
| 26 | + |
| 27 | +rule genes_gtex_v8_download: # -- download GTex v8 gene expression data |
| 28 | + output: |
| 29 | + attributes="work/download/genes/gtex/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt", |
| 30 | + attributes_md5="work/download/genes/gtex/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt.md5", |
| 31 | + genes_tpm="work/download/genes/gtex/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz", |
| 32 | + genes_tpm_md5="work/download/genes/gtex/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz.md5", |
| 33 | + shell: |
| 34 | + r""" |
| 35 | + wget --no-check-certificate \ |
| 36 | + -O {output.attributes} \ |
| 37 | + https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt |
| 38 | +
|
| 39 | + wget --no-check-certificate \ |
| 40 | + -O {output.genes_tpm} \ |
| 41 | + https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz |
| 42 | +
|
| 43 | + md5sum {output.attributes} > {output.attributes_md5} |
| 44 | + md5sum {output.genes_tpm} > {output.genes_tpm_md5} |
| 45 | + """ |
| 46 | + |
| 47 | + |
| 48 | +rule genes_gtex_v8_map: # -- map GTex v8 gene files for annonars |
| 49 | + input: |
| 50 | + attributes="work/download/genes/gtex/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt", |
| 51 | + genes_tpm="work/download/genes/gtex/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz", |
| 52 | + genes_xlink=f"output/full/mehari/genes-xlink-{DV.today}/genes-xlink.tsv", |
| 53 | + output: |
| 54 | + genes_tpm="work/genes/annonars/gtex_v8/genes_tpm.jsonl.gz", |
| 55 | + run: |
| 56 | + # Load mapping from sample ID to sample tissue details |
| 57 | + smtsd_count = {} |
| 58 | + with open(input.attributes, "rt") as inputf: |
| 59 | + reader = csv.DictReader(inputf, delimiter="\t") |
| 60 | + sampid_to_tissue = {} |
| 61 | + for row in reader: |
| 62 | + sampid_to_tissue[row["SAMPID"]] = (row["SMTS"], row["SMTSD"]) |
| 63 | + smtsd_count.setdefault(row["SMTSD"], 0) |
| 64 | + smtsd_count[row["SMTSD"]] += 1 |
| 65 | + print("Sample counts per tissue:", file=sys.stderr) |
| 66 | + for smtsd, count in sorted(smtsd_count.items(), key=lambda x: x[1], reverse=True): |
| 67 | + print(f"{smtsd}: {count}", file=sys.stderr) |
| 68 | + # Load mapping from ENSEMBL to HGNC gene ID |
| 69 | + with open(input.genes_xlink, "rt") as inputf: |
| 70 | + reader = csv.DictReader(inputf, delimiter="\t") |
| 71 | + ensembl_to_hgnc = {row["ensembl_gene_id"]: row["hgnc_id"] for row in reader} |
| 72 | + |
| 73 | + # Map GTEx v8 gene expression data to counts JSONL data for annonars |
| 74 | + print("Transmogrifying expression data...", file=sys.stderr) |
| 75 | + with gzip.open(input.genes_tpm, "rt") as inputf, gzip.open( |
| 76 | + output.genes_tpm, "wt" |
| 77 | + ) as outputf: |
| 78 | + for _ in range(2): |
| 79 | + next(inputf) |
| 80 | + reader = csv.DictReader(inputf, delimiter="\t") |
| 81 | + for row in reader: |
| 82 | + ensembl_gene_id, ensembl_gene_version = row["Name"].split(".", 1) |
| 83 | + hgnc_id = ensembl_to_hgnc.get(ensembl_gene_id) |
| 84 | + if hgnc_id is None: |
| 85 | + print(f"Skipping {ensembl_gene_id}.{ensembl_gene_version}", file=sys.stderr) |
| 86 | + continue |
| 87 | + |
| 88 | + tissue_records = {} |
| 89 | + |
| 90 | + for sampid, tpm in row.items(): |
| 91 | + if not sampid.startswith("GTEX-"): |
| 92 | + continue |
| 93 | + smts, smtsd = sampid_to_tissue[sampid] |
| 94 | + if smtsd not in tissue_records: |
| 95 | + tissue_records[smtsd] = GtexTissueRecord(tissue=smts, tissue_detailed=smtsd) |
| 96 | + tissue_records[smtsd].tpms.append(float(tpm)) |
| 97 | + |
| 98 | + records = [] |
| 99 | + for tissue_record in tissue_records.values(): |
| 100 | + records.append( |
| 101 | + attrs.evolve( |
| 102 | + tissue_record, |
| 103 | + tpms=np.quantile( |
| 104 | + np.array(tissue_record.tpms), [0.0, 0.25, 0.5, 0.75, 1.0] |
| 105 | + ).tolist(), |
| 106 | + ) |
| 107 | + ) |
| 108 | + |
| 109 | + gene_record = GtexGeneRecord( |
| 110 | + hgnc_id=hgnc_id, |
| 111 | + ensembl_gene_id=ensembl_gene_id, |
| 112 | + ensembl_gene_version=ensembl_gene_version, |
| 113 | + records=list(sorted(records, key=lambda r: (r.tissue, r.tissue_detailed))), |
| 114 | + ) |
| 115 | + print( |
| 116 | + json.dumps(cattrs.unstructure(gene_record)), |
| 117 | + file=outputf, |
| 118 | + ) |
| 119 | + print("... done transmogrifying GTex data", file=sys.stderr) |
0 commit comments