Skip to content

fix: ensure genes_gnomad_convert_v4 picks mane_select and canonical transcripts #101

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,5 @@ dependencies:
- httpx =0.25.0
- httpcore =0.18.0
- trio
# Used in genes_gnomad_convert_v4
- polars >=1.23,<2
86 changes: 2 additions & 84 deletions rules/work/genes/gnomad.smk
Original file line number Diff line number Diff line change
Expand Up @@ -99,94 +99,12 @@ rule genes_gnomad_download_v4: # -- download gnomAD gene constraints v4.1
"""


def run_genes_gnomad_constraints_v4_to_tsv(input, output, wildcards):
"""Extra function because of snakefmt issues.

Note that the names in the output file are taken from the v2.1.1 file.
"""
columns_src_dst = {
"transcript": "ensembl_transcript_id",
"lof_hc_lc.exp": "exp_lof",
"mis.exp": "exp_mis",
"syn.exp": "exp_syn",
"mis.z_score": "mis_z",
"lof_hc_lc.obs": "obs_lof",
"mis.obs": "obs_mis",
"syn.obs": "obs_syn",
"lof.oe": "oe_lof",
"lof.oe_ci.lower": "oe_lof_lower",
"lof.oe_ci.upper": "oe_lof_upper",
"mis.oe": "oe_mis",
"mis.oe_ci.lower": "oe_mis_lower",
"mis.oe_ci.upper": "oe_mis_upper",
"syn.oe": "oe_syn",
"syn.oe_ci.lower": "oe_syn_lower",
"syn.oe_ci.upper": "oe_syn_upper",
"lof.pLI": "pLI",
"syn.z_score": "syn_z",
}
columns_src_str = ",".join(columns_src_dst.keys())
columns_tmp_str = ",".join(columns_src_dst.values())
columns_dst = ["ensembl_gene_id", "entrez_id", "gene_symbol"] + list(columns_src_dst.values())[
1:
]
columns_dst_str = ",".join(
columns_dst
+ [
"exac_pLI",
"exac_obs_lof",
"exac_exp_lof",
"exac_oe_lof",
]
)
shell(
r"""
export TMPDIR=$(mktemp -d)
mkdir -p $TMPDIR
trap "rm -rf $TMPDIR" EXIT

# Extract transcripts that are MANE Select for genes that have one.
head -n 1 {input.tsv} \
> $TMPDIR/tmp.tsv
tail -n +2 {input.tsv} \
| grep ENSG \
| sort -k3,3r \
| sort -k1,1 -u \
| sed -e 's/","/"__COMMA__"/g' \
>> $TMPDIR/tmp.tsv

# Convert to CSV so `qsv` has an easier time
cat $TMPDIR/tmp.tsv \
| tr '\t' ',' \
> $TMPDIR/tmp.txt

# Select columns, rename, pad with the exac_* fields having NA values.
qsv select {columns_src_str} $TMPDIR/tmp.txt \
| qsv rename {columns_tmp_str} \
| perl -p -e 's/$/,exac_pLI,exac_obs_lof,exac_exp_lof,exac_oe_lof/ if $. == 1' \
| perl -p -e 's/$/,NA,NA,NA,NA/ if $. != 1' \
| tr ',' '\t' \
| sed -e 's/__COMMA__/,/g' \
> $TMPDIR/tmp2.tsv

qsv join -d '\t' \
ensembl_transcript_id $TMPDIR/tmp2.tsv \
ensembl_transcript_id {input.xlink_ensembl} \
| qsv select {columns_dst_str} \
| tr ',' '\t' \
> {output.tsv}

md5sum {output.tsv} >{output.tsv_md5}
"""
)


rule genes_gnomad_convert_v4: # -- create gnomAD gene constraints TSV (v4.x)
input:
tsv="work/download/genes/gnomad/{v_gnomad_constraints}/gnomad.v{v_gnomad_constraints}.constraint_metrics.tsv",
xlink_ensembl=f"work/genes/ensembl/{DV.ensembl}/ensembl_xlink.tsv",
output:
tsv="work/genes/gnomad/{v_gnomad_constraints}/gnomad_constraints.tsv",
tsv_md5="work/genes/gnomad/{v_gnomad_constraints}/gnomad_constraints.tsv.md5",
run:
run_genes_gnomad_constraints_v4_to_tsv(input, output, wildcards)
script:
"scripts/gnomad_constraints_v4_to_tsv.py"
73 changes: 73 additions & 0 deletions rules/work/genes/scripts/gnomad_constraints_v4_to_tsv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import polars as pl
from snakemake.shell import shell

def main(
constraint_tsv_path, ensembl_xlink_tsv_path, output_tsv_path, output_tsv_md5_path
):
columns_src_dst = {
"transcript": "ensembl_transcript_id",
"lof_hc_lc.exp": "exp_lof",
"mis.exp": "exp_mis",
"syn.exp": "exp_syn",
"mis.z_score": "mis_z",
"lof_hc_lc.obs": "obs_lof",
"mis.obs": "obs_mis",
"syn.obs": "obs_syn",
"lof.oe": "oe_lof",
"lof.oe_ci.lower": "oe_lof_lower",
"lof.oe_ci.upper": "oe_lof_upper",
"mis.oe": "oe_mis",
"mis.oe_ci.lower": "oe_mis_lower",
"mis.oe_ci.upper": "oe_mis_upper",
"syn.oe": "oe_syn",
"syn.oe_ci.lower": "oe_syn_lower",
"syn.oe_ci.upper": "oe_syn_upper",
"lof.pLI": "pLI",
"syn.z_score": "syn_z",
}

additional_columns = ["exac_pLI", "exac_obs_lof", "exac_exp_lof", "exac_oe_lof"]

columns_dst = (
["ensembl_gene_id", "entrez_id", "gene_symbol"]
+ list(columns_src_dst.values())[1:]
+ additional_columns
)

df = pl.read_csv(constraint_tsv_path, separator="\t", null_values="NA")
ensembl_xlink = pl.read_csv(
ensembl_xlink_tsv_path, separator="\t", null_values="NA"
)

df = (
df.sort(by=["gene", "mane_select", "canonical", "cds_length", "transcript"])
.filter(pl.col("gene_id").str.starts_with("ENSG"))
.group_by(["gene"])
.last()
.sort(["gene", "mane_select", "canonical"])
.with_columns(mane_or_canonical=pl.col("canonical") | pl.col("mane_select"))
)
discard = df.filter(pl.col("mane_or_canonical") == False).select(
["gene", "gene_id"]
)
print("No mane_select or canonical transcript found for ", discard.select("gene"))
df = (
df.filter(pl.col("mane_or_canonical") == True)
.rename(columns_src_dst)
.with_columns(pl.lit(None).alias(col) for col in additional_columns)
.join(ensembl_xlink, on=["ensembl_transcript_id"])
.select(columns_dst)
.with_columns(pl.exclude(pl.String).cast(str))
.fill_null("NA")
)
df.write_csv(output_tsv_path, separator="\t")
shell(f"md5sum {output_tsv_path} > {output_tsv_md5_path}")


if __name__ == "__main__":
main(
snakemake.input.tsv,
snakemake.input.xlink_ensembl,
snakemake.output.tsv,
snakemake.output.tsv_md5,
)