diff --git a/environment.yml b/environment.yml index b8180d7..ef9cde1 100644 --- a/environment.yml +++ b/environment.yml @@ -53,3 +53,5 @@ dependencies: - httpx =0.25.0 - httpcore =0.18.0 - trio + # Used in genes_gnomad_convert_v4 + - polars >=1.23,<2 diff --git a/rules/work/genes/gnomad.smk b/rules/work/genes/gnomad.smk index d13e280..8e59030 100644 --- a/rules/work/genes/gnomad.smk +++ b/rules/work/genes/gnomad.smk @@ -99,88 +99,6 @@ rule genes_gnomad_download_v4: # -- download gnomAD gene constraints v4.1 """ -def run_genes_gnomad_constraints_v4_to_tsv(input, output, wildcards): - """Extra function because of snakefmt issues. - - Note that the names in the output file are taken from the v2.1.1 file. - """ - columns_src_dst = { - "transcript": "ensembl_transcript_id", - "lof_hc_lc.exp": "exp_lof", - "mis.exp": "exp_mis", - "syn.exp": "exp_syn", - "mis.z_score": "mis_z", - "lof_hc_lc.obs": "obs_lof", - "mis.obs": "obs_mis", - "syn.obs": "obs_syn", - "lof.oe": "oe_lof", - "lof.oe_ci.lower": "oe_lof_lower", - "lof.oe_ci.upper": "oe_lof_upper", - "mis.oe": "oe_mis", - "mis.oe_ci.lower": "oe_mis_lower", - "mis.oe_ci.upper": "oe_mis_upper", - "syn.oe": "oe_syn", - "syn.oe_ci.lower": "oe_syn_lower", - "syn.oe_ci.upper": "oe_syn_upper", - "lof.pLI": "pLI", - "syn.z_score": "syn_z", - } - columns_src_str = ",".join(columns_src_dst.keys()) - columns_tmp_str = ",".join(columns_src_dst.values()) - columns_dst = ["ensembl_gene_id", "entrez_id", "gene_symbol"] + list(columns_src_dst.values())[ - 1: - ] - columns_dst_str = ",".join( - columns_dst - + [ - "exac_pLI", - "exac_obs_lof", - "exac_exp_lof", - "exac_oe_lof", - ] - ) - shell( - r""" - export TMPDIR=$(mktemp -d) - mkdir -p $TMPDIR - trap "rm -rf $TMPDIR" EXIT - - # Extract transcripts that are MANE Select for genes that have one. - head -n 1 {input.tsv} \ - > $TMPDIR/tmp.tsv - tail -n +2 {input.tsv} \ - | grep ENSG \ - | sort -k3,3r \ - | sort -k1,1 -u \ - | sed -e 's/","/"__COMMA__"/g' \ - >> $TMPDIR/tmp.tsv - - # Convert to CSV so `qsv` has an easier time - cat $TMPDIR/tmp.tsv \ - | tr '\t' ',' \ - > $TMPDIR/tmp.txt - - # Select columns, rename, pad with the exac_* fields having NA values. - qsv select {columns_src_str} $TMPDIR/tmp.txt \ - | qsv rename {columns_tmp_str} \ - | perl -p -e 's/$/,exac_pLI,exac_obs_lof,exac_exp_lof,exac_oe_lof/ if $. == 1' \ - | perl -p -e 's/$/,NA,NA,NA,NA/ if $. != 1' \ - | tr ',' '\t' \ - | sed -e 's/__COMMA__/,/g' \ - > $TMPDIR/tmp2.tsv - - qsv join -d '\t' \ - ensembl_transcript_id $TMPDIR/tmp2.tsv \ - ensembl_transcript_id {input.xlink_ensembl} \ - | qsv select {columns_dst_str} \ - | tr ',' '\t' \ - > {output.tsv} - - md5sum {output.tsv} >{output.tsv_md5} - """ - ) - - rule genes_gnomad_convert_v4: # -- create gnomAD gene constraints TSV (v4.x) input: tsv="work/download/genes/gnomad/{v_gnomad_constraints}/gnomad.v{v_gnomad_constraints}.constraint_metrics.tsv", @@ -188,5 +106,5 @@ rule genes_gnomad_convert_v4: # -- create gnomAD gene constraints TSV (v4.x) output: tsv="work/genes/gnomad/{v_gnomad_constraints}/gnomad_constraints.tsv", tsv_md5="work/genes/gnomad/{v_gnomad_constraints}/gnomad_constraints.tsv.md5", - run: - run_genes_gnomad_constraints_v4_to_tsv(input, output, wildcards) + script: + "scripts/gnomad_constraints_v4_to_tsv.py" diff --git a/rules/work/genes/scripts/gnomad_constraints_v4_to_tsv.py b/rules/work/genes/scripts/gnomad_constraints_v4_to_tsv.py new file mode 100644 index 0000000..89b7a13 --- /dev/null +++ b/rules/work/genes/scripts/gnomad_constraints_v4_to_tsv.py @@ -0,0 +1,73 @@ +import polars as pl +from snakemake.shell import shell + +def main( + constraint_tsv_path, ensembl_xlink_tsv_path, output_tsv_path, output_tsv_md5_path +): + columns_src_dst = { + "transcript": "ensembl_transcript_id", + "lof_hc_lc.exp": "exp_lof", + "mis.exp": "exp_mis", + "syn.exp": "exp_syn", + "mis.z_score": "mis_z", + "lof_hc_lc.obs": "obs_lof", + "mis.obs": "obs_mis", + "syn.obs": "obs_syn", + "lof.oe": "oe_lof", + "lof.oe_ci.lower": "oe_lof_lower", + "lof.oe_ci.upper": "oe_lof_upper", + "mis.oe": "oe_mis", + "mis.oe_ci.lower": "oe_mis_lower", + "mis.oe_ci.upper": "oe_mis_upper", + "syn.oe": "oe_syn", + "syn.oe_ci.lower": "oe_syn_lower", + "syn.oe_ci.upper": "oe_syn_upper", + "lof.pLI": "pLI", + "syn.z_score": "syn_z", + } + + additional_columns = ["exac_pLI", "exac_obs_lof", "exac_exp_lof", "exac_oe_lof"] + + columns_dst = ( + ["ensembl_gene_id", "entrez_id", "gene_symbol"] + + list(columns_src_dst.values())[1:] + + additional_columns + ) + + df = pl.read_csv(constraint_tsv_path, separator="\t", null_values="NA") + ensembl_xlink = pl.read_csv( + ensembl_xlink_tsv_path, separator="\t", null_values="NA" + ) + + df = ( + df.sort(by=["gene", "mane_select", "canonical", "cds_length", "transcript"]) + .filter(pl.col("gene_id").str.starts_with("ENSG")) + .group_by(["gene"]) + .last() + .sort(["gene", "mane_select", "canonical"]) + .with_columns(mane_or_canonical=pl.col("canonical") | pl.col("mane_select")) + ) + discard = df.filter(pl.col("mane_or_canonical") == False).select( + ["gene", "gene_id"] + ) + print("No mane_select or canonical transcript found for ", discard.select("gene")) + df = ( + df.filter(pl.col("mane_or_canonical") == True) + .rename(columns_src_dst) + .with_columns(pl.lit(None).alias(col) for col in additional_columns) + .join(ensembl_xlink, on=["ensembl_transcript_id"]) + .select(columns_dst) + .with_columns(pl.exclude(pl.String).cast(str)) + .fill_null("NA") + ) + df.write_csv(output_tsv_path, separator="\t") + shell(f"md5sum {output_tsv_path} > {output_tsv_md5_path}") + + +if __name__ == "__main__": + main( + snakemake.input.tsv, + snakemake.input.xlink_ensembl, + snakemake.output.tsv, + snakemake.output.tsv_md5, + )