From 905e50f685e48bd59bc559b8b61edda5845665fc Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Sat, 8 Mar 2025 11:19:21 +0100 Subject: [PATCH 1/7] fix: ensure genes_gnomad_convert_v4 picks mane_select and canonical if available --- rules/work/genes/envs/polars.yaml | 4 ++ rules/work/genes/gnomad.smk | 6 +- .../scripts/gnomad_constraints_v4_to_tsv.py | 71 +++++++++++++++++++ 3 files changed, 79 insertions(+), 2 deletions(-) create mode 100644 rules/work/genes/envs/polars.yaml create mode 100644 rules/work/genes/scripts/gnomad_constraints_v4_to_tsv.py diff --git a/rules/work/genes/envs/polars.yaml b/rules/work/genes/envs/polars.yaml new file mode 100644 index 0000000..8afaf3e --- /dev/null +++ b/rules/work/genes/envs/polars.yaml @@ -0,0 +1,4 @@ +channels: + - conda-forge +dependencies: + - polars==1.24.0 diff --git a/rules/work/genes/gnomad.smk b/rules/work/genes/gnomad.smk index d13e280..a49a6ae 100644 --- a/rules/work/genes/gnomad.smk +++ b/rules/work/genes/gnomad.smk @@ -188,5 +188,7 @@ rule genes_gnomad_convert_v4: # -- create gnomAD gene constraints TSV (v4.x) output: tsv="work/genes/gnomad/{v_gnomad_constraints}/gnomad_constraints.tsv", tsv_md5="work/genes/gnomad/{v_gnomad_constraints}/gnomad_constraints.tsv.md5", - run: - run_genes_gnomad_constraints_v4_to_tsv(input, output, wildcards) + conda: + "envs/polars.yaml" + script: + "scripts/gnomad_constraints_v4_to_tsv.py" diff --git a/rules/work/genes/scripts/gnomad_constraints_v4_to_tsv.py b/rules/work/genes/scripts/gnomad_constraints_v4_to_tsv.py new file mode 100644 index 0000000..d9a0167 --- /dev/null +++ b/rules/work/genes/scripts/gnomad_constraints_v4_to_tsv.py @@ -0,0 +1,71 @@ +import polars as pl + +def main( + constraint_tsv_path, ensembl_xlink_tsv_path, output_tsv_path, output_tsv_md5_path +): + columns_src_dst = { + "transcript": "ensembl_transcript_id", + "lof_hc_lc.exp": "exp_lof", + "mis.exp": "exp_mis", + "syn.exp": "exp_syn", + "mis.z_score": "mis_z", + "lof_hc_lc.obs": "obs_lof", + "mis.obs": "obs_mis", + "syn.obs": "obs_syn", + "lof.oe": "oe_lof", + "lof.oe_ci.lower": "oe_lof_lower", + "lof.oe_ci.upper": "oe_lof_upper", + "mis.oe": "oe_mis", + "mis.oe_ci.lower": "oe_mis_lower", + "mis.oe_ci.upper": "oe_mis_upper", + "syn.oe": "oe_syn", + "syn.oe_ci.lower": "oe_syn_lower", + "syn.oe_ci.upper": "oe_syn_upper", + "lof.pLI": "pLI", + "syn.z_score": "syn_z", + } + + additional_columns = ["exac_pLI", "exac_obs_lof", "exac_exp_lof", "exac_oe_lof"] + + columns_dst = ( + ["ensembl_gene_id", "entrez_id", "gene_symbol"] + + list(columns_src_dst.values())[1:] + + additional_columns + ) + + df = pl.read_csv(constraint_tsv_path, separator="\t", null_values="NA") + ensembl_xlink = pl.read_csv( + ensembl_xlink_tsv_path, separator="\t", null_values="NA" + ) + + df = ( + df.sort(by=["gene", "mane_select", "canonical", "cds_length", "transcript"]) + .filter(pl.col("gene_id").str.starts_with("ENSG")) + .group_by(["gene"]) + .last() + .sort(["gene", "mane_select", "canonical"]) + .with_columns(mane_or_canonical=pl.col("canonical") | pl.col("mane_select")) + ) + discard = df.filter(pl.col("mane_or_canonical") == False).select( + ["gene", "gene_id"] + ) + print("No mane_select or canonical transcript found for ", discard.select("gene")) + df = ( + df.filter(pl.col("mane_or_canonical") == True) + .rename(columns_src_dst) + .with_columns(pl.lit(None).alias(col) for col in additional_columns) + .join(ensembl_xlink, on=["ensembl_transcript_id"]) + .select(columns_dst) + .fill_null("NA") + ) + df.write_csv(output_tsv_path, separator="\t") + shell(f"md5sum {output_tsv_path} > {output_tsv_md5_path}") + + +if __name__ == "__main__": + main( + snakemake.input.tsv, + snakemake.input.xlink_ensembl, + snakemake.output.tsv, + snakemake.output.tsv_md5, + ) From cf630fc0cda2cff203561e2345b15675305b8a45 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Sat, 8 Mar 2025 11:21:54 +0100 Subject: [PATCH 2/7] move polars dependency from local conda to global conda env --- environment.yml | 2 ++ rules/work/genes/envs/polars.yaml | 4 ---- rules/work/genes/gnomad.smk | 2 -- 3 files changed, 2 insertions(+), 6 deletions(-) delete mode 100644 rules/work/genes/envs/polars.yaml diff --git a/environment.yml b/environment.yml index b8180d7..ef9cde1 100644 --- a/environment.yml +++ b/environment.yml @@ -53,3 +53,5 @@ dependencies: - httpx =0.25.0 - httpcore =0.18.0 - trio + # Used in genes_gnomad_convert_v4 + - polars >=1.23,<2 diff --git a/rules/work/genes/envs/polars.yaml b/rules/work/genes/envs/polars.yaml deleted file mode 100644 index 8afaf3e..0000000 --- a/rules/work/genes/envs/polars.yaml +++ /dev/null @@ -1,4 +0,0 @@ -channels: - - conda-forge -dependencies: - - polars==1.24.0 diff --git a/rules/work/genes/gnomad.smk b/rules/work/genes/gnomad.smk index a49a6ae..e738e52 100644 --- a/rules/work/genes/gnomad.smk +++ b/rules/work/genes/gnomad.smk @@ -188,7 +188,5 @@ rule genes_gnomad_convert_v4: # -- create gnomAD gene constraints TSV (v4.x) output: tsv="work/genes/gnomad/{v_gnomad_constraints}/gnomad_constraints.tsv", tsv_md5="work/genes/gnomad/{v_gnomad_constraints}/gnomad_constraints.tsv.md5", - conda: - "envs/polars.yaml" script: "scripts/gnomad_constraints_v4_to_tsv.py" From f16e54a2f0e22d369a9cb996f605ceb3f18dde4d Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Sat, 8 Mar 2025 11:22:47 +0100 Subject: [PATCH 3/7] remove old unused code --- rules/work/genes/gnomad.smk | 82 ------------------------------------- 1 file changed, 82 deletions(-) diff --git a/rules/work/genes/gnomad.smk b/rules/work/genes/gnomad.smk index e738e52..8e59030 100644 --- a/rules/work/genes/gnomad.smk +++ b/rules/work/genes/gnomad.smk @@ -99,88 +99,6 @@ rule genes_gnomad_download_v4: # -- download gnomAD gene constraints v4.1 """ -def run_genes_gnomad_constraints_v4_to_tsv(input, output, wildcards): - """Extra function because of snakefmt issues. - - Note that the names in the output file are taken from the v2.1.1 file. - """ - columns_src_dst = { - "transcript": "ensembl_transcript_id", - "lof_hc_lc.exp": "exp_lof", - "mis.exp": "exp_mis", - "syn.exp": "exp_syn", - "mis.z_score": "mis_z", - "lof_hc_lc.obs": "obs_lof", - "mis.obs": "obs_mis", - "syn.obs": "obs_syn", - "lof.oe": "oe_lof", - "lof.oe_ci.lower": "oe_lof_lower", - "lof.oe_ci.upper": "oe_lof_upper", - "mis.oe": "oe_mis", - "mis.oe_ci.lower": "oe_mis_lower", - "mis.oe_ci.upper": "oe_mis_upper", - "syn.oe": "oe_syn", - "syn.oe_ci.lower": "oe_syn_lower", - "syn.oe_ci.upper": "oe_syn_upper", - "lof.pLI": "pLI", - "syn.z_score": "syn_z", - } - columns_src_str = ",".join(columns_src_dst.keys()) - columns_tmp_str = ",".join(columns_src_dst.values()) - columns_dst = ["ensembl_gene_id", "entrez_id", "gene_symbol"] + list(columns_src_dst.values())[ - 1: - ] - columns_dst_str = ",".join( - columns_dst - + [ - "exac_pLI", - "exac_obs_lof", - "exac_exp_lof", - "exac_oe_lof", - ] - ) - shell( - r""" - export TMPDIR=$(mktemp -d) - mkdir -p $TMPDIR - trap "rm -rf $TMPDIR" EXIT - - # Extract transcripts that are MANE Select for genes that have one. - head -n 1 {input.tsv} \ - > $TMPDIR/tmp.tsv - tail -n +2 {input.tsv} \ - | grep ENSG \ - | sort -k3,3r \ - | sort -k1,1 -u \ - | sed -e 's/","/"__COMMA__"/g' \ - >> $TMPDIR/tmp.tsv - - # Convert to CSV so `qsv` has an easier time - cat $TMPDIR/tmp.tsv \ - | tr '\t' ',' \ - > $TMPDIR/tmp.txt - - # Select columns, rename, pad with the exac_* fields having NA values. - qsv select {columns_src_str} $TMPDIR/tmp.txt \ - | qsv rename {columns_tmp_str} \ - | perl -p -e 's/$/,exac_pLI,exac_obs_lof,exac_exp_lof,exac_oe_lof/ if $. == 1' \ - | perl -p -e 's/$/,NA,NA,NA,NA/ if $. != 1' \ - | tr ',' '\t' \ - | sed -e 's/__COMMA__/,/g' \ - > $TMPDIR/tmp2.tsv - - qsv join -d '\t' \ - ensembl_transcript_id $TMPDIR/tmp2.tsv \ - ensembl_transcript_id {input.xlink_ensembl} \ - | qsv select {columns_dst_str} \ - | tr ',' '\t' \ - > {output.tsv} - - md5sum {output.tsv} >{output.tsv_md5} - """ - ) - - rule genes_gnomad_convert_v4: # -- create gnomAD gene constraints TSV (v4.x) input: tsv="work/download/genes/gnomad/{v_gnomad_constraints}/gnomad.v{v_gnomad_constraints}.constraint_metrics.tsv", From b48d98586f2def54ae369609792ad837bee55ef2 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Mon, 10 Mar 2025 10:33:19 +0100 Subject: [PATCH 4/7] snakemake.shell --- rules/work/genes/scripts/gnomad_constraints_v4_to_tsv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rules/work/genes/scripts/gnomad_constraints_v4_to_tsv.py b/rules/work/genes/scripts/gnomad_constraints_v4_to_tsv.py index d9a0167..d08c9d1 100644 --- a/rules/work/genes/scripts/gnomad_constraints_v4_to_tsv.py +++ b/rules/work/genes/scripts/gnomad_constraints_v4_to_tsv.py @@ -59,7 +59,7 @@ def main( .fill_null("NA") ) df.write_csv(output_tsv_path, separator="\t") - shell(f"md5sum {output_tsv_path} > {output_tsv_md5_path}") + snakemake.shell(f"md5sum {output_tsv_path} > {output_tsv_md5_path}") if __name__ == "__main__": From 8325419825d774eb9a8ca96a8b073fd54bd8a3a5 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Mon, 10 Mar 2025 10:38:54 +0100 Subject: [PATCH 5/7] fix snakemake.shell import --- rules/work/genes/scripts/gnomad_constraints_v4_to_tsv.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rules/work/genes/scripts/gnomad_constraints_v4_to_tsv.py b/rules/work/genes/scripts/gnomad_constraints_v4_to_tsv.py index d08c9d1..2c5cecd 100644 --- a/rules/work/genes/scripts/gnomad_constraints_v4_to_tsv.py +++ b/rules/work/genes/scripts/gnomad_constraints_v4_to_tsv.py @@ -1,4 +1,5 @@ import polars as pl +from snakemake.shell import shell def main( constraint_tsv_path, ensembl_xlink_tsv_path, output_tsv_path, output_tsv_md5_path @@ -59,7 +60,7 @@ def main( .fill_null("NA") ) df.write_csv(output_tsv_path, separator="\t") - snakemake.shell(f"md5sum {output_tsv_path} > {output_tsv_md5_path}") + shell(f"md5sum {output_tsv_path} > {output_tsv_md5_path}") if __name__ == "__main__": From ee957379e088053e7793853d6bc3996df3a52f8d Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Mon, 10 Mar 2025 10:46:43 +0100 Subject: [PATCH 6/7] do not replace nulls with 'NA' --- rules/work/genes/scripts/gnomad_constraints_v4_to_tsv.py | 1 - 1 file changed, 1 deletion(-) diff --git a/rules/work/genes/scripts/gnomad_constraints_v4_to_tsv.py b/rules/work/genes/scripts/gnomad_constraints_v4_to_tsv.py index 2c5cecd..2170d24 100644 --- a/rules/work/genes/scripts/gnomad_constraints_v4_to_tsv.py +++ b/rules/work/genes/scripts/gnomad_constraints_v4_to_tsv.py @@ -57,7 +57,6 @@ def main( .with_columns(pl.lit(None).alias(col) for col in additional_columns) .join(ensembl_xlink, on=["ensembl_transcript_id"]) .select(columns_dst) - .fill_null("NA") ) df.write_csv(output_tsv_path, separator="\t") shell(f"md5sum {output_tsv_path} > {output_tsv_md5_path}") From f8509f454ff3c52c658f9924800db8f1ce259324 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Mon, 10 Mar 2025 11:33:51 +0100 Subject: [PATCH 7/7] cast to str, then fill with 'NA' --- rules/work/genes/scripts/gnomad_constraints_v4_to_tsv.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rules/work/genes/scripts/gnomad_constraints_v4_to_tsv.py b/rules/work/genes/scripts/gnomad_constraints_v4_to_tsv.py index 2170d24..89b7a13 100644 --- a/rules/work/genes/scripts/gnomad_constraints_v4_to_tsv.py +++ b/rules/work/genes/scripts/gnomad_constraints_v4_to_tsv.py @@ -57,6 +57,8 @@ def main( .with_columns(pl.lit(None).alias(col) for col in additional_columns) .join(ensembl_xlink, on=["ensembl_transcript_id"]) .select(columns_dst) + .with_columns(pl.exclude(pl.String).cast(str)) + .fill_null("NA") ) df.write_csv(output_tsv_path, separator="\t") shell(f"md5sum {output_tsv_path} > {output_tsv_md5_path}")