From ef2f9dacc43023d1562db9aeda6dd522bc441f55 Mon Sep 17 00:00:00 2001 From: Mackenzie Noon Date: Wed, 3 Jun 2026 16:15:49 -0400 Subject: [PATCH 1/9] example of compiler bug --- test_compiler/minimal_test_input.tsv | 5 +++++ test_compiler/output_pre_fix/dna_annot.tsv.gz | Bin 0 -> 124 bytes test_compiler/output_pre_fix/dna_counts.tsv.gz | Bin 0 -> 111 bytes test_compiler/output_pre_fix/rna_annot.tsv.gz | Bin 0 -> 125 bytes test_compiler/output_pre_fix/rna_counts.tsv.gz | Bin 0 -> 116 bytes test_compiler/test.py | 17 +++++++++++++++++ 6 files changed, 22 insertions(+) create mode 100644 test_compiler/minimal_test_input.tsv create mode 100644 test_compiler/output_pre_fix/dna_annot.tsv.gz create mode 100644 test_compiler/output_pre_fix/dna_counts.tsv.gz create mode 100644 test_compiler/output_pre_fix/rna_annot.tsv.gz create mode 100644 test_compiler/output_pre_fix/rna_counts.tsv.gz create mode 100644 test_compiler/test.py diff --git a/test_compiler/minimal_test_input.tsv b/test_compiler/minimal_test_input.tsv new file mode 100644 index 0000000..1a330ce --- /dev/null +++ b/test_compiler/minimal_test_input.tsv @@ -0,0 +1,5 @@ +label Sequence Barcode DNA(condition X, replicate 1) DNA(condition X, replicate 2) DNA(condition X, replicate 3) RNA(condition X, replicate 1) RNA(condition X, replicate 2) RNA(condition X, replicate 3) +oligoA A BC0 10 20 30 100 200 300 +oligoA A BC1 11 21 31 101 201 301 +oligoB B BC2 12 22 32 102 202 302 +oligoB B BC3 13 23 33 103 203 303 diff --git a/test_compiler/output_pre_fix/dna_annot.tsv.gz b/test_compiler/output_pre_fix/dna_annot.tsv.gz new file mode 100644 index 0000000000000000000000000000000000000000..46098423c75caee1800046a9283b176a8a36fd56 GIT binary patch literal 124 zcmV-?0E7P@iwFopg&=AI|731qUtw-;Z*(qnb9Mkc$}tXrAPh#~x!$Jj2~3=w*$6@h z5(I)Q-d+atZ!a(36YgG&=G%>8t0~XiDh=x;$1shqFk(x#9p~00030{{sNd(SSvu0001)%QbcY literal 0 HcmV?d00001 diff --git a/test_compiler/output_pre_fix/dna_counts.tsv.gz b/test_compiler/output_pre_fix/dna_counts.tsv.gz new file mode 100644 index 0000000000000000000000000000000000000000..2d54e55f464ae1e9340f8f59763086c2c808198a GIT binary patch literal 111 zcmV-#0FeJ5iwFopg&=AI|731qUt@1|Zgg`lbaQq9JjpQ%fG`k5(Ry;4%*+WQ*4B30 zC=v(+@%~_;Z1;KpIq&EP)~&@J3})#sr76v64wX6jH5*u=3RjR2VSZx)1ql{t$gs-` R009600|0U!H`8?h008xgEQ$aC literal 0 HcmV?d00001 diff --git a/test_compiler/output_pre_fix/rna_annot.tsv.gz b/test_compiler/output_pre_fix/rna_annot.tsv.gz new file mode 100644 index 0000000000000000000000000000000000000000..c5d09e08c5856ddcc0f29d7f7271ce8fb47f3807 GIT binary patch literal 125 zcmV-@0D}J?iwFopg&=AI|8j0&Utw-;Z*(qnb9Mkc$}tXrAPh#~x!$Jj3A})V6B|M3 zK!QM!#oNnZ{_W-Ed&1p|(R{m6Y&GSXTcu&Wcks~I?6Fm8! fzzLkd37oOV|NjF3=9s1tpa1{>WJ){# literal 0 HcmV?d00001 diff --git a/test_compiler/output_pre_fix/rna_counts.tsv.gz b/test_compiler/output_pre_fix/rna_counts.tsv.gz new file mode 100644 index 0000000000000000000000000000000000000000..8314690a10c4db371ba55b23ed6e73bcac607d6a GIT binary patch literal 116 zcmV-)0E_=0iwFopg&=AI|8j0&Ut@1|Zgg`lbaQq9JjpQ%fG`k5(Km2=>t-!2Ma}5R=GQN9Jq7s!T WyX>;p7XSeN{{sL404bP(0000i Date: Wed, 3 Jun 2026 16:47:40 -0400 Subject: [PATCH 2/9] +fix for compiler replicate confounding +example of fix --- test_compiler/output_post_fix/dna_annot.tsv.gz | Bin 0 -> 124 bytes test_compiler/output_post_fix/dna_counts.tsv.gz | Bin 0 -> 111 bytes test_compiler/output_post_fix/rna_annot.tsv.gz | Bin 0 -> 125 bytes test_compiler/output_post_fix/rna_counts.tsv.gz | Bin 0 -> 116 bytes workflow/scripts/count/mpranalyze_compiler.py | 2 +- 5 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 test_compiler/output_post_fix/dna_annot.tsv.gz create mode 100644 test_compiler/output_post_fix/dna_counts.tsv.gz create mode 100644 test_compiler/output_post_fix/rna_annot.tsv.gz create mode 100644 test_compiler/output_post_fix/rna_counts.tsv.gz diff --git a/test_compiler/output_post_fix/dna_annot.tsv.gz b/test_compiler/output_post_fix/dna_annot.tsv.gz new file mode 100644 index 0000000000000000000000000000000000000000..76f339181a247eefb2e0150634deca01a2c9b99e GIT binary patch literal 124 zcmV-?0E7P@iwFn;k|1gV|731qUtw-;Z*(qnb9Mkc$}tXrAPh#~x!$Jj2~3=w*$6@h z5(I)Q-d+atZ!a(36YgG&=G%>8t0~XiDh=x;$1shqFk(x#9p~00030{{sNd(SSvu0001i)-^)_ literal 0 HcmV?d00001 diff --git a/test_compiler/output_post_fix/dna_counts.tsv.gz b/test_compiler/output_post_fix/dna_counts.tsv.gz new file mode 100644 index 0000000000000000000000000000000000000000..a968e50f99698e16a783ff48540fb148ffe2ab3b GIT binary patch literal 111 zcmV-#0FeJ5iwFn;k|1gV|731qUt@1|Zgg`lbaQq9JjpQ%fG`k5(Ry;4%*+WQ*4B30 zC=v(+@%~_;Z1;KpIq&EP)~&@J3})#sr76v64wX6jH5*t#LKPxhVSZx)1sW1$Sg^|r R009600|0+Zm9TXH007IbE7<@5 literal 0 HcmV?d00001 diff --git a/test_compiler/output_post_fix/rna_annot.tsv.gz b/test_compiler/output_post_fix/rna_annot.tsv.gz new file mode 100644 index 0000000000000000000000000000000000000000..286632b075cd8547a375031622ec8621603468aa GIT binary patch literal 125 zcmV-@0D}J?iwFn;k|1gV|8j0&Utw-;Z*(qnb9Mkc$}tXrAPh#~x!$Jj3A})V6B|M3 zK!QM!#oNnZ{_W-Ed&1p|(R{m6Y&GSXTcu&Wcks~I?6Fm8! fzzLkd37oOV|NjF3=9s1tpa1{>Od>nX literal 0 HcmV?d00001 diff --git a/test_compiler/output_post_fix/rna_counts.tsv.gz b/test_compiler/output_post_fix/rna_counts.tsv.gz new file mode 100644 index 0000000000000000000000000000000000000000..295b79e7a9bb7140caabff229183dfd30cbaa109 GIT binary patch literal 116 zcmV-)0E_=0iwFn;k|1gV|8j0&Ut@1|Zgg`lbaQq9Jc%(1fG`jQ>*O_eyDx|ju&}bb zMv*`ui2nx{lq-f^W}fqoew1x(v4@A}!~c*D>6ngzI!C`|D{dydKnpFkH2?CV1VJST WDnYQm00030{{sL?T8v_W0002LcP}OY literal 0 HcmV?d00001 diff --git a/workflow/scripts/count/mpranalyze_compiler.py b/workflow/scripts/count/mpranalyze_compiler.py index b559c1f..2a77622 100644 --- a/workflow/scripts/count/mpranalyze_compiler.py +++ b/workflow/scripts/count/mpranalyze_compiler.py @@ -79,7 +79,7 @@ def generateAnnotationOutput(data, number_barcodes): ## generate output DNA/RNA count tables ## rows oligo/seq ids,/assignment then per barcode the counts. padding with zeros def generateCountOutput(data,columns): - counts = pd.DataFrame(list(data.groupby('label').apply(lambda x: x.values.flatten()))).fillna(0).astype(np.int64) + counts = pd.DataFrame(list(data.groupby('label').apply(lambda x: x.values.flatten(order='F')))).fillna(0).astype(np.int64) counts.columns = columns counts['seq_id'] = data.index.unique() counts = counts[(['seq_id'] + list(columns))] From a1115ec4b3d197841db32d851e1346ec1badb55f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 3 Jun 2026 21:00:07 +0000 Subject: [PATCH 3/9] ci: auto fixes from pre-commit hooks for more information, see https://pre-commit.ci --- test_compiler/test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test_compiler/test.py b/test_compiler/test.py index 0f2aa85..eb594fb 100644 --- a/test_compiler/test.py +++ b/test_compiler/test.py @@ -1,5 +1,6 @@ -import sys import os +import sys + sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "workflow", "scripts", "count")) def main(): From 6b7ffcae4377e4a77e30ca5c8ae0e2dc5209b0e7 Mon Sep 17 00:00:00 2001 From: Mackenzie Noon Date: Fri, 5 Jun 2026 19:24:26 -0400 Subject: [PATCH 4/9] Potential fix for pull request finding This improves the tests substantively, as described. Not actually required for the fix, but a nice addition. Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- test_compiler/test.py | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/test_compiler/test.py b/test_compiler/test.py index eb594fb..b13b660 100644 --- a/test_compiler/test.py +++ b/test_compiler/test.py @@ -4,15 +4,32 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "workflow", "scripts", "count")) def main(): + import tempfile + + import pandas as pd + import mpranalyze_compiler as c + here = os.path.dirname(__file__) - c.cli.callback( - input_file=os.path.join(here, "minimal_test_input.tsv"), - rna_counts_output_file=os.path.join(here, "rna_counts.tsv.gz"), - dna_counts_output_file=os.path.join(here, "dna_counts.tsv.gz"), - rna_annotation_output_file=os.path.join(here, "rna_annot.tsv.gz"), - dna_annotation_output_file=os.path.join(here, "dna_annot.tsv.gz"), - ) + with tempfile.TemporaryDirectory() as tmp: + rna_counts_output = os.path.join(tmp, "rna_counts.tsv.gz") + dna_counts_output = os.path.join(tmp, "dna_counts.tsv.gz") + rna_annot_output = os.path.join(tmp, "rna_annot.tsv.gz") + dna_annot_output = os.path.join(tmp, "dna_annot.tsv.gz") + + c.cli.callback( + input_file=os.path.join(here, "minimal_test_input.tsv"), + rna_counts_output_file=rna_counts_output, + dna_counts_output_file=dna_counts_output, + rna_annotation_output_file=rna_annot_output, + dna_annotation_output_file=dna_annot_output, + ) + + rna = pd.read_csv(rna_counts_output, sep="\t").set_index("seq_id") + dna = pd.read_csv(dna_counts_output, sep="\t").set_index("seq_id") + + assert list(rna.loc["oligoA", ["RNA_X_1_1", "RNA_X_1_2", "RNA_X_2_1", "RNA_X_2_2", "RNA_X_3_1", "RNA_X_3_2"]]) == [100, 101, 200, 201, 300, 301] + assert list(dna.loc["oligoA", ["DNA_X_1_1", "DNA_X_1_2", "DNA_X_2_1", "DNA_X_2_2", "DNA_X_3_1", "DNA_X_3_2"]]) == [10, 11, 20, 21, 30, 31] if __name__=="__main__": main() From 2a93422c6d2d2476c871312fa596f7e20f8e0c42 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 5 Jun 2026 23:25:28 +0000 Subject: [PATCH 5/9] ci: auto fixes from pre-commit hooks for more information, see https://pre-commit.ci --- test_compiler/test.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test_compiler/test.py b/test_compiler/test.py index b13b660..d411935 100644 --- a/test_compiler/test.py +++ b/test_compiler/test.py @@ -6,9 +6,8 @@ def main(): import tempfile - import pandas as pd - import mpranalyze_compiler as c + import pandas as pd here = os.path.dirname(__file__) with tempfile.TemporaryDirectory() as tmp: From 68f49a36b06e5f6ddb40b49fed8e9330204b7d0f Mon Sep 17 00:00:00 2001 From: Max Schubach Date: Mon, 8 Jun 2026 13:29:05 +0200 Subject: [PATCH 6/9] tests: using general pytests for python scripts --- .github/workflows/main.yml | 17 ++++ pyproject.toml | 5 + requirements-test.txt | 4 + .../output_post_fix/dna_annot.tsv.gz | Bin 124 -> 0 bytes .../output_post_fix/dna_counts.tsv.gz | Bin 111 -> 0 bytes .../output_post_fix/rna_annot.tsv.gz | Bin 125 -> 0 bytes .../output_post_fix/rna_counts.tsv.gz | Bin 116 -> 0 bytes test_compiler/output_pre_fix/dna_annot.tsv.gz | Bin 124 -> 0 bytes .../output_pre_fix/dna_counts.tsv.gz | Bin 111 -> 0 bytes test_compiler/output_pre_fix/rna_annot.tsv.gz | Bin 125 -> 0 bytes .../output_pre_fix/rna_counts.tsv.gz | Bin 116 -> 0 bytes test_compiler/test.py | 34 ------- tests/README.md | 20 ++++ tests/conftest.py | 37 ++++++++ tests/count/test_mpranalyze_compiler.py | 88 ++++++++++++++++++ .../fixtures/count}/minimal_test_input.tsv | 0 workflow/scripts/count/mpranalyze_compiler.py | 82 +++++++++------- 17 files changed, 219 insertions(+), 68 deletions(-) create mode 100644 requirements-test.txt delete mode 100644 test_compiler/output_post_fix/dna_annot.tsv.gz delete mode 100644 test_compiler/output_post_fix/dna_counts.tsv.gz delete mode 100644 test_compiler/output_post_fix/rna_annot.tsv.gz delete mode 100644 test_compiler/output_post_fix/rna_counts.tsv.gz delete mode 100644 test_compiler/output_pre_fix/dna_annot.tsv.gz delete mode 100644 test_compiler/output_pre_fix/dna_counts.tsv.gz delete mode 100644 test_compiler/output_pre_fix/rna_annot.tsv.gz delete mode 100644 test_compiler/output_pre_fix/rna_counts.tsv.gz delete mode 100644 test_compiler/test.py create mode 100644 tests/README.md create mode 100644 tests/conftest.py create mode 100644 tests/count/test_mpranalyze_compiler.py rename {test_compiler => tests/fixtures/count}/minimal_test_input.tsv (100%) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 39c6e37..10679ab 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -54,6 +54,23 @@ jobs: directory: . snakefile: workflow/Snakefile args: "--lint --configfile config/example_config.yaml --config skip_version_check=True" + + Pytest: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ['3.10', '3.11', '3.12', '3.13', '3.14'] + steps: + - uses: actions/checkout@v6 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install test dependencies + run: python -m pip install --upgrade pip -r requirements-test.txt + - name: Run pytest suite + run: python -m pytest -q tests # Testing: # runs-on: ubuntu-latest # needs: diff --git a/pyproject.toml b/pyproject.toml index 52c96bc..1050e93 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,2 +1,7 @@ [tool.snakefmt] line_length = 127 + +[tool.pytest.ini_options] +testpaths = ["tests"] +python_files = ["test_*.py", "*_test.py"] +addopts = "-ra" diff --git a/requirements-test.txt b/requirements-test.txt new file mode 100644 index 0000000..42b555e --- /dev/null +++ b/requirements-test.txt @@ -0,0 +1,4 @@ +pytest +click +numpy +pandas diff --git a/test_compiler/output_post_fix/dna_annot.tsv.gz b/test_compiler/output_post_fix/dna_annot.tsv.gz deleted file mode 100644 index 76f339181a247eefb2e0150634deca01a2c9b99e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 124 zcmV-?0E7P@iwFn;k|1gV|731qUtw-;Z*(qnb9Mkc$}tXrAPh#~x!$Jj2~3=w*$6@h z5(I)Q-d+atZ!a(36YgG&=G%>8t0~XiDh=x;$1shqFk(x#9p~00030{{sNd(SSvu0001i)-^)_ diff --git a/test_compiler/output_post_fix/dna_counts.tsv.gz b/test_compiler/output_post_fix/dna_counts.tsv.gz deleted file mode 100644 index a968e50f99698e16a783ff48540fb148ffe2ab3b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 111 zcmV-#0FeJ5iwFn;k|1gV|731qUt@1|Zgg`lbaQq9JjpQ%fG`k5(Ry;4%*+WQ*4B30 zC=v(+@%~_;Z1;KpIq&EP)~&@J3})#sr76v64wX6jH5*t#LKPxhVSZx)1sW1$Sg^|r R009600|0+Zm9TXH007IbE7<@5 diff --git a/test_compiler/output_post_fix/rna_annot.tsv.gz b/test_compiler/output_post_fix/rna_annot.tsv.gz deleted file mode 100644 index 286632b075cd8547a375031622ec8621603468aa..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 125 zcmV-@0D}J?iwFn;k|1gV|8j0&Utw-;Z*(qnb9Mkc$}tXrAPh#~x!$Jj3A})V6B|M3 zK!QM!#oNnZ{_W-Ed&1p|(R{m6Y&GSXTcu&Wcks~I?6Fm8! fzzLkd37oOV|NjF3=9s1tpa1{>Od>nX diff --git a/test_compiler/output_post_fix/rna_counts.tsv.gz b/test_compiler/output_post_fix/rna_counts.tsv.gz deleted file mode 100644 index 295b79e7a9bb7140caabff229183dfd30cbaa109..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 116 zcmV-)0E_=0iwFn;k|1gV|8j0&Ut@1|Zgg`lbaQq9Jc%(1fG`jQ>*O_eyDx|ju&}bb zMv*`ui2nx{lq-f^W}fqoew1x(v4@A}!~c*D>6ngzI!C`|D{dydKnpFkH2?CV1VJST WDnYQm00030{{sL?T8v_W0002LcP}OY diff --git a/test_compiler/output_pre_fix/dna_annot.tsv.gz b/test_compiler/output_pre_fix/dna_annot.tsv.gz deleted file mode 100644 index 46098423c75caee1800046a9283b176a8a36fd56..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 124 zcmV-?0E7P@iwFopg&=AI|731qUtw-;Z*(qnb9Mkc$}tXrAPh#~x!$Jj2~3=w*$6@h z5(I)Q-d+atZ!a(36YgG&=G%>8t0~XiDh=x;$1shqFk(x#9p~00030{{sNd(SSvu0001)%QbcY diff --git a/test_compiler/output_pre_fix/dna_counts.tsv.gz b/test_compiler/output_pre_fix/dna_counts.tsv.gz deleted file mode 100644 index 2d54e55f464ae1e9340f8f59763086c2c808198a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 111 zcmV-#0FeJ5iwFopg&=AI|731qUt@1|Zgg`lbaQq9JjpQ%fG`k5(Ry;4%*+WQ*4B30 zC=v(+@%~_;Z1;KpIq&EP)~&@J3})#sr76v64wX6jH5*u=3RjR2VSZx)1ql{t$gs-` R009600|0U!H`8?h008xgEQ$aC diff --git a/test_compiler/output_pre_fix/rna_annot.tsv.gz b/test_compiler/output_pre_fix/rna_annot.tsv.gz deleted file mode 100644 index c5d09e08c5856ddcc0f29d7f7271ce8fb47f3807..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 125 zcmV-@0D}J?iwFopg&=AI|8j0&Utw-;Z*(qnb9Mkc$}tXrAPh#~x!$Jj3A})V6B|M3 zK!QM!#oNnZ{_W-Ed&1p|(R{m6Y&GSXTcu&Wcks~I?6Fm8! fzzLkd37oOV|NjF3=9s1tpa1{>WJ){# diff --git a/test_compiler/output_pre_fix/rna_counts.tsv.gz b/test_compiler/output_pre_fix/rna_counts.tsv.gz deleted file mode 100644 index 8314690a10c4db371ba55b23ed6e73bcac607d6a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 116 zcmV-)0E_=0iwFopg&=AI|8j0&Ut@1|Zgg`lbaQq9JjpQ%fG`k5(Km2=>t-!2Ma}5R=GQN9Jq7s!T WyX>;p7XSeN{{sL404bP(0000i/test_*.py` holds the actual tests, grouped by script area such as `count`. +- `tests/fixtures//` stores reusable input data for those tests. + +## Adding a new script test + +1. Put the new test in the matching area folder, for example `tests/count/test_new_script.py`. +2. Add any reusable inputs under `tests/fixtures//`. +3. Prefer `click.testing.CliRunner` for Click commands and pytest fixtures like `tmp_path` for temporary outputs. +4. Run a focused check with `conda run -n mpralib python -m pytest -q tests//test_new_script.py`. + +## Current example + +The MPRAnalyze compiler test lives in `tests/count/test_mpranalyze_compiler.py` and uses the fixture input at `tests/fixtures/count/minimal_test_input.tsv`. diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..8da7f62 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,37 @@ +import sys +from pathlib import Path + +import pytest +from click.testing import CliRunner + +PROJECT_ROOT = Path(__file__).resolve().parents[1] +PROJECT_ROOT_STR = str(PROJECT_ROOT) +TESTS_ROOT = Path(__file__).resolve().parent + +if PROJECT_ROOT_STR not in sys.path: + sys.path.insert(0, PROJECT_ROOT_STR) + + +@pytest.fixture(scope="session") +def tests_root() -> Path: + return TESTS_ROOT + + +@pytest.fixture(scope="session") +def fixtures_root(tests_root: Path) -> Path: + return tests_root / "fixtures" + + +@pytest.fixture(scope="session") +def count_fixtures_root(fixtures_root: Path) -> Path: + return fixtures_root / "count" + + +@pytest.fixture +def minimal_count_input(count_fixtures_root: Path) -> Path: + return count_fixtures_root / "minimal_test_input.tsv" + + +@pytest.fixture +def cli_runner() -> CliRunner: + return CliRunner() diff --git a/tests/count/test_mpranalyze_compiler.py b/tests/count/test_mpranalyze_compiler.py new file mode 100644 index 0000000..082d0ae --- /dev/null +++ b/tests/count/test_mpranalyze_compiler.py @@ -0,0 +1,88 @@ +import pandas as pd + +from workflow.scripts.count import mpranalyze_compiler as compiler + + +class TestMpranalyzeCompiler: + def test_get_annot_parses_dna_and_rna_headers(self): + assert compiler.get_annot("DNA(condition X, replicate 1)") == ("DNA", "X", "1") + assert compiler.get_annot("RNA(condition Y, replicate 2)") == ("RNA", "Y", "2") + + def test_get_annot_returns_none_for_unmatched_headers(self): + assert compiler.get_annot("Sequence") == (None, None, None) + assert compiler.get_annot("label") == (None, None, None) + + def test_generate_annotation_output_repeats_and_numbers_barcodes(self): + input_frame = pd.DataFrame( + [ + {"type": "DNA", "condition": "X", "replicate": "1"}, + {"type": "RNA", "condition": "X", "replicate": "1"}, + ] + ) + + output = compiler.generate_annotation_output(input_frame, number_barcodes=2) + + assert list(output["sample"]) == ["DNA_X_1_1", "DNA_X_1_2", "RNA_X_1_1", "RNA_X_1_2"] + assert list(output["barcode"]) == ["1", "2", "1", "2"] + + def test_generate_count_output_pads_and_flattens_by_barcode(self): + input_frame = pd.DataFrame( + [ + {"label": "oligoA", "bc1": 10, "bc2": 20}, + {"label": "oligoA", "bc1": 11, "bc2": 21}, + {"label": "oligoB", "bc1": 12, "bc2": 22}, + ] + ).set_index("label") + + output = compiler.generate_count_output(input_frame, ["sample_1", "sample_2", "sample_3", "sample_4"], number_barcodes=2) + + assert list(output["seq_id"]) == ["oligoA", "oligoB"] + assert list(output.loc[output["seq_id"] == "oligoA", ["sample_1", "sample_2", "sample_3", "sample_4"]].iloc[0]) == [10, 11, 20, 21] + assert list(output.loc[output["seq_id"] == "oligoB", ["sample_1", "sample_2", "sample_3", "sample_4"]].iloc[0]) == [12, 0, 22, 0] + + def test_cli_generates_expected_count_tables(self, cli_runner, minimal_count_input, tmp_path): + rna_counts_output = tmp_path / "rna_counts.tsv.gz" + dna_counts_output = tmp_path / "dna_counts.tsv.gz" + rna_annotation_output = tmp_path / "rna_annot.tsv.gz" + dna_annotation_output = tmp_path / "dna_annot.tsv.gz" + + result = cli_runner.invoke( + compiler.cli, + [ + "--input", + str(minimal_count_input), + "--rna-counts-output", + str(rna_counts_output), + "--dna-counts-output", + str(dna_counts_output), + "--rna-annotation-output", + str(rna_annotation_output), + "--dna-annotation-output", + str(dna_annotation_output), + ], + ) + + assert result.exit_code == 0, result.output + + rna: pd.DataFrame = pd.read_csv(rna_counts_output, sep="\t").set_index("seq_id") + dna: pd.DataFrame = pd.read_csv(dna_counts_output, sep="\t").set_index("seq_id") + cols= ["RNA_X_1_1", "RNA_X_1_2", "RNA_X_2_1", "RNA_X_2_2", "RNA_X_3_1", "RNA_X_3_2"] + row = rna.loc["oligoA"] + assert list(row.loc[cols]) == [ + 100, + 101, + 200, + 201, + 300, + 301, + ] + cols= ["DNA_X_1_1", "DNA_X_1_2", "DNA_X_2_1", "DNA_X_2_2", "DNA_X_3_1", "DNA_X_3_2"] + row = dna.loc["oligoA"] + assert list(row.loc[cols]) == [ + 10, + 11, + 20, + 21, + 30, + 31, + ] diff --git a/test_compiler/minimal_test_input.tsv b/tests/fixtures/count/minimal_test_input.tsv similarity index 100% rename from test_compiler/minimal_test_input.tsv rename to tests/fixtures/count/minimal_test_input.tsv diff --git a/workflow/scripts/count/mpranalyze_compiler.py b/workflow/scripts/count/mpranalyze_compiler.py index 2a77622..4805bb2 100644 --- a/workflow/scripts/count/mpranalyze_compiler.py +++ b/workflow/scripts/count/mpranalyze_compiler.py @@ -6,6 +6,44 @@ import numpy as np import pandas as pd +ANNOT_PATTERN = re.compile(r"^([DR]NA).*\(condition (.*), replicate (.*)\)$") + + +def get_annot(head: str) -> tuple[str|None, str|None, str|None]: + match = ANNOT_PATTERN.match(head) + if match is not None: + group1 = match.group(1) + group2 = match.group(2) + group3 = match.group(3) + return (group1, group2, group3) + return (None, None, None) + + +def generate_annotation_output(data, number_barcodes): + data = data.loc[data.index.repeat(number_barcodes)].copy() + data['barcode'] = data.groupby(['type', 'condition', 'replicate']).cumcount() + 1 + data['barcode'] = data['barcode'].astype(str) + data['sample'] = data[['type', 'condition', 'replicate', 'barcode']].agg('_'.join, axis=1) + return data[["sample", "type", "condition", "replicate", "barcode"]] + + +def generate_count_output(data, columns, number_barcodes): + rows = [] + seq_ids = [] + for label, group in data.groupby('label', sort=False): + padded = np.zeros((number_barcodes, data.shape[1]), dtype=np.int64) + vals = group.values[:number_barcodes].astype(np.int64) + padded[:len(vals)] = vals + rows.append(padded.flatten(order='F')) + seq_ids.append(label) + counts = pd.DataFrame(rows, columns=columns) + counts.insert(0, 'seq_id', seq_ids) + return counts + + +def write_table(data, file): + data.to_csv(file, index=False, sep='\t', compression='gzip') + # options @click.command() @@ -39,12 +77,6 @@ def cli(input_file, rna_counts_output_file, dna_counts_output_file, rna_annotation_output_file, dna_annotation_output_file): - annot_pattern = re.compile(r"^([DR]NA).*\(condition (.*), replicate (.*)\)$") - def get_annot(head): - m = annot_pattern.match(head) - if m is not None: - return m.group(1,2,3) - # read input df = pd.read_csv(input_file,sep="\t", header='infer') @@ -61,44 +93,26 @@ def get_annot(head): # counts for observation - dna_df = df.iloc[:,2:(2+n_dna_obs)].applymap(np.int64) - rna_df = df.iloc[:,(2+n_dna_obs):].applymap(np.int64) + dna_df = df.iloc[:,2:(2+n_dna_obs)].astype(np.int16) + rna_df = df.iloc[:,(2+n_dna_obs):].astype(np.int16) ## generate output DNA/RNA annotations (type_condition_replicate_barcode) n_bc = df.groupby('label').Barcode.agg(len).max() - def generateAnnotationOutput(data, number_barcodes): - data = data.loc[data.index.repeat(number_barcodes)] - data['barcode'] = data.groupby(['type','condition','replicate']).cumcount() +1 - data['barcode'] = data['barcode'].astype(str) - data['sample'] = data[['type','condition','replicate','barcode']].agg('_'.join,axis=1) - data = data[["sample", "type", "condition", "replicate", "barcode"]] - return(data) - dna_annot = generateAnnotationOutput(dna_annot, n_bc) - rna_annot = generateAnnotationOutput(rna_annot, n_bc) + dna_annot = generate_annotation_output(dna_annot, n_bc) + rna_annot = generate_annotation_output(rna_annot, n_bc) ## generate output DNA/RNA count tables ## rows oligo/seq ids,/assignment then per barcode the counts. padding with zeros - def generateCountOutput(data,columns): - counts = pd.DataFrame(list(data.groupby('label').apply(lambda x: x.values.flatten(order='F')))).fillna(0).astype(np.int64) - counts.columns = columns - counts['seq_id'] = data.index.unique() - counts = counts[(['seq_id'] + list(columns))] - return(counts) - - dna_counts = generateCountOutput(dna_df,dna_annot['sample']) - rna_counts = generateCountOutput(rna_df,rna_annot['sample']) - - ## write table function - def write(data,file): - data.to_csv(file, index=False,sep='\t', compression='gzip') + dna_counts = generate_count_output(dna_df, dna_annot['sample'], n_bc) + rna_counts = generate_count_output(rna_df, rna_annot['sample'], n_bc) ## write output DNA/RNA annotations - write(dna_annot,dna_annotation_output_file) - write(rna_annot,rna_annotation_output_file) + write_table(dna_annot, dna_annotation_output_file) + write_table(rna_annot, rna_annotation_output_file) ## write output DNA/RNA annotations - write(dna_counts,dna_counts_output_file) - write(rna_counts,rna_counts_output_file) + write_table(dna_counts, dna_counts_output_file) + write_table(rna_counts, rna_counts_output_file) if __name__ == '__main__': cli() From 2d86b82b8c92d663c0bf4ab76bb5fcf40aa8f2b8 Mon Sep 17 00:00:00 2001 From: Mackenzie Noon Date: Mon, 15 Jun 2026 16:14:10 -0400 Subject: [PATCH 7/9] quick additional test, making sure ragged ends dont goof --- tests/conftest.py | 5 ++ tests/count/test_mpranalyze_compiler.py | 52 +++++++++++++++++++ tests/fixtures/count/ragged_missing_input.tsv | 5 ++ 3 files changed, 62 insertions(+) create mode 100644 tests/fixtures/count/ragged_missing_input.tsv diff --git a/tests/conftest.py b/tests/conftest.py index 8da7f62..0496f1d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -32,6 +32,11 @@ def minimal_count_input(count_fixtures_root: Path) -> Path: return count_fixtures_root / "minimal_test_input.tsv" +@pytest.fixture +def ragged_missing_input(count_fixtures_root: Path) -> Path: + return count_fixtures_root / "ragged_missing_input.tsv" + + @pytest.fixture def cli_runner() -> CliRunner: return CliRunner() diff --git a/tests/count/test_mpranalyze_compiler.py b/tests/count/test_mpranalyze_compiler.py index 082d0ae..c8f0f8d 100644 --- a/tests/count/test_mpranalyze_compiler.py +++ b/tests/count/test_mpranalyze_compiler.py @@ -86,3 +86,55 @@ def test_cli_generates_expected_count_tables(self, cli_runner, minimal_count_inp 30, 31, ] + + def test_cli_missing_count_does_not_shift_replicates(self, cli_runner, ragged_missing_input, tmp_path): + # Regression guard for the MPRAflow-style "ragged within an oligo" bug + # (shendurelab/MPRAflow#87): a missing trailing count (empty cell / trailing + # tab) must stay a zero in its own replicate/barcode slot and must NOT shift + # later counts into the wrong replicate. The fixture's oligoA is missing its + # RNA replicate-3 / barcode-2 value, and oligoB has a single barcode (so it + # also exercises the across-oligo padding case). + rna_counts_output = tmp_path / "rna_counts.tsv.gz" + dna_counts_output = tmp_path / "dna_counts.tsv.gz" + rna_annotation_output = tmp_path / "rna_annot.tsv.gz" + dna_annotation_output = tmp_path / "dna_annot.tsv.gz" + + result = cli_runner.invoke( + compiler.cli, + [ + "--input", + str(ragged_missing_input), + "--rna-counts-output", + str(rna_counts_output), + "--dna-counts-output", + str(dna_counts_output), + "--rna-annotation-output", + str(rna_annotation_output), + "--dna-annotation-output", + str(dna_annotation_output), + ], + ) + + assert result.exit_code == 0, result.output + + rna: pd.DataFrame = pd.read_csv(rna_counts_output, sep="\t").set_index("seq_id") + dna: pd.DataFrame = pd.read_csv(dna_counts_output, sep="\t").set_index("seq_id") + + rna_cols = [ + "RNA_X_1_1", "RNA_X_1_2", "RNA_X_1_3", + "RNA_X_2_1", "RNA_X_2_2", "RNA_X_2_3", + "RNA_X_3_1", "RNA_X_3_2", "RNA_X_3_3", + ] + # oligoA: the hole is at replicate 3 / barcode 2 -> must be 0, and the real + # barcode-3 value (302) must stay in barcode 3, not shift up to barcode 2. + assert list(rna.loc["oligoA", rna_cols]) == [100, 101, 102, 200, 201, 202, 300, 0, 302] + # oligoB has one barcode; remaining barcode slots pad with zeros per replicate. + assert list(rna.loc["oligoB", rna_cols]) == [103, 0, 0, 203, 0, 0, 303, 0, 0] + + # DNA has no missing values; confirm nothing shifted there either. + dna_cols = [ + "DNA_X_1_1", "DNA_X_1_2", "DNA_X_1_3", + "DNA_X_2_1", "DNA_X_2_2", "DNA_X_2_3", + "DNA_X_3_1", "DNA_X_3_2", "DNA_X_3_3", + ] + assert list(dna.loc["oligoA", dna_cols]) == [10, 11, 12, 20, 21, 22, 30, 31, 32] diff --git a/tests/fixtures/count/ragged_missing_input.tsv b/tests/fixtures/count/ragged_missing_input.tsv new file mode 100644 index 0000000..bc6ee66 --- /dev/null +++ b/tests/fixtures/count/ragged_missing_input.tsv @@ -0,0 +1,5 @@ +label Sequence Barcode DNA(condition X, replicate 1) DNA(condition X, replicate 2) DNA(condition X, replicate 3) RNA(condition X, replicate 1) RNA(condition X, replicate 2) RNA(condition X, replicate 3) +oligoA A BC0 10 20 30 100 200 300 +oligoA A BC1 11 21 31 101 201 +oligoA A BC2 12 22 32 102 202 302 +oligoB B BC3 13 23 33 103 203 303 From 580f840e5abbdf1ef7236176c538b5a66c20001d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 15 Jun 2026 20:14:39 +0000 Subject: [PATCH 8/9] ci: auto fixes from pre-commit hooks for more information, see https://pre-commit.ci --- tests/fixtures/count/ragged_missing_input.tsv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fixtures/count/ragged_missing_input.tsv b/tests/fixtures/count/ragged_missing_input.tsv index bc6ee66..cd53b62 100644 --- a/tests/fixtures/count/ragged_missing_input.tsv +++ b/tests/fixtures/count/ragged_missing_input.tsv @@ -1,5 +1,5 @@ label Sequence Barcode DNA(condition X, replicate 1) DNA(condition X, replicate 2) DNA(condition X, replicate 3) RNA(condition X, replicate 1) RNA(condition X, replicate 2) RNA(condition X, replicate 3) oligoA A BC0 10 20 30 100 200 300 -oligoA A BC1 11 21 31 101 201 +oligoA A BC1 11 21 31 101 201 oligoA A BC2 12 22 32 102 202 302 oligoB B BC3 13 23 33 103 203 303 From 708bd4a63d7c47031fd40879154ef77c9b46fbde Mon Sep 17 00:00:00 2001 From: Mackenzie Noon Date: Mon, 15 Jun 2026 16:15:11 -0400 Subject: [PATCH 9/9] +switched type to prevent overflow --- workflow/scripts/count/mpranalyze_compiler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/scripts/count/mpranalyze_compiler.py b/workflow/scripts/count/mpranalyze_compiler.py index 4805bb2..e758108 100644 --- a/workflow/scripts/count/mpranalyze_compiler.py +++ b/workflow/scripts/count/mpranalyze_compiler.py @@ -93,8 +93,8 @@ def cli(input_file, rna_counts_output_file, dna_counts_output_file, rna_annotati # counts for observation - dna_df = df.iloc[:,2:(2+n_dna_obs)].astype(np.int16) - rna_df = df.iloc[:,(2+n_dna_obs):].astype(np.int16) + dna_df = df.iloc[:,2:(2+n_dna_obs)].astype(np.int64) + rna_df = df.iloc[:,(2+n_dna_obs):].astype(np.int64) ## generate output DNA/RNA annotations (type_condition_replicate_barcode) n_bc = df.groupby('label').Barcode.agg(len).max()