From ef2f9dacc43023d1562db9aeda6dd522bc441f55 Mon Sep 17 00:00:00 2001
From: Mackenzie Noon <mackenziecnoon@gmail.com>
Date: Wed, 3 Jun 2026 16:15:49 -0400
Subject: [PATCH 1/9] example of compiler bug

---
 test_compiler/minimal_test_input.tsv           |   5 +++++
 test_compiler/output_pre_fix/dna_annot.tsv.gz  | Bin 0 -> 124 bytes
 test_compiler/output_pre_fix/dna_counts.tsv.gz | Bin 0 -> 111 bytes
 test_compiler/output_pre_fix/rna_annot.tsv.gz  | Bin 0 -> 125 bytes
 test_compiler/output_pre_fix/rna_counts.tsv.gz | Bin 0 -> 116 bytes
 test_compiler/test.py                          |  17 +++++++++++++++++
 6 files changed, 22 insertions(+)
 create mode 100644 test_compiler/minimal_test_input.tsv
 create mode 100644 test_compiler/output_pre_fix/dna_annot.tsv.gz
 create mode 100644 test_compiler/output_pre_fix/dna_counts.tsv.gz
 create mode 100644 test_compiler/output_pre_fix/rna_annot.tsv.gz
 create mode 100644 test_compiler/output_pre_fix/rna_counts.tsv.gz
 create mode 100644 test_compiler/test.py

diff --git a/test_compiler/minimal_test_input.tsv b/test_compiler/minimal_test_input.tsv
new file mode 100644
index 0000000..1a330ce
--- /dev/null
+++ b/test_compiler/minimal_test_input.tsv
@@ -0,0 +1,5 @@
+label	Sequence	Barcode	DNA(condition X, replicate 1)	DNA(condition X, replicate 2)	DNA(condition X, replicate 3)	RNA(condition X, replicate 1)	RNA(condition X, replicate 2)	RNA(condition X, replicate 3)
+oligoA	A	BC0	10	20	30	100	200	300
+oligoA	A	BC1	11	21	31	101	201	301
+oligoB	B	BC2	12	22	32	102	202	302
+oligoB	B	BC3	13	23	33	103	203	303
diff --git a/test_compiler/output_pre_fix/dna_annot.tsv.gz b/test_compiler/output_pre_fix/dna_annot.tsv.gz
new file mode 100644
index 0000000000000000000000000000000000000000..46098423c75caee1800046a9283b176a8a36fd56
GIT binary patch
literal 124
zcmV-?0E7P@iwFopg&=AI|731qUtw-;Z*(qnb9Mkc$}tXrAPh#~x!$Jj2~3=w*$6@h
z5(I)Q-d+atZ!a(36YgG&=G%>8t0~XiDh=x;$1shqFk(x#9p~<yXV0{X95FGT;K~04
ePT&Mi;1o{b6i%1>00030{{sNd(SSvu0001)%QbcY

literal 0
HcmV?d00001

diff --git a/test_compiler/output_pre_fix/dna_counts.tsv.gz b/test_compiler/output_pre_fix/dna_counts.tsv.gz
new file mode 100644
index 0000000000000000000000000000000000000000..2d54e55f464ae1e9340f8f59763086c2c808198a
GIT binary patch
literal 111
zcmV-#0FeJ5iwFopg&=AI|731qUt@1|Zgg`lbaQq9JjpQ%fG`k5(Ry;4%*+WQ*4B30
zC=v(+@%~_;Z1;KpIq&EP)~&@J3})#sr76v64wX6jH5*u=3RjR2VSZx)1ql{t$gs-`
R009600|0U!H`8?h008xgEQ$aC

literal 0
HcmV?d00001

diff --git a/test_compiler/output_pre_fix/rna_annot.tsv.gz b/test_compiler/output_pre_fix/rna_annot.tsv.gz
new file mode 100644
index 0000000000000000000000000000000000000000..c5d09e08c5856ddcc0f29d7f7271ce8fb47f3807
GIT binary patch
literal 125
zcmV-@0D}J?iwFopg&=AI|8j0&Utw-;Z*(qnb9Mkc$}tXrAPh#~x!$Jj3A})V6B|M3
zK!QM!#oNnZ{_W-Ed&1p|(R{m6Y&GSXTcu&W<QS&W6-I2yw)5D%^X!>cks~I?6Fm8!
fzzLkd37o<yoWkjH9{>OV|NjF3=9s1tpa1{>WJ){#

literal 0
HcmV?d00001

diff --git a/test_compiler/output_pre_fix/rna_counts.tsv.gz b/test_compiler/output_pre_fix/rna_counts.tsv.gz
new file mode 100644
index 0000000000000000000000000000000000000000..8314690a10c4db371ba55b23ed6e73bcac607d6a
GIT binary patch
literal 116
zcmV-)0E_=0iwFopg&=AI|8j0&Ut@1|Zgg`lbaQq9JjpQ%fG`k5(K<QJ&g==|0W7TS
zwoxPy2;%+00cDGU$DilCqaS5kTkL^AiockR**u#E>m2=>t-!2Ma}5R=GQN9Jq7s!T
WyX>;p7XSeN{{sL404bP(0000i<S_mK

literal 0
HcmV?d00001

diff --git a/test_compiler/test.py b/test_compiler/test.py
new file mode 100644
index 0000000..0f2aa85
--- /dev/null
+++ b/test_compiler/test.py
@@ -0,0 +1,17 @@
+import sys
+import os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "workflow", "scripts", "count"))
+
+def main():
+    import mpranalyze_compiler as c
+    here = os.path.dirname(__file__)
+    c.cli.callback(
+        input_file=os.path.join(here, "minimal_test_input.tsv"),
+        rna_counts_output_file=os.path.join(here, "rna_counts.tsv.gz"),
+        dna_counts_output_file=os.path.join(here, "dna_counts.tsv.gz"),
+        rna_annotation_output_file=os.path.join(here, "rna_annot.tsv.gz"),
+        dna_annotation_output_file=os.path.join(here, "dna_annot.tsv.gz"),
+    )
+
+if __name__=="__main__":
+    main()

From f06133526799ab00d3e6ef053874ffee0d60595a Mon Sep 17 00:00:00 2001
From: Mackenzie Noon <mackenziecnoon@gmail.com>
Date: Wed, 3 Jun 2026 16:47:40 -0400
Subject: [PATCH 2/9] +fix for compiler replicate confounding +example of fix

---
 test_compiler/output_post_fix/dna_annot.tsv.gz  | Bin 0 -> 124 bytes
 test_compiler/output_post_fix/dna_counts.tsv.gz | Bin 0 -> 111 bytes
 test_compiler/output_post_fix/rna_annot.tsv.gz  | Bin 0 -> 125 bytes
 test_compiler/output_post_fix/rna_counts.tsv.gz | Bin 0 -> 116 bytes
 workflow/scripts/count/mpranalyze_compiler.py   |   2 +-
 5 files changed, 1 insertion(+), 1 deletion(-)
 create mode 100644 test_compiler/output_post_fix/dna_annot.tsv.gz
 create mode 100644 test_compiler/output_post_fix/dna_counts.tsv.gz
 create mode 100644 test_compiler/output_post_fix/rna_annot.tsv.gz
 create mode 100644 test_compiler/output_post_fix/rna_counts.tsv.gz

diff --git a/test_compiler/output_post_fix/dna_annot.tsv.gz b/test_compiler/output_post_fix/dna_annot.tsv.gz
new file mode 100644
index 0000000000000000000000000000000000000000..76f339181a247eefb2e0150634deca01a2c9b99e
GIT binary patch
literal 124
zcmV-?0E7P@iwFn;k|1gV|731qUtw-;Z*(qnb9Mkc$}tXrAPh#~x!$Jj2~3=w*$6@h
z5(I)Q-d+atZ!a(36YgG&=G%>8t0~XiDh=x;$1shqFk(x#9p~<yXV0{X95FGT;K~04
ePT&Mi;1o{b6i%1>00030{{sNd(SSvu0001i)-^)_

literal 0
HcmV?d00001

diff --git a/test_compiler/output_post_fix/dna_counts.tsv.gz b/test_compiler/output_post_fix/dna_counts.tsv.gz
new file mode 100644
index 0000000000000000000000000000000000000000..a968e50f99698e16a783ff48540fb148ffe2ab3b
GIT binary patch
literal 111
zcmV-#0FeJ5iwFn;k|1gV|731qUt@1|Zgg`lbaQq9JjpQ%fG`k5(Ry;4%*+WQ*4B30
zC=v(+@%~_;Z1;KpIq&EP)~&@J3})#sr76v64wX6jH5*t#LKPxhVSZx)1sW1$Sg^|r
R009600|0+Zm9TXH007IbE7<@5

literal 0
HcmV?d00001

diff --git a/test_compiler/output_post_fix/rna_annot.tsv.gz b/test_compiler/output_post_fix/rna_annot.tsv.gz
new file mode 100644
index 0000000000000000000000000000000000000000..286632b075cd8547a375031622ec8621603468aa
GIT binary patch
literal 125
zcmV-@0D}J?iwFn;k|1gV|8j0&Utw-;Z*(qnb9Mkc$}tXrAPh#~x!$Jj3A})V6B|M3
zK!QM!#oNnZ{_W-Ed&1p|(R{m6Y&GSXTcu&W<QS&W6-I2yw)5D%^X!>cks~I?6Fm8!
fzzLkd37o<yoWkjH9{>OV|NjF3=9s1tpa1{>Od>nX

literal 0
HcmV?d00001

diff --git a/test_compiler/output_post_fix/rna_counts.tsv.gz b/test_compiler/output_post_fix/rna_counts.tsv.gz
new file mode 100644
index 0000000000000000000000000000000000000000..295b79e7a9bb7140caabff229183dfd30cbaa109
GIT binary patch
literal 116
zcmV-)0E_=0iwFn;k|1gV|8j0&Ut@1|Zgg`lbaQq9Jc%(1fG`jQ>*O_eyDx|ju&}bb
zMv*`ui2nx{lq-f^W}fqoew1x(v4@A}!~c*D>6ngzI!C`|D{dydKnpFkH2?CV1VJST
WDnYQm00030{{sL?T8v_W0002LcP}OY

literal 0
HcmV?d00001

diff --git a/workflow/scripts/count/mpranalyze_compiler.py b/workflow/scripts/count/mpranalyze_compiler.py
index b559c1f..2a77622 100644
--- a/workflow/scripts/count/mpranalyze_compiler.py
+++ b/workflow/scripts/count/mpranalyze_compiler.py
@@ -79,7 +79,7 @@ def generateAnnotationOutput(data, number_barcodes):
     ## generate output DNA/RNA count tables
     ## rows oligo/seq ids,/assignment then per barcode the counts. padding with zeros
     def generateCountOutput(data,columns):
-        counts = pd.DataFrame(list(data.groupby('label').apply(lambda x: x.values.flatten()))).fillna(0).astype(np.int64)
+        counts = pd.DataFrame(list(data.groupby('label').apply(lambda x: x.values.flatten(order='F')))).fillna(0).astype(np.int64)
         counts.columns = columns
         counts['seq_id'] = data.index.unique()
         counts = counts[(['seq_id'] + list(columns))]

From a1115ec4b3d197841db32d851e1346ec1badb55f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 3 Jun 2026 21:00:07 +0000
Subject: [PATCH 3/9] ci: auto fixes from pre-commit hooks

for more information, see https://pre-commit.ci
---
 test_compiler/test.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test_compiler/test.py b/test_compiler/test.py
index 0f2aa85..eb594fb 100644
--- a/test_compiler/test.py
+++ b/test_compiler/test.py
@@ -1,5 +1,6 @@
-import sys
 import os
+import sys
+
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "workflow", "scripts", "count"))
 
 def main():

From 6b7ffcae4377e4a77e30ca5c8ae0e2dc5209b0e7 Mon Sep 17 00:00:00 2001
From: Mackenzie Noon <me@mackenzienoon.com>
Date: Fri, 5 Jun 2026 19:24:26 -0400
Subject: [PATCH 4/9] Potential fix for pull request finding

This improves the tests substantively, as described. Not actually required for the fix, but a nice addition.

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 test_compiler/test.py | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/test_compiler/test.py b/test_compiler/test.py
index eb594fb..b13b660 100644
--- a/test_compiler/test.py
+++ b/test_compiler/test.py
@@ -4,15 +4,32 @@
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "workflow", "scripts", "count"))
 
 def main():
+    import tempfile
+
+    import pandas as pd
+
     import mpranalyze_compiler as c
+
     here = os.path.dirname(__file__)
-    c.cli.callback(
-        input_file=os.path.join(here, "minimal_test_input.tsv"),
-        rna_counts_output_file=os.path.join(here, "rna_counts.tsv.gz"),
-        dna_counts_output_file=os.path.join(here, "dna_counts.tsv.gz"),
-        rna_annotation_output_file=os.path.join(here, "rna_annot.tsv.gz"),
-        dna_annotation_output_file=os.path.join(here, "dna_annot.tsv.gz"),
-    )
+    with tempfile.TemporaryDirectory() as tmp:
+        rna_counts_output = os.path.join(tmp, "rna_counts.tsv.gz")
+        dna_counts_output = os.path.join(tmp, "dna_counts.tsv.gz")
+        rna_annot_output = os.path.join(tmp, "rna_annot.tsv.gz")
+        dna_annot_output = os.path.join(tmp, "dna_annot.tsv.gz")
+
+        c.cli.callback(
+            input_file=os.path.join(here, "minimal_test_input.tsv"),
+            rna_counts_output_file=rna_counts_output,
+            dna_counts_output_file=dna_counts_output,
+            rna_annotation_output_file=rna_annot_output,
+            dna_annotation_output_file=dna_annot_output,
+        )
+
+        rna = pd.read_csv(rna_counts_output, sep="\t").set_index("seq_id")
+        dna = pd.read_csv(dna_counts_output, sep="\t").set_index("seq_id")
+
+        assert list(rna.loc["oligoA", ["RNA_X_1_1", "RNA_X_1_2", "RNA_X_2_1", "RNA_X_2_2", "RNA_X_3_1", "RNA_X_3_2"]]) == [100, 101, 200, 201, 300, 301]
+        assert list(dna.loc["oligoA", ["DNA_X_1_1", "DNA_X_1_2", "DNA_X_2_1", "DNA_X_2_2", "DNA_X_3_1", "DNA_X_3_2"]]) == [10, 11, 20, 21, 30, 31]
 
 if __name__=="__main__":
     main()

From 2a93422c6d2d2476c871312fa596f7e20f8e0c42 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 5 Jun 2026 23:25:28 +0000
Subject: [PATCH 5/9] ci: auto fixes from pre-commit hooks

for more information, see https://pre-commit.ci
---
 test_compiler/test.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test_compiler/test.py b/test_compiler/test.py
index b13b660..d411935 100644
--- a/test_compiler/test.py
+++ b/test_compiler/test.py
@@ -6,9 +6,8 @@
 def main():
     import tempfile
 
-    import pandas as pd
-
     import mpranalyze_compiler as c
+    import pandas as pd
 
     here = os.path.dirname(__file__)
     with tempfile.TemporaryDirectory() as tmp:

From 68f49a36b06e5f6ddb40b49fed8e9330204b7d0f Mon Sep 17 00:00:00 2001
From: Max Schubach <max.schubach@bih-charite.de>
Date: Mon, 8 Jun 2026 13:29:05 +0200
Subject: [PATCH 6/9] tests: using general pytests for python scripts

---
 .github/workflows/main.yml                    |  17 ++++
 pyproject.toml                                |   5 +
 requirements-test.txt                         |   4 +
 .../output_post_fix/dna_annot.tsv.gz          | Bin 124 -> 0 bytes
 .../output_post_fix/dna_counts.tsv.gz         | Bin 111 -> 0 bytes
 .../output_post_fix/rna_annot.tsv.gz          | Bin 125 -> 0 bytes
 .../output_post_fix/rna_counts.tsv.gz         | Bin 116 -> 0 bytes
 test_compiler/output_pre_fix/dna_annot.tsv.gz | Bin 124 -> 0 bytes
 .../output_pre_fix/dna_counts.tsv.gz          | Bin 111 -> 0 bytes
 test_compiler/output_pre_fix/rna_annot.tsv.gz | Bin 125 -> 0 bytes
 .../output_pre_fix/rna_counts.tsv.gz          | Bin 116 -> 0 bytes
 test_compiler/test.py                         |  34 -------
 tests/README.md                               |  20 ++++
 tests/conftest.py                             |  37 ++++++++
 tests/count/test_mpranalyze_compiler.py       |  88 ++++++++++++++++++
 .../fixtures/count}/minimal_test_input.tsv    |   0
 workflow/scripts/count/mpranalyze_compiler.py |  82 +++++++++-------
 17 files changed, 219 insertions(+), 68 deletions(-)
 create mode 100644 requirements-test.txt
 delete mode 100644 test_compiler/output_post_fix/dna_annot.tsv.gz
 delete mode 100644 test_compiler/output_post_fix/dna_counts.tsv.gz
 delete mode 100644 test_compiler/output_post_fix/rna_annot.tsv.gz
 delete mode 100644 test_compiler/output_post_fix/rna_counts.tsv.gz
 delete mode 100644 test_compiler/output_pre_fix/dna_annot.tsv.gz
 delete mode 100644 test_compiler/output_pre_fix/dna_counts.tsv.gz
 delete mode 100644 test_compiler/output_pre_fix/rna_annot.tsv.gz
 delete mode 100644 test_compiler/output_pre_fix/rna_counts.tsv.gz
 delete mode 100644 test_compiler/test.py
 create mode 100644 tests/README.md
 create mode 100644 tests/conftest.py
 create mode 100644 tests/count/test_mpranalyze_compiler.py
 rename {test_compiler => tests/fixtures/count}/minimal_test_input.tsv (100%)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 39c6e37..10679ab 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -54,6 +54,23 @@ jobs:
           directory: .
           snakefile: workflow/Snakefile
           args: "--lint --configfile config/example_config.yaml --config skip_version_check=True"
+
+  Pytest:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ['3.10', '3.11', '3.12', '3.13', '3.14']
+    steps:
+      - uses: actions/checkout@v6
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install test dependencies
+        run: python -m pip install --upgrade pip -r requirements-test.txt
+      - name: Run pytest suite
+        run: python -m pytest -q tests
 # Testing:
 #   runs-on: ubuntu-latest
 #   needs:
diff --git a/pyproject.toml b/pyproject.toml
index 52c96bc..1050e93 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,2 +1,7 @@
 [tool.snakefmt]
 line_length = 127
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = ["test_*.py", "*_test.py"]
+addopts = "-ra"
diff --git a/requirements-test.txt b/requirements-test.txt
new file mode 100644
index 0000000..42b555e
--- /dev/null
+++ b/requirements-test.txt
@@ -0,0 +1,4 @@
+pytest
+click
+numpy
+pandas
diff --git a/test_compiler/output_post_fix/dna_annot.tsv.gz b/test_compiler/output_post_fix/dna_annot.tsv.gz
deleted file mode 100644
index 76f339181a247eefb2e0150634deca01a2c9b99e..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 124
zcmV-?0E7P@iwFn;k|1gV|731qUtw-;Z*(qnb9Mkc$}tXrAPh#~x!$Jj2~3=w*$6@h
z5(I)Q-d+atZ!a(36YgG&=G%>8t0~XiDh=x;$1shqFk(x#9p~<yXV0{X95FGT;K~04
ePT&Mi;1o{b6i%1>00030{{sNd(SSvu0001i)-^)_

diff --git a/test_compiler/output_post_fix/dna_counts.tsv.gz b/test_compiler/output_post_fix/dna_counts.tsv.gz
deleted file mode 100644
index a968e50f99698e16a783ff48540fb148ffe2ab3b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 111
zcmV-#0FeJ5iwFn;k|1gV|731qUt@1|Zgg`lbaQq9JjpQ%fG`k5(Ry;4%*+WQ*4B30
zC=v(+@%~_;Z1;KpIq&EP)~&@J3})#sr76v64wX6jH5*t#LKPxhVSZx)1sW1$Sg^|r
R009600|0+Zm9TXH007IbE7<@5

diff --git a/test_compiler/output_post_fix/rna_annot.tsv.gz b/test_compiler/output_post_fix/rna_annot.tsv.gz
deleted file mode 100644
index 286632b075cd8547a375031622ec8621603468aa..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 125
zcmV-@0D}J?iwFn;k|1gV|8j0&Utw-;Z*(qnb9Mkc$}tXrAPh#~x!$Jj3A})V6B|M3
zK!QM!#oNnZ{_W-Ed&1p|(R{m6Y&GSXTcu&W<QS&W6-I2yw)5D%^X!>cks~I?6Fm8!
fzzLkd37o<yoWkjH9{>OV|NjF3=9s1tpa1{>Od>nX

diff --git a/test_compiler/output_post_fix/rna_counts.tsv.gz b/test_compiler/output_post_fix/rna_counts.tsv.gz
deleted file mode 100644
index 295b79e7a9bb7140caabff229183dfd30cbaa109..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 116
zcmV-)0E_=0iwFn;k|1gV|8j0&Ut@1|Zgg`lbaQq9Jc%(1fG`jQ>*O_eyDx|ju&}bb
zMv*`ui2nx{lq-f^W}fqoew1x(v4@A}!~c*D>6ngzI!C`|D{dydKnpFkH2?CV1VJST
WDnYQm00030{{sL?T8v_W0002LcP}OY

diff --git a/test_compiler/output_pre_fix/dna_annot.tsv.gz b/test_compiler/output_pre_fix/dna_annot.tsv.gz
deleted file mode 100644
index 46098423c75caee1800046a9283b176a8a36fd56..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 124
zcmV-?0E7P@iwFopg&=AI|731qUtw-;Z*(qnb9Mkc$}tXrAPh#~x!$Jj2~3=w*$6@h
z5(I)Q-d+atZ!a(36YgG&=G%>8t0~XiDh=x;$1shqFk(x#9p~<yXV0{X95FGT;K~04
ePT&Mi;1o{b6i%1>00030{{sNd(SSvu0001)%QbcY

diff --git a/test_compiler/output_pre_fix/dna_counts.tsv.gz b/test_compiler/output_pre_fix/dna_counts.tsv.gz
deleted file mode 100644
index 2d54e55f464ae1e9340f8f59763086c2c808198a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 111
zcmV-#0FeJ5iwFopg&=AI|731qUt@1|Zgg`lbaQq9JjpQ%fG`k5(Ry;4%*+WQ*4B30
zC=v(+@%~_;Z1;KpIq&EP)~&@J3})#sr76v64wX6jH5*u=3RjR2VSZx)1ql{t$gs-`
R009600|0U!H`8?h008xgEQ$aC

diff --git a/test_compiler/output_pre_fix/rna_annot.tsv.gz b/test_compiler/output_pre_fix/rna_annot.tsv.gz
deleted file mode 100644
index c5d09e08c5856ddcc0f29d7f7271ce8fb47f3807..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 125
zcmV-@0D}J?iwFopg&=AI|8j0&Utw-;Z*(qnb9Mkc$}tXrAPh#~x!$Jj3A})V6B|M3
zK!QM!#oNnZ{_W-Ed&1p|(R{m6Y&GSXTcu&W<QS&W6-I2yw)5D%^X!>cks~I?6Fm8!
fzzLkd37o<yoWkjH9{>OV|NjF3=9s1tpa1{>WJ){#

diff --git a/test_compiler/output_pre_fix/rna_counts.tsv.gz b/test_compiler/output_pre_fix/rna_counts.tsv.gz
deleted file mode 100644
index 8314690a10c4db371ba55b23ed6e73bcac607d6a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 116
zcmV-)0E_=0iwFopg&=AI|8j0&Ut@1|Zgg`lbaQq9JjpQ%fG`k5(K<QJ&g==|0W7TS
zwoxPy2;%+00cDGU$DilCqaS5kTkL^AiockR**u#E>m2=>t-!2Ma}5R=GQN9Jq7s!T
WyX>;p7XSeN{{sL404bP(0000i<S_mK

diff --git a/test_compiler/test.py b/test_compiler/test.py
deleted file mode 100644
index d411935..0000000
--- a/test_compiler/test.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import os
-import sys
-
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "workflow", "scripts", "count"))
-
-def main():
-    import tempfile
-
-    import mpranalyze_compiler as c
-    import pandas as pd
-
-    here = os.path.dirname(__file__)
-    with tempfile.TemporaryDirectory() as tmp:
-        rna_counts_output = os.path.join(tmp, "rna_counts.tsv.gz")
-        dna_counts_output = os.path.join(tmp, "dna_counts.tsv.gz")
-        rna_annot_output = os.path.join(tmp, "rna_annot.tsv.gz")
-        dna_annot_output = os.path.join(tmp, "dna_annot.tsv.gz")
-
-        c.cli.callback(
-            input_file=os.path.join(here, "minimal_test_input.tsv"),
-            rna_counts_output_file=rna_counts_output,
-            dna_counts_output_file=dna_counts_output,
-            rna_annotation_output_file=rna_annot_output,
-            dna_annotation_output_file=dna_annot_output,
-        )
-
-        rna = pd.read_csv(rna_counts_output, sep="\t").set_index("seq_id")
-        dna = pd.read_csv(dna_counts_output, sep="\t").set_index("seq_id")
-
-        assert list(rna.loc["oligoA", ["RNA_X_1_1", "RNA_X_1_2", "RNA_X_2_1", "RNA_X_2_2", "RNA_X_3_1", "RNA_X_3_2"]]) == [100, 101, 200, 201, 300, 301]
-        assert list(dna.loc["oligoA", ["DNA_X_1_1", "DNA_X_1_2", "DNA_X_2_1", "DNA_X_2_2", "DNA_X_3_1", "DNA_X_3_2"]]) == [10, 11, 20, 21, 30, 31]
-
-if __name__=="__main__":
-    main()
diff --git a/tests/README.md b/tests/README.md
new file mode 100644
index 0000000..161fdb3
--- /dev/null
+++ b/tests/README.md
@@ -0,0 +1,20 @@
+# Tests
+
+This directory contains the pytest suite for workflow scripts and supporting fixtures.
+
+## Layout
+
+- `tests/conftest.py` keeps shared pytest fixtures and makes the repo root importable.
+- `tests/<area>/test_*.py` holds the actual tests, grouped by script area such as `count`.
+- `tests/fixtures/<area>/` stores reusable input data for those tests.
+
+## Adding a new script test
+
+1. Put the new test in the matching area folder, for example `tests/count/test_new_script.py`.
+2. Add any reusable inputs under `tests/fixtures/<area>/`.
+3. Prefer `click.testing.CliRunner` for Click commands and pytest fixtures like `tmp_path` for temporary outputs.
+4. Run a focused check with `conda run -n mpralib python -m pytest -q tests/<area>/test_new_script.py`.
+
+## Current example
+
+The MPRAnalyze compiler test lives in `tests/count/test_mpranalyze_compiler.py` and uses the fixture input at `tests/fixtures/count/minimal_test_input.tsv`.
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..8da7f62
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,37 @@
+import sys
+from pathlib import Path
+
+import pytest
+from click.testing import CliRunner
+
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+PROJECT_ROOT_STR = str(PROJECT_ROOT)
+TESTS_ROOT = Path(__file__).resolve().parent
+
+if PROJECT_ROOT_STR not in sys.path:
+    sys.path.insert(0, PROJECT_ROOT_STR)
+
+
+@pytest.fixture(scope="session")
+def tests_root() -> Path:
+    return TESTS_ROOT
+
+
+@pytest.fixture(scope="session")
+def fixtures_root(tests_root: Path) -> Path:
+    return tests_root / "fixtures"
+
+
+@pytest.fixture(scope="session")
+def count_fixtures_root(fixtures_root: Path) -> Path:
+    return fixtures_root / "count"
+
+
+@pytest.fixture
+def minimal_count_input(count_fixtures_root: Path) -> Path:
+    return count_fixtures_root / "minimal_test_input.tsv"
+
+
+@pytest.fixture
+def cli_runner() -> CliRunner:
+    return CliRunner()
diff --git a/tests/count/test_mpranalyze_compiler.py b/tests/count/test_mpranalyze_compiler.py
new file mode 100644
index 0000000..082d0ae
--- /dev/null
+++ b/tests/count/test_mpranalyze_compiler.py
@@ -0,0 +1,88 @@
+import pandas as pd
+
+from workflow.scripts.count import mpranalyze_compiler as compiler
+
+
+class TestMpranalyzeCompiler:
+    def test_get_annot_parses_dna_and_rna_headers(self):
+        assert compiler.get_annot("DNA(condition X, replicate 1)") == ("DNA", "X", "1")
+        assert compiler.get_annot("RNA(condition Y, replicate 2)") == ("RNA", "Y", "2")
+
+    def test_get_annot_returns_none_for_unmatched_headers(self):
+        assert compiler.get_annot("Sequence") == (None, None, None)
+        assert compiler.get_annot("label") == (None, None, None)
+
+    def test_generate_annotation_output_repeats_and_numbers_barcodes(self):
+        input_frame = pd.DataFrame(
+            [
+                {"type": "DNA", "condition": "X", "replicate": "1"},
+                {"type": "RNA", "condition": "X", "replicate": "1"},
+            ]
+        )
+
+        output = compiler.generate_annotation_output(input_frame, number_barcodes=2)
+
+        assert list(output["sample"]) == ["DNA_X_1_1", "DNA_X_1_2", "RNA_X_1_1", "RNA_X_1_2"]
+        assert list(output["barcode"]) == ["1", "2", "1", "2"]
+
+    def test_generate_count_output_pads_and_flattens_by_barcode(self):
+        input_frame = pd.DataFrame(
+            [
+                {"label": "oligoA", "bc1": 10, "bc2": 20},
+                {"label": "oligoA", "bc1": 11, "bc2": 21},
+                {"label": "oligoB", "bc1": 12, "bc2": 22},
+            ]
+        ).set_index("label")
+
+        output = compiler.generate_count_output(input_frame, ["sample_1", "sample_2", "sample_3", "sample_4"], number_barcodes=2)
+
+        assert list(output["seq_id"]) == ["oligoA", "oligoB"]
+        assert list(output.loc[output["seq_id"] == "oligoA", ["sample_1", "sample_2", "sample_3", "sample_4"]].iloc[0]) == [10, 11, 20, 21]
+        assert list(output.loc[output["seq_id"] == "oligoB", ["sample_1", "sample_2", "sample_3", "sample_4"]].iloc[0]) == [12, 0, 22, 0]
+
+    def test_cli_generates_expected_count_tables(self, cli_runner, minimal_count_input, tmp_path):
+        rna_counts_output = tmp_path / "rna_counts.tsv.gz"
+        dna_counts_output = tmp_path / "dna_counts.tsv.gz"
+        rna_annotation_output = tmp_path / "rna_annot.tsv.gz"
+        dna_annotation_output = tmp_path / "dna_annot.tsv.gz"
+
+        result = cli_runner.invoke(
+            compiler.cli,
+            [
+                "--input",
+                str(minimal_count_input),
+                "--rna-counts-output",
+                str(rna_counts_output),
+                "--dna-counts-output",
+                str(dna_counts_output),
+                "--rna-annotation-output",
+                str(rna_annotation_output),
+                "--dna-annotation-output",
+                str(dna_annotation_output),
+            ],
+        )
+
+        assert result.exit_code == 0, result.output
+
+        rna: pd.DataFrame = pd.read_csv(rna_counts_output, sep="\t").set_index("seq_id")
+        dna: pd.DataFrame = pd.read_csv(dna_counts_output, sep="\t").set_index("seq_id")
+        cols= ["RNA_X_1_1", "RNA_X_1_2", "RNA_X_2_1", "RNA_X_2_2", "RNA_X_3_1", "RNA_X_3_2"]
+        row = rna.loc["oligoA"]
+        assert list(row.loc[cols]) == [
+            100,
+            101,
+            200,
+            201,
+            300,
+            301,
+        ]
+        cols= ["DNA_X_1_1", "DNA_X_1_2", "DNA_X_2_1", "DNA_X_2_2", "DNA_X_3_1", "DNA_X_3_2"]
+        row = dna.loc["oligoA"]
+        assert list(row.loc[cols]) == [
+            10,
+            11,
+            20,
+            21,
+            30,
+            31,
+        ]
diff --git a/test_compiler/minimal_test_input.tsv b/tests/fixtures/count/minimal_test_input.tsv
similarity index 100%
rename from test_compiler/minimal_test_input.tsv
rename to tests/fixtures/count/minimal_test_input.tsv
diff --git a/workflow/scripts/count/mpranalyze_compiler.py b/workflow/scripts/count/mpranalyze_compiler.py
index 2a77622..4805bb2 100644
--- a/workflow/scripts/count/mpranalyze_compiler.py
+++ b/workflow/scripts/count/mpranalyze_compiler.py
@@ -6,6 +6,44 @@
 import numpy as np
 import pandas as pd
 
+ANNOT_PATTERN = re.compile(r"^([DR]NA).*\(condition (.*), replicate (.*)\)$")
+
+
+def get_annot(head: str) -> tuple[str|None, str|None, str|None]:
+    match = ANNOT_PATTERN.match(head)
+    if match is not None:
+        group1 = match.group(1)
+        group2 = match.group(2)
+        group3 = match.group(3)
+        return (group1, group2, group3)
+    return (None, None, None)
+
+
+def generate_annotation_output(data, number_barcodes):
+    data = data.loc[data.index.repeat(number_barcodes)].copy()
+    data['barcode'] = data.groupby(['type', 'condition', 'replicate']).cumcount() + 1
+    data['barcode'] = data['barcode'].astype(str)
+    data['sample'] = data[['type', 'condition', 'replicate', 'barcode']].agg('_'.join, axis=1)
+    return data[["sample", "type", "condition", "replicate", "barcode"]]
+
+
+def generate_count_output(data, columns, number_barcodes):
+    rows = []
+    seq_ids = []
+    for label, group in data.groupby('label', sort=False):
+        padded = np.zeros((number_barcodes, data.shape[1]), dtype=np.int64)
+        vals = group.values[:number_barcodes].astype(np.int64)
+        padded[:len(vals)] = vals
+        rows.append(padded.flatten(order='F'))
+        seq_ids.append(label)
+    counts = pd.DataFrame(rows, columns=columns)
+    counts.insert(0, 'seq_id', seq_ids)
+    return counts
+
+
+def write_table(data, file):
+    data.to_csv(file, index=False, sep='\t', compression='gzip')
+
 
 # options
 @click.command()
@@ -39,12 +77,6 @@
 
 def cli(input_file, rna_counts_output_file, dna_counts_output_file, rna_annotation_output_file, dna_annotation_output_file):
 
-    annot_pattern = re.compile(r"^([DR]NA).*\(condition (.*), replicate (.*)\)$")
-    def get_annot(head):
-        m = annot_pattern.match(head)
-        if m is not None:
-            return m.group(1,2,3)
-
     # read input
     df = pd.read_csv(input_file,sep="\t", header='infer')
 
@@ -61,44 +93,26 @@ def get_annot(head):
 
     # counts for observation
 
-    dna_df = df.iloc[:,2:(2+n_dna_obs)].applymap(np.int64)
-    rna_df = df.iloc[:,(2+n_dna_obs):].applymap(np.int64)
+    dna_df = df.iloc[:,2:(2+n_dna_obs)].astype(np.int16)
+    rna_df = df.iloc[:,(2+n_dna_obs):].astype(np.int16)
 
     ## generate output DNA/RNA annotations (type_condition_replicate_barcode)
     n_bc = df.groupby('label').Barcode.agg(len).max()
-    def generateAnnotationOutput(data, number_barcodes):
-        data = data.loc[data.index.repeat(number_barcodes)]
-        data['barcode'] = data.groupby(['type','condition','replicate']).cumcount() +1
-        data['barcode'] = data['barcode'].astype(str)
-        data['sample'] = data[['type','condition','replicate','barcode']].agg('_'.join,axis=1)
-        data = data[["sample", "type", "condition", "replicate", "barcode"]]
-        return(data)
-    dna_annot = generateAnnotationOutput(dna_annot, n_bc)
-    rna_annot = generateAnnotationOutput(rna_annot, n_bc)
+    dna_annot = generate_annotation_output(dna_annot, n_bc)
+    rna_annot = generate_annotation_output(rna_annot, n_bc)
 
     ## generate output DNA/RNA count tables
     ## rows oligo/seq ids,/assignment then per barcode the counts. padding with zeros
-    def generateCountOutput(data,columns):
-        counts = pd.DataFrame(list(data.groupby('label').apply(lambda x: x.values.flatten(order='F')))).fillna(0).astype(np.int64)
-        counts.columns = columns
-        counts['seq_id'] = data.index.unique()
-        counts = counts[(['seq_id'] + list(columns))]
-        return(counts)
-
-    dna_counts = generateCountOutput(dna_df,dna_annot['sample'])
-    rna_counts = generateCountOutput(rna_df,rna_annot['sample'])
-
-    ## write table function
-    def write(data,file):
-        data.to_csv(file, index=False,sep='\t', compression='gzip')
+    dna_counts = generate_count_output(dna_df, dna_annot['sample'], n_bc)
+    rna_counts = generate_count_output(rna_df, rna_annot['sample'], n_bc)
 
     ## write output DNA/RNA annotations
-    write(dna_annot,dna_annotation_output_file)
-    write(rna_annot,rna_annotation_output_file)
+    write_table(dna_annot, dna_annotation_output_file)
+    write_table(rna_annot, rna_annotation_output_file)
 
     ## write output DNA/RNA annotations
-    write(dna_counts,dna_counts_output_file)
-    write(rna_counts,rna_counts_output_file)
+    write_table(dna_counts, dna_counts_output_file)
+    write_table(rna_counts, rna_counts_output_file)
 
 if __name__ == '__main__':
     cli()

From 2d86b82b8c92d663c0bf4ab76bb5fcf40aa8f2b8 Mon Sep 17 00:00:00 2001
From: Mackenzie Noon <mackenziecnoon@gmail.com>
Date: Mon, 15 Jun 2026 16:14:10 -0400
Subject: [PATCH 7/9] quick additional test, making sure ragged ends dont goof

---
 tests/conftest.py                             |  5 ++
 tests/count/test_mpranalyze_compiler.py       | 52 +++++++++++++++++++
 tests/fixtures/count/ragged_missing_input.tsv |  5 ++
 3 files changed, 62 insertions(+)
 create mode 100644 tests/fixtures/count/ragged_missing_input.tsv

diff --git a/tests/conftest.py b/tests/conftest.py
index 8da7f62..0496f1d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -32,6 +32,11 @@ def minimal_count_input(count_fixtures_root: Path) -> Path:
     return count_fixtures_root / "minimal_test_input.tsv"
 
 
+@pytest.fixture
+def ragged_missing_input(count_fixtures_root: Path) -> Path:
+    return count_fixtures_root / "ragged_missing_input.tsv"
+
+
 @pytest.fixture
 def cli_runner() -> CliRunner:
     return CliRunner()
diff --git a/tests/count/test_mpranalyze_compiler.py b/tests/count/test_mpranalyze_compiler.py
index 082d0ae..c8f0f8d 100644
--- a/tests/count/test_mpranalyze_compiler.py
+++ b/tests/count/test_mpranalyze_compiler.py
@@ -86,3 +86,55 @@ def test_cli_generates_expected_count_tables(self, cli_runner, minimal_count_inp
             30,
             31,
         ]
+
+    def test_cli_missing_count_does_not_shift_replicates(self, cli_runner, ragged_missing_input, tmp_path):
+        # Regression guard for the MPRAflow-style "ragged within an oligo" bug
+        # (shendurelab/MPRAflow#87): a missing trailing count (empty cell / trailing
+        # tab) must stay a zero in its own replicate/barcode slot and must NOT shift
+        # later counts into the wrong replicate. The fixture's oligoA is missing its
+        # RNA replicate-3 / barcode-2 value, and oligoB has a single barcode (so it
+        # also exercises the across-oligo padding case).
+        rna_counts_output = tmp_path / "rna_counts.tsv.gz"
+        dna_counts_output = tmp_path / "dna_counts.tsv.gz"
+        rna_annotation_output = tmp_path / "rna_annot.tsv.gz"
+        dna_annotation_output = tmp_path / "dna_annot.tsv.gz"
+
+        result = cli_runner.invoke(
+            compiler.cli,
+            [
+                "--input",
+                str(ragged_missing_input),
+                "--rna-counts-output",
+                str(rna_counts_output),
+                "--dna-counts-output",
+                str(dna_counts_output),
+                "--rna-annotation-output",
+                str(rna_annotation_output),
+                "--dna-annotation-output",
+                str(dna_annotation_output),
+            ],
+        )
+
+        assert result.exit_code == 0, result.output
+
+        rna: pd.DataFrame = pd.read_csv(rna_counts_output, sep="\t").set_index("seq_id")
+        dna: pd.DataFrame = pd.read_csv(dna_counts_output, sep="\t").set_index("seq_id")
+
+        rna_cols = [
+            "RNA_X_1_1", "RNA_X_1_2", "RNA_X_1_3",
+            "RNA_X_2_1", "RNA_X_2_2", "RNA_X_2_3",
+            "RNA_X_3_1", "RNA_X_3_2", "RNA_X_3_3",
+        ]
+        # oligoA: the hole is at replicate 3 / barcode 2 -> must be 0, and the real
+        # barcode-3 value (302) must stay in barcode 3, not shift up to barcode 2.
+        assert list(rna.loc["oligoA", rna_cols]) == [100, 101, 102, 200, 201, 202, 300, 0, 302]
+        # oligoB has one barcode; remaining barcode slots pad with zeros per replicate.
+        assert list(rna.loc["oligoB", rna_cols]) == [103, 0, 0, 203, 0, 0, 303, 0, 0]
+
+        # DNA has no missing values; confirm nothing shifted there either.
+        dna_cols = [
+            "DNA_X_1_1", "DNA_X_1_2", "DNA_X_1_3",
+            "DNA_X_2_1", "DNA_X_2_2", "DNA_X_2_3",
+            "DNA_X_3_1", "DNA_X_3_2", "DNA_X_3_3",
+        ]
+        assert list(dna.loc["oligoA", dna_cols]) == [10, 11, 12, 20, 21, 22, 30, 31, 32]
diff --git a/tests/fixtures/count/ragged_missing_input.tsv b/tests/fixtures/count/ragged_missing_input.tsv
new file mode 100644
index 0000000..bc6ee66
--- /dev/null
+++ b/tests/fixtures/count/ragged_missing_input.tsv
@@ -0,0 +1,5 @@
+label	Sequence	Barcode	DNA(condition X, replicate 1)	DNA(condition X, replicate 2)	DNA(condition X, replicate 3)	RNA(condition X, replicate 1)	RNA(condition X, replicate 2)	RNA(condition X, replicate 3)
+oligoA	A	BC0	10	20	30	100	200	300
+oligoA	A	BC1	11	21	31	101	201	
+oligoA	A	BC2	12	22	32	102	202	302
+oligoB	B	BC3	13	23	33	103	203	303

From 580f840e5abbdf1ef7236176c538b5a66c20001d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 15 Jun 2026 20:14:39 +0000
Subject: [PATCH 8/9] ci: auto fixes from pre-commit hooks

for more information, see https://pre-commit.ci
---
 tests/fixtures/count/ragged_missing_input.tsv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/fixtures/count/ragged_missing_input.tsv b/tests/fixtures/count/ragged_missing_input.tsv
index bc6ee66..cd53b62 100644
--- a/tests/fixtures/count/ragged_missing_input.tsv
+++ b/tests/fixtures/count/ragged_missing_input.tsv
@@ -1,5 +1,5 @@
 label	Sequence	Barcode	DNA(condition X, replicate 1)	DNA(condition X, replicate 2)	DNA(condition X, replicate 3)	RNA(condition X, replicate 1)	RNA(condition X, replicate 2)	RNA(condition X, replicate 3)
 oligoA	A	BC0	10	20	30	100	200	300
-oligoA	A	BC1	11	21	31	101	201	
+oligoA	A	BC1	11	21	31	101	201
 oligoA	A	BC2	12	22	32	102	202	302
 oligoB	B	BC3	13	23	33	103	203	303

From 708bd4a63d7c47031fd40879154ef77c9b46fbde Mon Sep 17 00:00:00 2001
From: Mackenzie Noon <mackenziecnoon@gmail.com>
Date: Mon, 15 Jun 2026 16:15:11 -0400
Subject: [PATCH 9/9] +switched type to prevent overflow

---
 workflow/scripts/count/mpranalyze_compiler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflow/scripts/count/mpranalyze_compiler.py b/workflow/scripts/count/mpranalyze_compiler.py
index 4805bb2..e758108 100644
--- a/workflow/scripts/count/mpranalyze_compiler.py
+++ b/workflow/scripts/count/mpranalyze_compiler.py
@@ -93,8 +93,8 @@ def cli(input_file, rna_counts_output_file, dna_counts_output_file, rna_annotati
 
     # counts for observation
 
-    dna_df = df.iloc[:,2:(2+n_dna_obs)].astype(np.int16)
-    rna_df = df.iloc[:,(2+n_dna_obs):].astype(np.int16)
+    dna_df = df.iloc[:,2:(2+n_dna_obs)].astype(np.int64)
+    rna_df = df.iloc[:,(2+n_dna_obs):].astype(np.int64)
 
     ## generate output DNA/RNA annotations (type_condition_replicate_barcode)
     n_bc = df.groupby('label').Barcode.agg(len).max()