Daylily-Informatics · iamh2o · Sep 18, 2025 · Sep 18, 2025 · Sep 18, 2025 · Sep 18, 2025
diff --git a/.test/ngs-test-data/reads/b.chr21.1.fq b/.test/ngs-test-data/reads/b.chr21.1.fq
diff --git a/.test/ngs-test-data/reads/b.chr21.2.fq b/.test/ngs-test-data/reads/b.chr21.2.fq
diff --git a/README.md b/README.md
@@ -75,7 +75,7 @@ snakemake --use-conda --use-singularity   \
 --executor pcluster-slurm \
 --default-resources slurm_partition=i192,i128 runtime=86400 mem_mb=36900 tmpdir=/fsx/scratch \
 --cache -p \
---verbose -k \
+ -k \
 --max-threads 20000 \
 --restart-times 2 \
 --cores 20000 -j 14 -n   \
@@ -122,7 +122,7 @@ snakemake --use-conda --use-singularity   \
 --executor pcluster-slurm \
 --default-resources slurm_partition=i192,i128 runtime=86400 mem_mb=36900 tmpdir=/fsx/scratch \
 --cache -p \
---verbose -k \
+ -k \
 --restart-times 2 \
 --max-threads 20000 \
 --cores 20000 -j 14 

diff --git a/config/samples.tsv b/config/samples.tsv
@@ -1,2 +1,19 @@
-sample_name	condition
-a	untreated
+sample_name	condition	patient
+R0c_H37002-BG002179-ILMN-rnaseq-tissue-tumor_S4_0	normal	H37002
+R0c_H37002-BG004281-ILMN-rnaseq-tissue-tumor_S5_0	normal	H37002
+R0c_H37004-2022-Resection-ILMN-rnaseq-tissue-tumor_S8_0	normal	H37004
+R0c_H37006-Mofit-2023-10-15-ILMN-rnaseq-tissue-organoid_S14_0	normal	H37006
+R0c_H37001-BG006103-ILMN-rnaseq-tissue-tumor_S14_0	normal	H37001
+R0c_H37001-BG-05182023-ILMN-rnaseq-tissue-tumor_S17_0	normal	H37001
+R0c_H37001-BG-duodenal-042023-ILMN-rnaseq-tissue-tumor_S20_0	normal	H37001
+R0c_H37011-BG005019-ILMN-rnaseq-tissue-tumor_S25_0	normal	H37011
+R0c_H37016-DAU27710-ILMN-rnaseq-tissue-tumor_S30_0	normal	H37016
+R0c_H37016-DAU28135-ILMN-rnaseq-tissue-tumor_S33_0	normal	H37016
+R0c_H37009-Tempus-TL-24-S3U64GWWOR-ILMN-rnaseq-tissue-tumor_S34_0	normal	H37009
+R0c_H37009-Tempus-TL-23-49B6RVQK-ILMN-rnaseq-tissue-tumor_S36_0	normal	H37009
+R1c_H37016-BG013656-ILMN-rnaseq-tissue-tumor_S1_0	normal	H37016
+R1c_H37019-BG013543-ILMN-rnaseq-tissue-tumor_S4_0	normal	H37019
+R1c_H37020-BG013759-ILMN-rnaseq-tissue-tumor_S7_0	normal	H37020
+R3c_H37021-BG012177-ILMN-rnaseq-tissue-tumor_S1_0	normal	H37021
+R3c_H37021-CeGaT-Liver-2023-ILMN-rnaseq-tissue-tumor_S2_0	normal	H37021
+R3c_H37021-15054597-2023-11-ILMN-rnaseq-tissue-tumor_S3_0	normal	H37021
diff --git a/config/units.tsv.template b/config/units.tsv.template
@@ -1,2 +1,3 @@
 sample_name	unit_name	fq1	fq2	sra	adapters	strandedness
 a	1	REGSUB_PWD/.test/ngs-test-data/reads/a.chr21.1.fq	REGSUB_PWD/.test/ngs-test-data/reads/a.chr21.2.fq			
+b	1	REGSUB_PWD/.test/ngs-test-data/reads/b.chr21.1.fq	REGSUB_PWD/.test/ngs-test-data/reads/b.chr21.2.fq
diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk
@@ -11,6 +11,8 @@ rule align:
     benchmark:
         "logs/star/{sample}_{unit}.bench.tsv",
     threads: 127
+    resources:
+        tmpdir="/dev/shm"
     params:
         idx=lambda wc, input: input.index,
         extra=lambda wc, input: f'--outSAMtype BAM SortedByCoordinate --quantMode GeneCounts --sjdbGTFfile {input.gtf} {config["params"]["star"]} ',

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -36,8 +36,20 @@ def get_final_output():
 
 validate(samples, schema="../schemas/samples.schema.yaml")
 
+#units = (
+#    pd.read_csv(config["units"], sep="\t", dtype={"sample_name": str, "unit_name": str})
+#    .set_index(["sample_name", "unit_name"], drop=False)
+#    .sort_index()
+#)
 units = (
-    pd.read_csv(config["units"], sep="\t", dtype={"sample_name": str, "unit_name": str})
+    pd.read_csv(
+        config["units"],
+        sep="\t",
+        dtype=str,               # ← all cols strings
+        keep_default_na=False,   # ← "" stays "", no NaN
+        na_filter=False,
+    )
+    .applymap(lambda x: x.strip() if isinstance(x, str) else x)
     .set_index(["sample_name", "unit_name"], drop=False)
     .sort_index()
 )

diff --git a/workflow/rules/diffexp.smk b/workflow/rules/diffexp.smk
@@ -1,27 +1,41 @@
 rule count_matrix:
     input:
-        expand(
+        reads = expand(
             "results/star/{unit.sample_name}_{unit.unit_name}/ReadsPerGene.out.tab",
             unit=units.itertuples(),
         ),
+        # OPTIONAL: provide per-sample infer_experiment outputs when available
+        infer = expand(
+            "results/qc/rseqc/{unit.sample_name}_{unit.unit_name}.infer_experiment.txt",
+            unit=units.itertuples(),
+        )
     output:
-        "results/counts/all.tsv",
+        "results/counts/all.tsv"
     log:
-        "logs/count-matrix.log",
+        "logs/count-matrix.log"
     params:
-        samples=units["sample_name"].tolist(),
-        strand=get_strandedness(units),
+        samples = ",".join(units["sample_name"].tolist()),
+        strands = ",".join(["none"] * len(units))   # or your real per-sample value
     conda:
         "../envs/pandas.yaml"
-    script:
-        "../scripts/count-matrix.py"
+    threads: 1
+    shell:
+        """
+        python workflow/scripts/count-matrix.py \
+            --output {output} \
+            --samples "{params.samples}" \
+            --strands "{params.strands}" \
+            {input.reads} \
+            > {log} 2>&1
+        """
 
 
 rule gene_2_symbol:
     input:
         counts="{prefix}.tsv",
     output:
         symbol="{prefix}.symbol.tsv",
+        nodata="{prefix}.symbol.tsv.NODATA",
     params:
         species=get_bioc_species_name(),
     log:
@@ -30,7 +44,20 @@ rule gene_2_symbol:
         "../envs/biomart.yaml"
     shell:
         """
-        touch {output}
+        line_count=$(awk 'END {{print NR}}' {input.counts})
+        if [ "$line_count" -le 1 ]; then
+            echo "Input {input.counts} has $line_count line(s); skipping gene symbol annotation." > {log}
+            touch {output.symbol}
+            touch {output.nodata}
+        else
+            rm -f {output.nodata}
+            echo "Annotating gene symbols for {input.counts}." > {log}
+            Rscript workflow/scripts/gene2symbol.R \
+                --counts {input.counts} \
+                --output {output.symbol} \
+                --species {params.species} \
+                >> {log} 2>&1
+        fi
         """
 
 
@@ -66,18 +93,6 @@ rule pca:
     shell:
         "touch {output}"
 
-#rule pca:
-#    input:
-#        "results/deseq2/all.rds",
-#    output:
-#        report("results/pca.{variable}.svg", "../report/pca.rst"),
-#    conda:
-#        "../envs/deseq2.yaml"
-#    log:
-#        "logs/pca.{variable}.log",
-#    script:
-#        "../scripts/plot-pca.R"
-
 
 rule deseq2:
     input:

diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk
@@ -106,8 +106,15 @@ rule rseqc_gtf2bed:
         "logs/rseqc_gtf2bed.log",
     conda:
         "../envs/gffutils.yaml"
-    script:
-        "../scripts/gtf2bed.py"
+    shell:
+        """
+        set -euo pipefail
+        python workflow/scripts/gtf2bed.py \
+            --input {input} \
+            --bed {output.bed} \
+            --db {output.db} \
+            > {log} 2>&1
+        """
 
 
 rule rseqc_junction_annotation:

diff --git a/workflow/schemas/samples.schema.yaml b/workflow/schemas/samples.schema.yaml
@@ -5,6 +5,16 @@ properties:
   sample_name:
     type: string
     description: sample name/identifier
+  condition:
+    type: string
+    description: biological condition label (tumor or normal)
+    enum:
+      - tumor
+      - normal
+  patient:
+    type: string
+    description: optional patient identifier for paired analyses
 
 required:
   - sample_name
+  - condition
diff --git a/workflow/scripts/count-matrix.py b/workflow/scripts/count-matrix.py
@@ -1,40 +1,91 @@
-import sys
+#!/usr/bin/env python3
+"""Build a count matrix from STAR gene counts outputs."""
 
-# logging
-sys.stderr = open(snakemake.log[0], "w")
+import argparse
+from typing import List
 
 import pandas as pd
 
 
-def get_column(strandedness):
-    if pd.isnull(strandedness) or strandedness == "none":
+def parse_comma_separated(value: str, name: str) -> List[str]:
+    items = [item.strip() for item in value.split(",") if item.strip()]
+    if not items:
+        raise argparse.ArgumentTypeError(f"{name} list cannot be empty")
+    return items
+
+
+def get_column(strandedness: str) -> int:
+    if strandedness in {"", "none", None}:
         return 1  # non stranded protocol
-    elif strandedness == "yes":
+    if strandedness == "yes":
         return 2  # 3rd column
-    elif strandedness == "reverse":
+    if strandedness == "reverse":
         return 3  # 4th column, usually for Illumina truseq
-    else:
-        raise ValueError(
-            (
-                "'strandedness' column should be empty or have the "
-                "value 'none', 'yes' or 'reverse', instead has the "
-                "value {}"
-            ).format(repr(strandedness))
+    raise ValueError(
+        (
+            "'strandedness' column should be empty or have the value "
+            "'none', 'yes' or 'reverse', instead has the value {}"
+        ).format(repr(strandedness))
+    )
+
+
+def build_matrix(inputs: List[str], strands: List[str], samples: List[str]) -> pd.DataFrame:
+    counts = []
+    for path, strand, sample in zip(inputs, strands, samples):
+        table = pd.read_table(
+            path, index_col=0, usecols=[0, get_column(strand)], header=None, skiprows=4
         )
+        table.columns = [sample]
+        counts.append(table)
+
+    matrix = pd.concat(counts, axis=1)
+    matrix.index.name = "gene"
+    # collapse technical replicates
+    matrix = matrix.groupby(matrix.columns, axis=1, sort=False).sum()
+    return matrix
 
 
-counts = [
-    pd.read_table(
-        f, index_col=0, usecols=[0, get_column(strandedness)], header=None, skiprows=4
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--samples",
+        required=True,
+        help="Comma-separated list of sample names matching the input files.",
+    )
+    parser.add_argument(
+        "--strands",
+        required=True,
+        help="Comma-separated list of strandedness values matching the input files.",
+    )
+    parser.add_argument(
+        "--output",
+        required=True,
+        help="Path to write the combined count matrix to.",
     )
-    for f, strandedness in zip(snakemake.input, snakemake.params.strand)
-]
+    parser.add_argument(
+        "inputs",
+        nargs="+",
+        help="STAR ReadsPerGene.out.tab files to combine.",
+    )
+
+    args = parser.parse_args()
+
+    samples = parse_comma_separated(args.samples, "Sample")
+    strands = parse_comma_separated(args.strands, "Strand")
+    inputs = args.inputs
+
+    if not inputs:
+        parser.error("At least one input file must be provided.")
+
+    if not (len(inputs) == len(samples) == len(strands)):
+        parser.error(
+            "The number of inputs, samples, and strands must match: "
+            f"got {len(inputs)} inputs, {len(samples)} samples and {len(strands)} strands."
+        )
+
+    matrix = build_matrix(inputs, strands, samples)
+    matrix.to_csv(args.output, sep="\t")
 
-for t, sample in zip(counts, snakemake.params.samples):
-    t.columns = [sample]
 
-matrix = pd.concat(counts, axis=1)
-matrix.index.name = "gene"
-# collapse technical replicates
-matrix = matrix.groupby(matrix.columns, axis=1, sort=False).sum()
-matrix.to_csv(snakemake.output[0], sep="\t")
+if __name__ == "__main__":
+    main()