Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21,202 changes: 3,233 additions & 17,969 deletions .test/ngs-test-data/reads/b.chr21.1.fq

Large diffs are not rendered by default.

21,216 changes: 3,240 additions & 17,976 deletions .test/ngs-test-data/reads/b.chr21.2.fq

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ snakemake --use-conda --use-singularity \
--executor pcluster-slurm \
--default-resources slurm_partition=i192,i128 runtime=86400 mem_mb=36900 tmpdir=/fsx/scratch \
--cache -p \
--verbose -k \
-k \
--max-threads 20000 \
--restart-times 2 \
--cores 20000 -j 14 -n \
Expand Down Expand Up @@ -122,7 +122,7 @@ snakemake --use-conda --use-singularity \
--executor pcluster-slurm \
--default-resources slurm_partition=i192,i128 runtime=86400 mem_mb=36900 tmpdir=/fsx/scratch \
--cache -p \
--verbose -k \
-k \
--restart-times 2 \
--max-threads 20000 \
--cores 20000 -j 14
Expand Down
21 changes: 19 additions & 2 deletions config/samples.tsv
Original file line number Diff line number Diff line change
@@ -1,2 +1,19 @@
sample_name condition
a untreated
sample_name condition patient
R0c_H37002-BG002179-ILMN-rnaseq-tissue-tumor_S4_0 normal H37002
R0c_H37002-BG004281-ILMN-rnaseq-tissue-tumor_S5_0 normal H37002
R0c_H37004-2022-Resection-ILMN-rnaseq-tissue-tumor_S8_0 normal H37004
R0c_H37006-Mofit-2023-10-15-ILMN-rnaseq-tissue-organoid_S14_0 normal H37006
R0c_H37001-BG006103-ILMN-rnaseq-tissue-tumor_S14_0 normal H37001
R0c_H37001-BG-05182023-ILMN-rnaseq-tissue-tumor_S17_0 normal H37001
R0c_H37001-BG-duodenal-042023-ILMN-rnaseq-tissue-tumor_S20_0 normal H37001
R0c_H37011-BG005019-ILMN-rnaseq-tissue-tumor_S25_0 normal H37011
R0c_H37016-DAU27710-ILMN-rnaseq-tissue-tumor_S30_0 normal H37016
R0c_H37016-DAU28135-ILMN-rnaseq-tissue-tumor_S33_0 normal H37016
R0c_H37009-Tempus-TL-24-S3U64GWWOR-ILMN-rnaseq-tissue-tumor_S34_0 normal H37009
R0c_H37009-Tempus-TL-23-49B6RVQK-ILMN-rnaseq-tissue-tumor_S36_0 normal H37009
R1c_H37016-BG013656-ILMN-rnaseq-tissue-tumor_S1_0 normal H37016
R1c_H37019-BG013543-ILMN-rnaseq-tissue-tumor_S4_0 normal H37019
R1c_H37020-BG013759-ILMN-rnaseq-tissue-tumor_S7_0 normal H37020
R3c_H37021-BG012177-ILMN-rnaseq-tissue-tumor_S1_0 normal H37021
R3c_H37021-CeGaT-Liver-2023-ILMN-rnaseq-tissue-tumor_S2_0 normal H37021
R3c_H37021-15054597-2023-11-ILMN-rnaseq-tissue-tumor_S3_0 normal H37021
1 change: 1 addition & 0 deletions config/units.tsv.template
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
sample_name unit_name fq1 fq2 sra adapters strandedness
a 1 REGSUB_PWD/.test/ngs-test-data/reads/a.chr21.1.fq REGSUB_PWD/.test/ngs-test-data/reads/a.chr21.2.fq
b 1 REGSUB_PWD/.test/ngs-test-data/reads/b.chr21.1.fq REGSUB_PWD/.test/ngs-test-data/reads/b.chr21.2.fq
2 changes: 2 additions & 0 deletions workflow/rules/align.smk
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ rule align:
benchmark:
"logs/star/{sample}_{unit}.bench.tsv",
threads: 127
resources:
tmpdir="/dev/shm"
params:
idx=lambda wc, input: input.index,
extra=lambda wc, input: f'--outSAMtype BAM SortedByCoordinate --quantMode GeneCounts --sjdbGTFfile {input.gtf} {config["params"]["star"]} ',
Expand Down
14 changes: 13 additions & 1 deletion workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,20 @@ def get_final_output():

validate(samples, schema="../schemas/samples.schema.yaml")

#units = (
# pd.read_csv(config["units"], sep="\t", dtype={"sample_name": str, "unit_name": str})
# .set_index(["sample_name", "unit_name"], drop=False)
# .sort_index()
#)
units = (
pd.read_csv(config["units"], sep="\t", dtype={"sample_name": str, "unit_name": str})
pd.read_csv(
config["units"],
sep="\t",
dtype=str, # ← all cols strings
keep_default_na=False, # ← "" stays "", no NaN
na_filter=False,
)
.applymap(lambda x: x.strip() if isinstance(x, str) else x)
.set_index(["sample_name", "unit_name"], drop=False)
.sort_index()
)
Expand Down
55 changes: 35 additions & 20 deletions workflow/rules/diffexp.smk
Original file line number Diff line number Diff line change
@@ -1,27 +1,41 @@
rule count_matrix:
input:
expand(
reads = expand(
"results/star/{unit.sample_name}_{unit.unit_name}/ReadsPerGene.out.tab",
unit=units.itertuples(),
),
# OPTIONAL: provide per-sample infer_experiment outputs when available
infer = expand(
"results/qc/rseqc/{unit.sample_name}_{unit.unit_name}.infer_experiment.txt",
unit=units.itertuples(),
)
output:
"results/counts/all.tsv",
"results/counts/all.tsv"
log:
"logs/count-matrix.log",
"logs/count-matrix.log"
params:
samples=units["sample_name"].tolist(),
strand=get_strandedness(units),
samples = ",".join(units["sample_name"].tolist()),
strands = ",".join(["none"] * len(units)) # or your real per-sample value
conda:
"../envs/pandas.yaml"
script:
"../scripts/count-matrix.py"
threads: 1
shell:
"""
python workflow/scripts/count-matrix.py \
--output {output} \
--samples "{params.samples}" \
--strands "{params.strands}" \
{input.reads} \
> {log} 2>&1
"""


rule gene_2_symbol:
input:
counts="{prefix}.tsv",
output:
symbol="{prefix}.symbol.tsv",
nodata="{prefix}.symbol.tsv.NODATA",
params:
species=get_bioc_species_name(),
log:
Expand All @@ -30,7 +44,20 @@ rule gene_2_symbol:
"../envs/biomart.yaml"
shell:
"""
touch {output}
line_count=$(awk 'END {{print NR}}' {input.counts})
if [ "$line_count" -le 1 ]; then
echo "Input {input.counts} has $line_count line(s); skipping gene symbol annotation." > {log}
touch {output.symbol}
touch {output.nodata}
else
rm -f {output.nodata}
echo "Annotating gene symbols for {input.counts}." > {log}
Rscript workflow/scripts/gene2symbol.R \
--counts {input.counts} \
--output {output.symbol} \
--species {params.species} \
>> {log} 2>&1
fi
"""


Expand Down Expand Up @@ -66,18 +93,6 @@ rule pca:
shell:
"touch {output}"

#rule pca:
# input:
# "results/deseq2/all.rds",
# output:
# report("results/pca.{variable}.svg", "../report/pca.rst"),
# conda:
# "../envs/deseq2.yaml"
# log:
# "logs/pca.{variable}.log",
# script:
# "../scripts/plot-pca.R"


rule deseq2:
input:
Expand Down
11 changes: 9 additions & 2 deletions workflow/rules/qc.smk
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,15 @@ rule rseqc_gtf2bed:
"logs/rseqc_gtf2bed.log",
conda:
"../envs/gffutils.yaml"
script:
"../scripts/gtf2bed.py"
shell:
"""
set -euo pipefail
python workflow/scripts/gtf2bed.py \
--input {input} \
--bed {output.bed} \
--db {output.db} \
> {log} 2>&1
"""


rule rseqc_junction_annotation:
Expand Down
10 changes: 10 additions & 0 deletions workflow/schemas/samples.schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,16 @@ properties:
sample_name:
type: string
description: sample name/identifier
condition:
type: string
description: biological condition label (tumor or normal)
enum:
- tumor
- normal
patient:
type: string
description: optional patient identifier for paired analyses

required:
- sample_name
- condition
103 changes: 77 additions & 26 deletions workflow/scripts/count-matrix.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,91 @@
import sys
#!/usr/bin/env python3
"""Build a count matrix from STAR gene counts outputs."""

# logging
sys.stderr = open(snakemake.log[0], "w")
import argparse
from typing import List

import pandas as pd


def get_column(strandedness):
if pd.isnull(strandedness) or strandedness == "none":
def parse_comma_separated(value: str, name: str) -> List[str]:
items = [item.strip() for item in value.split(",") if item.strip()]
if not items:
raise argparse.ArgumentTypeError(f"{name} list cannot be empty")
return items


def get_column(strandedness: str) -> int:
if strandedness in {"", "none", None}:
return 1 # non stranded protocol
elif strandedness == "yes":
if strandedness == "yes":
return 2 # 3rd column
elif strandedness == "reverse":
if strandedness == "reverse":
return 3 # 4th column, usually for Illumina truseq
else:
raise ValueError(
(
"'strandedness' column should be empty or have the "
"value 'none', 'yes' or 'reverse', instead has the "
"value {}"
).format(repr(strandedness))
raise ValueError(
(
"'strandedness' column should be empty or have the value "
"'none', 'yes' or 'reverse', instead has the value {}"
).format(repr(strandedness))
)


def build_matrix(inputs: List[str], strands: List[str], samples: List[str]) -> pd.DataFrame:
counts = []
for path, strand, sample in zip(inputs, strands, samples):
table = pd.read_table(
path, index_col=0, usecols=[0, get_column(strand)], header=None, skiprows=4
)
table.columns = [sample]
counts.append(table)

matrix = pd.concat(counts, axis=1)
matrix.index.name = "gene"
# collapse technical replicates
matrix = matrix.groupby(matrix.columns, axis=1, sort=False).sum()
return matrix


counts = [
pd.read_table(
f, index_col=0, usecols=[0, get_column(strandedness)], header=None, skiprows=4
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--samples",
required=True,
help="Comma-separated list of sample names matching the input files.",
)
parser.add_argument(
"--strands",
required=True,
help="Comma-separated list of strandedness values matching the input files.",
)
parser.add_argument(
"--output",
required=True,
help="Path to write the combined count matrix to.",
)
for f, strandedness in zip(snakemake.input, snakemake.params.strand)
]
parser.add_argument(
"inputs",
nargs="+",
help="STAR ReadsPerGene.out.tab files to combine.",
)

args = parser.parse_args()

samples = parse_comma_separated(args.samples, "Sample")
strands = parse_comma_separated(args.strands, "Strand")
inputs = args.inputs

if not inputs:
parser.error("At least one input file must be provided.")

if not (len(inputs) == len(samples) == len(strands)):
parser.error(
"The number of inputs, samples, and strands must match: "
f"got {len(inputs)} inputs, {len(samples)} samples and {len(strands)} strands."
)

matrix = build_matrix(inputs, strands, samples)
matrix.to_csv(args.output, sep="\t")

for t, sample in zip(counts, snakemake.params.samples):
t.columns = [sample]

matrix = pd.concat(counts, axis=1)
matrix.index.name = "gene"
# collapse technical replicates
matrix = matrix.groupby(matrix.columns, axis=1, sort=False).sum()
matrix.to_csv(snakemake.output[0], sep="\t")
if __name__ == "__main__":
main()
Loading