diff --git a/config/relatedness.yaml b/config/relatedness.yaml new file mode 100644 index 0000000..895f36c --- /dev/null +++ b/config/relatedness.yaml @@ -0,0 +1,45 @@ +# REQUIRED: reference + resources +ref_fasta: /path/to/GRCh38.fa + +somalier: + sites_vcf: /path/to/somalier-sites-hg38.vcf.bgz # common 0.5-MAF SNP panel (bgzipped + .tbi) + # If you only have BAM/CRAMs, Somalier will extract from BAM with --fasta {ref_fasta} + # If you have VCFs with germline GTs, Somalier will extract from VCF. + genome_build: GRCh38 + +picard: + # HAPLOTYPE_MAP from GATK resource bundle (haplotype map for fingerprinting) + haplotype_map: /path/to/haplotype_map_hg38.vcf.gz + +conpair: + # Conpair site lists (use hg38 bundle) + snp_positions_bed: /path/to/Conpair/snp_positions_hg38.bed + common_sites_vcf: /path/to/Conpair/common_snps_hg38.vcf.gz + min_baseq: 20 + min_mapq: 10 + +# INPUTS (either BAM/CRAM or VCF per sample; you can mix) +samples: + # sample_id: {bam: "..."} OR {vcf: "..."} + HG001_N1: {bam: /data/HG001/normal1.bam} + HG001_N2: {bam: /data/HG001/normal2.bam} + PT1_T: {bam: /data/PT1/tumor.bam} + PT1_N: {bam: /data/PT1/normal.bam} + FATHER: {vcf: /data/family/joint.vcf.gz} # joint or singleton VCF allowed + MOTHER: {vcf: /data/family/joint.vcf.gz} + CHILD: {vcf: /data/family/joint.vcf.gz} + +# EXPECTED RELATIONSHIPS (used by final report for assertions) +# relationship ∈ {identical, duplicate, tumor_normal, parent_child, siblings, unrelated} +expected: + - {relationship: identical, samples: [HG001_N1, HG001_N2]} + - {relationship: tumor_normal, samples: [PT1_T, PT1_N]} + - {relationship: parent_child, samples: [FATHER, CHILD]} + - {relationship: parent_child, samples: [MOTHER, CHILD]} + - {relationship: siblings, samples: [FATHER, MOTHER], note: "should NOT be siblings -> expect fail"} # example negative check + +# OPTIONAL: run peddy on a joint VCF (pedigree checks) +peddy: + enabled: false + joint_vcf: /data/family/joint.vcf.gz + ped: /data/family/family.ped diff --git a/workflow/envs/conpair_v0.1.yaml b/workflow/envs/conpair_v0.1.yaml new file mode 100644 index 0000000..eccfda4 --- /dev/null +++ b/workflow/envs/conpair_v0.1.yaml @@ -0,0 +1,11 @@ +name: conpair +channels: [conda-forge, bioconda] +dependencies: + - python=3.9 + - samtools + - pysam + - pandas + - numpy + - pip + - pip: + - git+https://github.com/nygenome/Conpair.git diff --git a/workflow/envs/peddy_relatedness_v0.1.yaml b/workflow/envs/peddy_relatedness_v0.1.yaml new file mode 100644 index 0000000..d9828f6 --- /dev/null +++ b/workflow/envs/peddy_relatedness_v0.1.yaml @@ -0,0 +1,7 @@ +name: peddy +channels: [conda-forge, bioconda] +dependencies: + - peddy + - cython + - pandas + - python>=3.9 diff --git a/workflow/envs/picard_relatedness_v0.1.yaml b/workflow/envs/picard_relatedness_v0.1.yaml new file mode 100644 index 0000000..ac7cf76 --- /dev/null +++ b/workflow/envs/picard_relatedness_v0.1.yaml @@ -0,0 +1,5 @@ +name: picard +channels: [conda-forge, bioconda] +dependencies: + - picard=3.* + - openjdk=17 diff --git a/workflow/envs/relatedness_report_v0.1.yaml b/workflow/envs/relatedness_report_v0.1.yaml new file mode 100644 index 0000000..4c4ca83 --- /dev/null +++ b/workflow/envs/relatedness_report_v0.1.yaml @@ -0,0 +1,8 @@ +name: relatedness-report +channels: [conda-forge, bioconda] +dependencies: + - python>=3.9 + - pandas + - numpy + - jinja2 + - pyyaml diff --git a/workflow/envs/somalier_v0.1.yaml b/workflow/envs/somalier_v0.1.yaml new file mode 100644 index 0000000..a3b924b --- /dev/null +++ b/workflow/envs/somalier_v0.1.yaml @@ -0,0 +1,8 @@ +name: somalier +channels: [conda-forge, bioconda] +dependencies: + - somalier=0.2* + - htslib + - bcftools + - samtools + - python>=3.9 diff --git a/workflow/rules/relatedness_test_day.smk b/workflow/rules/relatedness_test_day.smk index 903898f..bd6c634 100644 --- a/workflow/rules/relatedness_test_day.smk +++ b/workflow/rules/relatedness_test_day.smk @@ -199,51 +199,24 @@ rule conpair_compare: cp results/conpair/{wildcards.t}__{wildcards.n}/res_concordance_details.txt {output.conctsv} fi """ - - - - ####################################################################### # PEDDY (optional; requires a joint VCF) ####################################################################### -rule conpair_mpileup_all: - input: - expand( - "results/conpair/{t}__{n}/tumor.mpileup", - t=[p[0] for p in tn_pairs()], - n=[p[1] for p in tn_pairs()] - ), - expand( - "results/conpair/{t}__{n}/normal.mpileup", - t=[p[0] for p in tn_pairs()], - n=[p[1] for p in tn_pairs()] - ) -rule conpair_parse_all: - input: - expand( - "results/conpair/{t}__{n}/tumor.parsed", - t=[p[0] for p in tn_pairs()], - n=[p[1] for p in tn_pairs()] - ), - expand( - "results/conpair/{t}__{n}/normal.parsed", - t=[p[0] for p in tn_pairs()], - n=[p[1] for p in tn_pairs()] - ) +use rule conpair_mpileup as conpair_mpileup_all with: + wildcards: + t=[p[0] for p in tn_pairs()], + n=[p[1] for p in tn_pairs()] -rule conpair_compare_all: - input: - expand( - "results/conpair/{t}__{n}/concordance.tsv", - t=[p[0] for p in tn_pairs()], - n=[p[1] for p in tn_pairs()] - ), - expand( - "results/conpair/{t}__{n}/summary.txt", - t=[p[0] for p in tn_pairs()], - n=[p[1] for p in tn_pairs()] - ) +use rule conpair_parse as conpair_parse_all with: + wildcards: + t=[p[0] for p in tn_pairs()], + n=[p[1] for p in tn_pairs()] + +use rule conpair_compare as conpair_compare_all with: + wildcards: + t=[p[0] for p in tn_pairs()], + n=[p[1] for p in tn_pairs()] rule peddy_relatedness: input: diff --git a/workflow/scripts/relatedness_report.py b/workflow/scripts/relatedness_report.py new file mode 100644 index 0000000..c87786a --- /dev/null +++ b/workflow/scripts/relatedness_report.py @@ -0,0 +1,203 @@ +import sys, os, yaml, pandas as pd, numpy as np +from jinja2 import Template + +som_pairs = snakemake.input.som_pairs +som_groups = snakemake.input.som_groups +picard_metrics = snakemake.input.picard_metrics +picard_matrix = snakemake.input.picard_matrix +conpair_files = snakemake.input.get("conpair", []) +out_tsv = snakemake.output.tsv +out_html = snakemake.output.html +cfg_path = snakemake.params.cfg + +with open(cfg_path) as fh: + CFG = yaml.safe_load(fh) + +# ---------------- Somalier ---------------- +sp = pd.read_csv(som_pairs, sep="\t") +# Ensure consistent column presence +# expected columns include: sample_a, sample_b, relatedness, ibs0, ibs2, hom_concordance, het_concordance, etc. +sp_cols = {c.lower(): c for c in sp.columns} +def col(name): return sp_cols.get(name, name) + +# Fast lookup for any pair (unordered) +def key(a,b): return tuple(sorted([a,b])) +sp["pair_key"] = [key(a,b) for a,b in zip(sp[col("sample_a")], sp[col("sample_b")])] +sp_pairs = {k: row for k, row in sp.set_index("pair_key").iterrows()} + +# ---------------- Picard Crosscheck ---------------- +# metrics.txt has columns like: LEFT_FILE, RIGHT_FILE, RESULT, LOD_SCORE, etc. +pm = pd.read_csv(picard_metrics, sep="\t", comment="#") +# map file path -> sample name from config +path2sample = {} +for s,ent in CFG["samples"].items(): + p = ent.get("bam") or ent.get("vcf") + if p: path2sample[os.path.abspath(p)] = s + +def file2sample(p): + a = os.path.abspath(p) + # Picard sometimes normalizes paths; try basename fallback + if a in path2sample: return path2sample[a] + base = os.path.basename(a) + for k,v in path2sample.items(): + if os.path.basename(k)==base: return v + return base + +pm["left_samp"] = pm["LEFT_FILE"].map(file2sample) +pm["right_samp"] = pm["RIGHT_FILE"].map(file2sample) +pm["pair_key"] = [key(a,b) for a,b in zip(pm["left_samp"], pm["right_samp"]) ] + +# Collapse to best (max absolute LOD) per pair +pm_best = pm.loc[pm.groupby("pair_key")["LOD_SCORE"].apply(lambda s: s.abs().idxmax())] + +picard_pairs = {k: row for k,row in pm_best.set_index("pair_key").iterrows()} + +# ---------------- Conpair (tumor/normal) ---------------- +con_df = [] +for f in conpair_files: + if not os.path.exists(f): continue + try: + df = pd.read_csv(f, sep="\t") + except Exception: + # Some versions write CSV-like; be lenient + df = pd.read_csv(f) + df.columns = [c.lower() for c in df.columns] + # Try to capture pair names from path + pair = os.path.basename(os.path.dirname(f)).split("__") + df["sample_a"] = pair[0] + df["sample_b"] = pair[1] + con_df.append(df) +con_df = pd.concat(con_df, ignore_index=True) if con_df else pd.DataFrame() + +# Heuristic thresholds +THRESH = dict( + identical_relatedness=0.95, # Somalier ~1.0 for same-individual/duplicate + tn_relatedness=0.70, # Tumor-normal often 0.8–1.0; allow LOH/purity wiggle + first_degree_relatedness=0.45, # Parent-child or full-sibs ~0.45–0.55 in Somalier + ibs0_parent_child=0, # IBS0 = 0 for PO in ideal; allow <= 2 by noise + picard_mismatch_lod=-5.0, # strong negative indicates mismatch + picard_match_lod=5.0 # strong positive indicates match +) + +def get_somalier(a,b, field): + r = sp_pairs.get(key(a,b)) + return None if r is None else r.get(field, r.get(field.upper())) + +def get_picard(a,b): + r = picard_pairs.get(key(a,b)) + if r is None: return None + return dict(result=r.get("RESULT",""), lod=float(r.get("LOD_SCORE", np.nan))) + +# Evaluate expectations +rows = [] +for exp in CFG.get("expected", []): + rel = exp["relationship"] + a, b = exp["samples"] + note = exp.get("note","") + + som_rel = get_somalier(a,b, "relatedness") + ibs0 = get_somalier(a,b, "ibs0") + pic = get_picard(a,b) + + status = "PASS" + fail_reasons = [] + + if rel in ("identical","duplicate"): + if som_rel is not None and som_rel < THRESH["identical_relatedness"]: + status = "FAIL"; fail_reasons.append(f"Somalier.relatedness={som_rel:.3f} < {THRESH['identical_relatedness']}") + if pic is not None and pic["lod"] < THRESH["picard_match_lod"]: + status = "FAIL"; fail_reasons.append(f"Picard.LOD={pic['lod']:.1f} < {THRESH['picard_match_lod']}") + elif rel == "tumor_normal": + if som_rel is not None and som_rel < THRESH["tn_relatedness"]: + status = "FAIL"; fail_reasons.append(f"Somalier.relatedness={som_rel:.3f} < {THRESH['tn_relatedness']}") + if pic is not None and pic["lod"] < 0 and pic["lod"] <= THRESH["picard_mismatch_lod"]: + status = "FAIL"; fail_reasons.append(f"Picard.LOD strongly negative ({pic['lod']:.1f})") + elif rel in ("parent_child","parent-offspring"): + # Expect first-degree + IBS0≈0 + if som_rel is not None and som_rel < THRESH["first_degree_relatedness"]: + status = "FAIL"; fail_reasons.append(f"Somalier.relatedness={som_rel:.3f} < {THRESH['first_degree_relatedness']}") + if ibs0 is not None and ibs0 > 2: + status = "FAIL"; fail_reasons.append(f"Somalier.IBS0={ibs0} > 2 (expected ~0)") + elif rel == "siblings": + # Expect first-degree but IBS0>0 typically + if som_rel is not None and som_rel < THRESH["first_degree_relatedness"]: + status = "FAIL"; fail_reasons.append(f"Somalier.relatedness={som_rel:.3f} < {THRESH['first_degree_relatedness']}") + if ibs0 is not None and ibs0 <= 0: + status = "FAIL"; fail_reasons.append(f"Somalier.IBS0={ibs0} <= 0 (sibs typically >0)") + elif rel == "unrelated": + if som_rel is not None and som_rel >= 0.2: + status = "FAIL"; fail_reasons.append(f"Somalier.relatedness={som_rel:.3f} unexpectedly high for unrelated") + if pic is not None and pic["lod"] >= THRESH["picard_match_lod"]: + status = "FAIL"; fail_reasons.append(f"Picard.LOD={pic['lod']:.1f} unexpectedly high for unrelated") + else: + fail_reasons.append(f"Unknown relationship: {rel}") + + rows.append(dict( + sample_a=a, sample_b=b, relationship=rel, note=note, + somalier_relatedness=som_rel, + somalier_ibs0=ibs0, + picard_result=None if pic is None else pic["result"], + picard_lod=None if pic is None else pic["lod"], + status=status, reason="; ".join(fail_reasons) if fail_reasons else "" + )) + +summary = pd.DataFrame(rows).sort_values(["status","relationship","sample_a","sample_b"]) +os.makedirs(os.path.dirname(out_tsv), exist_ok=True) +summary.to_csv(out_tsv, sep="\t", index=False) + +# Minimal HTML (table + quick hints) +tpl = Template(""" + +Relatedness QC + + +

Relatedness QC

+

Samples: {{ ns }} | Somalier pairs: {{ n_pairs }} | Picard pairs: {{ n_picard }}

+

Expectations

+ + + + + + + {% for r in rows %} + + + + + + + + + + + {% endfor %} + +
StatusRelationshipSample ASample BSomalier.relatednessSomalier.IBS0Picard.LODNotes / Reasons
{{ r.status }}{{ r.relationship }}{{ r.sample_a }}{{ r.sample_b }}{{ "%.3f"|format(r.somalier_relatedness) if r.somalier_relatedness is not none else "" }}{{ r.somalier_ibs0 if r.somalier_ibs0 is not none else "" }}{{ "%.1f"|format(r.picard_lod) if r.picard_lod is not none else "" }}{{ r.reason or r.note }}
+ +

Files

+ + +

Thresholds (heuristic defaults): +identical ≥ {{th.identical_relatedness}}, tumor-normal ≥ {{th.tn_relatedness}}, +first-degree ≥ {{th.first_degree_relatedness}}, +Picard match LOD ≥ {{th.picard_match_lod}}, strong mismatch LOD ≤ {{th.picard_mismatch_lod}}. +

+ +""") +html = tpl.render( + rows=rows, + ns=len(CFG["samples"]), + n_pairs=len(sp), + n_picard=len(pm), + has_conpair=len(conpair_files)>0, + peddy_enabled=CFG.get("peddy",{}).get("enabled", False), + th=THRESH +) +with open(out_html,"w") as f: + f.write(html)