diff --git a/app/api/metrics.py b/app/api/metrics.py
index 4241169..fcfac4b 100644
--- a/app/api/metrics.py
+++ b/app/api/metrics.py
@@ -4,7 +4,7 @@
 from datetime import UTC, datetime, timedelta
 
 from fastapi import APIRouter, Depends, Query
-from sqlalchemy import select
+from sqlalchemy import func, select
 from sqlalchemy.ext.asyncio import AsyncSession
 
 from app.auth.middleware import get_api_key
@@ -12,6 +12,7 @@
 from app.models.api_key import APIKey
 from app.models.job import ExtractionJob
 from app.models.llm_trace import LLMTrace
+from app.models.record import ExtractedRecord
 from app.schemas.responses import (
     BusinessMetricsResponse,
     LLMMetricsResponse,
@@ -117,6 +118,23 @@ async def get_llm_metrics(
     )
 
 
+async def _hitl_escalation_rate(db: AsyncSession, since: datetime) -> float:
+    """Fraction of records created since `since` that were flagged for human
+    review (ExtractedRecord.needs_review). Returns 0.0 when there are no
+    records in the window."""
+    total, escalated = (
+        await db.execute(
+            select(
+                func.count(ExtractedRecord.id),
+                func.count(ExtractedRecord.id).filter(
+                    ExtractedRecord.needs_review.is_(True)
+                ),
+            ).where(ExtractedRecord.created_at >= since)
+        )
+    ).one()
+    return round(escalated / total, 4) if total else 0.0
+
+
 @router.get("/business", response_model=BusinessMetricsResponse)
 async def get_business_metrics(
     db: AsyncSession = Depends(get_db),
@@ -177,7 +195,7 @@ async def get_business_metrics(
         p50_ms=round(p50_ms, 1),
         p95_ms=round(p95_ms, 1),
         docs_30d=docs_30d,
-        hitl_escalation_rate=0.12,
+        hitl_escalation_rate=await _hitl_escalation_rate(db, since),
     )
 
 
@@ -204,7 +222,7 @@ async def get_quality_trend(
             ewma_composite=[],
             per_dimension={d: [] for d in ("completeness", "field_accuracy",
                                             "hallucination_absence", "format_compliance")},
-            escalation_rate=0.0,
+            escalation_rate=await _hitl_escalation_rate(db, since),
             sample_count=0,
         )
 
@@ -244,6 +262,6 @@ async def get_quality_trend(
         days=days,
         ewma_composite=ewma_composite,
         per_dimension=per_dimension,
-        escalation_rate=0.12,  # placeholder until HITL escalation table exists
+        escalation_rate=await _hitl_escalation_rate(db, since),
         sample_count=len(logs),
     )
diff --git a/app/services/agentic_rag.py b/app/services/agentic_rag.py
index e0cbadb..58d8f50 100644
--- a/app/services/agentic_rag.py
+++ b/app/services/agentic_rag.py
@@ -421,15 +421,21 @@ def _parse_json(text: str) -> dict:
     return {}
 
 
+def _dedup_key(r: SearchResult) -> str:
+    # Fall back to doc_id only when chunk_id is genuinely absent (None);
+    # a falsy-but-valid chunk_id ("") must not collapse distinct chunks.
+    return r.chunk_id if r.chunk_id is not None else r.doc_id
+
+
 def _merge_results(
     existing: list[SearchResult],
     new: list[SearchResult],
 ) -> list[SearchResult]:
     """Merge new results into existing, deduplicating by chunk_id/doc_id."""
-    seen = {(r.chunk_id or r.doc_id) for r in existing}
+    seen = {_dedup_key(r) for r in existing}
     merged = list(existing)
     for r in new:
-        key = r.chunk_id or r.doc_id
+        key = _dedup_key(r)
         if key not in seen:
             merged.append(r)
             seen.add(key)
@@ -440,7 +446,7 @@ def _deduplicate(results: list[SearchResult]) -> list[SearchResult]:
     seen: set[str] = set()
     out: list[SearchResult] = []
     for r in results:
-        key = r.chunk_id or r.doc_id
+        key = _dedup_key(r)
         if key not in seen:
             out.append(r)
             seen.add(key)
diff --git a/app/services/model_ab_test.py b/app/services/model_ab_test.py
index 85fa5ab..a457718 100644
--- a/app/services/model_ab_test.py
+++ b/app/services/model_ab_test.py
@@ -184,14 +184,15 @@ def compute_significance(
         var1 = sum((x - mean1) ** 2 for x in control_scores) / n1
         var2 = sum((x - mean2) ** 2 for x in treatment_scores) / n2
 
-        pooled_se = math.sqrt(var1 / n1 + var2 / n2)
+        # Welch SE of the mean difference (per-arm variance, not pooled).
+        se_diff = math.sqrt(var1 / n1 + var2 / n2)
 
-        if pooled_se == 0.0:
+        if se_diff == 0.0:
             # Identical distributions — no effect
             z_score = 0.0
             p_value = 1.0
         else:
-            z_score = (mean1 - mean2) / pooled_se
+            z_score = (mean1 - mean2) / se_diff
             p_value = _p_value_from_z(z_score)
 
         significant = p_value < 0.05
diff --git a/app/services/reranker.py b/app/services/reranker.py
index 77b7597..7452d22 100644
--- a/app/services/reranker.py
+++ b/app/services/reranker.py
@@ -83,11 +83,12 @@ def rerank(
         combined = self.alpha * tfidf_scores + (1.0 - self.alpha) * norm_scores
         ranked_indices = combined.argsort()[::-1]
 
-        reranked = []
-        for idx in ranked_indices:
-            result = results[idx]
-            result.score = float(combined[idx])
-            reranked.append(result)
+        # Return copies: rerank must not mutate the caller's SearchResult
+        # objects — their original retrieval scores are still needed upstream.
+        reranked = [
+            results[idx].model_copy(update={"score": float(combined[idx])})
+            for idx in ranked_indices
+        ]
 
         logger.debug(
             "Reranked %d results — top score %.3f (tfidf=%.3f retrieval=%.3f)",
diff --git a/requirements-msg.txt b/requirements-msg.txt
new file mode 100644
index 0000000..febbade
--- /dev/null
+++ b/requirements-msg.txt
@@ -0,0 +1,13 @@
+# Optional: Outlook .msg email extraction.
+#
+# Not part of the default install because extract-msg pulls
+# red-black-tree-mod, which ships no prebuilt wheel and needs a C build
+# toolchain (fails `pip install` on minimal/macOS/Windows environments).
+#
+# .eml extraction works without this (stdlib email + eml-parser). Only
+# proprietary Outlook .msg files require it. Without it, submitting a .msg
+# raises a clear RuntimeError rather than failing at import
+# (see app/services/email_extractor.py).
+#
+#   pip install -r requirements-msg.txt
+extract-msg==0.52.0
diff --git a/requirements_ci.txt b/requirements_ci.txt
index 017ff07..7702757 100644
--- a/requirements_ci.txt
+++ b/requirements_ci.txt
@@ -24,7 +24,10 @@ python-multipart>=0.0.22
 sse-starlette==2.1.3
 httpx==0.27.2
 eml-parser==1.17.5
-extract-msg==0.52.0
+# extract-msg (Outlook .msg support) is optional: see requirements-msg.txt.
+# It pulls red-black-tree-mod, which has no prebuilt wheel and needs a C
+# toolchain, so it is kept out of the default install. The code degrades
+# gracefully without it (app/services/email_extractor.py:HAS_EXTRACT_MSG).
 pgvector==0.3.5
 cryptography>=46.0.6
 streamlit>=1.54.0
diff --git a/requirements_full.txt b/requirements_full.txt
index d6f7f45..7712dc8 100644
--- a/requirements_full.txt
+++ b/requirements_full.txt
@@ -24,7 +24,7 @@ python-multipart>=0.0.22
 sse-starlette==2.1.3
 httpx==0.27.2
 eml-parser==1.17.5
-extract-msg==0.52.0
+# extract-msg (Outlook .msg support) is optional: pip install -r requirements-msg.txt
 pgvector==0.3.5
 cryptography>=46.0.6
 streamlit>=1.54.0
diff --git a/tests/integration/test_business_metrics.py b/tests/integration/test_business_metrics.py
index 5b37323..738bff0 100644
--- a/tests/integration/test_business_metrics.py
+++ b/tests/integration/test_business_metrics.py
@@ -113,8 +113,9 @@ async def test_business_metrics_with_jobs(client: AsyncClient, db_session: Async
     # straight_through_rate: at least 2 completed out of 3+
     assert 0.0 <= data["straight_through_rate"] <= 1.0
 
-    # hitl_escalation_rate is a fixed placeholder
-    assert data["hitl_escalation_rate"] == 0.12
+    # hitl_escalation_rate is computed from ExtractedRecord.needs_review;
+    # no records were created here, so it is 0.0.
+    assert data["hitl_escalation_rate"] == 0.0
 
     # avg_cost_usd defaults to 0.03 (no cost_usd on LLMTrace model yet)
     assert data["avg_cost_usd"] == pytest.approx(0.03, rel=1e-3)
diff --git a/tests/unit/test_reranker.py b/tests/unit/test_reranker.py
index 61ebdfe..5189ca2 100644
--- a/tests/unit/test_reranker.py
+++ b/tests/unit/test_reranker.py
@@ -1,16 +1,12 @@
 """Unit tests for TFIDFReranker."""
 from __future__ import annotations
 
-from unittest.mock import MagicMock
-
+from app.services.rag_tools import SearchResult
 from app.services.reranker import TFIDFReranker
 
 
-def _make_result(content: str, score: float = 0.5) -> MagicMock:
-    r = MagicMock()
-    r.content = content
-    r.score = score
-    return r
+def _make_result(content: str, score: float = 0.5) -> SearchResult:
+    return SearchResult(doc_id=f"doc-{content[:24]}", content=content, score=score)
 
 
 class TestTFIDFReranker: