diff --git a/app/api/metrics.py b/app/api/metrics.py index 4241169..fcfac4b 100644 --- a/app/api/metrics.py +++ b/app/api/metrics.py @@ -4,7 +4,7 @@ from datetime import UTC, datetime, timedelta from fastapi import APIRouter, Depends, Query -from sqlalchemy import select +from sqlalchemy import func, select from sqlalchemy.ext.asyncio import AsyncSession from app.auth.middleware import get_api_key @@ -12,6 +12,7 @@ from app.models.api_key import APIKey from app.models.job import ExtractionJob from app.models.llm_trace import LLMTrace +from app.models.record import ExtractedRecord from app.schemas.responses import ( BusinessMetricsResponse, LLMMetricsResponse, @@ -117,6 +118,23 @@ async def get_llm_metrics( ) +async def _hitl_escalation_rate(db: AsyncSession, since: datetime) -> float: + """Fraction of records created since `since` that were flagged for human + review (ExtractedRecord.needs_review). Returns 0.0 when there are no + records in the window.""" + total, escalated = ( + await db.execute( + select( + func.count(ExtractedRecord.id), + func.count(ExtractedRecord.id).filter( + ExtractedRecord.needs_review.is_(True) + ), + ).where(ExtractedRecord.created_at >= since) + ) + ).one() + return round(escalated / total, 4) if total else 0.0 + + @router.get("/business", response_model=BusinessMetricsResponse) async def get_business_metrics( db: AsyncSession = Depends(get_db), @@ -177,7 +195,7 @@ async def get_business_metrics( p50_ms=round(p50_ms, 1), p95_ms=round(p95_ms, 1), docs_30d=docs_30d, - hitl_escalation_rate=0.12, + hitl_escalation_rate=await _hitl_escalation_rate(db, since), ) @@ -204,7 +222,7 @@ async def get_quality_trend( ewma_composite=[], per_dimension={d: [] for d in ("completeness", "field_accuracy", "hallucination_absence", "format_compliance")}, - escalation_rate=0.0, + escalation_rate=await _hitl_escalation_rate(db, since), sample_count=0, ) @@ -244,6 +262,6 @@ async def get_quality_trend( days=days, ewma_composite=ewma_composite, per_dimension=per_dimension, - escalation_rate=0.12, # placeholder until HITL escalation table exists + escalation_rate=await _hitl_escalation_rate(db, since), sample_count=len(logs), ) diff --git a/app/services/agentic_rag.py b/app/services/agentic_rag.py index e0cbadb..58d8f50 100644 --- a/app/services/agentic_rag.py +++ b/app/services/agentic_rag.py @@ -421,15 +421,21 @@ def _parse_json(text: str) -> dict: return {} +def _dedup_key(r: SearchResult) -> str: + # Fall back to doc_id only when chunk_id is genuinely absent (None); + # a falsy-but-valid chunk_id ("") must not collapse distinct chunks. + return r.chunk_id if r.chunk_id is not None else r.doc_id + + def _merge_results( existing: list[SearchResult], new: list[SearchResult], ) -> list[SearchResult]: """Merge new results into existing, deduplicating by chunk_id/doc_id.""" - seen = {(r.chunk_id or r.doc_id) for r in existing} + seen = {_dedup_key(r) for r in existing} merged = list(existing) for r in new: - key = r.chunk_id or r.doc_id + key = _dedup_key(r) if key not in seen: merged.append(r) seen.add(key) @@ -440,7 +446,7 @@ def _deduplicate(results: list[SearchResult]) -> list[SearchResult]: seen: set[str] = set() out: list[SearchResult] = [] for r in results: - key = r.chunk_id or r.doc_id + key = _dedup_key(r) if key not in seen: out.append(r) seen.add(key) diff --git a/app/services/model_ab_test.py b/app/services/model_ab_test.py index 85fa5ab..a457718 100644 --- a/app/services/model_ab_test.py +++ b/app/services/model_ab_test.py @@ -184,14 +184,15 @@ def compute_significance( var1 = sum((x - mean1) ** 2 for x in control_scores) / n1 var2 = sum((x - mean2) ** 2 for x in treatment_scores) / n2 - pooled_se = math.sqrt(var1 / n1 + var2 / n2) + # Welch SE of the mean difference (per-arm variance, not pooled). + se_diff = math.sqrt(var1 / n1 + var2 / n2) - if pooled_se == 0.0: + if se_diff == 0.0: # Identical distributions — no effect z_score = 0.0 p_value = 1.0 else: - z_score = (mean1 - mean2) / pooled_se + z_score = (mean1 - mean2) / se_diff p_value = _p_value_from_z(z_score) significant = p_value < 0.05 diff --git a/app/services/reranker.py b/app/services/reranker.py index 77b7597..7452d22 100644 --- a/app/services/reranker.py +++ b/app/services/reranker.py @@ -83,11 +83,12 @@ def rerank( combined = self.alpha * tfidf_scores + (1.0 - self.alpha) * norm_scores ranked_indices = combined.argsort()[::-1] - reranked = [] - for idx in ranked_indices: - result = results[idx] - result.score = float(combined[idx]) - reranked.append(result) + # Return copies: rerank must not mutate the caller's SearchResult + # objects — their original retrieval scores are still needed upstream. + reranked = [ + results[idx].model_copy(update={"score": float(combined[idx])}) + for idx in ranked_indices + ] logger.debug( "Reranked %d results — top score %.3f (tfidf=%.3f retrieval=%.3f)", diff --git a/requirements-msg.txt b/requirements-msg.txt new file mode 100644 index 0000000..febbade --- /dev/null +++ b/requirements-msg.txt @@ -0,0 +1,13 @@ +# Optional: Outlook .msg email extraction. +# +# Not part of the default install because extract-msg pulls +# red-black-tree-mod, which ships no prebuilt wheel and needs a C build +# toolchain (fails `pip install` on minimal/macOS/Windows environments). +# +# .eml extraction works without this (stdlib email + eml-parser). Only +# proprietary Outlook .msg files require it. Without it, submitting a .msg +# raises a clear RuntimeError rather than failing at import +# (see app/services/email_extractor.py). +# +# pip install -r requirements-msg.txt +extract-msg==0.52.0 diff --git a/requirements_ci.txt b/requirements_ci.txt index 017ff07..7702757 100644 --- a/requirements_ci.txt +++ b/requirements_ci.txt @@ -24,7 +24,10 @@ python-multipart>=0.0.22 sse-starlette==2.1.3 httpx==0.27.2 eml-parser==1.17.5 -extract-msg==0.52.0 +# extract-msg (Outlook .msg support) is optional: see requirements-msg.txt. +# It pulls red-black-tree-mod, which has no prebuilt wheel and needs a C +# toolchain, so it is kept out of the default install. The code degrades +# gracefully without it (app/services/email_extractor.py:HAS_EXTRACT_MSG). pgvector==0.3.5 cryptography>=46.0.6 streamlit>=1.54.0 diff --git a/requirements_full.txt b/requirements_full.txt index d6f7f45..7712dc8 100644 --- a/requirements_full.txt +++ b/requirements_full.txt @@ -24,7 +24,7 @@ python-multipart>=0.0.22 sse-starlette==2.1.3 httpx==0.27.2 eml-parser==1.17.5 -extract-msg==0.52.0 +# extract-msg (Outlook .msg support) is optional: pip install -r requirements-msg.txt pgvector==0.3.5 cryptography>=46.0.6 streamlit>=1.54.0 diff --git a/tests/integration/test_business_metrics.py b/tests/integration/test_business_metrics.py index 5b37323..738bff0 100644 --- a/tests/integration/test_business_metrics.py +++ b/tests/integration/test_business_metrics.py @@ -113,8 +113,9 @@ async def test_business_metrics_with_jobs(client: AsyncClient, db_session: Async # straight_through_rate: at least 2 completed out of 3+ assert 0.0 <= data["straight_through_rate"] <= 1.0 - # hitl_escalation_rate is a fixed placeholder - assert data["hitl_escalation_rate"] == 0.12 + # hitl_escalation_rate is computed from ExtractedRecord.needs_review; + # no records were created here, so it is 0.0. + assert data["hitl_escalation_rate"] == 0.0 # avg_cost_usd defaults to 0.03 (no cost_usd on LLMTrace model yet) assert data["avg_cost_usd"] == pytest.approx(0.03, rel=1e-3) diff --git a/tests/unit/test_reranker.py b/tests/unit/test_reranker.py index 61ebdfe..5189ca2 100644 --- a/tests/unit/test_reranker.py +++ b/tests/unit/test_reranker.py @@ -1,16 +1,12 @@ """Unit tests for TFIDFReranker.""" from __future__ import annotations -from unittest.mock import MagicMock - +from app.services.rag_tools import SearchResult from app.services.reranker import TFIDFReranker -def _make_result(content: str, score: float = 0.5) -> MagicMock: - r = MagicMock() - r.content = content - r.score = score - return r +def _make_result(content: str, score: float = 0.5) -> SearchResult: + return SearchResult(doc_id=f"doc-{content[:24]}", content=content, score=score) class TestTFIDFReranker: