Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 22 additions & 4 deletions app/api/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@
from datetime import UTC, datetime, timedelta

from fastapi import APIRouter, Depends, Query
from sqlalchemy import select
from sqlalchemy import func, select
from sqlalchemy.ext.asyncio import AsyncSession

from app.auth.middleware import get_api_key
from app.dependencies import get_db
from app.models.api_key import APIKey
from app.models.job import ExtractionJob
from app.models.llm_trace import LLMTrace
from app.models.record import ExtractedRecord
from app.schemas.responses import (
BusinessMetricsResponse,
LLMMetricsResponse,
Expand Down Expand Up @@ -117,6 +118,23 @@ async def get_llm_metrics(
)


async def _hitl_escalation_rate(db: AsyncSession, since: datetime) -> float:
"""Fraction of records created since `since` that were flagged for human
review (ExtractedRecord.needs_review). Returns 0.0 when there are no
records in the window."""
total, escalated = (
await db.execute(
select(
func.count(ExtractedRecord.id),
func.count(ExtractedRecord.id).filter(
ExtractedRecord.needs_review.is_(True)
),
).where(ExtractedRecord.created_at >= since)
)
).one()
return round(escalated / total, 4) if total else 0.0


@router.get("/business", response_model=BusinessMetricsResponse)
async def get_business_metrics(
db: AsyncSession = Depends(get_db),
Expand Down Expand Up @@ -177,7 +195,7 @@ async def get_business_metrics(
p50_ms=round(p50_ms, 1),
p95_ms=round(p95_ms, 1),
docs_30d=docs_30d,
hitl_escalation_rate=0.12,
hitl_escalation_rate=await _hitl_escalation_rate(db, since),
)


Expand All @@ -204,7 +222,7 @@ async def get_quality_trend(
ewma_composite=[],
per_dimension={d: [] for d in ("completeness", "field_accuracy",
"hallucination_absence", "format_compliance")},
escalation_rate=0.0,
escalation_rate=await _hitl_escalation_rate(db, since),
sample_count=0,
)

Expand Down Expand Up @@ -244,6 +262,6 @@ async def get_quality_trend(
days=days,
ewma_composite=ewma_composite,
per_dimension=per_dimension,
escalation_rate=0.12, # placeholder until HITL escalation table exists
escalation_rate=await _hitl_escalation_rate(db, since),
sample_count=len(logs),
)
12 changes: 9 additions & 3 deletions app/services/agentic_rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,15 +421,21 @@ def _parse_json(text: str) -> dict:
return {}


def _dedup_key(r: SearchResult) -> str:
# Fall back to doc_id only when chunk_id is genuinely absent (None);
# a falsy-but-valid chunk_id ("") must not collapse distinct chunks.
return r.chunk_id if r.chunk_id is not None else r.doc_id

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Preserve doc identity for empty chunk IDs

When a tool returns chunk_id="" (which the new comment explicitly treats as a valid falsy chunk id), this key becomes the same empty string for every result, so _merge_results/_deduplicate will drop all but the first such result even when they come from different doc_ids. Use a composite key that still includes doc_id when chunk_id is present but empty, or normalize empty strings to absent if they are not actually valid.

Useful? React with 👍 / 👎.



def _merge_results(
existing: list[SearchResult],
new: list[SearchResult],
) -> list[SearchResult]:
"""Merge new results into existing, deduplicating by chunk_id/doc_id."""
seen = {(r.chunk_id or r.doc_id) for r in existing}
seen = {_dedup_key(r) for r in existing}
merged = list(existing)
for r in new:
key = r.chunk_id or r.doc_id
key = _dedup_key(r)
if key not in seen:
merged.append(r)
seen.add(key)
Expand All @@ -440,7 +446,7 @@ def _deduplicate(results: list[SearchResult]) -> list[SearchResult]:
seen: set[str] = set()
out: list[SearchResult] = []
for r in results:
key = r.chunk_id or r.doc_id
key = _dedup_key(r)
if key not in seen:
out.append(r)
seen.add(key)
Expand Down
7 changes: 4 additions & 3 deletions app/services/model_ab_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,14 +184,15 @@ def compute_significance(
var1 = sum((x - mean1) ** 2 for x in control_scores) / n1
var2 = sum((x - mean2) ** 2 for x in treatment_scores) / n2

pooled_se = math.sqrt(var1 / n1 + var2 / n2)
# Welch SE of the mean difference (per-arm variance, not pooled).
se_diff = math.sqrt(var1 / n1 + var2 / n2)

if pooled_se == 0.0:
if se_diff == 0.0:
# Identical distributions — no effect
z_score = 0.0
p_value = 1.0
else:
z_score = (mean1 - mean2) / pooled_se
z_score = (mean1 - mean2) / se_diff
p_value = _p_value_from_z(z_score)

significant = p_value < 0.05
Expand Down
11 changes: 6 additions & 5 deletions app/services/reranker.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,12 @@ def rerank(
combined = self.alpha * tfidf_scores + (1.0 - self.alpha) * norm_scores
ranked_indices = combined.argsort()[::-1]

reranked = []
for idx in ranked_indices:
result = results[idx]
result.score = float(combined[idx])
reranked.append(result)
# Return copies: rerank must not mutate the caller's SearchResult
# objects — their original retrieval scores are still needed upstream.
reranked = [
results[idx].model_copy(update={"score": float(combined[idx])})
for idx in ranked_indices
]

logger.debug(
"Reranked %d results — top score %.3f (tfidf=%.3f retrieval=%.3f)",
Expand Down
13 changes: 13 additions & 0 deletions requirements-msg.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Optional: Outlook .msg email extraction.
#
# Not part of the default install because extract-msg pulls
# red-black-tree-mod, which ships no prebuilt wheel and needs a C build
# toolchain (fails `pip install` on minimal/macOS/Windows environments).
#
# .eml extraction works without this (stdlib email + eml-parser). Only
# proprietary Outlook .msg files require it. Without it, submitting a .msg
# raises a clear RuntimeError rather than failing at import
# (see app/services/email_extractor.py).
#
# pip install -r requirements-msg.txt
extract-msg==0.52.0
5 changes: 4 additions & 1 deletion requirements_ci.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@ python-multipart>=0.0.22
sse-starlette==2.1.3
httpx==0.27.2
eml-parser==1.17.5
extract-msg==0.52.0
# extract-msg (Outlook .msg support) is optional: see requirements-msg.txt.
# It pulls red-black-tree-mod, which has no prebuilt wheel and needs a C
# toolchain, so it is kept out of the default install. The code degrades
# gracefully without it (app/services/email_extractor.py:HAS_EXTRACT_MSG).
pgvector==0.3.5
cryptography>=46.0.6
streamlit>=1.54.0
Expand Down
2 changes: 1 addition & 1 deletion requirements_full.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ python-multipart>=0.0.22
sse-starlette==2.1.3
httpx==0.27.2
eml-parser==1.17.5
extract-msg==0.52.0
# extract-msg (Outlook .msg support) is optional: pip install -r requirements-msg.txt
pgvector==0.3.5
cryptography>=46.0.6
streamlit>=1.54.0
Expand Down
5 changes: 3 additions & 2 deletions tests/integration/test_business_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,9 @@ async def test_business_metrics_with_jobs(client: AsyncClient, db_session: Async
# straight_through_rate: at least 2 completed out of 3+
assert 0.0 <= data["straight_through_rate"] <= 1.0

# hitl_escalation_rate is a fixed placeholder
assert data["hitl_escalation_rate"] == 0.12
# hitl_escalation_rate is computed from ExtractedRecord.needs_review;
# no records were created here, so it is 0.0.
assert data["hitl_escalation_rate"] == 0.0

# avg_cost_usd defaults to 0.03 (no cost_usd on LLMTrace model yet)
assert data["avg_cost_usd"] == pytest.approx(0.03, rel=1e-3)
Expand Down
10 changes: 3 additions & 7 deletions tests/unit/test_reranker.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,12 @@
"""Unit tests for TFIDFReranker."""
from __future__ import annotations

from unittest.mock import MagicMock

from app.services.rag_tools import SearchResult
from app.services.reranker import TFIDFReranker


def _make_result(content: str, score: float = 0.5) -> MagicMock:
r = MagicMock()
r.content = content
r.score = score
return r
def _make_result(content: str, score: float = 0.5) -> SearchResult:
return SearchResult(doc_id=f"doc-{content[:24]}", content=content, score=score)


class TestTFIDFReranker:
Expand Down
Loading