diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 6cc9e5e..972e411 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -39,6 +39,7 @@ review_cycle_days: 21 ### kb-server - Exposes health/readiness and notes/publish APIs. +- Exposes retrieval/context APIs for server-side note discovery and bundling. - Enforces path + extension safety for note files. - Routes writes by source: - `source=api`: queued to PR branch workflow. @@ -76,6 +77,13 @@ review_cycle_days: 21 3. API returns composed content + source branches. 4. Writes to `view=current` are rejected. +### Retrieval Flow + +1. Client requests search or a context bundle. +2. `kb-server` builds a deterministic note graph over the visible view. +3. Results return ranked note candidates, excerpts, and provenance. +4. Content remains read-only until a later explicit write operation. + ## Invariants - `main` remains approved truth. @@ -91,4 +99,3 @@ review_cycle_days: 21 - `docs/product-specs/vault-sync.md` - `docs/SECURITY.md` - `docs/RELIABILITY.md` - diff --git a/docs/RELIABILITY.md b/docs/RELIABILITY.md index 74295f2..4913fc6 100644 --- a/docs/RELIABILITY.md +++ b/docs/RELIABILITY.md @@ -19,6 +19,7 @@ review_cycle_days: 21 ## Service Expectations - `kb-server` readiness requires database and Git-backed vault access. +- `kb-server` retrieval endpoints should rebuild or refresh in-process graph state when visible note state changes. - Autosave worker should tolerate transient Git/network failures. - `vault-sync` should converge after temporary API outages. @@ -31,6 +32,7 @@ review_cycle_days: 21 ## Reliability Signals - API health and readiness checks. +- Retrieval endpoint latency/error rate and cache rebuild logs. - Job/event tables for write and publish operations. - Sync logs for pull/push loop success and retries. diff --git a/docs/exec-plans/active/README.md b/docs/exec-plans/active/README.md new file mode 100644 index 0000000..11d6727 --- /dev/null +++ b/docs/exec-plans/active/README.md @@ -0,0 +1,17 @@ +--- +owner: platform +status: draft +last_verified: 2026-03-12 +source_of_truth: + - ../completed/README.md +related_code: + - ../../../scripts/docs_lint.py +related_tests: + - ../../../kb-server/tests + - ../../../mcp-server/tests +review_cycle_days: 30 +--- + +# Active Plans + +Place in-progress execution plans here while work is underway. diff --git a/docs/generated/api-surface.md b/docs/generated/api-surface.md index 0e7664f..7d2af03 100644 --- a/docs/generated/api-surface.md +++ b/docs/generated/api-surface.md @@ -1,9 +1,10 @@ --- owner: platform status: generated -last_verified: 2026-03-07 +last_verified: 2026-03-12 source_of_truth: - ../../kb-server/app/api/routes/health.py + - ../../kb-server/app/api/routes/context.py - ../../kb-server/app/api/routes/notes.py - ../../kb-server/app/api/routes/publish.py related_code: @@ -15,16 +16,18 @@ review_cycle_days: 7 # API Surface (Generated) -Generated on `2026-03-07` from route handlers. +Generated on `2026-03-12` from route handlers. | Method | Path | | --- | --- | | `GET` | `/health` | | `GET` | `/ready` | +| `POST` | `/search` | +| `POST` | `/bundle` | | `GET` | `/` | | `GET` | `/{path:path}` | | `PUT` | `/{path:path}` | | `DELETE` | `/{path:path}` | | `POST` | `/publish` | -Do not edit manually. Regenerate with `python3 scripts/generate_context_artifacts.py`. \ No newline at end of file +Do not edit manually. Regenerate with `python3 scripts/generate_context_artifacts.py`. diff --git a/docs/generated/env-catalog.md b/docs/generated/env-catalog.md index e09f48d..4fccbae 100644 --- a/docs/generated/env-catalog.md +++ b/docs/generated/env-catalog.md @@ -1,7 +1,7 @@ --- owner: platform status: generated -last_verified: 2026-03-07 +last_verified: 2026-03-06 source_of_truth: - ../../kb-server/.env.example - ../../kb-server/app/core/config.py @@ -16,7 +16,7 @@ review_cycle_days: 7 # Environment Catalog (Generated) -Generated on `2026-03-07` from settings and env sources. +Generated on `2026-03-06` from settings and env sources. ## kb-server `.env.example` @@ -70,4 +70,4 @@ Generated on `2026-03-07` from settings and env sources. | `sync_debounce_seconds` | `2.0` | | `sync_pull_interval_seconds` | `30.0` | -Do not edit manually. Regenerate with `python3 scripts/generate_context_artifacts.py`. \ No newline at end of file +Do not edit manually. Regenerate with `python3 scripts/generate_context_artifacts.py`. diff --git a/docs/product-specs/kb-server.md b/docs/product-specs/kb-server.md index ce86721..bfc40e9 100644 --- a/docs/product-specs/kb-server.md +++ b/docs/product-specs/kb-server.md @@ -26,6 +26,8 @@ Provide a file-first API over a Git-backed vault with explicit approval boundari - `GET /notes` and `GET /notes/{path}` support: - `view=main` (default approved state) - `view=current` (approved + pending composed state) +- `POST /context/search` returns ranked note candidates for a query. +- `POST /context/bundle` returns a token-bounded context bundle with excerpts, optional full content, and provenance. - `PUT /notes/{path}` and `DELETE /notes/{path}` support: - `source=api` for PR-based pending writes - `source=human` for direct approved writes @@ -42,6 +44,7 @@ Provide a file-first API over a Git-backed vault with explicit approval boundari - Allowed file extensions: `.md`, `.markdown`, `.txt`. - No absolute paths and no traversal outside vault root. - API key auth enforced when configured. +- Retrieval is read-only and respects the same view/provenance semantics as note reads. ## Related Operational Docs @@ -49,4 +52,3 @@ Provide a file-first API over a Git-backed vault with explicit approval boundari - `../../kb-server/BRANCHING_AND_CURRENT_VIEW.md` - `../SECURITY.md` - `../RELIABILITY.md` - diff --git a/kb-server/app/api/routes/context.py b/kb-server/app/api/routes/context.py new file mode 100644 index 0000000..adf428b --- /dev/null +++ b/kb-server/app/api/routes/context.py @@ -0,0 +1,103 @@ +from enum import Enum + +from fastapi import APIRouter, Depends, HTTPException + +from app.api.deps import require_api_key +from app.schemas.context import ( + ContextBundleItem, + ContextBundleRequest, + ContextBundleResponse, + ContextSearchRequest, + ContextSearchResponse, + ContextSearchResult, +) +from app.services import retrieval_service + +router = APIRouter( + prefix="/context", + tags=["context"], + dependencies=[Depends(require_api_key)], +) + + +class ViewType(str, Enum): + main = "main" + current = "current" + + +@router.post("/search", response_model=ContextSearchResponse) +def search_context(body: ContextSearchRequest): + if body.view not in {ViewType.main.value, ViewType.current.value}: + raise HTTPException(status_code=400, detail="Unsupported view") + + query = body.query.strip() + if not query: + raise HTTPException(status_code=400, detail="Query must not be blank") + + try: + results = retrieval_service.search_notes( + query=query, + view=body.view, + limit=body.limit, + ) + except ValueError as exc: + raise HTTPException(status_code=400, detail=str(exc)) + + return ContextSearchResponse( + query=query, + view=body.view, + results=[ + ContextSearchResult( + path=result.path, + title=result.title, + score=result.score, + reasons=result.reasons, + excerpt=result.excerpt, + view=result.view, + sources=result.sources, + ) + for result in results + ], + ) + + +@router.post("/bundle", response_model=ContextBundleResponse) +def build_context_bundle(body: ContextBundleRequest): + if body.view not in {ViewType.main.value, ViewType.current.value}: + raise HTTPException(status_code=400, detail="Unsupported view") + + query = body.query.strip() + if not query: + raise HTTPException(status_code=400, detail="Query must not be blank") + + try: + items, used_tokens = retrieval_service.build_context_bundle( + query=query, + view=body.view, + limit=body.limit, + token_budget=body.token_budget, + ) + except ValueError as exc: + raise HTTPException(status_code=400, detail=str(exc)) + + return ContextBundleResponse( + query=query, + view=body.view, + token_budget=body.token_budget, + used_tokens=used_tokens, + items=[ + ContextBundleItem( + path=item.path, + title=item.title, + score=item.score, + reasons=item.reasons, + excerpt=item.excerpt, + view=item.view, + sources=item.sources, + content=item.content, + content_tokens=item.content_tokens, + truncated=item.truncated, + ) + for item in items + ], + ) diff --git a/kb-server/app/main.py b/kb-server/app/main.py index 30f9a56..4623c64 100644 --- a/kb-server/app/main.py +++ b/kb-server/app/main.py @@ -41,10 +41,12 @@ def create_app() -> FastAPI: app.add_middleware(APIKeyMiddleware) from app.api.routes.health import router as health_router + from app.api.routes.context import router as context_router from app.api.routes.notes import router as notes_router from app.api.routes.publish import router as publish_router app.include_router(health_router) + app.include_router(context_router) app.include_router(notes_router) app.include_router(publish_router) diff --git a/kb-server/app/schemas/context.py b/kb-server/app/schemas/context.py new file mode 100644 index 0000000..09c8d35 --- /dev/null +++ b/kb-server/app/schemas/context.py @@ -0,0 +1,44 @@ +from pydantic import BaseModel, Field + + +class ContextSearchRequest(BaseModel): + query: str = Field(min_length=1, max_length=500) + view: str = "current" + limit: int = Field(default=10, ge=1, le=50) + + +class ContextSearchResult(BaseModel): + path: str + title: str + score: float + reasons: list[str] + excerpt: str + view: str + sources: list[str] | None = None + + +class ContextSearchResponse(BaseModel): + query: str + view: str + results: list[ContextSearchResult] + + +class ContextBundleRequest(BaseModel): + query: str = Field(min_length=1, max_length=500) + view: str = "current" + limit: int = Field(default=10, ge=1, le=50) + token_budget: int = Field(default=4000, ge=1, le=50000) + + +class ContextBundleItem(ContextSearchResult): + content: str | None = None + content_tokens: int = 0 + truncated: bool = False + + +class ContextBundleResponse(BaseModel): + query: str + view: str + token_budget: int + used_tokens: int + items: list[ContextBundleItem] diff --git a/kb-server/app/services/current_view_service.py b/kb-server/app/services/current_view_service.py index f0f4a00..fad7282 100644 --- a/kb-server/app/services/current_view_service.py +++ b/kb-server/app/services/current_view_service.py @@ -41,7 +41,10 @@ def _pending_branches() -> list[str]: return git_service.list_branches(pattern=f"{prefix}/*") -def read_note_current(relative_path: str) -> tuple[str, datetime, list[str]]: +def read_note_current( + relative_path: str, + pending_branches: list[str] | None = None, +) -> tuple[str, datetime, list[str]]: """Read a note from the *current* view. Returns ``(content, modified_at, sources)`` where *sources* is the @@ -56,7 +59,7 @@ def read_note_current(relative_path: str) -> tuple[str, datetime, list[str]]: main_branch = settings.git_branch main_content = git_service.show_file(main_branch, relative_path) - pending = _pending_branches() + pending = pending_branches if pending_branches is not None else _pending_branches() winning_content: str | None = main_content sources: list[str] = [] if main_content is not None: @@ -75,14 +78,17 @@ def read_note_current(relative_path: str) -> tuple[str, datetime, list[str]]: return winning_content, now, sources -def list_notes_current(prefix: str = "") -> list[tuple[str, datetime, list[str]]]: +def list_notes_current( + prefix: str = "", + pending_branches: list[str] | None = None, +) -> list[tuple[str, datetime, list[str]]]: """List notes visible in the *current* view. Returns ``[(relative_path, modified_at, sources), ...]`` sorted by path. Each entry includes which branches provide that file. """ main_branch = settings.git_branch - pending = _pending_branches() + pending = pending_branches if pending_branches is not None else _pending_branches() path_sources: dict[str, list[str]] = {} diff --git a/kb-server/app/services/git_service.py b/kb-server/app/services/git_service.py index 64ea25a..2ce3389 100644 --- a/kb-server/app/services/git_service.py +++ b/kb-server/app/services/git_service.py @@ -202,6 +202,11 @@ def current_sha() -> str: return _run("rev-parse", "HEAD").stdout.strip() +def resolve_ref(ref: str) -> str: + """Return the SHA for *ref*.""" + return _run("rev-parse", ref).stdout.strip() + + def show_file(branch: str, path: str) -> str | None: """Read file content from a branch without checking it out. diff --git a/kb-server/app/services/retrieval_service.py b/kb-server/app/services/retrieval_service.py new file mode 100644 index 0000000..1afea17 --- /dev/null +++ b/kb-server/app/services/retrieval_service.py @@ -0,0 +1,439 @@ +"""Read-only retrieval and context bundling over notes.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +import re +from pathlib import PurePosixPath + +from app.core.config import settings +from app.services import current_view_service, git_service, vault_service + +_WORD_RE = re.compile(r"[A-Za-z0-9][A-Za-z0-9_-]*") +_WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]") +_MARKDOWN_LINK_RE = re.compile(r"\[[^\]]+\]\(([^)]+)\)") +_HEADING_RE = re.compile(r"^\s{0,3}#{1,6}\s+(.+?)\s*$", re.MULTILINE) +_FRONTMATTER_BOUNDARY_RE = re.compile(r"^---\s*$", re.MULTILINE) +_SNIPPET_MAX_CHARS = 240 + + +@dataclass(slots=True) +class NoteDocument: + path: str + content: str + title: str + headings: list[str] + tags: list[str] + links: set[str] + sources: list[str] + view: str + search_blob: str + tokens: set[str] + backlinks: set[str] = field(default_factory=set) + + +@dataclass(slots=True) +class RetrievalMatch: + path: str + title: str + score: float + reasons: list[str] + excerpt: str + view: str + sources: list[str] + + +@dataclass(slots=True) +class BundleMatch(RetrievalMatch): + content: str | None + content_tokens: int + truncated: bool + + +@dataclass(slots=True) +class RetrievalIndex: + docs: dict[str, NoteDocument] + + +_INDEX_CACHE: dict[tuple[object, ...], RetrievalIndex] = {} + + +def search_notes(query: str, view: str = "current", limit: int = 10) -> list[RetrievalMatch]: + normalized_query = _normalize_query(query) + if not normalized_query: + return [] + + index = _get_index(view) + query_tokens = _tokenize(normalized_query) + scored = _score_documents(index, normalized_query, query_tokens) + ordered = sorted(scored.values(), key=lambda item: (-item.score, item.path)) + + return ordered[:limit] + + +def build_context_bundle( + query: str, + view: str = "current", + limit: int = 10, + token_budget: int = 4000, +) -> tuple[list[BundleMatch], int]: + matches = search_notes(query=query, view=view, limit=limit) + index = _get_index(view) + + used_tokens = 0 + bundle: list[BundleMatch] = [] + for match in matches: + doc = index.docs[match.path] + content_tokens = _estimate_tokens(doc.content) + if used_tokens + content_tokens <= token_budget: + content = doc.content + used_tokens += content_tokens + truncated = False + included_tokens = content_tokens + else: + content = None + truncated = True + included_tokens = 0 + + bundle.append( + BundleMatch( + path=match.path, + title=match.title, + score=match.score, + reasons=match.reasons, + excerpt=match.excerpt, + view=match.view, + sources=match.sources, + content=content, + content_tokens=included_tokens, + truncated=truncated, + ) + ) + + return bundle, used_tokens + + +def _get_index(view: str) -> RetrievalIndex: + cache_key, docs = _load_documents(view) + cached = _INDEX_CACHE.get(cache_key) + if cached is not None: + return cached + + index = RetrievalIndex(docs=docs) + _INDEX_CACHE.clear() + _INDEX_CACHE[cache_key] = index + return index + + +def _load_documents(view: str) -> tuple[tuple[object, ...], dict[str, NoteDocument]]: + if view == "main": + items = vault_service.list_notes() + main_signature = tuple( + (path, int(modified_at.timestamp() * 1_000_000)) + for path, modified_at in items + ) + raw_docs = [] + for path, _ in items: + content, _ = vault_service.read_note(path) + raw_docs.append((path, content, [settings.git_branch])) + cache_key = ("main", main_signature) + elif view == "current": + pending = current_view_service._pending_branches() + ref_signature = tuple( + (branch, git_service.resolve_ref(branch)) + for branch in [settings.git_branch, *pending] + ) + items = current_view_service.list_notes_current(pending_branches=pending) + raw_docs = [] + for path, _, sources in items: + content, _, read_sources = current_view_service.read_note_current( + path, + pending_branches=pending, + ) + raw_docs.append((path, content, read_sources or sources)) + cache_key = ("current", ref_signature) + else: + raise ValueError(f"Unsupported view '{view}'") + + docs = { + doc.path: doc + for doc in ( + _build_document(path=path, content=content, sources=sources, view=view) + for path, content, sources in raw_docs + ) + } + + for doc in docs.values(): + doc.links.intersection_update(docs.keys()) + for target in doc.links: + docs[target].backlinks.add(doc.path) + + return cache_key, docs + + +def _build_document(path: str, content: str, sources: list[str], view: str) -> NoteDocument: + frontmatter, body = _split_frontmatter(content) + title = _extract_title(path, frontmatter, body) + headings = _extract_headings(body) + tags = _extract_tags(frontmatter) + links = _extract_links(path, body) + search_parts = [path, title, *headings, *tags, body] + search_blob = "\n".join(part for part in search_parts if part).lower() + tokens = set(_tokenize(search_blob)) + + return NoteDocument( + path=path, + content=content, + title=title, + headings=headings, + tags=tags, + links=links, + sources=sources, + view=view, + search_blob=search_blob, + tokens=tokens, + ) + + +def _score_documents( + index: RetrievalIndex, + normalized_query: str, + query_tokens: list[str], +) -> dict[str, RetrievalMatch]: + interim: dict[str, tuple[float, list[str]]] = {} + + for doc in index.docs.values(): + score = 0.0 + reasons: list[str] = [] + title_lower = doc.title.lower() + path_lower = doc.path.lower() + + if normalized_query in title_lower: + score += 8.0 + reasons.append("title matches query") + if normalized_query in path_lower: + score += 6.0 + reasons.append("path matches query") + if normalized_query in doc.search_blob: + score += 4.0 + reasons.append("content matches query") + + for token in query_tokens: + if token in path_lower: + score += 2.5 + _append_reason(reasons, f"path contains '{token}'") + if token in title_lower: + score += 3.5 + _append_reason(reasons, f"title contains '{token}'") + if any(token in heading.lower() for heading in doc.headings): + score += 2.0 + _append_reason(reasons, f"heading contains '{token}'") + if any(token == tag.lower() for tag in doc.tags): + score += 2.5 + _append_reason(reasons, f"tag matches '{token}'") + elif token in doc.tokens: + score += 1.0 + _append_reason(reasons, f"body contains '{token}'") + + if score > 0: + interim[doc.path] = (score, reasons[:4]) + + seeds = sorted(interim.items(), key=lambda item: (-item[1][0], item[0]))[:5] + for seed_path, (seed_score, _) in seeds: + if seed_score < 3: + continue + seed_doc = index.docs[seed_path] + for neighbor_path in seed_doc.links: + if neighbor_path == seed_path: + continue + score, reasons = interim.get(neighbor_path, (0.0, [])) + score += 1.0 + _append_reason(reasons, f"linked from {seed_path}") + interim[neighbor_path] = (score, reasons[:4]) + for backlink_path in seed_doc.backlinks: + if backlink_path == seed_path: + continue + score, reasons = interim.get(backlink_path, (0.0, [])) + score += 0.75 + _append_reason(reasons, f"links to {seed_path}") + interim[backlink_path] = (score, reasons[:4]) + + return { + path: RetrievalMatch( + path=path, + title=index.docs[path].title, + score=round(score, 2), + reasons=reasons[:4], + excerpt=_build_excerpt(index.docs[path], normalized_query, query_tokens), + view=index.docs[path].view, + sources=index.docs[path].sources, + ) + for path, (score, reasons) in interim.items() + if score > 0 + } + + +def _split_frontmatter(content: str) -> tuple[str, str]: + if not content.startswith("---\n"): + return "", content + + matches = list(_FRONTMATTER_BOUNDARY_RE.finditer(content)) + if len(matches) < 2 or matches[0].start() != 0: + return "", content + + frontmatter = content[matches[0].end():matches[1].start()].strip() + body = content[matches[1].end():].lstrip("\n") + return frontmatter, body + + +def _extract_title(path: str, frontmatter: str, body: str) -> str: + fields = _parse_frontmatter(frontmatter) + if title := fields.get("title"): + return title[0] + + headings = _extract_headings(body) + if headings: + return headings[0] + + stem = PurePosixPath(path).stem.replace("-", " ").replace("_", " ") + return stem.strip().title() or path + + +def _extract_headings(body: str) -> list[str]: + return [match.strip() for match in _HEADING_RE.findall(body)] + + +def _extract_tags(frontmatter: str) -> list[str]: + fields = _parse_frontmatter(frontmatter) + return fields.get("tags", []) + + +def _parse_frontmatter(frontmatter: str) -> dict[str, list[str]]: + parsed: dict[str, list[str]] = {} + current_key: str | None = None + + for raw_line in frontmatter.splitlines(): + line = raw_line.rstrip() + if not line.strip(): + continue + if line.startswith((" ", "\t")) and current_key and line.strip().startswith("-"): + value = line.strip()[1:].strip().strip("'\"") + if value: + parsed.setdefault(current_key, []).append(value) + continue + current_key = None + if ":" not in line: + continue + key, raw_value = line.split(":", 1) + key = key.strip().lower() + value = raw_value.strip() + if not value: + current_key = key + parsed.setdefault(key, []) + continue + if key == "tags": + parsed[key] = _split_tag_values(value) + else: + parsed[key] = [value.strip().strip("'\"")] + + return parsed + + +def _split_tag_values(value: str) -> list[str]: + cleaned = value.strip() + if cleaned.startswith("[") and cleaned.endswith("]"): + cleaned = cleaned[1:-1] + return [ + token.strip().strip("'\"") + for token in cleaned.split(",") + if token.strip().strip("'\"") + ] + + +def _extract_links(path: str, body: str) -> set[str]: + links: set[str] = set() + for raw in _WIKILINK_RE.findall(body): + target = raw.split("|", 1)[0].strip() + normalized = _normalize_link_target(path, target) + if normalized: + links.add(normalized) + + for raw in _MARKDOWN_LINK_RE.findall(body): + normalized = _normalize_link_target(path, raw) + if normalized: + links.add(normalized) + return links + + +def _normalize_link_target(source_path: str, raw_target: str) -> str | None: + target = raw_target.strip().strip("<>").split("#", 1)[0].split("?", 1)[0] + if not target or "://" in target or target.startswith(("mailto:", "#")): + return None + + if target.startswith("/"): + candidate = PurePosixPath(target.lstrip("/")) + else: + candidate = PurePosixPath(source_path).parent / target + + try: + normalized = candidate.as_posix() + except Exception: + return None + + if normalized.startswith("../"): + return None + + path_obj = PurePosixPath(normalized) + if not path_obj.suffix: + normalized = f"{normalized}.md" + if PurePosixPath(normalized).suffix.lower() not in vault_service.ALLOWED_EXTENSIONS: + return None + return normalized + + +def _build_excerpt(doc: NoteDocument, normalized_query: str, query_tokens: list[str]) -> str: + _, body = _split_frontmatter(doc.content) + lines = [line.strip() for line in body.splitlines() if line.strip()] + for line in lines: + lowered = line.lower() + if normalized_query in lowered or any(token in lowered for token in query_tokens): + return _trim_snippet(_strip_markdown(line)) + if doc.headings: + return _trim_snippet(_strip_markdown(doc.headings[0])) + if lines: + return _trim_snippet(_strip_markdown(lines[0])) + return doc.title + + +def _strip_markdown(text: str) -> str: + text = re.sub(r"^#{1,6}\s*", "", text) + text = re.sub(r"\[\[([^\]|]+)\|?([^\]]*)\]\]", lambda m: m.group(2) or m.group(1), text) + text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text) + return text.strip() + + +def _trim_snippet(text: str) -> str: + if len(text) <= _SNIPPET_MAX_CHARS: + return text + return text[: _SNIPPET_MAX_CHARS - 1].rstrip() + "…" + + +def _estimate_tokens(text: str) -> int: + return max(1, len(text.split())) + + +def _normalize_query(query: str) -> str: + return " ".join(query.strip().lower().split()) + + +def _tokenize(text: str) -> list[str]: + return [ + token + for token in (match.group(0).lower() for match in _WORD_RE.finditer(text)) + if len(token) > 1 + ] + + +def _append_reason(reasons: list[str], reason: str) -> None: + if reason not in reasons: + reasons.append(reason) diff --git a/kb-server/kb_server.egg-info/SOURCES.txt b/kb-server/kb_server.egg-info/SOURCES.txt index 6d27c3a..d55049a 100644 --- a/kb-server/kb_server.egg-info/SOURCES.txt +++ b/kb-server/kb_server.egg-info/SOURCES.txt @@ -3,7 +3,9 @@ pyproject.toml app/__init__.py app/main.py app/api/__init__.py +app/api/deps.py app/api/routes/__init__.py +app/api/routes/context.py app/api/routes/health.py app/api/routes/notes.py app/api/routes/publish.py @@ -14,11 +16,15 @@ app/core/logging.py app/models/__init__.py app/models/db.py app/schemas/__init__.py +app/schemas/context.py app/schemas/notes.py app/services/__init__.py +app/services/current_view_service.py app/services/git_batcher.py app/services/git_service.py +app/services/github_service.py app/services/publish_service.py +app/services/retrieval_service.py app/services/vault_service.py app/workers/__init__.py app/workers/autosave.py @@ -28,7 +34,10 @@ kb_server.egg-info/dependency_links.txt kb_server.egg-info/requires.txt kb_server.egg-info/top_level.txt tests/test_autosave.py +tests/test_context_api.py +tests/test_current_view.py tests/test_git_batcher.py tests/test_git_service.py tests/test_notes_api.py +tests/test_source_and_delete.py tests/test_vault_service.py \ No newline at end of file diff --git a/kb-server/tests/test_context_api.py b/kb-server/tests/test_context_api.py new file mode 100644 index 0000000..6a6cb11 --- /dev/null +++ b/kb-server/tests/test_context_api.py @@ -0,0 +1,247 @@ +"""Tests for retrieval service and /context API routes.""" + +from __future__ import annotations + +from contextlib import ExitStack +from pathlib import Path +import subprocess +from unittest.mock import patch + +import pytest +from fastapi.testclient import TestClient +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker +from sqlalchemy.pool import StaticPool + +from app.models.db import Base, get_session +from app.services import retrieval_service + +TEST_API_KEY = "test-secret-key-for-context" + + +def _git(vault: Path, *args: str) -> str: + result = subprocess.run( + ["git", *args], + cwd=vault, + capture_output=True, + text=True, + check=True, + ) + return result.stdout.strip() + + +def _commit_file(vault: Path, rel_path: str, content: str, msg: str) -> str: + full = vault / rel_path + full.parent.mkdir(parents=True, exist_ok=True) + full.write_text(content, encoding="utf-8") + _git(vault, "add", "--all") + _git(vault, "commit", "-m", msg) + return _git(vault, "rev-parse", "HEAD") + + +def _create_branch(vault: Path, branch: str) -> None: + _git(vault, "checkout", "-b", branch) + + +def _checkout(vault: Path, branch: str) -> None: + _git(vault, "checkout", branch) + + +@pytest.fixture(autouse=True) +def _patch_vault_settings(tmp_vault: Path): + with patch("app.services.git_service.settings") as gs, \ + patch("app.services.vault_service.settings") as vs, \ + patch("app.services.current_view_service.settings") as cvs, \ + patch("app.services.retrieval_service.settings") as rs: + for patched in (gs, vs, cvs, rs): + patched.vault_path = tmp_vault + patched.git_remote = "origin" + patched.git_branch = "main" + patched.git_batch_branch_prefix = "kb-api" + yield + + +@pytest.fixture() +def vault_with_retrieval_graph(tmp_vault: Path) -> Path: + _commit_file( + tmp_vault, + "notes/mcp-overview.md", + """--- +title: MCP Overview +tags: [mcp, agent] +--- +# MCP Overview + +Flight Deck exposes notes to agent clients. + +See [[retrieval-layer]] for context bundling. +""", + "add mcp overview", + ) + _commit_file( + tmp_vault, + "notes/retrieval-layer.md", + """--- +tags: + - retrieval + - context +--- +# Retrieval Layer + +This note describes graph traversal and context bundles. +""", + "add retrieval layer", + ) + + _create_branch(tmp_vault, "kb-api/2026-03-12") + _commit_file( + tmp_vault, + "notes/mcp-adapter.md", + """# MCP Adapter + +The adapter uses the retrieval layer to return context bundles. +""", + "add pending adapter", + ) + _checkout(tmp_vault, "main") + return tmp_vault + + +def _make_client(tmp_vault: Path) -> tuple[TestClient, ExitStack]: + stack = ExitStack() + engine = create_engine( + "sqlite://", + connect_args={"check_same_thread": False}, + poolclass=StaticPool, + ) + SessionLocal = sessionmaker(bind=engine) + Base.metadata.create_all(bind=engine) + + def _override_session(): + session = SessionLocal() + try: + yield session + finally: + session.close() + + auth_settings = stack.enter_context(patch("app.api.deps.settings")) + middleware_settings = stack.enter_context(patch("app.core.auth.settings")) + auth_settings.kb_api_key = TEST_API_KEY + middleware_settings.kb_api_key = TEST_API_KEY + + from app.main import create_app + + app = create_app() + app.dependency_overrides[get_session] = _override_session + client = TestClient(app) + return client, stack + + +@pytest.fixture() +def client(tmp_vault: Path): + client, stack = _make_client(tmp_vault) + try: + yield client + finally: + stack.close() + + +class TestRetrievalService: + def test_graph_expansion_returns_related_note(self, vault_with_retrieval_graph: Path): + results = retrieval_service.search_notes("mcp", view="main", limit=10) + paths = [result.path for result in results] + assert "notes/mcp-overview.md" in paths + assert "notes/retrieval-layer.md" in paths + + retrieval_result = next( + result for result in results if result.path == "notes/retrieval-layer.md" + ) + assert any("linked from" in reason for reason in retrieval_result.reasons) + + def test_current_view_includes_pending_branch_content(self, vault_with_retrieval_graph: Path): + with patch( + "app.services.current_view_service.github_service.list_open_kb_api_prs", + side_effect=Exception("offline"), + ): + results = retrieval_service.search_notes("adapter", view="current", limit=10) + + adapter_result = next( + result for result in results if result.path == "notes/mcp-adapter.md" + ) + assert "kb-api/2026-03-12" in adapter_result.sources + + def test_bundle_respects_token_budget(self, tmp_vault: Path): + _commit_file( + tmp_vault, + "notes/short.md", + "# Retrieval Short\n\nretrieval token\n", + "add short", + ) + _commit_file( + tmp_vault, + "notes/long.md", + "# Retrieval Long\n\n" + "retrieval " * 300, + "add long", + ) + + items, used_tokens = retrieval_service.build_context_bundle( + "retrieval", + view="main", + limit=10, + token_budget=20, + ) + + assert used_tokens <= 20 + assert any(item.content is not None for item in items) + assert any(item.truncated for item in items) + + +class TestContextAPI: + headers = {"X-API-Key": TEST_API_KEY} + + def test_search_requires_api_key(self, client: TestClient): + response = client.post("/context/search", json={"query": "mcp"}) + assert response.status_code == 401 + + def test_search_returns_current_view_results(self, client: TestClient, vault_with_retrieval_graph: Path): + with patch( + "app.services.current_view_service.github_service.list_open_kb_api_prs", + side_effect=Exception("offline"), + ): + response = client.post( + "/context/search", + json={"query": "adapter", "view": "current"}, + headers=self.headers, + ) + + assert response.status_code == 200 + data = response.json() + assert data["view"] == "current" + assert data["results"][0]["path"] == "notes/mcp-adapter.md" + assert "kb-api/2026-03-12" in data["results"][0]["sources"] + + def test_bundle_returns_excerpt_and_budgeted_content(self, client: TestClient, tmp_vault: Path): + _commit_file( + tmp_vault, + "notes/context.md", + "# Context Bundle\n\ncontext retrieval bundle\n", + "add context", + ) + _commit_file( + tmp_vault, + "notes/context-deep.md", + "# Context Deep Dive\n\n" + "context " * 250, + "add context deep", + ) + + response = client.post( + "/context/bundle", + json={"query": "context", "view": "main", "token_budget": 20}, + headers=self.headers, + ) + + assert response.status_code == 200 + data = response.json() + assert data["used_tokens"] <= 20 + assert any(item["content"] is not None for item in data["items"]) + assert any(item["truncated"] for item in data["items"]) diff --git a/scripts/generate_context_artifacts.py b/scripts/generate_context_artifacts.py index 6d02d12..2003ad8 100755 --- a/scripts/generate_context_artifacts.py +++ b/scripts/generate_context_artifacts.py @@ -5,6 +5,7 @@ import datetime as dt import re +import subprocess from pathlib import Path REPO_ROOT = Path(__file__).resolve().parent.parent @@ -15,6 +16,24 @@ def _read(path: Path) -> str: return path.read_text(encoding="utf-8") +def _git_last_change_date(paths: list[Path]) -> str: + dates: list[str] = [] + for path in paths: + proc = subprocess.run( + ["git", "log", "-1", "--format=%cs", "--", str(path)], + cwd=REPO_ROOT, + text=True, + capture_output=True, + check=False, + ) + value = proc.stdout.strip() + if value: + dates.append(value) + if dates: + return max(dates) + return "1970-01-01" + + def _parse_env_example(path: Path) -> list[tuple[str, str]]: items: list[tuple[str, str]] = [] for line in _read(path).splitlines(): @@ -54,11 +73,16 @@ def _parse_routes(path: Path) -> list[tuple[str, str]]: def _write_api_surface() -> None: - health_routes = _parse_routes(REPO_ROOT / "kb-server" / "app" / "api" / "routes" / "health.py") - notes_routes = _parse_routes(REPO_ROOT / "kb-server" / "app" / "api" / "routes" / "notes.py") - publish_routes = _parse_routes(REPO_ROOT / "kb-server" / "app" / "api" / "routes" / "publish.py") - all_routes = health_routes + notes_routes + publish_routes - date = dt.date.today().isoformat() + health_path = REPO_ROOT / "kb-server" / "app" / "api" / "routes" / "health.py" + context_path = REPO_ROOT / "kb-server" / "app" / "api" / "routes" / "context.py" + notes_path = REPO_ROOT / "kb-server" / "app" / "api" / "routes" / "notes.py" + publish_path = REPO_ROOT / "kb-server" / "app" / "api" / "routes" / "publish.py" + health_routes = _parse_routes(health_path) + context_routes = _parse_routes(context_path) + notes_routes = _parse_routes(notes_path) + publish_routes = _parse_routes(publish_path) + all_routes = health_routes + context_routes + notes_routes + publish_routes + date = _git_last_change_date([health_path, context_path, notes_path, publish_path]) content = [ "---", @@ -67,6 +91,7 @@ def _write_api_surface() -> None: f"last_verified: {date}", "source_of_truth:", " - ../../kb-server/app/api/routes/health.py", + " - ../../kb-server/app/api/routes/context.py", " - ../../kb-server/app/api/routes/notes.py", " - ../../kb-server/app/api/routes/publish.py", "related_code:", @@ -87,14 +112,17 @@ def _write_api_surface() -> None: content.append(f"| `{method}` | `{path}` |") content.append("") content.append("Do not edit manually. Regenerate with `python3 scripts/generate_context_artifacts.py`.") - (DOCS_GENERATED / "api-surface.md").write_text("\n".join(content), encoding="utf-8") + (DOCS_GENERATED / "api-surface.md").write_text("\n".join(content) + "\n", encoding="utf-8") def _write_env_catalog() -> None: - date = dt.date.today().isoformat() - kb_env = _parse_env_example(REPO_ROOT / "kb-server" / ".env.example") - kb_defaults = _parse_settings_defaults(REPO_ROOT / "kb-server" / "app" / "core" / "config.py") - vs_defaults = _parse_settings_defaults(REPO_ROOT / "vault-sync" / "vault_sync" / "config.py") + kb_env_path = REPO_ROOT / "kb-server" / ".env.example" + kb_config_path = REPO_ROOT / "kb-server" / "app" / "core" / "config.py" + vs_config_path = REPO_ROOT / "vault-sync" / "vault_sync" / "config.py" + date = _git_last_change_date([kb_env_path, kb_config_path, vs_config_path]) + kb_env = _parse_env_example(kb_env_path) + kb_defaults = _parse_settings_defaults(kb_config_path) + vs_defaults = _parse_settings_defaults(vs_config_path) content = [ "---", @@ -152,7 +180,7 @@ def _write_env_catalog() -> None: content.append("") content.append("Do not edit manually. Regenerate with `python3 scripts/generate_context_artifacts.py`.") - (DOCS_GENERATED / "env-catalog.md").write_text("\n".join(content), encoding="utf-8") + (DOCS_GENERATED / "env-catalog.md").write_text("\n".join(content) + "\n", encoding="utf-8") def main() -> int: @@ -165,4 +193,3 @@ def main() -> int: if __name__ == "__main__": raise SystemExit(main()) -