diff --git a/.env.example b/.env.example index 42ce2db..8959888 100644 --- a/.env.example +++ b/.env.example @@ -9,11 +9,19 @@ CLERK_SECRET_KEY=sk_test_... # Generate at https://openrouter.ai/settings/keys OPENROUTER_API_KEY=sk-or-... +# TinyFish — required by populate agent web search/fetch. +# Generate at https://agent.tinyfish.ai/api-keys +TINYFISH_API_KEY= + # Generate once after the first `make dev` with: # docker compose exec convex ./generate_admin_key.sh # Used by the backend container to call internal Convex functions. CONVEX_SELF_HOSTED_ADMIN_KEY= +# Durable store for self-healing populate recipe manifests. +# Docker dev overrides this to /app/.bigset/populate-recipes on a named volume. +POPULATE_RECIPE_STORE_DIR=.bigset/populate-recipes + # PostHog (optional — leave blank to disable analytics entirely in local dev). # Get from https://us.posthog.com/project/settings/general. NEXT_PUBLIC_POSTHOG_KEY= diff --git a/.gitignore b/.gitignore index 7632c39..91c25ee 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ .DS_Store node_modules/ +backend/node_modules .env .env.local Project_BigSet_brief.md @@ -14,6 +15,7 @@ Project_BigSet_brief.md *.log npm-debug.log* yarn-debug.log* +/benchmark-results/ # Local-only files *.bak @@ -21,9 +23,10 @@ tmp/ temp/ .mastra +.bigset/ # Local tarballs *.tgz # Internal docs -BigSet Technical Specs & Goals.md \ No newline at end of file +BigSet Technical Specs & Goals.md diff --git a/CLAUDE.md b/CLAUDE.md index 4df3522..813fbf7 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -29,7 +29,7 @@ Backend is Fastify + Mastra. Fastify serves the HTTP API (Clerk JWT auth on prot The schema inference pipeline: frontend calls `POST /infer-schema` → Fastify verifies the Clerk JWT → calls `inferSchema()` in `backend/src/pipeline/schema-inference.ts` → Claude Sonnet 4.6 via OpenRouter → returns a Zod-validated `DatasetSchema` → frontend maps it to editable columns in the wizard. -The populate pipeline: frontend calls `POST /populate` with `{ datasetId, datasetName, description, columns }` → Fastify verifies the Clerk JWT → triggers `populateWorkflow` which: (1) clears existing rows, (2) builds a prompt from the schema, (3) runs the populate agent (Claude Sonnet 4.6) which searches the web via TinyFish APIs, then inserts rows into Convex one by one. Rows appear in realtime on the frontend via Convex reactive queries. +The populate pipeline: frontend calls `POST /populate` with `{ datasetId, datasetName, description, columns }` → Fastify verifies the Clerk JWT → runs the self-healing populate service. The service builds or reuses a recipe, runs the Mastra populate runtime against TinyFish search/fetch, validates source-backed rows, repairs bad recipes, promotes the passing recipe, then atomically replaces the dataset rows in Convex. Rows appear in realtime on the frontend via Convex reactive queries. Convex functions use `ctx.auth.getUserIdentity()` to get the authenticated user. The `ownerId` field on datasets stores `identity.subject` (Clerk user ID). Do not pass `ownerId` from the client. @@ -49,4 +49,10 @@ Convex is self-hosted — it does NOT hot-reload when you edit files in `fronten In CI/prod, run `npx convex deploy` with `CONVEX_SELF_HOSTED_URL` and `CONVEX_SELF_HOSTED_ADMIN_KEY` set as env vars. +## Self-Healing Verification + +Run `make verify-self-healing` before handing the stack to another agent. It runs backend tests, backend build, adapter syntax checks, and a no-key benchmark smoke that should block cleanly without spending API credits. + +Use `bash scripts/verify-self-healing-stack.sh --real-benchmark` for the 2-prompt real Mastra benchmark, and `bash scripts/verify-self-healing-stack.sh --convex-push --dataset-id ` for a live app dataset dry-run. Export the required env vars before live modes; the verifier does not parse secret files itself. Add `--commit` only when you intentionally want to replace rows. + This is an open-source (AGPL) project. Do not commit secrets, API keys, or internal docs. diff --git a/backend/.env.example b/backend/.env.example index 5f6f461..a56d9df 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -1,6 +1,7 @@ CLIENT_ORIGIN=http://localhost:3500 CONVEX_URL=http://localhost:3210 PORT=3501 +POPULATE_RECIPE_STORE_DIR=.bigset/populate-recipes # Required once the backend starts writing rows via internal Convex mutations. # Generate with: docker compose exec convex ./generate_admin_key.sh diff --git a/backend/BigSet_Data_Collection_Agent/src/acquisition/link-follow.ts b/backend/BigSet_Data_Collection_Agent/src/acquisition/link-follow.ts new file mode 100644 index 0000000..b8316d7 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/acquisition/link-follow.ts @@ -0,0 +1,114 @@ +import type { FetchedPage } from "../models/schemas.js"; +import type { WorkflowMemory } from "../memory/types.js"; +import { domainMemoryBoost } from "../memory/workflow-memory.js"; +import { getDomain, normalizeUrl } from "../utils/url.js"; + +const SKIP_HOST = + /(?:facebook|twitter|x\.com|instagram|youtube|tiktok|pinterest|reddit\.com\/r\/|linkedin\.com\/in\/|accounts\.google|login|signin|signup|register|cookie|privacy|terms|cdn\.|static\.|fonts\.)/i; +const SKIP_EXT = /\.(?:pdf|zip|png|jpe?g|gif|svg|webp|css|js|woff2?|xml|mp4|mp3)(?:\?|$)/i; +const POSITIVE_PATH = + /\/(?:blog|news|docs|documentation|pricing|billing|investor|investors|earnings|financial|reports|press|release|releases|mcp|model-context-protocol|agents|company|companies|startup|startups|portfolio|team|about|careers|jobs|directory|list|batch|founder|org|organization|profile|detail|view)(?:\/|$|\?)/i; +const NEGATIVE_PATH = + /\/(?:tag|tags|category|categories|author|feed|rss|search|wp-admin|wp-content)(?:\/|$|\?)/i; + +export interface LinkFollowOptions { + pages: FetchedPage[]; + excludeUrls: Set; + focusFields?: string[]; + maxTotal: number; + maxPerSource: number; + memory?: WorkflowMemory; +} + +function pathTokensFromFields(fields?: string[]): string[] { + if (!fields?.length) return []; + return fields + .flatMap((field) => + field + .split(/[_\s-]+/) + .map((part) => part.toLowerCase()) + .filter((part) => part.length > 3), + ) + .slice(0, 12); +} + +function scoreLink( + link: string, + sourceDomain: string, + focusTokens: string[], + memory?: WorkflowMemory, +): number { + let score = 0; + + try { + const parsed = new URL(link); + const host = parsed.hostname.toLowerCase(); + const path = `${parsed.pathname}${parsed.search}`.toLowerCase(); + + if (SKIP_HOST.test(host) || SKIP_EXT.test(path)) return -1000; + if (NEGATIVE_PATH.test(path)) score -= 2; + if (POSITIVE_PATH.test(path)) score += 4; + + const linkDomain = getDomain(link); + if (linkDomain === sourceDomain) score += 3; + else if (linkDomain.endsWith(`.${sourceDomain}`) || sourceDomain.endsWith(`.${linkDomain}`)) { + score += 2; + } + + for (const token of focusTokens) { + if (path.includes(token)) score += 2; + } + + if (memory) score += domainMemoryBoost(memory, linkDomain); + + if (path.length > 120) score -= 1; + if (parsed.hash.length > 1) score -= 1; + } catch { + return -1000; + } + + return score; +} + +/** Pick outbound links from high-value pages using URL heuristics only. */ +export function selectOutboundLinksToFollow( + options: LinkFollowOptions, +): string[] { + const focusTokens = pathTokensFromFields(options.focusFields); + const selected: string[] = []; + const selectedSet = new Set(); + + const pagesWithLinks = options.pages + .filter((page) => !page.error && page.outbound_links && page.outbound_links.length > 0) + .sort((a, b) => (b.outbound_links?.length ?? 0) - (a.outbound_links?.length ?? 0)); + + for (const page of pagesWithLinks) { + const sourceUrl = normalizeUrl(page.final_url || page.url); + const sourceDomain = getDomain(sourceUrl); + let perSource = 0; + + const ranked = [...(page.outbound_links ?? [])] + .map((link) => ({ + link, + score: scoreLink(link, sourceDomain, focusTokens, options.memory), + })) + .filter((item) => item.score > 0) + .sort((a, b) => b.score - a.score); + + for (const { link } of ranked) { + if (selected.length >= options.maxTotal) return selected; + if (perSource >= options.maxPerSource) break; + + const normalized = normalizeUrl(link); + if (options.excludeUrls.has(normalized)) continue; + if (selectedSet.has(normalized)) continue; + if (normalized === sourceUrl) continue; + + selectedSet.add(normalized); + selected.push(link); + perSource += 1; + } + } + + return selected; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/agent-goal.ts b/backend/BigSet_Data_Collection_Agent/src/agents/agent-goal.ts new file mode 100644 index 0000000..e84ad75 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/agents/agent-goal.ts @@ -0,0 +1,64 @@ +import { completeJson } from "../integrations/openrouter.js"; +import { + memoryContextForAgents, + type WorkflowMemory, +} from "../memory/index.js"; +import { agentGoalSchema, type AgentGoal } from "../models/schemas.js"; +import type { DatasetSpec, SourceTriageResult } from "../models/schemas.js"; + +const AGENT_GOAL_SYSTEM = `You are the Navigation Task Agent for a web data collection pipeline. + +Write a Tinyfish Agent goal: a clear natural-language instruction for browser automation on the given URL. + +The agent must navigate the site and return structured JSON with extracted data matching the dataset schema. + +Rules: +- Be specific about what to click, search, filter, or paginate. +- State the exact JSON shape to return: { "records": [ { column_name: value, ... } ] } +- Include column names from the schema in the goal. +- For forms: describe fields to fill and how to submit. +- For detail follow-up: explain how to open each item and which fields to collect. +- Limit scope (e.g. first 25 rows) to keep runs reliable. +- Do not invent data; extract only what is visible on the site. +- When workflow_memory is provided, reuse goal patterns from agent_goal_stats_top (high avg_completeness/confidence); avoid domains in domain_stats_weak unless diagnosis says otherwise. +- If latest_diagnosis.prefer_tinyfish_agent or agent_strategy_notes exist, follow them. +- Return ONLY JSON with fields: goal, rationale`; + +export async function generateAgentGoal(options: { + userPrompt: string; + spec: DatasetSpec; + triage: SourceTriageResult; + focusFields?: string[]; + memory?: WorkflowMemory; +}): Promise { + const columnList = options.spec.columns + .map((c) => `${c.name} (${c.type}${c.required ? ", required" : ""})`) + .join(", "); + + return completeJson({ + label: `agent_goal:${options.triage.final_url}`, + schema: agentGoalSchema, + messages: [ + { role: "system", content: AGENT_GOAL_SYSTEM }, + { + role: "user", + content: JSON.stringify({ + user_prompt: options.userPrompt, + triage_status: options.triage.status, + triage_reasoning: options.triage.reasoning, + suggested_action: options.triage.suggested_action, + page_url: options.triage.final_url, + page_title: options.triage.title, + row_grain: options.spec.row_grain, + columns: columnList, + focus_fields: options.focusFields ?? [], + extraction_hints: options.spec.extraction_hints, + workflow_memory: options.memory + ? memoryContextForAgents(options.memory) + : undefined, + output_shape: { goal: "string", rationale: "string" }, + }), + }, + ], + }); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/benchmark-spec.ts b/backend/BigSet_Data_Collection_Agent/src/agents/benchmark-spec.ts new file mode 100644 index 0000000..288f540 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/agents/benchmark-spec.ts @@ -0,0 +1,94 @@ +import type { ColumnDef, DatasetSpec } from "../models/schemas.js"; +import { normalizeSpecColumnOrder } from "./dataset-spec.js"; + +/** Benchmark harness fields from prompts.json (via env in adapters). */ +export interface BenchmarkSpecContext { + promptId?: string; + promptQuality?: string; + persona?: string; + expectedStress?: string; + requiredColumns: string[]; +} + +export function hasBenchmarkRequiredColumns( + context?: BenchmarkSpecContext, +): context is BenchmarkSpecContext & { requiredColumns: string[] } { + return Boolean(context?.requiredColumns?.length); +} + +/** Parse comma-separated column names (CLI flag or benchmark env). */ +export function parseRequiredColumns(value: string): string[] { + const columns = value + .split(",") + .map((name) => name.trim()) + .filter(Boolean); + if (columns.length === 0) { + throw new Error( + "Required columns must include at least one non-empty column name.", + ); + } + return columns; +} + +/** + * Ensures every benchmark-required column name exists on the spec as required. + * Types and descriptions come from the dataset-spec LLM when present; otherwise + * minimal placeholders (no per-column name heuristics). + */ +export function mergeSpecWithBenchmarkRequiredColumns( + spec: DatasetSpec, + context: BenchmarkSpecContext, +): DatasetSpec { + const requiredColumns = context.requiredColumns; + const columnsByName = new Map(spec.columns.map((column) => [column.name, column])); + + const requiredColumnDefs: ColumnDef[] = requiredColumns.map((name) => { + const existing = columnsByName.get(name); + if (existing) { + return { ...existing, required: true }; + } + return { + name, + type: "string", + description: name, + required: true, + }; + }); + + const optionalExtras = spec.columns.filter( + (column) => !requiredColumns.includes(column.name), + ); + + const columns = [...requiredColumnDefs, ...optionalExtras]; + const columnNames = new Set(columns.map((column) => column.name)); + + const isEntityLikeColumn = (name: string): boolean => + /(entity|company|organization|business|restaurant|bakery|provider|product|name|title)/i.test( + name, + ); + + const dedupeKey = + requiredColumns.find( + (name) => columnNames.has(name) && isEntityLikeColumn(name), + ) ?? + spec.dedupe_keys.find((key) => columnNames.has(key)) ?? + requiredColumns.find((name) => columnNames.has(name)) ?? + spec.dedupe_keys[0]; + + const extractionHints = [ + spec.extraction_hints, + `Benchmark required columns (use as exact row keys): ${requiredColumns.join(", ")}.`, + context.expectedStress + ? `Benchmark stress note: ${context.expectedStress}` + : undefined, + ] + .filter(Boolean) + .join("\n"); + + return normalizeSpecColumnOrder({ + ...spec, + columns, + dedupe_keys: dedupeKey ? [dedupeKey] : spec.dedupe_keys, + extraction_hints: extractionHints, + }); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/dataset-spec.ts b/backend/BigSet_Data_Collection_Agent/src/agents/dataset-spec.ts new file mode 100644 index 0000000..be1f489 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/agents/dataset-spec.ts @@ -0,0 +1,194 @@ +import { completeJson } from "../integrations/openrouter.js"; +import type { WorkflowMemory } from "../memory/types.js"; +import { + datasetSpecSchema, + type ColumnDef, + type DatasetSpec, +} from "../models/schemas.js"; +import { + hasBenchmarkRequiredColumns, + mergeSpecWithBenchmarkRequiredColumns, + type BenchmarkSpecContext, +} from "./benchmark-spec.js"; +import { applyPromptSourcePolicyToSpec } from "./source-policy.js"; + +const DATASET_SPEC_SYSTEM = `You are the Dataset Spec Agent for a web data collection pipeline. + +Given a user's data gathering prompt, produce a JSON object that defines: +- what each CSV row represents (row_grain) +- column names, types, and which are required +- dedupe_keys: exactly ONE column name that identifies a unique row (the main entity field, e.g. entity_name or restaurant_name — used as primary key for merge/repair) +- search_queries: diverse web search strings to find sources (use site: operators when helpful) +- extraction_hints: guidance for downstream extraction + +Rules: +- columns[].name must be snake_case +- types must be one of: string, number, boolean, date +- Column order: list every required column first (see ordering below), then optional columns. Do not bury required fields after optional metadata. +- Required columns (required: true): + - The single dedupe_keys field must be required: true. + - Every column that the user_prompt explicitly or clearly implies they want per row (e.g. "who's hiring" → is_hiring; "still active" → is_active; "funding amount" → funding column) must be required: true. + - Do NOT mark only the entity name/identifier as required while leaving core intent fields optional — that blocks the repair loop from filling sparse rows. + - Optional (required: false) only for nice-to-have extras the user did not ask for (e.g. logo_url when they only care about hiring status). +- Required column ordering within columns[]: + 1. the dedupe_keys field first + 2. other required intent fields (what the user asked to collect) + 3. optional fields last +- For type "number", embed the measurement unit in the column name using snake_case + (e.g. funding_amount_usd(millions), employee_count, market_cap_million_usd, growth_rate_percent). + Choose units that match the user's intent; describe the unit in columns[].description when helpful. + Do not use bare numeric names like "amount", "price", or "funding" without a unit, for example, if the + numeric value is in millions, use "funding_amount_million_usd" instead of "funding_amount_usd". +- search_queries should be specific, varied (5-8 queries), and likely to surface pages with list/table data +- Temporal relevance for search_queries: + - Use the provided current_date / current_year when a query needs a time anchor (e.g. "2026", "latest", "recent"). + - Do NOT default to past years (e.g. 2024) unless the user_prompt explicitly names that year or date range. + - If the user says "recent", "current", "latest", or implies up-to-date data, anchor queries to current_year. + - If the user gives no time constraint, prefer evergreen queries OR current_year only when recency clearly matters for the dataset. + - If the user specifies a year or date (e.g. "in 2024", "Q1 2023"), use exactly what they asked for. +- target_row_count should reflect the user's implied or stated goal +- Return ONLY JSON, no markdown`; + +function currentTimeContext(): { current_date: string; current_year: number } { + const now = new Date(); + return { + current_date: now.toISOString().slice(0, 10), + current_year: now.getFullYear(), + }; +} + +/** Ensure exactly one valid dedupe key exists on the spec. */ +export function normalizeDedupeKey(spec: DatasetSpec): DatasetSpec { + const columnNames = new Set(spec.columns.map((column) => column.name)); + let key = spec.dedupe_keys[0]; + + if (!key || !columnNames.has(key)) { + const firstRequired = spec.columns.find((column) => column.required); + key = firstRequired?.name ?? spec.columns[0]?.name ?? key; + } + + if (!key) { + return spec; + } + + return { ...spec, dedupe_keys: [key] }; +} + +/** Enforce required-first column order even if the model returns a different order. */ +export function normalizeSpecColumnOrder(spec: DatasetSpec): DatasetSpec { + const byName = new Map(spec.columns.map((col) => [col.name, col])); + const ordered: ColumnDef[] = []; + const used = new Set(); + + for (const key of spec.dedupe_keys.slice(0, 1)) { + const col = byName.get(key); + if (!col || used.has(key)) continue; + ordered.push({ ...col, required: true }); + used.add(key); + } + + for (const col of spec.columns) { + if (used.has(col.name) || !col.required) continue; + ordered.push(col); + used.add(col.name); + } + + for (const col of spec.columns) { + if (used.has(col.name)) continue; + ordered.push(col); + used.add(col.name); + } + + return { ...spec, columns: ordered }; +} + +export async function generateDatasetSpec( + prompt: string, + targetRows: number, + priorMemory?: WorkflowMemory | null, + benchmark?: BenchmarkSpecContext, +): Promise { + const { current_date, current_year } = currentTimeContext(); + + const spec = await completeJson({ + label: "dataset_spec", + schema: datasetSpecSchema, + messages: [ + { role: "system", content: DATASET_SPEC_SYSTEM }, + { + role: "user", + content: JSON.stringify({ + user_prompt: prompt, + target_row_count: targetRows, + current_date, + current_year, + prior_workflow_memory: + priorMemory && priorMemory.prompt_fingerprint + ? { + query_stats_top: [...priorMemory.query_stats] + .filter((q) => q.record_count > 0) + .slice(-8), + domain_stats_top: [...priorMemory.domain_stats] + .filter((d) => d.record_count > 0) + .slice(-8), + domain_stats_weak: [...priorMemory.domain_stats] + .filter( + (d) => + d.fetch_failures > 0 || + (d.record_count > 0 && d.avg_completeness < 0.5), + ) + .slice(-6), + dedupe_keys: priorMemory.dedupe_keys, + strategy_notes: priorMemory.strategy_notes.slice(-5), + } + : undefined, + column_order_note: + "required columns first: dedupe_keys in order, then other required intent fields, then optional", + benchmark_context: hasBenchmarkRequiredColumns(benchmark) + ? { + prompt_id: benchmark.promptId, + prompt_quality: benchmark.promptQuality, + persona: benchmark.persona, + expected_stress: benchmark.expectedStress, + required_columns: benchmark.requiredColumns, + instruction: + "When required_columns is present, columns[].name MUST use those exact snake_case names as the core schema (all required: true). You may add optional extra columns only if they do not replace or rename required_columns. Align search_queries and extraction_hints to satisfy the user_prompt and expected_stress.", + } + : undefined, + output_shape: { + intent_summary: "string", + target_row_count: "number", + row_grain: "string", + columns: [ + { + name: "string (snake_case)", + type: "string | number | boolean | date", + description: "string", + required: + "boolean — true for dedupe_keys and every field the user_prompt asks to collect per row", + }, + ], + dedupe_keys: ["string — exactly one primary entity column name"], + search_queries: ["string"], + extraction_hints: "string", + }, + }), + }, + ], + }); + + let normalized = normalizeDedupeKey( + normalizeSpecColumnOrder({ + ...spec, + target_row_count: targetRows, + }), + ); + + normalized = applyPromptSourcePolicyToSpec(normalized, prompt); + + if (hasBenchmarkRequiredColumns(benchmark)) { + normalized = mergeSpecWithBenchmarkRequiredColumns(normalized, benchmark); + } + + return normalized; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/extract-from-agent.ts b/backend/BigSet_Data_Collection_Agent/src/agents/extract-from-agent.ts new file mode 100644 index 0000000..eba28c1 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/agents/extract-from-agent.ts @@ -0,0 +1,82 @@ +import { completeJson } from "../integrations/openrouter.js"; +import { + memoryContextForAgents, + type WorkflowMemory, +} from "../memory/index.js"; +import type { DatasetSpec, ExtractedRecord } from "../models/schemas.js"; +import { + buildLlmExtractionResultSchema, + finalizeExtractedRecords, + type LlmExtractionRecord, +} from "./extract.js"; + +/** + * Parses one Tinyfish agent result JSON per call (see process-pages.ts agent branch). + * Not used for fetched-page markdown; that path uses extractFromPage. + */ + +const EXTRACT_AGENT_SYSTEM = `You are the Extraction Agent parsing output from a Tinyfish browser automation run. + +Convert the agent result JSON into dataset records matching the schema. + +Rules: +- Only include facts present in the agent result. Do not invent values. +- row keys must match spec column names exactly. +- For number columns, numeric values only (unit is in the column name). +- evidence: field, quote, and url for fields you populated when you have a supporting quote (url = where that quote was found; use page_url when from this page). Not required for every column. +- Do not return source_urls. +- extraction_confidence (0–1) per record when possible. +- Provenance URL columns: set per row to the URL where that row's data came from (use page_url when appropriate). +- If the agent result has no usable rows, return an empty records array. +- Return ONLY JSON`; + +export async function extractFromAgentResult(options: { + spec: DatasetSpec; + pageUrl: string; + agentResult: Record | null; + focusFields?: string[]; + memory?: WorkflowMemory; +}): Promise { + if (!options.agentResult || Object.keys(options.agentResult).length === 0) { + return []; + } + + const result = await completeJson({ + label: `extract_agent:${options.pageUrl}`, + schema: buildLlmExtractionResultSchema(options.spec), + messages: [ + { role: "system", content: EXTRACT_AGENT_SYSTEM }, + { + role: "user", + content: JSON.stringify({ + dataset_spec: { + intent_summary: options.spec.intent_summary, + row_grain: options.spec.row_grain, + columns: options.spec.columns, + }, + page_url: options.pageUrl, + agent_result: options.agentResult, + focus_fields: options.focusFields ?? [], + workflow_memory: options.memory + ? memoryContextForAgents(options.memory) + : undefined, + output_shape: { + records: [ + { + row: { column_name: "value or null" }, + evidence: [{ field: "column_name", url: "string", quote: "string" }], + extraction_confidence: "0-1 number", + }, + ], + }, + }), + }, + ], + }); + + return finalizeExtractedRecords( + result.records as LlmExtractionRecord[], + options.pageUrl, + options.spec, + ); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/extract.ts b/backend/BigSet_Data_Collection_Agent/src/agents/extract.ts new file mode 100644 index 0000000..b6d8a04 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/agents/extract.ts @@ -0,0 +1,312 @@ +import { z } from "zod"; +import { config } from "../config.js"; +import { completeJson } from "../integrations/openrouter.js"; +import { + memoryContextForAgents, + type WorkflowMemory, +} from "../memory/index.js"; +import { + extractedRecordSchema, + fieldEvidenceSchema, + type ColumnDef, + type DatasetSpec, + type ExtractedRecord, + type FetchedPage, +} from "../models/schemas.js"; +import { + deriveRecordSourceUrls, + isHttpUrl, + isUrlLikeColumnName, +} from "../records/source-urls.js"; + +/** + * Extraction is always one source per LLM call in process-pages.ts: + * - extractFromPage: one fetched page's markdown per call (parallelized per page). + * - extractFromAgentResult: one Tinyfish agent JSON payload per call (separate module). + * + * LLM returns row + sparse evidence + extraction_confidence; code attaches evidence URLs + * and source_urls. Provenance URL columns come from the LLM row values per record. + */ + +const llmFieldEvidenceSchema = fieldEvidenceSchema + .omit({ url: true }) + .extend({ url: z.string().optional() }); + +export type LlmExtractionRecord = { + row: Record; + evidence: z.infer[]; + extraction_confidence?: number; +}; + +function columnValueSchema( + column: ColumnDef, +): z.ZodType { + switch (column.type) { + case "number": + return z.union([z.number(), z.null()]); + case "boolean": + return z.union([z.boolean(), z.null()]); + default: + return z.union([z.string(), z.null()]); + } +} + +/** Explicit column keys so AI SDK structured output guides the model to populate row fields. */ +export function buildLlmExtractionResultSchema(spec: DatasetSpec) { + const rowShape: Record = {}; + for (const column of spec.columns) { + rowShape[column.name] = columnValueSchema(column); + } + + const llmExtractionRecordSchema = z.object({ + row: z.object(rowShape), + evidence: z.array(llmFieldEvidenceSchema), + extraction_confidence: z.number().min(0).max(1).optional(), + }); + + return z.object({ + records: z.array(llmExtractionRecordSchema), + notes: z.string().optional(), + }); +} + +const EXTRACTION_SYSTEM = `You are the Extraction Agent for a web data collection pipeline. + +Extract structured records from the provided page content according to the dataset specification. + +Rules: +- Only extract facts supported by the page text. Do not invent data. +- row keys must match spec column names exactly. +- For columns with type "number", store numeric values only (no unit text in the value; the unit is already in the column name). +- Use null for unknown values. +- Return multiple records if the page lists multiple entities matching row_grain. +- If the page has no relevant data, return an empty records array. +- evidence: include field, quote, and url for fields you populated when you have a supporting quote (url = where that quote was found; use the page URL when from this page). Not required for every column. +- Do not return source_urls on the record. +- extraction_confidence (0–1): how confident you are this row is accurate. +- Provenance URL columns (e.g. source_url, evidence_url, or columns described as where data was found): set each row's value to the URL where that row's facts came from — use the provided page URL when all fields for that row are from this page, or a more specific URL only if clearly stated on the page. +- Do not copy unrelated URLs into provenance columns (e.g. do not set source_url to the page URL when pricing_page_url already holds the pricing URL and source_url should cite where you read the plan). +- Return ONLY JSON`; + +function truncatePageText(text: string): string { + if (text.length <= config.maxPageChars) return text; + return `${text.slice(0, config.maxPageChars)}\n\n[truncated]`; +} + +function isEmpty(value: unknown): boolean { + return value === null || value === undefined || value === ""; +} + +function coerceEvidenceToColumnValue( + column: ColumnDef, + quote: string, +): string | number | boolean | null { + const trimmed = quote.trim(); + if (!trimmed) return null; + + switch (column.type) { + case "boolean": { + const lower = trimmed.toLowerCase(); + if ( + /\b(true|yes|active|hiring|looking for|open roles|open positions|join us|join our team|we(?:'re| are) hiring|see open roles)\b/.test( + lower, + ) + ) { + return true; + } + if ( + /\b(false|no|not hiring|no careers|does not contain|lack of|without)\b/.test( + lower, + ) + ) { + return false; + } + return null; + } + case "number": { + const parsed = Number(trimmed.replace(/,/g, "")); + return Number.isFinite(parsed) ? parsed : null; + } + default: + return trimmed; + } +} + +function hydrateRowFromEvidence( + row: Record, + evidence: Array<{ field: string; quote: string }>, + spec: DatasetSpec, +): void { + const columnByName = new Map(spec.columns.map((column) => [column.name, column])); + + for (const item of evidence) { + if (isEmpty(row[item.field])) { + const column = columnByName.get(item.field); + if (!column) continue; + const value = coerceEvidenceToColumnValue(column, item.quote); + if (value !== null) { + row[item.field] = value; + } + } + } +} + +/** Columns meant to hold a citation URL for where row data was found (not content URLs). */ +export function isProvenanceUrlColumn(column: ColumnDef): boolean { + const name = column.name.toLowerCase(); + if (name === "source_url" || name === "evidence_url") { + return true; + } + if (name.endsWith("_source_url")) { + return true; + } + const description = column.description.toLowerCase(); + return ( + name.includes("source") && + name.includes("url") && + (description.includes("evidence") || + description.includes("provenance") || + description.includes("where")) + ); +} + +function provenanceUrlColumns(spec: DatasetSpec): ColumnDef[] { + return spec.columns.filter(isProvenanceUrlColumn); +} + +function isUrlLikeColumn(column: ColumnDef): boolean { + return isUrlLikeColumnName(column.name); +} + +function addUrlCellEvidence( + row: Record, + evidence: ExtractedRecord["evidence"], + spec: DatasetSpec, +): void { + const fieldsWithEvidence = new Set(evidence.map((item) => item.field)); + for (const column of spec.columns) { + if (!isUrlLikeColumn(column) || fieldsWithEvidence.has(column.name)) { + continue; + } + const value = row[column.name]; + if (!isHttpUrl(value)) continue; + evidence.push({ + field: column.name, + url: value.trim(), + quote: value.trim(), + }); + fieldsWithEvidence.add(column.name); + } +} + +/** Attach evidence URLs and source_urls; keep LLM row and provenance values. */ +export function finalizeExtractedRecord( + record: LlmExtractionRecord, + pageUrl: string, + spec: DatasetSpec, +): ExtractedRecord { + const row = { ...record.row }; + hydrateRowFromEvidence(row, record.evidence, spec); + + const evidence = record.evidence.map((item) => ({ + field: item.field, + quote: item.quote, + url: item.url?.trim() || pageUrl, + })); + + for (const column of provenanceUrlColumns(spec)) { + if (column.required && isEmpty(row[column.name])) { + row[column.name] = pageUrl; + } + } + addUrlCellEvidence(row, evidence, spec); + + const source_urls = deriveRecordSourceUrls({ + spec, + row, + evidence, + fallbackUrls: [pageUrl], + }); + + return extractedRecordSchema.parse({ + row, + evidence, + source_urls, + ...(record.extraction_confidence !== undefined + ? { extraction_confidence: record.extraction_confidence } + : {}), + }); +} + +export function finalizeExtractedRecords( + records: LlmExtractionRecord[], + pageUrl: string, + spec: DatasetSpec, +): ExtractedRecord[] { + return records.map((record) => finalizeExtractedRecord(record, pageUrl, spec)); +} + +export interface ExtractOptions { + focusFields?: string[]; +} + +export async function extractFromPage( + spec: DatasetSpec, + page: FetchedPage, + options: ExtractOptions & { memory?: WorkflowMemory } = {}, +): Promise { + if (page.error || !page.text.trim()) { + return []; + } + + const pageUrl = page.final_url || page.url; + const result = await completeJson({ + label: `extraction:${pageUrl}`, + schema: buildLlmExtractionResultSchema(spec), + messages: [ + { role: "system", content: EXTRACTION_SYSTEM }, + { + role: "user", + content: JSON.stringify({ + dataset_spec: { + intent_summary: spec.intent_summary, + row_grain: spec.row_grain, + columns: spec.columns, + extraction_hints: spec.extraction_hints, + }, + page: { + url: pageUrl, + title: page.title, + text: truncatePageText(page.text), + }, + ...(options.focusFields?.length + ? { + focus_fields: options.focusFields, + instruction: + "Prioritize extracting focus_fields. Use null only when the page truly lacks that information.", + } + : {}), + workflow_memory: options.memory + ? memoryContextForAgents(options.memory) + : undefined, + output_shape: { + records: [ + { + row: { column_name: "value or null" }, + evidence: [{ field: "column_name", url: "string", quote: "string" }], + extraction_confidence: "0-1 number", + }, + ], + notes: "optional string", + }, + }), + }, + ], + }); + + return finalizeExtractedRecords( + result.records as LlmExtractionRecord[], + pageUrl, + spec, + ); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/repair-diagnosis.ts b/backend/BigSet_Data_Collection_Agent/src/agents/repair-diagnosis.ts new file mode 100644 index 0000000..be77e5e --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/agents/repair-diagnosis.ts @@ -0,0 +1,80 @@ +import type { CoverageReport } from "../coverage/analyze.js"; +import { completeJson } from "../integrations/openrouter.js"; +import { + memoryContextForAgents, + type WorkflowMemory, +} from "../memory/index.js"; +import { repairDiagnosisSchema, type RepairDiagnosis } from "../memory/types.js"; +import type { DatasetSpec } from "../models/schemas.js"; +import type { SourcesReport } from "../models/quality.js"; + +const DIAGNOSIS_SYSTEM = `You are the Repair Diagnosis Agent for a web data collection pipeline. + +A repair loop just finished (or is about to start). Analyze workflow memory, coverage gaps, and source outcomes to explain what failed and how the next search/fetch/agent pass should change. + +Rules: +- Be specific and actionable — cite domains, query patterns, and triage/agent failures from memory when relevant. +- recommended_search_patterns: concrete query templates or angles (not duplicates of failed_queries). +- domains_to_prioritize: hosts that previously yielded records or match the missing fields. +- domains_to_avoid: hosts that failed fetch, blocked, or returned no usable rows. +- prefer_tinyfish_agent: true when static fetch/extract failed but navigation or forms are likely needed. +- extraction_notes: hints for extract agents (e.g. which columns are still null, evidence issues). +- Return ONLY JSON`; + +export async function generateRepairDiagnosis(options: { + userPrompt: string; + spec: DatasetSpec; + coverage: CoverageReport; + memory: WorkflowMemory; + sources?: SourcesReport; + repairLoop: number; + maxRepairLoops: number; +}): Promise { + const failedOutcomes = + options.sources?.failed.slice(0, 20).map((item) => ({ + url: item.url, + outcome: item.outcome, + error: item.error?.slice(0, 120), + })) ?? []; + + return completeJson({ + label: `repair_diagnosis:loop${options.repairLoop}`, + schema: repairDiagnosisSchema, + messages: [ + { role: "system", content: DIAGNOSIS_SYSTEM }, + { + role: "user", + content: JSON.stringify({ + user_prompt: options.userPrompt, + repair_loop: options.repairLoop, + max_repair_loops: options.maxRepairLoops, + dataset_spec: { + intent_summary: options.spec.intent_summary, + row_grain: options.spec.row_grain, + columns: options.spec.columns, + dedupe_keys: options.spec.dedupe_keys, + }, + coverage: { + total_records: options.coverage.total_records, + complete_count: options.coverage.complete_count, + partial_count: options.coverage.partial_count, + required_columns: options.coverage.required_columns, + field_gaps: options.coverage.field_gaps, + }, + source_failures_sample: failedOutcomes, + workflow_memory: memoryContextForAgents(options.memory), + output_shape: { + summary: "string", + likely_causes: ["string"], + recommended_search_patterns: ["string"], + domains_to_prioritize: ["string"], + domains_to_avoid: ["string"], + prefer_tinyfish_agent: "boolean", + agent_strategy_notes: "optional string", + extraction_notes: "optional string", + }, + }), + }, + ], + }); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/repair-queries.ts b/backend/BigSet_Data_Collection_Agent/src/agents/repair-queries.ts new file mode 100644 index 0000000..441778b --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/agents/repair-queries.ts @@ -0,0 +1,108 @@ +import { z } from "zod"; +import type { CoverageReport } from "../coverage/analyze.js"; +import { completeJson } from "../integrations/openrouter.js"; +import { + memoryContextForAgents, + type WorkflowMemory, +} from "../memory/index.js"; +import type { RepairDiagnosis } from "../memory/types.js"; +import type { DatasetSpec } from "../models/schemas.js"; + +const repairQueriesSchema = z.object({ + repair_queries: z.array(z.string()).min(1), + rationale: z.string(), +}); + +export type RepairQueriesResult = z.infer; + +function buildRepairQueriesSystem(maxQueries: number): string { + const minQueries = Math.min(2, maxQueries); + return `You are the Coverage & Query Planning Agent for a web data collection pipeline. + +After an initial extraction pass, some required fields are still missing. Generate targeted web search queries to find pages that can fill those gaps. + +Rules: +- Return between ${minQueries} and ${maxQueries} repair_queries (the user message includes max_queries — use as many distinct queries as needed, up to that limit). +- Prefer more queries when multiple fields or example rows need coverage (e.g. one query angle per missing field or per entity in example_rows). +- Each query should aim at a different source angle (company site, press release, database, registry, news). +- Include entity names or attributes from example_rows when available. +- Do NOT repeat or lightly rephrase queries already in prior_search_queries. +- Temporal rules (same as initial search): + - Use current_year / current_date when recency matters unless the user_prompt names a specific year. + - Do not default to outdated years. +- Prefer queries likely to return factual detail pages, not generic listicles. +- Use workflow_memory.query_stats_weak (low completeness/confidence) to avoid repeating bad queries; prefer angles similar to query_stats_top. +- Use workflow_memory.domain_stats_top / domain_stats_weak when choosing site: operators or domains to target. +- Follow recommended_search_patterns from latest_diagnosis when present. +- Return ONLY JSON`; +} + +function currentTimeContext(): { current_date: string; current_year: number } { + const now = new Date(); + return { + current_date: now.toISOString().slice(0, 10), + current_year: now.getFullYear(), + }; +} + +export async function generateRepairQueries(options: { + userPrompt: string; + spec: DatasetSpec; + coverage: CoverageReport; + priorSearchQueries: string[]; + maxQueries: number; + memory?: WorkflowMemory; + diagnosis?: RepairDiagnosis; + repairLoop?: number; +}): Promise { + const { current_date, current_year } = currentTimeContext(); + + const result = await completeJson({ + label: "repair_queries", + schema: repairQueriesSchema, + messages: [ + { + role: "system", + content: buildRepairQueriesSystem(options.maxQueries), + }, + { + role: "user", + content: JSON.stringify({ + user_prompt: options.userPrompt, + current_date, + current_year, + max_queries: options.maxQueries, + instruction: `Generate up to ${options.maxQueries} distinct repair_queries. Use as many as needed to cover missing fields and example rows; do not stop at 5 unless you have fewer useful angles.`, + dataset_spec: { + intent_summary: options.spec.intent_summary, + row_grain: options.spec.row_grain, + columns: options.spec.columns, + dedupe_keys: options.spec.dedupe_keys, + }, + coverage: { + total_records: options.coverage.total_records, + complete_count: options.coverage.complete_count, + partial_count: options.coverage.partial_count, + partial_record_ids: options.coverage.partial_record_ids, + field_gaps: options.coverage.field_gaps, + }, + prior_search_queries: options.priorSearchQueries, + repair_loop: options.repairLoop ?? options.memory?.repair_loop_count ?? 0, + repair_diagnosis: options.diagnosis, + workflow_memory: options.memory + ? memoryContextForAgents(options.memory) + : undefined, + output_shape: { + repair_queries: ["string"], + rationale: "string", + }, + }), + }, + ], + }); + + return { + ...result, + repair_queries: result.repair_queries.slice(0, options.maxQueries), + }; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/source-policy.ts b/backend/BigSet_Data_Collection_Agent/src/agents/source-policy.ts new file mode 100644 index 0000000..1ea3b54 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/agents/source-policy.ts @@ -0,0 +1,420 @@ +import type { + DatasetSpec, + ExtractedRecord, + SourceCandidate, + SourceTriageResult, +} from "../models/schemas.js"; +import { scoreDocsUrlForOfficialSource } from "../records/source-urls.js"; +import { getDomain } from "../utils/url.js"; + +export interface PromptSourceEntity { + name: string; + primaryToken: string; + domainTokens: string[]; +} + +export interface PromptSourcePolicy { + requiresOfficialSource: boolean; + entities: PromptSourceEntity[]; + searchPhrases: string[]; + hint?: string; +} + +const ENTITY_STOPWORDS = new Set([ + "a", + "an", + "and", + "company", + "companies", + "corp", + "corporation", + "for", + "from", + "inc", + "llc", + "ltd", + "of", + "official", + "page", + "pages", + "the", +]); + +const ENTITY_LIST_INTRODUCER = /\b(?:for|from)\s+([^?.;:]+)/gi; +const ENTITY_LIST_CUTOFF = + /\b(?:collect|find|include|give|make|show|table|with|need|return|list|shown)\b/i; +const GENERIC_HOSTED_DOMAIN = + /(?:^|\.)((github|gitlab)\.(io|com)|gitbook\.io|readthedocs\.io|notion\.site|medium\.com|substack\.com)$/i; + +function taskTextFromPrompt(prompt: string): string { + const taskLine = prompt.match(/^Task:\s*(.+)$/im)?.[1]; + return taskLine?.trim() || prompt; +} + +function uniqueStrings(values: string[]): string[] { + return [...new Set(values.map((value) => value.trim()).filter(Boolean))]; +} + +function tokenize(value: string): string[] { + return value + .toLowerCase() + .replace(/[^a-z0-9]+/g, " ") + .split(/\s+/) + .filter((token) => token.length >= 2 && !ENTITY_STOPWORDS.has(token)); +} + +function looksLikeEntityName(value: string): boolean { + const trimmed = value.trim(); + if (!trimmed || trimmed.length > 60) return false; + if (/^(?:and|or|the|official|latest|recent|current)$/i.test(trimmed)) { + return false; + } + return /[A-Z]/.test(trimmed[0] ?? "") || /[a-z][A-Z]/.test(trimmed); +} + +function splitEntityList(value: string): string[] { + const beforeVerb = value.split(ENTITY_LIST_CUTOFF)[0] ?? value; + const nestedFrom = beforeVerb.match(/\bfrom\s+(.+)$/i)?.[1]; + const entitySegment = nestedFrom ?? beforeVerb; + return entitySegment + .replace(/\s+and\s+/gi, ",") + .split(",") + .map((part) => part.trim().replace(/^and\s+/i, "").replace(/[.?!]$/g, "")) + .filter(looksLikeEntityName); +} + +function extractExplicitEntities(prompt: string): PromptSourceEntity[] { + const names: string[] = []; + for (const match of prompt.matchAll(ENTITY_LIST_INTRODUCER)) { + names.push(...splitEntityList(match[1] ?? "")); + } + + return uniqueStrings(names).map((name) => { + const domainTokens = tokenize(name); + return { + name, + primaryToken: domainTokens.at(-1) ?? name.toLowerCase(), + domainTokens, + }; + }); +} + +function searchPhrasesForPrompt(prompt: string): string[] { + const lower = prompt.toLowerCase(); + const phrases: string[] = []; + + if (lower.includes("pricing")) { + phrases.push("official pricing page", "billing pricing"); + } + if (lower.includes("investor relations") || lower.includes("earnings release")) { + phrases.push("reports quarterly results", "investor relations earnings release"); + } + if (lower.includes("mcp")) { + phrases.push("MCP connector docs", "model context protocol docs"); + } else if (lower.includes("docs") || lower.includes("documentation")) { + phrases.push("official docs"); + } + if (lower.includes("blog post") || lower.includes("blog posts")) { + phrases.push("official blog latest post"); + } + if (lower.includes("official website") || lower.includes("official websites")) { + phrases.push("official website"); + } + if (lower.includes("official") && phrases.length === 0) { + phrases.push("official source"); + } + + return uniqueStrings(phrases); +} + +function wantsDocsSource(policy: PromptSourcePolicy): boolean { + return policy.searchPhrases.some((phrase) => + /\b(?:docs|documentation|mcp|model context protocol)\b/i.test(phrase), + ); +} + +function isWeakDocsSurface(url: string): boolean { + return /\b(?:blog|news|course|academy|directory|skilljar)\b/i.test(url); +} + +function preferredDocsHost(entity: PromptSourceEntity): string { + const primary = entity.primaryToken.toLowerCase(); + if (primary === "openai") return "developers.openai.com"; + if (primary === "cloudflare") return "developers.cloudflare.com"; + if (primary === "anthropic") return "platform.claude.com"; + return `docs.${primary}.com`; +} + +function officialDomainAliasesForEntity(entity: PromptSourceEntity): string[] { + const primary = entity.primaryToken.toLowerCase(); + if (primary === "anthropic") { + return ["docs.anthropic.com", "platform.claude.com"]; + } + return []; +} + +export function derivePromptSourcePolicy(prompt: string): PromptSourcePolicy { + const taskText = taskTextFromPrompt(prompt); + const entities = extractExplicitEntities(taskText); + const searchPhrases = searchPhrasesForPrompt(taskText); + const lower = taskText.toLowerCase(); + const asksForCanonicalSource = + searchPhrases.length > 0 || + lower.includes("source url") || + lower.includes("source page"); + const requiresOfficialSource = + entities.length > 0 && + asksForCanonicalSource && + (lower.includes("official") || + lower.includes("pricing") || + lower.includes("investor relations") || + lower.includes("earnings release") || + lower.includes("docs") || + lower.includes("documentation") || + lower.includes("blog post")); + + const hint = requiresOfficialSource + ? [ + "Prompt source policy: user requested canonical/official sources for named entities.", + `Named entities: ${entities.map((entity) => entity.name).join(", ")}.`, + "Use official entity-owned domains for source_url, evidence, pricing/docs/blog/IR URLs, and required facts.", + "Use third-party pages only for discovery; do not use them as evidence when an official entity-owned page is available.", + ].join("\n") + : undefined; + + return { requiresOfficialSource, entities, searchPhrases, hint }; +} + +export function promptSourceSearchQueries(policy: PromptSourcePolicy): string[] { + if (!policy.requiresOfficialSource || policy.entities.length === 0) { + return []; + } + + const phrases = policy.searchPhrases.length + ? policy.searchPhrases + : ["official source"]; + const primaryPhrase = phrases[0] ?? "official source"; + const siteQualifiedDocsQueries = wantsDocsSource(policy) + ? policy.entities.map( + (entity) => + `${entity.name} ${primaryPhrase} site:${preferredDocsHost(entity)}`, + ) + : []; + + return uniqueStrings( + [ + ...siteQualifiedDocsQueries, + ...policy.entities.flatMap((entity) => + phrases.map((phrase) => `${entity.name} ${phrase}`), + ), + ], + ); +} + +export function applyPromptSourcePolicyToSpec( + spec: DatasetSpec, + prompt: string, +): DatasetSpec { + const policy = derivePromptSourcePolicy(prompt); + if (!policy.requiresOfficialSource) { + return spec; + } + + return { + ...spec, + search_queries: uniqueStrings([ + ...promptSourceSearchQueries(policy), + ...spec.search_queries, + ]), + extraction_hints: [spec.extraction_hints, policy.hint] + .filter(Boolean) + .join("\n"), + }; +} + +export function urlMatchesPromptSourcePolicy( + url: string, + policy: PromptSourcePolicy, +): boolean { + if (!policy.requiresOfficialSource) return true; + const domain = getDomain(url).toLowerCase(); + if (GENERIC_HOSTED_DOMAIN.test(domain)) { + return false; + } + return policy.entities.some( + (entity) => urlMatchesEntitySourcePolicy(url, entity, policy), + ); +} + +function urlMatchesEntitySourcePolicy( + url: string, + entity: PromptSourceEntity, + policy: PromptSourcePolicy, +): boolean { + const domain = getDomain(url).toLowerCase(); + if (GENERIC_HOSTED_DOMAIN.test(domain)) { + return false; + } + const entityOwnedDomain = + domain.includes(entity.primaryToken) || + officialDomainAliasesForEntity(entity).some((alias) => + domain.endsWith(alias), + ); + if (!entityOwnedDomain) { + return false; + } + if (wantsDocsSource(policy) && isWeakDocsSurface(url)) { + return false; + } + return true; +} + +export function sourceCandidatePolicyBoost( + candidate: SourceCandidate, + policy: PromptSourcePolicy, +): number { + if (!policy.requiresOfficialSource) return 0; + + const searchableText = [ + candidate.url, + candidate.title, + candidate.snippet, + candidate.site_name, + ] + .join(" ") + .toLowerCase(); + const matchedEntity = policy.entities.some((entity) => + entity.domainTokens.some((token) => searchableText.includes(token)), + ); + const matchedDomain = urlMatchesPromptSourcePolicy(candidate.url, policy); + const officialLanguage = + /\b(official|pricing|docs|documentation|investor relations|earnings|blog)\b/.test( + searchableText, + ); + const docsSurface = + wantsDocsSource(policy) && + /(?:^|\/\/)(?:docs|developers)\.|\/(?:docs|documentation|guides|api\/docs|agents)(?:\/|$)/.test( + searchableText, + ); + const weakDocsSurface = + wantsDocsSource(policy) && + /\b(?:blog|news|course|academy|directory|skilljar)\b/.test(searchableText); + + if (matchedDomain && matchedEntity && docsSurface) return 7; + if (matchedDomain && matchedEntity && officialLanguage) { + return weakDocsSurface ? 2 : 5; + } + if (matchedDomain && matchedEntity) return weakDocsSurface ? 1 : 4; + if (matchedDomain) return 3; + if (matchedEntity && officialLanguage) return 1; + return -2; +} + +export function applyPromptSourcePolicyToTriageResult( + result: SourceTriageResult, + policy: PromptSourcePolicy, +): SourceTriageResult { + if ( + !policy.requiresOfficialSource || + ![ + "extract_now", + "requires_navigation", + "requires_form_submission", + "requires_detail_page_followup", + ].includes(result.status) || + urlMatchesPromptSourcePolicy(result.final_url || result.url, policy) + ) { + return result; + } + + const domain = getDomain(result.final_url || result.url); + return { + ...result, + status: "low_value", + source_data_confidence: Math.min(result.source_data_confidence, 0.3), + expected_yield: "none", + reasoning: + `Prompt asks for official/canonical sources for named entities; ${domain} ` + + `does not match ${policy.entities.map((entity) => entity.name).join(", ")}. ` + + `Original triage: ${result.reasoning}`, + suggested_action: + result.suggested_action ?? + "Search/fetch the named entity's official domain instead of extracting this third-party page.", + }; +} + +export function recordMatchesPromptSourcePolicy( + record: ExtractedRecord, + spec: DatasetSpec, + policy: PromptSourcePolicy, +): boolean { + if (!policy.requiresOfficialSource) { + return true; + } + + const entity = matchingPromptEntityForRecord(record, spec, policy); + if (!entity) { + return true; + } + + const urls = urlsForRecordSourcePolicy(record, spec); + if (urls.length === 0) { + return false; + } + + return urls.some((url) => urlMatchesEntitySourcePolicy(url, entity, policy)); +} + +function matchingPromptEntityForRecord( + record: ExtractedRecord, + spec: DatasetSpec, + policy: PromptSourcePolicy, +): PromptSourceEntity | null { + const primaryColumn = + spec.dedupe_keys[0] ?? + spec.columns.find((column) => + /(name|title|company|organization|entity)/i.test(column.name), + )?.name; + const primaryValue = String( + primaryColumn ? record.row[primaryColumn] ?? "" : "", + ).toLowerCase(); + const rowText = Object.values(record.row).join(" ").toLowerCase(); + + return ( + policy.entities.find((entity) => { + const name = entity.name.toLowerCase(); + return ( + primaryValue.includes(name) || + primaryValue.includes(entity.primaryToken) || + rowText.includes(name) + ); + }) ?? null + ); +} + +function urlsForRecordSourcePolicy( + record: ExtractedRecord, + spec: DatasetSpec, +): string[] { + const urls = new Set(); + for (const url of record.source_urls) { + if (isHttpUrl(url)) urls.add(url.trim()); + } + for (const column of spec.columns) { + if (!isUrlLikeColumnName(column.name)) continue; + const value = record.row[column.name]; + if (isHttpUrl(value)) urls.add(value.trim()); + } + return [...urls].sort((a, b) => { + return scoreDocsUrlForOfficialSource(b) - scoreDocsUrlForOfficialSource(a); + }); +} + +function isHttpUrl(value: unknown): value is string { + return typeof value === "string" && /^https?:\/\//i.test(value.trim()); +} + +function isUrlLikeColumnName(name: string): boolean { + const lower = name.toLowerCase(); + return lower === "url" || lower.endsWith("_url") || lower.includes("url"); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/source-triage.ts b/backend/BigSet_Data_Collection_Agent/src/agents/source-triage.ts new file mode 100644 index 0000000..68939e5 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/agents/source-triage.ts @@ -0,0 +1,108 @@ +import { config } from "../config.js"; +import { completeJson } from "../integrations/openrouter.js"; +import { sourceStatusSchema } from "../models/source-status.js"; +import { + memoryContextForAgents, + type WorkflowMemory, +} from "../memory/index.js"; +import { + sourceTriageResultSchema, + type DatasetSpec, + type FetchedPage, + type SourceTriageResult, +} from "../models/schemas.js"; +import { + applyPromptSourcePolicyToTriageResult, + derivePromptSourcePolicy, +} from "./source-policy.js"; + +const TRIAGE_SYSTEM = `You are the Source Triage Agent for a web data collection pipeline. + +Classify each fetched web page to decide how the pipeline should process it. + +Status definitions: +- extract_now: Page already contains a usable list/table or enough inline data to extract rows directly. +- requires_navigation: Data exists but requires clicking through menus, pagination, tabs, or multi-step browsing. +- requires_form_submission: Data requires filling and submitting a search/filter form. +- requires_detail_page_followup: Page is an index; each item needs opening a detail page to get full fields. +- irrelevant: Page is unrelated to the dataset intent. +- duplicate: Page largely repeats data already covered (same listings, mirror content). +- blocked: Login wall, CAPTCHA, access denied, or bot block. +- low_value: Related but unlikely to yield useful rows (thin content, ads-only, generic homepage). + +Rules: +- Prefer extract_now when markdown already has list/table-style content matching row_grain. +- Use requires_* statuses when static fetch text is clearly incomplete for the schema. +- Mark duplicate only when the page would not yield any NEW rows beyond known_entities (if provided): same listings or mirror content with no additional primary keys visible. If the page may list entities not in known_entities, prefer extract_now or partial yield instead of duplicate. +- source_data_confidence: how confident you are that accurate, complete rows can be extracted (0–1). +- expected_yield: "complete" if full rows likely available inline; "partial" if only some fields; "none" if no useful rows. +- confidence: your confidence in this triage classification itself (routing), not data quality. +- When workflow_memory is provided: use domain_stats_top (high avg_completeness and avg_confidence) as strong extract_now signals; domain_stats_weak suggests blocked, low_value, or partial-only unless content clearly matches intent. +- Return ONLY JSON`; + +function truncate(text: string): string { + if (text.length <= config.maxPageChars) return text; + return `${text.slice(0, config.maxPageChars)}\n\n[truncated]`; +} + +export async function triagePage(options: { + userPrompt: string; + spec: DatasetSpec; + page: FetchedPage; + knownEntityKeys?: string[]; + memory?: WorkflowMemory; +}): Promise { + const pageUrl = options.page.final_url || options.page.url; + + const result = await completeJson({ + label: `triage:${pageUrl}`, + schema: sourceTriageResultSchema, + messages: [ + { role: "system", content: TRIAGE_SYSTEM }, + { + role: "user", + content: JSON.stringify({ + user_prompt: options.userPrompt, + dataset_spec: { + intent_summary: options.spec.intent_summary, + row_grain: options.spec.row_grain, + columns: options.spec.columns, + extraction_hints: options.spec.extraction_hints, + }, + known_entities: options.knownEntityKeys ?? [], + workflow_memory: options.memory + ? memoryContextForAgents(options.memory) + : undefined, + page: { + url: pageUrl, + title: options.page.title, + text: truncate(options.page.text), + }, + output_shape: { + url: "string", + final_url: "string", + title: "string", + status: "extract_now | requires_navigation | ...", + confidence: "0-1 triage routing confidence", + source_data_confidence: "0-1 expected data accuracy if extracted", + expected_yield: "complete | partial | none", + reasoning: "string", + suggested_action: "optional string", + }, + }), + }, + ], + }); + + const normalizedResult = { + ...result, + url: options.page.url, + final_url: pageUrl, + title: options.page.title || result.title, + status: sourceStatusSchema.parse(result.status), + }; + return applyPromptSourcePolicyToTriageResult( + normalizedResult, + derivePromptSourcePolicy(options.userPrompt), + ); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/config.ts b/backend/BigSet_Data_Collection_Agent/src/config.ts new file mode 100644 index 0000000..875747c --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/config.ts @@ -0,0 +1,114 @@ +function readBool(name: string, fallback: boolean): boolean { + const raw = process.env[name]; + if (raw === undefined || raw === "") return fallback; + return ["1", "true", "yes", "on"].includes(raw.toLowerCase()); +} + +function readFloat(name: string, fallback: number): number { + const raw = process.env[name]; + if (!raw) return fallback; + const value = Number.parseFloat(raw); + if (Number.isNaN(value) || value < 0 || value > 1) { + throw new Error(`Invalid ${name}: expected number 0–1, got "${raw}"`); + } + return value; +} + +function readOptionalFloat(name: string): number | undefined { + const raw = process.env[name]; + if (raw === undefined || raw === "") return undefined; + const value = Number.parseFloat(raw); + if (Number.isNaN(value) || value < 0 || value > 2) { + throw new Error(`Invalid ${name}: expected number 0–2, got "${raw}"`); + } + return value; +} + +function readInt(name: string, fallback: number): number { + const raw = process.env[name]; + if (!raw) return fallback; + const value = Number.parseInt(raw, 10); + if (Number.isNaN(value) || value <= 0) { + throw new Error(`Invalid ${name}: expected positive integer, got "${raw}"`); + } + return value; +} + +export const config = { + tinyfishApiKey: process.env.TINYFISH_API_KEY ?? "", + openRouterApiKey: process.env.OPENROUTER_API_KEY ?? "", + openRouterModel: process.env.OPENROUTER_MODEL ?? "google/gemini-3.1-flash-lite", + openRouterSiteUrl: + process.env.OPENROUTER_SITE_URL ?? + "https://github.com/MMeteorL/BigSet_Data_Collection_Agent", + openRouterAppName: + process.env.OPENROUTER_APP_NAME ?? "BigSet Data Collection Agent", + /** Omit temperature by default — Gemini/reasoning models on OpenRouter reject it. Set OPENROUTER_TEMPERATURE to override. */ + openRouterTemperature: readOptionalFloat("OPENROUTER_TEMPERATURE"), + maxSearchQueries: readInt("MAX_SEARCH_QUERIES", 6), + maxResultsPerQuery: readInt("MAX_RESULTS_PER_QUERY", 5), + maxUrlsToFetch: readInt("MAX_URLS_TO_FETCH", 20), + maxPageChars: readInt("MAX_PAGE_CHARS", 12000), + extractionConcurrency: readInt("EXTRACTION_CONCURRENCY", 5), + fetchBatchSize: readInt("FETCH_BATCH_SIZE", 10), + fetchConcurrency: readInt("FETCH_CONCURRENCY", 4), + searchConcurrency: readInt("SEARCH_CONCURRENCY", 4), + maxConcurrentPerDomain: readInt("MAX_CONCURRENT_PER_DOMAIN", 2), + maxRetries: readInt("MAX_RETRIES", 2), + retryBaseDelayMs: readInt("RETRY_BASE_DELAY_MS", 1000), + openRouterRpm: readInt("OPENROUTER_RPM", 60), + tinyfishSearchRpm: readInt("TINYFISH_SEARCH_RPM", 30), + tinyfishFetchRpm: readInt("TINYFISH_FETCH_RPM", 30), + tinyfishAgentRpm: readInt("TINYFISH_AGENT_RPM", 10), + enableRepairLoop: readBool("ENABLE_REPAIR_LOOP", true), + maxRepairLoops: readInt("MAX_REPAIR_LOOPS", 3), + enableWorkflowMemory: readBool("ENABLE_WORKFLOW_MEMORY", true), + maxRepairQueries: readInt("MAX_REPAIR_QUERIES", 4), + maxRepairResultsPerQuery: readInt("MAX_REPAIR_RESULTS_PER_QUERY", 5), + maxRepairUrlsToFetch: readInt("MAX_REPAIR_URLS_TO_FETCH", 10), + /** Top historical queries to re-run on the next Search API page during repair. */ + maxRepairSearchPaginationQueries: readInt( + "MAX_REPAIR_SEARCH_PAGINATION_QUERIES", + 2, + ), + /** Highest Search API page index (API allows 0–10). */ + maxSearchPage: readInt("MAX_SEARCH_PAGE", 10), + enableRepairLinkFollow: readBool("ENABLE_REPAIR_LINK_FOLLOW", true), + maxRepairLinkUrls: readInt("MAX_REPAIR_LINK_URLS", 8), + maxLinksPerSourcePage: readInt("MAX_LINKS_PER_SOURCE_PAGE", 3), + enableTriage: readBool("ENABLE_TRIAGE", true), + enableTinyfishAgent: readBool("ENABLE_TINYFISH_AGENT", true), + maxAgentRunsPerPhase: readInt("MAX_AGENT_RUNS_PER_PHASE", 5), + agentConcurrency: readInt("AGENT_CONCURRENCY", 2), + /** Parallel `/run-async` queue submissions per agent phase. */ + agentQueueConcurrency: readInt("AGENT_QUEUE_CONCURRENCY", 10), + /** Parallel `runs.get` polls while agent jobs execute on Tinyfish. */ + agentPollConcurrency: readInt("AGENT_POLL_CONCURRENCY", 10), + agentPollIntervalMs: readInt("AGENT_POLL_INTERVAL_MS", 3000), + agentPollTimeoutMs: readInt("AGENT_POLL_TIMEOUT_MS", 1_200_000), + triageConcurrency: readInt("TRIAGE_CONCURRENCY", 5), + enableQualityScoring: readBool("ENABLE_QUALITY_SCORING", true), + /** results.csv only includes rows with all required fields, ranked by quality. */ + enableSelectiveResults: readBool("ENABLE_SELECTIVE_RESULTS", true), + qualityLowConfidenceThreshold: readFloat("QUALITY_LOW_CONFIDENCE_THRESHOLD", 0.55), + qualityReviewThreshold: readFloat("QUALITY_REVIEW_THRESHOLD", 0.75), + qualitySourceConfidenceThreshold: readFloat( + "QUALITY_SOURCE_CONFIDENCE_THRESHOLD", + 0.5, + ), + qualityExtractionConfidenceThreshold: readFloat( + "QUALITY_EXTRACTION_CONFIDENCE_THRESHOLD", + 0.6, + ), +} as const; + +export function assertConfig(): void { + const missing: string[] = []; + if (!config.tinyfishApiKey) missing.push("TINYFISH_API_KEY"); + if (!config.openRouterApiKey) missing.push("OPENROUTER_API_KEY"); + if (missing.length > 0) { + throw new Error( + `Missing required environment variables: ${missing.join(", ")}. Copy .env.example to .env and fill in values.`, + ); + } +} diff --git a/backend/BigSet_Data_Collection_Agent/src/coverage/analyze.ts b/backend/BigSet_Data_Collection_Agent/src/coverage/analyze.ts new file mode 100644 index 0000000..e1a364c --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/coverage/analyze.ts @@ -0,0 +1,116 @@ +import { canonicalRecordId } from "../merge/records.js"; +import type { DatasetSpec, ExtractedRecord } from "../models/schemas.js"; + +export interface FieldGap { + column: string; + description: string; + missing_count: number; + missing_pct: number; + /** Partial rows missing this field (for repair query context). */ + example_rows: Record[]; +} + +export interface CoverageReport { + total_records: number; + required_columns: string[]; + field_gaps: FieldGap[]; + should_repair: boolean; + /** Rows with all required fields present. */ + complete_count: number; + /** Rows missing at least one required field. */ + partial_count: number; + /** Record ids (canonical) for partial rows — for repair planning. */ + partial_record_ids: string[]; +} + +function isEmpty(value: unknown): boolean { + return value === null || value === undefined || value === ""; +} + +export function analyzeCoverage( + spec: DatasetSpec, + records: ExtractedRecord[], +): CoverageReport { + const requiredColumns = spec.columns.filter((col) => col.required); + + const fieldGaps: FieldGap[] = requiredColumns + .map((col) => { + const missingRecords = records.filter((record) => + isEmpty(record.row[col.name]), + ); + + return { + column: col.name, + description: col.description, + missing_count: missingRecords.length, + missing_pct: + records.length > 0 ? missingRecords.length / records.length : 1, + example_rows: missingRecords.slice(0, 5).map((record) => record.row), + }; + }) + .filter((gap) => gap.missing_count > 0 || records.length === 0); + + const shouldRepair = + fieldGaps.length > 0 && + (records.length === 0 || fieldGaps.some((gap) => gap.missing_count > 0)); + + const partialRecordIds: string[] = []; + let completeCount = 0; + + for (const record of records) { + const missingRequired = requiredColumns.some((col) => + isEmpty(record.row[col.name]), + ); + if (missingRequired) { + const id = canonicalRecordId(record, spec); + if (id) partialRecordIds.push(id); + } else { + completeCount += 1; + } + } + + return { + total_records: records.length, + required_columns: requiredColumns.map((col) => col.name), + field_gaps: fieldGaps, + should_repair: shouldRepair, + complete_count: completeCount, + partial_count: partialRecordIds.length, + partial_record_ids: partialRecordIds, + }; +} + +export function countFilledGaps( + spec: DatasetSpec, + before: ExtractedRecord[], + after: ExtractedRecord[], + columns: string[], +): Record { + const filled = Object.fromEntries(columns.map((col) => [col, 0])) as Record< + string, + number + >; + + const afterByKey = new Map(); + for (const record of after) { + const key = canonicalRecordId(record, spec); + if (key && !afterByKey.has(key)) { + afterByKey.set(key, record); + } + } + + for (const prev of before) { + const key = canonicalRecordId(prev, spec); + if (!key) continue; + const next = afterByKey.get(key); + if (!next) continue; + + for (const column of columns) { + if (isEmpty(prev.row[column]) && !isEmpty(next.row[column])) { + filled[column] = (filled[column] ?? 0) + 1; + } + } + } + + return filled; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/export/csv-compiler.ts b/backend/BigSet_Data_Collection_Agent/src/export/csv-compiler.ts new file mode 100644 index 0000000..0514376 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/export/csv-compiler.ts @@ -0,0 +1,199 @@ +import { writeFile } from "node:fs/promises"; +import { join } from "node:path"; +import { canonicalRecordId } from "../merge/records.js"; +import type { RecordQuality } from "../models/quality.js"; +import type { DatasetSpec, ExtractedRecord } from "../models/schemas.js"; + +function escapeCsv(value: string): string { + if (/[",\n\r]/.test(value)) { + return `"${value.replace(/"/g, '""')}"`; + } + return value; +} + +function cellValue(value: unknown): string { + if (value === null || value === undefined) return ""; + if (typeof value === "boolean") return value ? "true" : "false"; + return String(value); +} + +const QUALITY_COLUMNS = [ + "record_id", + "record_status", + "needs_review", + "completeness_pct", + "confidence_score", + "missing_required_fields", + "review_reasons", +] as const; + +function fieldConfidenceColumns(spec: DatasetSpec): string[] { + return spec.columns + .filter((col) => col.required) + .map((col) => `${col.name}_confidence`); +} + +function qualityCells( + quality: RecordQuality | undefined, + spec: DatasetSpec, +): string[] { + if (!quality) { + return [ + ...QUALITY_COLUMNS.map(() => ""), + ...fieldConfidenceColumns(spec).map(() => ""), + ]; + } + const requiredConfidenceCells = spec.columns + .filter((col) => col.required) + .map((col) => { + const value = quality.field_confidences[col.name]; + return escapeCsv(value !== undefined ? String(value) : ""); + }); + + return [ + escapeCsv(quality.record_id), + escapeCsv(quality.record_status), + escapeCsv(quality.needs_review ? "true" : "false"), + escapeCsv(String(quality.completeness_pct)), + escapeCsv(String(quality.confidence_score)), + escapeCsv(quality.missing_required_fields.join("; ")), + escapeCsv(quality.review_reasons.join("; ")), + ...requiredConfidenceCells, + ]; +} + +export async function writeResultsCsv( + path: string, + spec: DatasetSpec, + records: ExtractedRecord[], + qualityByRecordId?: Map, +): Promise { + const columnNames = spec.columns.map((c) => c.name); + const metaColumns = ["primary_source_url", "all_source_urls"]; + const includeQuality = qualityByRecordId !== undefined; + const header = [ + ...columnNames, + ...metaColumns, + ...(includeQuality + ? [...QUALITY_COLUMNS, ...fieldConfidenceColumns(spec)] + : []), + ]; + + const lines = [header.map(escapeCsv).join(",")]; + + for (const record of records) { + const cells = columnNames.map((name) => + escapeCsv(cellValue(record.row[name])), + ); + const primarySource = record.source_urls[0] ?? ""; + const allSources = record.source_urls.join(" | "); + cells.push(escapeCsv(primarySource), escapeCsv(allSources)); + + if (includeQuality) { + const recordId = canonicalRecordId(record, spec); + const quality = recordId ? qualityByRecordId.get(recordId) : undefined; + cells.push(...qualityCells(quality, spec)); + } + + lines.push(cells.join(",")); + } + + await writeFile(path, `${lines.join("\n")}\n`, "utf8"); +} + +export async function writeEvidenceJsonl( + path: string, + spec: DatasetSpec, + records: ExtractedRecord[], + qualityByRecordId?: Map, +): Promise { + const lines = records.map((record) => { + const recordId = canonicalRecordId(record, spec); + const payload: Record = { + row: record.row, + evidence: record.evidence, + source_urls: record.source_urls, + }; + if (record.extraction_confidence !== undefined) { + payload.extraction_confidence = record.extraction_confidence; + } + if (recordId && qualityByRecordId?.has(recordId)) { + const quality = qualityByRecordId.get(recordId)!; + payload.quality = quality; + if (Object.keys(quality.field_confidences).length > 0) { + payload.field_confidences = quality.field_confidences; + } + } + return JSON.stringify(payload); + }); + + const body = lines.length > 0 ? `${lines.join("\n")}\n` : ""; + await writeFile(path, body, "utf8"); +} + +export function qualityMapFromReport( + qualities: RecordQuality[], +): Map { + return new Map(qualities.map((quality) => [quality.record_id, quality])); +} + +export async function writeSegmentedRecordCsvs( + root: string, + spec: DatasetSpec, + records: ExtractedRecord[], + qualities: RecordQuality[], +): Promise { + const qualityById = qualityMapFromReport(qualities); + const recordIdFor = (record: ExtractedRecord) => canonicalRecordId(record, spec); + + const complete = records.filter((record) => { + const id = recordIdFor(record); + return id && qualityById.get(id)?.record_status === "complete"; + }); + const partial = records.filter((record) => { + const id = recordIdFor(record); + return id && qualityById.get(id)?.record_status === "partial"; + }); + const lowConfidence = records.filter((record) => { + const id = recordIdFor(record); + return id && qualityById.get(id)?.record_status === "low_confidence"; + }); + const needingReview = records.filter((record) => { + const id = recordIdFor(record); + return id && qualityById.get(id)?.needs_review === true; + }); + + await writeResultsCsv( + join(root, "records_complete.csv"), + spec, + complete, + qualityById, + ); + await writeResultsCsv( + join(root, "records_partial.csv"), + spec, + partial, + qualityById, + ); + await writeResultsCsv( + join(root, "records_low_confidence.csv"), + spec, + lowConfidence, + qualityById, + ); + await writeResultsCsv( + join(root, "records_needing_review.csv"), + spec, + needingReview, + qualityById, + ); +} + +export async function writeUnkeyedRecordsJsonl( + path: string, + records: ExtractedRecord[], +): Promise { + const lines = records.map((record) => JSON.stringify(record)); + const body = lines.length > 0 ? `${lines.join("\n")}\n` : ""; + await writeFile(path, body, "utf8"); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/export/select-results.ts b/backend/BigSet_Data_Collection_Agent/src/export/select-results.ts new file mode 100644 index 0000000..643bb9f --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/export/select-results.ts @@ -0,0 +1,47 @@ +import { canonicalRecordId } from "../merge/records.js"; +import type { RecordQuality } from "../models/quality.js"; +import type { DatasetSpec, ExtractedRecord } from "../models/schemas.js"; + +function isEmpty(value: unknown): boolean { + return value === null || value === undefined || value === ""; +} + +/** Row has every required column populated. */ +export function hasAllRequiredFields( + spec: DatasetSpec, + record: ExtractedRecord, +): boolean { + return spec.columns + .filter((col) => col.required) + .every((col) => !isEmpty(record.row[col.name])); +} + +/** + * Records for the primary results view: all required fields present, + * ranked by completeness (desc) then confidence (desc). + */ +export function selectVisualizationRecords( + spec: DatasetSpec, + records: ExtractedRecord[], + qualityById: Map, +): ExtractedRecord[] { + const eligible = records.filter((record) => { + if (!hasAllRequiredFields(spec, record)) return false; + const id = canonicalRecordId(record, spec); + if (!id) return false; + const quality = qualityById.get(id); + return quality !== undefined && quality.missing_required_fields.length === 0; + }); + + return eligible.sort((a, b) => { + const idA = canonicalRecordId(a, spec)!; + const idB = canonicalRecordId(b, spec)!; + const qA = qualityById.get(idA)!; + const qB = qualityById.get(idB)!; + + if (qB.completeness_pct !== qA.completeness_pct) { + return qB.completeness_pct - qA.completeness_pct; + } + return qB.confidence_score - qA.confidence_score; + }); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/integrations/openrouter.ts b/backend/BigSet_Data_Collection_Agent/src/integrations/openrouter.ts new file mode 100644 index 0000000..b8e6418 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/integrations/openrouter.ts @@ -0,0 +1,2 @@ +/** @deprecated Import from `../llm/complete-json.js` instead. */ +export { completeJson, type LlmMessage } from "../llm/complete-json.js"; diff --git a/backend/BigSet_Data_Collection_Agent/src/integrations/tinyfish-agent.ts b/backend/BigSet_Data_Collection_Agent/src/integrations/tinyfish-agent.ts new file mode 100644 index 0000000..4e337f3 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/integrations/tinyfish-agent.ts @@ -0,0 +1,240 @@ +import { RunStatus, TinyFish, type Run } from "@tiny-fish/sdk"; +import { config } from "../config.js"; +import { sleep, withRetry } from "../queue/retry.js"; +import { mapWithConcurrency } from "../utils/concurrency.js"; + +let client: TinyFish | null = null; + +const TINYFISH_API_BASE = "https://agent.tinyfish.ai"; + +function getClient(): TinyFish { + if (!client) { + client = new TinyFish({ apiKey: config.tinyfishApiKey }); + } + return client; +} + +const TERMINAL_STATUSES: ReadonlySet = new Set([ + RunStatus.COMPLETED, + RunStatus.FAILED, + RunStatus.CANCELLED, +]); + +export interface TinyfishAgentRunResult { + run_id: string | null; + status: string; + result: Record | null; + error: string | null; +} + +export interface QueueTinyfishAgentResult { + run_id: string | null; + error: string | null; +} + +export interface TinyfishAgentJob { + url: string; + goal: string; +} + +export interface TinyfishAgentRunOptions { + pollTimeoutMs?: number; +} + +function runToResult(run: Run): TinyfishAgentRunResult { + const errorMessage = + run.error?.message ?? + (run.status === RunStatus.FAILED ? "Agent run failed" : null); + + return { + run_id: run.run_id, + status: run.status, + result: (run.result as Record | null) ?? null, + error: errorMessage, + }; +} + +/** Best-effort cancel for async agent runs (POST /v1/runs/{id}/cancel). */ +export async function cancelTinyfishAgentRun(runId: string): Promise { + if (!runId.trim()) return; + + try { + await withRetry( + async () => { + const response = await fetch( + `${TINYFISH_API_BASE}/v1/runs/${encodeURIComponent(runId)}/cancel`, + { + method: "POST", + headers: { + "X-API-Key": config.tinyfishApiKey, + "Content-Type": "application/json", + }, + }, + ); + + if (!response.ok) { + const body = await response.text(); + throw new Error( + `Cancel failed (${response.status})${body ? `: ${body.slice(0, 200)}` : ""}`, + ); + } + }, + { + maxRetries: 1, + baseDelayMs: config.retryBaseDelayMs, + label: `agent.cancel:${runId}`, + }, + ); + } catch { + // Cancel is best-effort — polling timeout still reports failure. + } +} + +/** Submit a run via `/run-async` (returns immediately with run_id). */ +export async function queueTinyfishAgent( + url: string, + goal: string, +): Promise { + const response = await withRetry( + () => getClient().agent.queue({ url, goal }), + { + maxRetries: config.maxRetries, + baseDelayMs: config.retryBaseDelayMs, + label: `agent.queue:${url}`, + }, + ); + + if (response.error) { + return { run_id: null, error: response.error.message }; + } + + if (!response.run_id) { + return { run_id: null, error: "Failed to queue agent run (no run_id)" }; + } + + return { run_id: response.run_id, error: null }; +} + +/** Poll `runs.get` until the run reaches a terminal status or times out. */ +export async function pollTinyfishAgentUntilDone( + runId: string, + options: TinyfishAgentRunOptions = {}, +): Promise { + const startedAt = Date.now(); + const pollTimeoutMs = options.pollTimeoutMs ?? config.agentPollTimeoutMs; + let lastStatus = RunStatus.PENDING; + + while (true) { + const run = await withRetry( + () => getClient().runs.get(runId), + { + maxRetries: config.maxRetries, + baseDelayMs: config.retryBaseDelayMs, + label: `agent.poll:${runId}`, + }, + ); + + lastStatus = run.status; + + if (TERMINAL_STATUSES.has(run.status)) { + return runToResult(run); + } + + if (Date.now() - startedAt >= pollTimeoutMs) { + await cancelTinyfishAgentRun(runId); + + try { + const finalRun = await getClient().runs.get(runId); + if (TERMINAL_STATUSES.has(finalRun.status)) { + const result = runToResult(finalRun); + if (finalRun.status === RunStatus.CANCELLED) { + return { + ...result, + error: + result.error ?? + `Agent run cancelled after ${pollTimeoutMs}ms (was ${lastStatus})`, + }; + } + return result; + } + } catch { + // Fall through to TIMEOUT result below. + } + + return { + run_id: runId, + status: "TIMEOUT", + result: null, + error: `Agent run timed out after ${pollTimeoutMs}ms (last status: ${lastStatus}); cancel requested`, + }; + } + + await sleep(config.agentPollIntervalMs); + } +} + +/** + * Queue then poll — drop-in replacement for the old synchronous `/run` helper. + */ +export async function runTinyfishAgent( + url: string, + goal: string, + options: TinyfishAgentRunOptions = {}, +): Promise { + const queued = await queueTinyfishAgent(url, goal); + if (queued.error || !queued.run_id) { + return { + run_id: null, + status: RunStatus.FAILED, + result: null, + error: queued.error ?? "Failed to queue agent run", + }; + } + return pollTinyfishAgentUntilDone(queued.run_id, options); +} + +/** + * Queue all jobs quickly, then poll in parallel — better overlap than sync `/run` waves. + */ +export async function runTinyfishAgentsBatch( + jobs: TinyfishAgentJob[], + options: TinyfishAgentRunOptions = {}, +): Promise { + if (jobs.length === 0) return []; + + const queued = await mapWithConcurrency( + jobs, + config.agentQueueConcurrency, + async (job) => { + const queueResult = await queueTinyfishAgent(job.url, job.goal); + return { job, ...queueResult }; + }, + ); + + const results: TinyfishAgentRunResult[] = new Array(jobs.length); + + const pollTargets: { index: number; run_id: string }[] = []; + for (let index = 0; index < queued.length; index++) { + const item = queued[index]!; + if (item.error || !item.run_id) { + results[index] = { + run_id: null, + status: RunStatus.FAILED, + result: null, + error: item.error ?? "Failed to queue agent run", + }; + continue; + } + pollTargets.push({ index, run_id: item.run_id }); + } + + await mapWithConcurrency( + pollTargets, + config.agentPollConcurrency, + async ({ index, run_id }) => { + results[index] = await pollTinyfishAgentUntilDone(run_id, options); + }, + ); + + return results; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/integrations/tinyfish.ts b/backend/BigSet_Data_Collection_Agent/src/integrations/tinyfish.ts new file mode 100644 index 0000000..c11948a --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/integrations/tinyfish.ts @@ -0,0 +1,70 @@ +import { TinyFish } from "@tiny-fish/sdk"; +import { config } from "../config.js"; +import type { FetchedPage, SourceCandidate } from "../models/schemas.js"; + +let client: TinyFish | null = null; + +function getClient(): TinyFish { + if (!client) { + client = new TinyFish({ apiKey: config.tinyfishApiKey }); + } + return client; +} + +export async function searchWeb( + query: string, + page = 0, +): Promise { + const response = await getClient().search.query({ query, page }); + return response.results.map((result) => ({ + url: result.url, + title: result.title, + snippet: result.snippet, + site_name: result.site_name, + query, + position: result.position, + search_page: page, + })); +} + +export async function fetchPages( + urls: string[], + options?: { includeLinks?: boolean }, +): Promise { + if (urls.length === 0) return []; + + const response = await getClient().fetch.getContents({ + urls, + format: "markdown", + links: options?.includeLinks ?? false, + }); + + const pages: FetchedPage[] = response.results.map((page) => ({ + url: page.url, + final_url: page.final_url ?? page.url, + title: page.title ?? "", + description: page.description ?? undefined, + text: typeof page.text === "string" ? page.text : JSON.stringify(page.text), + outbound_links: page.links, + })); + + for (const err of response.errors) { + pages.push({ + url: err.url, + final_url: err.url, + title: "", + text: "", + error: err.error, + }); + } + + return pages; +} + +export function chunkUrls(urls: string[], size: number): string[][] { + const chunks: string[][] = []; + for (let i = 0; i < urls.length; i += size) { + chunks.push(urls.slice(i, i + size)); + } + return chunks; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/llm/complete-json.ts b/backend/BigSet_Data_Collection_Agent/src/llm/complete-json.ts new file mode 100644 index 0000000..bed77f2 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/llm/complete-json.ts @@ -0,0 +1,93 @@ +import { generateText, Output } from "ai"; +import type { z } from "zod"; + +import { config } from "../config.js"; +import { getOpenRouterLimiter } from "../queue/pools.js"; +import { getOpenRouterChatModel } from "./provider.js"; +import { recordLanguageModelUsage } from "./usage.js"; + +export interface LlmMessage { + role: "system" | "user" | "assistant"; + content: string; +} + +type ConversationMessage = { + role: "user" | "assistant"; + content: string; +}; + +function splitPromptMessages(messages: LlmMessage[]): { + system?: string; + messages: ConversationMessage[]; +} { + const systemParts: string[] = []; + const conversation: ConversationMessage[] = []; + + for (const message of messages) { + if (message.role === "system") { + systemParts.push(message.content); + continue; + } + conversation.push({ role: message.role, content: message.content }); + } + + return { + system: systemParts.length > 0 ? systemParts.join("\n\n") : undefined, + messages: conversation, + }; +} + +/** + * Structured JSON completion via Vercel AI SDK (`generateText` + `Output.object`). + * Token usage is recorded into the current `runWithLlmUsageScope` when active. + */ +export async function completeJson(options: { + messages: LlmMessage[]; + schema: z.ZodType; + label: string; + maxRetries?: number; +}): Promise { + const maxRetries = options.maxRetries ?? 2; + let messages = [...options.messages]; + let lastError: unknown; + + for (let attempt = 0; attempt <= maxRetries; attempt++) { + await getOpenRouterLimiter().acquire(); + + const { system, messages: conversation } = splitPromptMessages(messages); + + try { + const result = await generateText({ + model: getOpenRouterChatModel(), + ...(system ? { system } : {}), + messages: conversation, + output: Output.object({ schema: options.schema }), + ...(config.openRouterTemperature !== undefined + ? { temperature: config.openRouterTemperature } + : {}), + }); + + recordLanguageModelUsage(result.usage); + return result.output as T; + } catch (error) { + lastError = error; + if (attempt < maxRetries) { + messages = [ + ...messages, + { + role: "user", + content: `Your JSON was invalid for ${options.label}. Error: ${ + error instanceof Error ? error.message : String(error) + }. Return only valid JSON matching the requested schema.`, + }, + ]; + } + } + } + + throw new Error( + `${options.label} failed after ${maxRetries + 1} attempts: ${ + lastError instanceof Error ? lastError.message : String(lastError) + }`, + ); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/llm/provider.ts b/backend/BigSet_Data_Collection_Agent/src/llm/provider.ts new file mode 100644 index 0000000..078e514 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/llm/provider.ts @@ -0,0 +1,23 @@ +import { createOpenRouter } from "@openrouter/ai-sdk-provider"; + +import { config } from "../config.js"; + +let openRouterProvider: ReturnType | null = null; + +function getOpenRouterProvider(): ReturnType { + if (!openRouterProvider) { + openRouterProvider = createOpenRouter({ + apiKey: config.openRouterApiKey, + headers: { + "HTTP-Referer": config.openRouterSiteUrl, + "X-Title": config.openRouterAppName, + }, + }); + } + return openRouterProvider; +} + +/** OpenRouter chat model via the official AI SDK provider (not OpenAI-compatible shim). */ +export function getOpenRouterChatModel() { + return getOpenRouterProvider().chat(config.openRouterModel); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/llm/usage.ts b/backend/BigSet_Data_Collection_Agent/src/llm/usage.ts new file mode 100644 index 0000000..5f27740 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/llm/usage.ts @@ -0,0 +1,57 @@ +import { AsyncLocalStorage } from "node:async_hooks"; +import type { LanguageModelUsage } from "ai"; + +export interface LlmUsageTotals { + promptTokens: number; + completionTokens: number; + totalTokens: number; + callCount: number; +} + +const storage = new AsyncLocalStorage(); + +export function emptyLlmUsage(): LlmUsageTotals { + return { + promptTokens: 0, + completionTokens: 0, + totalTokens: 0, + callCount: 0, + }; +} + +/** Run pipeline (or other work) with a scoped LLM usage accumulator. */ +export async function runWithLlmUsageScope( + fn: () => Promise, +): Promise<{ result: T; usage: LlmUsageTotals }> { + const usage = emptyLlmUsage(); + const result = await storage.run(usage, fn); + return { result, usage: { ...usage } }; +} + +export function getCurrentLlmUsage(): LlmUsageTotals { + return storage.getStore() ?? emptyLlmUsage(); +} + +export function recordLanguageModelUsage(usage: LanguageModelUsage | undefined): void { + const totals = storage.getStore(); + if (!totals || !usage) { + return; + } + + const promptTokens = usage.inputTokens ?? 0; + const completionTokens = usage.outputTokens ?? 0; + totals.promptTokens += promptTokens; + totals.completionTokens += completionTokens; + totals.totalTokens += usage.totalTokens ?? promptTokens + completionTokens; + totals.callCount += 1; +} + +export function toDatasetAgentUsage( + usage: LlmUsageTotals, +): { promptTokens: number; completionTokens: number; totalTokens: number } { + return { + promptTokens: usage.promptTokens, + completionTokens: usage.completionTokens, + totalTokens: usage.totalTokens, + }; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/memory/fingerprint.ts b/backend/BigSet_Data_Collection_Agent/src/memory/fingerprint.ts new file mode 100644 index 0000000..7d49854 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/memory/fingerprint.ts @@ -0,0 +1,6 @@ +import { createHash } from "node:crypto"; + +export function promptFingerprint(prompt: string): string { + const normalized = prompt.trim().toLowerCase().replace(/\s+/g, " "); + return createHash("sha256").update(normalized).digest("hex").slice(0, 16); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/memory/index.ts b/backend/BigSet_Data_Collection_Agent/src/memory/index.ts new file mode 100644 index 0000000..4dec404 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/memory/index.ts @@ -0,0 +1,26 @@ +export { promptFingerprint } from "./fingerprint.js"; +export { + createWorkflowMemory, + domainMemoryBoost, + memoryContextForAgents, + mergePersistentMemory, + recordCoverageGaps, + recordDiagnosis, + recordPhaseInMemory, + snapshotExtractionSchema, +} from "./workflow-memory.js"; +export { loadPersistentMemory, savePersistentMemory, saveRunMemory } from "./store.js"; +export { + aggregateQueryStatsByText, + effectiveWeightedQuality, + planRepairSearches, + type SearchPlan, +} from "./search-pagination.js"; +export type { + AgentGoalMemoryEntry, + DomainMemoryEntry, + QueryMemoryEntry, + RepairDiagnosis, + WorkflowMemory, +} from "./types.js"; +export { repairDiagnosisSchema, workflowMemorySchema } from "./types.js"; diff --git a/backend/BigSet_Data_Collection_Agent/src/memory/scored-aggregates.ts b/backend/BigSet_Data_Collection_Agent/src/memory/scored-aggregates.ts new file mode 100644 index 0000000..5d873a7 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/memory/scored-aggregates.ts @@ -0,0 +1,481 @@ +import type { + AgentRunRecord, + DatasetSpec, + ExtractedRecord, + SourceCandidate, + SourceTriageResult, +} from "../models/schemas.js"; +import { agentExtractedUrls, triageByUrl } from "../quality/index.js"; +import { scoreRecord, type ScoreRecordContext } from "../quality/score-record.js"; +import { getDomain, normalizeUrl } from "../utils/url.js"; +import { recomputeWeightedQuality } from "./search-pagination.js"; +import type { + AgentGoalMemoryEntry, + DomainMemoryEntry, + QueryMemoryEntry, + QueryPageBreakdown, + WorkflowMemory, +} from "./types.js"; + +export interface RecordMetrics { + completeness: number; + confidence: number; +} + +function rollingAvg(current: number, count: number, value: number): number { + if (count <= 0) return value; + return (current * count + value) / (count + 1); +} + +export function metricsForRecord( + spec: DatasetSpec, + record: ExtractedRecord, + context: ScoreRecordContext, +): RecordMetrics { + const quality = scoreRecord(spec, record, context, "memory"); + return { + completeness: quality.completeness_pct, + confidence: quality.confidence_score, + }; +} + +export function buildUrlToQueryMap( + candidates: SourceCandidate[], +): Map { + const map = new Map(); + for (const candidate of candidates) { + map.set(normalizeUrl(candidate.url), candidate.query); + } + return map; +} + +function getOrCreateQueryEntry( + memory: WorkflowMemory, + query: string, + phase: string, + repairLoop: number, +): QueryMemoryEntry { + let entry = memory.query_stats.find( + (item) => item.query === query && item.phase === phase, + ); + if (!entry) { + entry = { + query, + phase, + repair_loop: repairLoop, + urls_produced: 0, + urls_with_records: 0, + record_count: 0, + avg_completeness: 0, + avg_confidence: 0, + search_page: 0, + weighted_quality: 0, + page_breakdown: [], + }; + memory.query_stats.push(entry); + } + return entry; +} + +function getOrCreatePageSlice( + entry: QueryMemoryEntry, + page: number, +): QueryPageBreakdown { + let slice = entry.page_breakdown.find((item) => item.page === page); + if (!slice) { + slice = { + page, + urls_produced: 0, + urls_with_records: 0, + record_count: 0, + avg_completeness: 0, + avg_confidence: 0, + }; + entry.page_breakdown.push(slice); + } + return slice; +} + +function applyMetricsToPageSlice( + slice: QueryPageBreakdown, + metrics: RecordMetrics, +): void { + slice.avg_completeness = rollingAvg( + slice.avg_completeness, + slice.record_count, + metrics.completeness, + ); + slice.avg_confidence = rollingAvg( + slice.avg_confidence, + slice.record_count, + metrics.confidence, + ); + slice.record_count += 1; +} + +function getOrCreateDomainEntry( + memory: WorkflowMemory, + domain: string, + repairLoop: number, +): DomainMemoryEntry { + let entry = memory.domain_stats.find((item) => item.domain === domain); + if (!entry) { + entry = { + domain, + record_count: 0, + fetch_failures: 0, + avg_completeness: 0, + avg_confidence: 0, + last_repair_loop: repairLoop, + }; + memory.domain_stats.push(entry); + } + return entry; +} + +function applyMetricsToDomain( + entry: DomainMemoryEntry, + metrics: RecordMetrics, + repairLoop: number, +): void { + entry.avg_completeness = rollingAvg( + entry.avg_completeness, + entry.record_count, + metrics.completeness, + ); + entry.avg_confidence = rollingAvg( + entry.avg_confidence, + entry.record_count, + metrics.confidence, + ); + entry.record_count += 1; + entry.last_repair_loop = repairLoop; +} + +function applyMetricsToQuery( + entry: QueryMemoryEntry, + metrics: RecordMetrics, + searchPage = 0, +): void { + entry.avg_completeness = rollingAvg( + entry.avg_completeness, + entry.record_count, + metrics.completeness, + ); + entry.avg_confidence = rollingAvg( + entry.avg_confidence, + entry.record_count, + metrics.confidence, + ); + entry.record_count += 1; + entry.search_page = Math.max(entry.search_page ?? 0, searchPage); + + const slice = getOrCreatePageSlice(entry, searchPage); + applyMetricsToPageSlice(slice, metrics); + recomputeWeightedQuality(entry); +} + +export function attributeRecordsToMemory(options: { + memory: WorkflowMemory; + spec: DatasetSpec; + phase: string; + repairLoop: number; + queries: string[]; + candidates: SourceCandidate[]; + records: ExtractedRecord[]; + failedUrls: string[]; + agentRuns: AgentRunRecord[]; + triageResults: SourceTriageResult[]; +}): void { + const { + memory, + spec, + phase, + repairLoop, + queries, + candidates, + records, + failedUrls, + agentRuns, + triageResults, + } = options; + + const urlToQuery = buildUrlToQueryMap(candidates); + const context: ScoreRecordContext = { + triageByUrl: triageByUrl(triageResults), + agentExtractedUrls: agentExtractedUrls(agentRuns), + }; + + const candidateUrlsByQuery = new Map>(); + const candidateUrlsByQueryPage = new Map>>(); + const urlToSearchPage = new Map(); + + for (const candidate of candidates) { + const normalized = normalizeUrl(candidate.url); + const page = candidate.search_page ?? 0; + urlToSearchPage.set(normalized, page); + + if (!candidateUrlsByQuery.has(candidate.query)) { + candidateUrlsByQuery.set(candidate.query, new Set()); + } + candidateUrlsByQuery.get(candidate.query)!.add(normalized); + + if (!candidateUrlsByQueryPage.has(candidate.query)) { + candidateUrlsByQueryPage.set(candidate.query, new Map()); + } + const byPage = candidateUrlsByQueryPage.get(candidate.query)!; + if (!byPage.has(page)) byPage.set(page, new Set()); + byPage.get(page)!.add(normalized); + } + + for (const query of queries) { + const entry = getOrCreateQueryEntry(memory, query, phase, repairLoop); + const urls = candidateUrlsByQuery.get(query); + if (urls) entry.urls_produced += urls.size; + + const byPage = candidateUrlsByQueryPage.get(query); + if (byPage) { + for (const [page, pageUrls] of byPage) { + const slice = getOrCreatePageSlice(entry, page); + slice.urls_produced += pageUrls.size; + entry.search_page = Math.max(entry.search_page ?? 0, page); + } + } + } + + const urlsWithRecordsByQuery = new Map>(); + const urlsWithRecordsByQueryPage = new Map>>(); + + for (const record of records) { + const metrics = metricsForRecord(spec, record, context); + const queriesHit = new Set(); + const domainsHit = new Set(); + + const attributeUrl = (rawUrl: string) => { + const normalized = normalizeUrl(rawUrl); + const domain = getDomain(rawUrl); + + if (!domainsHit.has(domain)) { + domainsHit.add(domain); + applyMetricsToDomain( + getOrCreateDomainEntry(memory, domain, repairLoop), + metrics, + repairLoop, + ); + } + + const query = urlToQuery.get(normalized); + if (query) { + if (!urlsWithRecordsByQuery.has(query)) { + urlsWithRecordsByQuery.set(query, new Set()); + } + urlsWithRecordsByQuery.get(query)!.add(normalized); + queriesHit.add(query); + + const page = urlToSearchPage.get(normalized) ?? 0; + if (!urlsWithRecordsByQueryPage.has(query)) { + urlsWithRecordsByQueryPage.set(query, new Map()); + } + const byPage = urlsWithRecordsByQueryPage.get(query)!; + if (!byPage.has(page)) byPage.set(page, new Set()); + byPage.get(page)!.add(normalized); + } + }; + + for (const sourceUrl of record.source_urls) { + attributeUrl(sourceUrl); + } + for (const item of record.evidence) { + attributeUrl(item.url); + } + + for (const query of queriesHit) { + let searchPage = 0; + for (const sourceUrl of record.source_urls) { + const normalized = normalizeUrl(sourceUrl); + if (urlToQuery.get(normalized) === query) { + searchPage = urlToSearchPage.get(normalized) ?? 0; + break; + } + } + if (searchPage === 0) { + for (const item of record.evidence) { + const normalized = normalizeUrl(item.url); + if (urlToQuery.get(normalized) === query) { + searchPage = urlToSearchPage.get(normalized) ?? 0; + break; + } + } + } + applyMetricsToQuery( + getOrCreateQueryEntry(memory, query, phase, repairLoop), + metrics, + searchPage, + ); + } + } + + for (const [query, urls] of urlsWithRecordsByQuery) { + const entry = getOrCreateQueryEntry(memory, query, phase, repairLoop); + entry.urls_with_records = Math.max(entry.urls_with_records, urls.size); + + const byPage = urlsWithRecordsByQueryPage.get(query); + if (byPage) { + for (const [page, pageUrls] of byPage) { + const slice = getOrCreatePageSlice(entry, page); + slice.urls_with_records = Math.max(slice.urls_with_records, pageUrls.size); + } + } + recomputeWeightedQuality(entry); + } + + for (const url of failedUrls) { + const entry = getOrCreateDomainEntry(memory, getDomain(url), repairLoop); + entry.fetch_failures += 1; + entry.last_repair_loop = repairLoop; + } + + for (const run of agentRuns) { + const normalizedUrl = normalizeUrl(run.url); + const domain = getDomain(run.url); + + if (run.records_extracted > 0 && run.goal) { + const matching = records.filter((record) => + record.source_urls.some((u) => normalizeUrl(u) === normalizedUrl), + ); + + let goalEntry = memory.agent_goal_stats.find( + (item) => item.url === run.url && item.goal === run.goal, + ); + if (!goalEntry) { + goalEntry = { + url: run.url, + goal: run.goal, + repair_loop: repairLoop, + record_count: 0, + avg_completeness: 0, + avg_confidence: 0, + }; + memory.agent_goal_stats.push(goalEntry); + } + + for (const record of matching) { + const metrics = metricsForRecord(spec, record, context); + goalEntry.avg_completeness = rollingAvg( + goalEntry.avg_completeness, + goalEntry.record_count, + metrics.completeness, + ); + goalEntry.avg_confidence = rollingAvg( + goalEntry.avg_confidence, + goalEntry.record_count, + metrics.confidence, + ); + goalEntry.record_count += 1; + } + } else { + const domainEntry = getOrCreateDomainEntry(memory, domain, repairLoop); + domainEntry.fetch_failures += 1; + } + } + + capMemoryLists(memory); +} + +function capMemoryLists(memory: WorkflowMemory): void { + if (memory.query_stats.length > 80) { + memory.query_stats.splice(0, memory.query_stats.length - 80); + } + if (memory.domain_stats.length > 50) { + memory.domain_stats.sort((a, b) => b.record_count - a.record_count); + memory.domain_stats = memory.domain_stats.slice(0, 50); + } + if (memory.agent_goal_stats.length > 40) { + memory.agent_goal_stats = memory.agent_goal_stats + .filter((item) => item.record_count > 0) + .slice(-40); + } +} + +export function mergeQueryEntry( + target: QueryMemoryEntry, + source: QueryMemoryEntry, +): void { + const totalRecords = target.record_count + source.record_count; + if (totalRecords > 0) { + target.avg_completeness = + (target.avg_completeness * target.record_count + + source.avg_completeness * source.record_count) / + totalRecords; + target.avg_confidence = + (target.avg_confidence * target.record_count + + source.avg_confidence * source.record_count) / + totalRecords; + } + target.record_count = totalRecords; + target.urls_produced += source.urls_produced; + target.urls_with_records += source.urls_with_records; + target.repair_loop = Math.max(target.repair_loop, source.repair_loop); + target.search_page = Math.max( + target.search_page ?? 0, + source.search_page ?? 0, + ); + + for (const slice of source.page_breakdown ?? []) { + const targetSlice = getOrCreatePageSlice(target, slice.page); + const combinedRecords = targetSlice.record_count + slice.record_count; + if (combinedRecords > 0) { + targetSlice.avg_completeness = + (targetSlice.avg_completeness * targetSlice.record_count + + slice.avg_completeness * slice.record_count) / + combinedRecords; + targetSlice.avg_confidence = + (targetSlice.avg_confidence * targetSlice.record_count + + slice.avg_confidence * slice.record_count) / + combinedRecords; + } + targetSlice.record_count = combinedRecords; + targetSlice.urls_produced += slice.urls_produced; + targetSlice.urls_with_records += slice.urls_with_records; + } + recomputeWeightedQuality(target); +} + +export function mergeDomainEntry( + target: DomainMemoryEntry, + source: DomainMemoryEntry, +): void { + const totalRecords = target.record_count + source.record_count; + if (totalRecords > 0) { + target.avg_completeness = + (target.avg_completeness * target.record_count + + source.avg_completeness * source.record_count) / + totalRecords; + target.avg_confidence = + (target.avg_confidence * target.record_count + + source.avg_confidence * source.record_count) / + totalRecords; + } + target.record_count = totalRecords; + target.fetch_failures += source.fetch_failures; + target.last_repair_loop = Math.max(target.last_repair_loop, source.last_repair_loop); +} + +export function mergeAgentGoalEntry( + target: AgentGoalMemoryEntry, + source: AgentGoalMemoryEntry, +): void { + const totalRecords = target.record_count + source.record_count; + if (totalRecords > 0) { + target.avg_completeness = + (target.avg_completeness * target.record_count + + source.avg_completeness * source.record_count) / + totalRecords; + target.avg_confidence = + (target.avg_confidence * target.record_count + + source.avg_confidence * source.record_count) / + totalRecords; + } + target.record_count = totalRecords; + target.repair_loop = Math.max(target.repair_loop, source.repair_loop); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/memory/search-pagination.ts b/backend/BigSet_Data_Collection_Agent/src/memory/search-pagination.ts new file mode 100644 index 0000000..67c9e9e --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/memory/search-pagination.ts @@ -0,0 +1,184 @@ +import { config } from "../config.js"; +import type { QueryMemoryEntry, WorkflowMemory } from "./types.js"; + +export interface SearchPlan { + /** Base query string sent to the Search API. */ + query: string; + /** Search API page index (0-based, max 10). */ + page: number; +} + +/** Front pages count more toward recurring-search ranking. */ +const PAGE_WEIGHTS = [1.0, 0.75, 0.5, 0.35, 0.25, 0.2, 0.15, 0.12, 0.1, 0.08, 0.05]; + +export function pageWeight(page: number): number { + if (page < 0) return 0.05; + return PAGE_WEIGHTS[page] ?? 0.05; +} + +export function effectiveWeightedQuality(entry: QueryMemoryEntry): number { + if (entry.weighted_quality > 0) return entry.weighted_quality; + if (entry.record_count <= 0) return 0; + return (entry.avg_completeness + entry.avg_confidence) / 2; +} + +export function recomputeWeightedQuality(entry: QueryMemoryEntry): void { + const breakdown = entry.page_breakdown ?? []; + if (breakdown.length === 0) { + entry.weighted_quality = + entry.record_count > 0 + ? (entry.avg_completeness + entry.avg_confidence) / 2 + : 0; + return; + } + + let numerator = 0; + let denominator = 0; + for (const slice of breakdown) { + if (slice.record_count <= 0) continue; + const w = pageWeight(slice.page) * slice.record_count; + const q = (slice.avg_completeness + slice.avg_confidence) / 2; + numerator += w * q; + denominator += w; + } + entry.weighted_quality = denominator > 0 ? numerator / denominator : 0; +} + +/** Roll up stats for the same query text across phases. */ +export function aggregateQueryStatsByText( + memory: WorkflowMemory, +): Map { + const map = new Map(); + + for (const item of memory.query_stats) { + const existing = map.get(item.query); + if (!existing) { + map.set(item.query, { + ...item, + phases: [item.phase], + search_page: item.search_page ?? 0, + weighted_quality: item.weighted_quality ?? 0, + page_breakdown: [...(item.page_breakdown ?? [])], + }); + continue; + } + + existing.phases.push(item.phase); + existing.record_count += item.record_count; + existing.urls_produced += item.urls_produced; + existing.urls_with_records += item.urls_with_records; + existing.search_page = Math.max( + existing.search_page ?? 0, + item.search_page ?? 0, + ); + existing.repair_loop = Math.max(existing.repair_loop, item.repair_loop); + + const totalRecords = existing.record_count; + if (totalRecords > 0) { + const prevCount = totalRecords - item.record_count; + if (prevCount > 0) { + existing.avg_completeness = + (existing.avg_completeness * prevCount + + item.avg_completeness * item.record_count) / + totalRecords; + existing.avg_confidence = + (existing.avg_confidence * prevCount + + item.avg_confidence * item.record_count) / + totalRecords; + } else { + existing.avg_completeness = item.avg_completeness; + existing.avg_confidence = item.avg_confidence; + } + } + + for (const slice of item.page_breakdown ?? []) { + const target = existing.page_breakdown!.find((p) => p.page === slice.page); + if (!target) { + existing.page_breakdown!.push({ ...slice }); + } else { + const combined = target.record_count + slice.record_count; + if (combined > 0) { + target.avg_completeness = + (target.avg_completeness * target.record_count + + slice.avg_completeness * slice.record_count) / + combined; + target.avg_confidence = + (target.avg_confidence * target.record_count + + slice.avg_confidence * slice.record_count) / + combined; + } + target.record_count = combined; + target.urls_produced += slice.urls_produced; + target.urls_with_records += slice.urls_with_records; + } + } + recomputeWeightedQuality(existing); + } + + return map; +} + +/** New repair queries at page 0; top historical queries at the next page. */ +export function planRepairSearches( + memory: WorkflowMemory, + newQueries: string[], +): SearchPlan[] { + const plans: SearchPlan[] = []; + const seen = new Set(); + + for (const raw of newQueries) { + const query = raw.trim(); + if (!query || seen.has(query)) continue; + seen.add(query); + plans.push({ query, page: 0 }); + } + + const aggregated = aggregateQueryStatsByText(memory); + const top = [...aggregated.values()] + .filter((item) => item.record_count > 0) + .sort( + (a, b) => effectiveWeightedQuality(b) - effectiveWeightedQuality(a), + ) + .slice(0, config.maxRepairSearchPaginationQueries); + + for (const entry of top) { + const nextPage = (entry.search_page ?? 0) + 1; + if (nextPage > config.maxSearchPage) continue; + if (seen.has(entry.query)) continue; + seen.add(entry.query); + plans.push({ query: entry.query, page: nextPage }); + } + + return plans; +} + +/** After a repair search pass, persist the highest page used per query. */ +export function markSearchPagesUsed( + memory: WorkflowMemory, + plans: SearchPlan[], + phase: string, + repairLoop: number, +): void { + for (const plan of plans) { + let entry = memory.query_stats.find( + (item) => item.query === plan.query && item.phase === phase, + ); + if (!entry) { + entry = { + query: plan.query, + phase, + repair_loop: repairLoop, + urls_produced: 0, + urls_with_records: 0, + record_count: 0, + avg_completeness: 0, + avg_confidence: 0, + search_page: plan.page, + weighted_quality: 0, + page_breakdown: [], + }; + memory.query_stats.push(entry); + } + entry.search_page = Math.max(entry.search_page ?? 0, plan.page); + } +} diff --git a/backend/BigSet_Data_Collection_Agent/src/memory/store.ts b/backend/BigSet_Data_Collection_Agent/src/memory/store.ts new file mode 100644 index 0000000..a8c75f7 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/memory/store.ts @@ -0,0 +1,125 @@ +import { mkdir, readFile, writeFile } from "node:fs/promises"; +import { join } from "node:path"; +import { workflowMemorySchema, type WorkflowMemory } from "./types.js"; + +export function globalMemoryPath(memoryDir: string, fingerprint: string): string { + return join(memoryDir, `${fingerprint}.json`); +} + +/** Migrate v1.1 coarse memory format to scored stats (best-effort). */ +function migrateLegacyMemory(raw: Record): WorkflowMemory { + const base = workflowMemorySchema.parse({ + prompt_fingerprint: raw.prompt_fingerprint, + user_prompt: raw.user_prompt, + repair_loop_count: raw.repair_loop_count ?? 0, + query_stats: [], + domain_stats: [], + agent_goal_stats: [], + extraction_schema: raw.extraction_schema, + dedupe_keys: raw.dedupe_keys ?? [], + diagnoses: raw.diagnoses ?? [], + strategy_notes: raw.strategy_notes ?? [], + last_missing_fields: raw.last_missing_fields, + }); + + const successfulDomains = raw.successful_domains as string[] | undefined; + const failedDomains = raw.failed_domains as string[] | undefined; + + for (const domain of successfulDomains ?? []) { + base.domain_stats.push({ + domain, + record_count: 1, + fetch_failures: 0, + avg_completeness: 0.7, + avg_confidence: 0.7, + last_repair_loop: 0, + }); + } + for (const domain of failedDomains ?? []) { + base.domain_stats.push({ + domain, + record_count: 0, + fetch_failures: 1, + avg_completeness: 0, + avg_confidence: 0, + last_repair_loop: 0, + }); + } + + const successfulQueries = raw.successful_queries as + | { query: string; phase: string; repair_loop: number }[] + | undefined; + for (const item of successfulQueries ?? []) { + base.query_stats.push({ + query: item.query, + phase: item.phase, + repair_loop: item.repair_loop, + urls_produced: 1, + urls_with_records: 1, + record_count: 1, + avg_completeness: 0.7, + avg_confidence: 0.7, + search_page: 0, + weighted_quality: 0.7, + page_breakdown: [], + }); + } + + for (const query of (raw.failed_queries as string[] | undefined) ?? []) { + base.query_stats.push({ + query, + phase: "legacy", + repair_loop: 0, + urls_produced: 1, + urls_with_records: 0, + record_count: 0, + avg_completeness: 0, + avg_confidence: 0, + search_page: 0, + weighted_quality: 0, + page_breakdown: [], + }); + } + + return base; +} + +export async function loadPersistentMemory( + memoryDir: string, + fingerprint: string, +): Promise { + try { + const raw = JSON.parse( + await readFile(globalMemoryPath(memoryDir, fingerprint), "utf8"), + ) as Record; + + if (Array.isArray(raw.query_stats)) { + return workflowMemorySchema.parse(raw); + } + + return migrateLegacyMemory(raw); + } catch { + return null; + } +} + +export async function savePersistentMemory( + memoryDir: string, + memory: WorkflowMemory, +): Promise { + await mkdir(memoryDir, { recursive: true }); + await writeFile( + globalMemoryPath(memoryDir, memory.prompt_fingerprint), + `${JSON.stringify(memory, null, 2)}\n`, + "utf8", + ); +} + +export async function saveRunMemory( + runRoot: string, + memory: WorkflowMemory, +): Promise { + const path = join(runRoot, "workflow_memory.json"); + await writeFile(path, `${JSON.stringify(memory, null, 2)}\n`, "utf8"); + return path; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/memory/types.ts b/backend/BigSet_Data_Collection_Agent/src/memory/types.ts new file mode 100644 index 0000000..893b658 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/memory/types.ts @@ -0,0 +1,101 @@ +import { z } from "zod"; + +export const queryPageBreakdownSchema = z.object({ + page: z.number().int().min(0), + urls_produced: z.number().int().nonnegative(), + urls_with_records: z.number().int().nonnegative(), + record_count: z.number().int().nonnegative(), + avg_completeness: z.number().min(0).max(1), + avg_confidence: z.number().min(0).max(1), +}); + +export type QueryPageBreakdown = z.infer; + +/** Rolling aggregate for a search query based on records from URLs it surfaced. */ +export const queryMemoryEntrySchema = z.object({ + query: z.string(), + phase: z.string(), + repair_loop: z.number(), + urls_produced: z.number().int().nonnegative(), + urls_with_records: z.number().int().nonnegative(), + record_count: z.number().int().nonnegative(), + avg_completeness: z.number().min(0).max(1), + avg_confidence: z.number().min(0).max(1), + /** Last Search API page index used for this query (0-based). */ + search_page: z.number().int().min(0).default(0), + /** Page-weighted quality for recurring search (earlier pages weigh more). */ + weighted_quality: z.number().min(0).max(1).default(0), + page_breakdown: z.array(queryPageBreakdownSchema).default([]), +}); + +export type QueryMemoryEntry = z.infer; + +/** Rolling aggregate for a hostname from records attributed to that domain. */ +export const domainMemoryEntrySchema = z.object({ + domain: z.string(), + record_count: z.number().int().nonnegative(), + fetch_failures: z.number().int().nonnegative(), + avg_completeness: z.number().min(0).max(1), + avg_confidence: z.number().min(0).max(1), + last_repair_loop: z.number().int().nonnegative(), +}); + +export type DomainMemoryEntry = z.infer; + +/** Rolling aggregate for a Tinyfish Agent goal from records on that URL. */ +export const agentGoalMemoryEntrySchema = z.object({ + url: z.string(), + goal: z.string(), + repair_loop: z.number(), + record_count: z.number().int().nonnegative(), + avg_completeness: z.number().min(0).max(1), + avg_confidence: z.number().min(0).max(1), +}); + +export type AgentGoalMemoryEntry = z.infer; + +export const extractionSchemaSnapshotSchema = z.object({ + columns: z.array( + z.object({ + name: z.string(), + type: z.string(), + required: z.boolean(), + }), + ), + dedupe_keys: z.array(z.string()), + row_grain: z.string(), +}); + +export const repairDiagnosisSchema = z.object({ + summary: z.string(), + likely_causes: z.array(z.string()), + recommended_search_patterns: z.array(z.string()), + domains_to_prioritize: z.array(z.string()), + domains_to_avoid: z.array(z.string()), + prefer_tinyfish_agent: z.boolean(), + agent_strategy_notes: z.string().optional(), + extraction_notes: z.string().optional(), +}); + +export type RepairDiagnosis = z.infer; + +export const workflowMemorySchema = z.object({ + prompt_fingerprint: z.string(), + user_prompt: z.string(), + repair_loop_count: z.number(), + query_stats: z.array(queryMemoryEntrySchema), + domain_stats: z.array(domainMemoryEntrySchema), + agent_goal_stats: z.array(agentGoalMemoryEntrySchema), + extraction_schema: extractionSchemaSnapshotSchema.optional(), + dedupe_keys: z.array(z.string()), + diagnoses: z.array( + z.object({ + repair_loop: z.number(), + diagnosis: repairDiagnosisSchema, + }), + ), + strategy_notes: z.array(z.string()), + last_missing_fields: z.array(z.string()).optional(), +}); + +export type WorkflowMemory = z.infer; diff --git a/backend/BigSet_Data_Collection_Agent/src/memory/workflow-memory.ts b/backend/BigSet_Data_Collection_Agent/src/memory/workflow-memory.ts new file mode 100644 index 0000000..559d91f --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/memory/workflow-memory.ts @@ -0,0 +1,208 @@ +import type { CoverageReport } from "../coverage/analyze.js"; +import type { + AgentRunRecord, + DatasetSpec, + ExtractedRecord, + SourceCandidate, + SourceTriageResult, +} from "../models/schemas.js"; +import { promptFingerprint } from "./fingerprint.js"; +import { effectiveWeightedQuality } from "./search-pagination.js"; +import { + attributeRecordsToMemory, + mergeAgentGoalEntry, + mergeDomainEntry, + mergeQueryEntry, +} from "./scored-aggregates.js"; +import type { + RepairDiagnosis, + WorkflowMemory, +} from "./types.js"; + +export function createWorkflowMemory( + userPrompt: string, + spec?: DatasetSpec, +): WorkflowMemory { + return { + prompt_fingerprint: promptFingerprint(userPrompt), + user_prompt: userPrompt, + repair_loop_count: 0, + query_stats: [], + domain_stats: [], + agent_goal_stats: [], + dedupe_keys: spec?.dedupe_keys ?? [], + extraction_schema: spec ? snapshotExtractionSchema(spec) : undefined, + diagnoses: [], + strategy_notes: [], + }; +} + +export function snapshotExtractionSchema( + spec: DatasetSpec, +): WorkflowMemory["extraction_schema"] { + return { + row_grain: spec.row_grain, + dedupe_keys: spec.dedupe_keys, + columns: spec.columns.map((col) => ({ + name: col.name, + type: col.type, + required: col.required, + })), + }; +} + +export function recordPhaseInMemory(options: { + memory: WorkflowMemory; + spec: DatasetSpec; + phase: string; + repairLoop: number; + queries: string[]; + candidates: SourceCandidate[]; + records: ExtractedRecord[]; + failedUrls: string[]; + agentRuns: AgentRunRecord[]; + triageResults: SourceTriageResult[]; +}): void { + attributeRecordsToMemory(options); +} + +export function recordDiagnosis( + memory: WorkflowMemory, + repairLoop: number, + diagnosis: RepairDiagnosis, +): void { + memory.diagnoses.push({ repair_loop: repairLoop, diagnosis }); + if (diagnosis.summary) { + memory.strategy_notes.push(`[loop ${repairLoop}] ${diagnosis.summary}`); + } + if (memory.strategy_notes.length > 30) { + memory.strategy_notes.splice(0, memory.strategy_notes.length - 30); + } +} + +export function recordCoverageGaps( + memory: WorkflowMemory, + coverage: CoverageReport, +): void { + memory.last_missing_fields = coverage.field_gaps.map((gap) => gap.column); +} + +export function mergePersistentMemory( + base: WorkflowMemory, + prior: WorkflowMemory | null, +): WorkflowMemory { + if (!prior || prior.prompt_fingerprint !== base.prompt_fingerprint) { + return base; + } + + for (const source of prior.query_stats) { + const target = base.query_stats.find( + (item) => item.query === source.query && item.phase === source.phase, + ); + if (target) mergeQueryEntry(target, source); + else base.query_stats.push({ ...source }); + } + + for (const source of prior.domain_stats) { + const target = base.domain_stats.find((item) => item.domain === source.domain); + if (target) mergeDomainEntry(target, source); + else base.domain_stats.push({ ...source }); + } + + for (const source of prior.agent_goal_stats) { + const target = base.agent_goal_stats.find( + (item) => item.url === source.url && item.goal === source.goal, + ); + if (target) mergeAgentGoalEntry(target, source); + else base.agent_goal_stats.push({ ...source }); + } + + for (const note of prior.strategy_notes) { + if (!base.strategy_notes.includes(note)) { + base.strategy_notes.push(note); + } + } + + return base; +} + +function topQueries(memory: WorkflowMemory, limit: number) { + return [...memory.query_stats] + .filter((item) => item.record_count > 0) + .sort( + (a, b) => effectiveWeightedQuality(b) - effectiveWeightedQuality(a), + ) + .slice(0, limit); +} + +function weakQueries(memory: WorkflowMemory, limit: number) { + return [...memory.query_stats] + .filter((item) => item.urls_produced > 0 && item.record_count === 0) + .slice(-limit); +} + +function topDomains(memory: WorkflowMemory, limit: number) { + return [...memory.domain_stats] + .filter((item) => item.record_count > 0) + .sort( + (a, b) => + b.avg_completeness + b.avg_confidence - (a.avg_completeness + a.avg_confidence), + ) + .slice(-limit); +} + +function weakDomains(memory: WorkflowMemory, limit: number) { + return [...memory.domain_stats] + .filter( + (item) => + item.fetch_failures > 0 || + (item.record_count > 0 && item.avg_completeness < 0.5), + ) + .sort((a, b) => b.fetch_failures - a.fetch_failures) + .slice(-limit); +} + +function topAgentGoals(memory: WorkflowMemory, limit: number) { + return [...memory.agent_goal_stats] + .filter((item) => item.record_count > 0) + .sort( + (a, b) => + b.avg_completeness + b.avg_confidence - (a.avg_completeness + a.avg_confidence), + ) + .slice(-limit); +} + +/** Compact context injected into LLM agent calls. */ +export function memoryContextForAgents(memory: WorkflowMemory): Record { + return { + repair_loop_count: memory.repair_loop_count, + query_stats_top: topQueries(memory, 12), + query_stats_weak: weakQueries(memory, 10), + domain_stats_top: topDomains(memory, 15), + domain_stats_weak: weakDomains(memory, 12), + agent_goal_stats_top: topAgentGoals(memory, 6), + extraction_schema: memory.extraction_schema, + dedupe_keys: memory.dedupe_keys, + last_missing_fields: memory.last_missing_fields, + strategy_notes: memory.strategy_notes.slice(-8), + latest_diagnosis: + memory.diagnoses.length > 0 + ? memory.diagnoses[memory.diagnoses.length - 1]!.diagnosis + : undefined, + }; +} + +export function domainMemoryBoost( + memory: WorkflowMemory, + domain: string, +): number { + const stats = memory.domain_stats.find((item) => item.domain === domain); + if (!stats) return 0; + + if (stats.record_count === 0 && stats.fetch_failures > 0) { + return -4; + } + + const qualityScore = (stats.avg_completeness + stats.avg_confidence) / 2; + return (qualityScore - 0.5) * 4; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/merge/records.ts b/backend/BigSet_Data_Collection_Agent/src/merge/records.ts new file mode 100644 index 0000000..a5ca0e3 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/merge/records.ts @@ -0,0 +1,430 @@ +import type { DatasetSpec, ExtractedRecord } from "../models/schemas.js"; +import { + deriveRecordSourceUrls, + isUrlLikeColumnName, + scoreDocsUrlForOfficialSource, + scoreUrlForCanonicalSource, +} from "../records/source-urls.js"; + +function normalizeValue(value: unknown): string { + if (value === null || value === undefined) return ""; + return String(value).trim().toLowerCase(); +} + +function isEmpty(value: unknown): boolean { + return value === null || value === undefined || value === ""; +} + +function normalizeComparableValue(value: unknown): string { + return normalizeValue(value) + .replace(/https?:\/\/(?:www\.)?/g, "") + .replace(/[/#?]+$/g, "") + .replace(/\s+/g, " "); +} + +function valuesMatch(a: unknown, b: unknown): boolean { + if (isEmpty(a) || isEmpty(b)) return false; + return normalizeComparableValue(a) === normalizeComparableValue(b); +} + +/** Normalize entity names for stable primary-key matching. */ +export function normalizePrimaryKey(value: unknown): string { + return normalizeValue(value) + .replace( + /\b(?:incorporated|inc|corporation|corp|company|co|llc|ltd|limited|plc)\b\.?$/i, + "", + ) + .replace(/\s+/g, " ") + .trim() + .replace(/[''`]/g, "'"); +} + +export function recordDedupeKey( + record: ExtractedRecord, + keys: string[], +): string { + return keys.map((key) => normalizeValue(record.row[key])).join("||"); +} + +function isEmptyCompositeKey(key: string, keyCount: number): boolean { + return !key || key === Array.from({ length: keyCount }, () => "").join("||"); +} + +/** + * Primary identity column: first dedupe key, or first column whose name suggests a name/title. + */ +export function getPrimaryKeyColumn(spec: DatasetSpec): string | null { + if (spec.dedupe_keys.length > 0) { + return spec.dedupe_keys[0]!; + } + + const nameLike = spec.columns.find((col) => + /(name|title|company|organization|entity)/i.test(col.name), + ); + return nameLike?.name ?? spec.columns[0]?.name ?? null; +} + +export function getPrimaryKeyValue( + record: ExtractedRecord, + spec: DatasetSpec, +): string { + const column = getPrimaryKeyColumn(spec); + if (!column) return ""; + return normalizePrimaryKey(record.row[column]); +} + +/** + * Canonical row id: primary key when present, otherwise full composite dedupe key. + */ +export function canonicalRecordId( + record: ExtractedRecord, + spec: DatasetSpec, +): string | null { + const primary = getPrimaryKeyValue(record, spec); + if (primary) { + return `pk:${primary}`; + } + + const composite = recordDedupeKey(record, spec.dedupe_keys); + if (!isEmptyCompositeKey(composite, spec.dedupe_keys.length)) { + return `dk:${composite}`; + } + + return null; +} + +export interface MergeResult { + records: ExtractedRecord[]; + unkeyed: ExtractedRecord[]; +} + +export function mergeRecords( + spec: DatasetSpec, + records: ExtractedRecord[], +): MergeResult { + const seen = new Map(); + const unkeyed: ExtractedRecord[] = []; + + for (const record of records) { + const id = canonicalRecordId(record, spec); + if (!id) { + unkeyed.push(record); + continue; + } + + const existing = seen.get(id); + if (!existing) { + seen.set(id, record); + continue; + } + + seen.set(id, mergePair(existing, record, spec)); + } + + return { records: [...seen.values()], unkeyed }; +} + +/** + * Merge repair-pass rows into an existing dataset. + * Rows with the same primary key (e.g. restaurant name) update in place; new keys add rows. + */ +export function mergeRepairIntoExisting( + spec: DatasetSpec, + existing: ExtractedRecord[], + repairRecords: ExtractedRecord[], +): MergeResult { + return mergeRecords(spec, [...existing, ...repairRecords]); +} + +export function mergePair( + a: ExtractedRecord, + b: ExtractedRecord, + spec: DatasetSpec, +): ExtractedRecord { + const row: Record = { ...a.row }; + const fieldsFilledFromIncoming = new Set(); + const shouldPreferIncomingCanonicalRecord = prefersIncomingCanonicalRecord( + a, + b, + spec, + ); + let replacedCanonicalUrlFromIncoming = false; + + for (const col of spec.columns) { + const current = row[col.name]; + const incoming = b.row[col.name]; + const currentEmpty = isEmpty(current); + const incomingFilled = !isEmpty(incoming); + + if (currentEmpty && incomingFilled) { + row[col.name] = incoming ?? null; + fieldsFilledFromIncoming.add(col.name); + } else if ( + incomingFilled && + shouldPreferIncomingCanonicalRecord && + !spec.dedupe_keys.includes(col.name) + ) { + row[col.name] = incoming ?? null; + fieldsFilledFromIncoming.add(col.name); + replacedCanonicalUrlFromIncoming ||= isCanonicalSourceUrlColumn(col.name); + } else if (incomingFilled && shouldReplaceCell(col.name, current, incoming)) { + row[col.name] = incoming ?? null; + fieldsFilledFromIncoming.add(col.name); + replacedCanonicalUrlFromIncoming ||= isCanonicalSourceUrlColumn(col.name); + } + } + + if (replacedCanonicalUrlFromIncoming) { + for (const col of spec.columns) { + const incoming = b.row[col.name]; + if ( + shouldReplaceCompanionColumn(col.name, spec) && + !isEmpty(incoming) && + !spec.dedupe_keys.includes(col.name) + ) { + row[col.name] = incoming ?? null; + fieldsFilledFromIncoming.add(col.name); + } + } + } + + const evidence = a.evidence.filter((item) => + valuesMatch(row[item.field], a.row[item.field]), + ); + const evidenceFields = new Set(evidence.map((e) => e.field)); + for (const item of b.evidence) { + if ( + !evidenceFields.has(item.field) && + shouldMergeIncomingEvidence({ + field: item.field, + mergedRow: row, + incomingRow: b.row, + fieldsFilledFromIncoming, + }) + ) { + evidence.push(item); + evidenceFields.add(item.field); + } + } + const coherentEvidence = filterEvidenceForRetainedCanonicalUrl(spec, row, evidence); + + const extractionConfidence = Math.max( + a.extraction_confidence ?? 0, + b.extraction_confidence ?? 0, + ); + + return { + row, + evidence: coherentEvidence, + source_urls: deriveRecordSourceUrls({ + spec, + row, + evidence: coherentEvidence, + fallbackUrls: coherentEvidence.length > 0 ? [] : a.source_urls, + }), + ...(extractionConfidence > 0 + ? { extraction_confidence: extractionConfidence } + : {}), + }; +} + +function shouldMergeIncomingEvidence(input: { + field: string; + mergedRow: Record; + incomingRow: Record; + fieldsFilledFromIncoming: Set; +}): boolean { + if ( + isCanonicalSourceUrlColumn(input.field) && + !urlsReferenceSamePage( + input.incomingRow[input.field], + input.mergedRow[input.field], + ) + ) { + return false; + } + if (input.fieldsFilledFromIncoming.has(input.field)) { + return true; + } + return valuesMatch(input.mergedRow[input.field], input.incomingRow[input.field]); +} + +function shouldReplaceCell( + columnName: string, + current: string | number | boolean | null | undefined, + incoming: string | number | boolean | null | undefined, +): boolean { + if (!isCanonicalSourceUrlColumn(columnName)) { + return false; + } + return ( + scoreUrlForCanonicalSource(incoming) > scoreUrlForCanonicalSource(current) + ); +} + +function prefersIncomingCanonicalRecord( + current: ExtractedRecord, + incoming: ExtractedRecord, + spec: DatasetSpec, +): boolean { + const currentScore = bestCanonicalScore(current, spec); + const incomingScore = bestCanonicalScore(incoming, spec); + if (incomingScore > currentScore) { + return true; + } + if (incomingScore < currentScore) { + return false; + } + + const currentDate = bestRecordTimestamp(current, spec); + const incomingDate = bestRecordTimestamp(incoming, spec); + return incomingDate !== null && currentDate !== null && incomingDate > currentDate; +} + +function bestCanonicalScore(record: ExtractedRecord, spec: DatasetSpec): number { + let bestScore = 0; + for (const column of spec.columns) { + if (!isCanonicalSourceUrlColumn(column.name)) continue; + bestScore = Math.max( + bestScore, + scoreUrlForCanonicalSource(record.row[column.name]), + ); + } + return bestScore; +} + +function bestRecordTimestamp( + record: ExtractedRecord, + spec: DatasetSpec, +): number | null { + const timestamps = spec.columns + .filter((column) => column.name.toLowerCase().includes("date")) + .map((column) => Date.parse(String(record.row[column.name] ?? ""))) + .filter(Number.isFinite); + if (timestamps.length === 0) { + return null; + } + return Math.max(...timestamps); +} + +function isDocsUrlColumn(columnName: string): boolean { + const lower = columnName.toLowerCase(); + return ( + lower === "docs_url" || + lower.endsWith("_docs_url") || + (lower.includes("docs") && lower.includes("url")) + ); +} + +function isDocsCompanionColumn(columnName: string): boolean { + const lower = columnName.toLowerCase(); + return ( + lower === "summary" || + lower === "description" || + lower === "docs_title" || + (lower.includes("docs") && lower.includes("title")) + ); +} + +function isCanonicalSourceUrlColumn(columnName: string): boolean { + return isUrlLikeColumnName(columnName); +} + +function shouldReplaceCompanionColumn( + columnName: string, + spec: DatasetSpec, +): boolean { + if (spec.dedupe_keys.includes(columnName)) { + return false; + } + return !isCanonicalSourceUrlColumn(columnName); +} + +function filterEvidenceForRetainedCanonicalUrl( + spec: DatasetSpec, + row: Record, + evidence: ExtractedRecord["evidence"], +): ExtractedRecord["evidence"] { + const retainedUrl = bestRetainedCanonicalUrl(spec, row); + if (!retainedUrl) { + return evidence; + } + + return evidence.filter((item) => { + if (isCanonicalSourceUrlColumn(item.field)) { + return urlsReferenceSamePage(item.url, row[item.field]); + } + + if ( + isDocsCompanionColumn(item.field) || + isLikelySourceCompanionColumn(item.field) || + spec.dedupe_keys.includes(item.field) + ) { + return sourceUrlSupportsRetainedCanonicalUrl(item.url, retainedUrl); + } + + return true; + }); +} + +function bestRetainedCanonicalUrl( + spec: DatasetSpec, + row: Record, +): string | null { + let bestUrl: string | null = null; + let bestScore = 0; + for (const col of spec.columns) { + if (!isCanonicalSourceUrlColumn(col.name)) continue; + const value = row[col.name]; + const score = scoreUrlForCanonicalSource(value); + if (typeof value === "string" && score > bestScore) { + bestUrl = value; + bestScore = score; + } + } + return bestScore >= 2 ? bestUrl : null; +} + +function isLikelySourceCompanionColumn(columnName: string): boolean { + const lower = columnName.toLowerCase(); + return ( + lower.includes("date") || + lower.includes("quarter") || + lower.includes("price") || + lower.includes("plan") || + lower.includes("title") || + lower.includes("summary") || + lower.includes("description") + ); +} + +function sourceUrlSupportsRetainedCanonicalUrl( + evidenceUrl: unknown, + retainedUrl: string, +): boolean { + if (urlsReferenceSamePage(evidenceUrl, retainedUrl)) { + return true; + } + if (scoreDocsUrlForOfficialSource(retainedUrl) < 4) { + return false; + } + return ( + sameHostname(evidenceUrl, retainedUrl) && + scoreUrlForCanonicalSource(evidenceUrl) >= 2 + ); +} + +function urlsReferenceSamePage(a: unknown, b: unknown): boolean { + if (isEmpty(a) || isEmpty(b)) return false; + return normalizeComparableValue(a) === normalizeComparableValue(b); +} + +function sameHostname(a: unknown, b: unknown): boolean { + try { + const aHost = new URL(String(a)).hostname.replace(/^www\./, ""); + const bHost = new URL(String(b)).hostname.replace(/^www\./, ""); + return aHost === bHost; + } catch { + return false; + } +} diff --git a/backend/BigSet_Data_Collection_Agent/src/models/quality.ts b/backend/BigSet_Data_Collection_Agent/src/models/quality.ts new file mode 100644 index 0000000..ffd496a --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/models/quality.ts @@ -0,0 +1,79 @@ +import { z } from "zod"; + +export const recordStatusSchema = z.enum([ + "complete", + "partial", + "low_confidence", +]); + +export type RecordStatus = z.infer; + +export const recordQualitySchema = z.object({ + record_id: z.string(), + record_status: recordStatusSchema, + needs_review: z.boolean(), + completeness_pct: z.number().min(0).max(1), + /** Mean confidence across required fields (from per-field source signals). */ + confidence_score: z.number().min(0).max(1), + field_confidences: z.record(z.string(), z.number().min(0).max(1)).default({}), + missing_required_fields: z.array(z.string()), + missing_optional_fields: z.array(z.string()), + fields_without_evidence: z.array(z.string()), + review_reasons: z.array(z.string()), +}); + +export type RecordQuality = z.infer; + +export const qualityBucketSchema = z.object({ + count: z.number().int().nonnegative(), + record_ids: z.array(z.string()), +}); + +export type QualityBucket = z.infer; + +export const qualityReportSchema = z.object({ + total_records: z.number().int().nonnegative(), + unkeyed_records: z.number().int().nonnegative(), + complete: qualityBucketSchema, + partial: qualityBucketSchema, + low_confidence: qualityBucketSchema, + needs_review: qualityBucketSchema, + records: z.array(recordQualitySchema), +}); + +export type QualityReport = z.infer; + +export const sourceOutcomeTypeSchema = z.enum([ + "success", + "fetch_failed", + "skipped", + "extract_failed", + "agent_failed", + "agent_deferred", + "no_records", +]); + +export type SourceOutcomeType = z.infer; + +export const sourceOutcomeSchema = z.object({ + url: z.string(), + phase: z.enum(["initial", "repair"]), + outcome: sourceOutcomeTypeSchema, + triage_status: z.string().optional(), + triage_confidence: z.number().optional(), + source_data_confidence: z.number().optional(), + expected_yield: z.string().optional(), + error: z.string().optional(), + records_extracted: z.number().optional(), +}); + +export type SourceOutcome = z.infer; + +export const sourcesReportSchema = z.object({ + total: z.number().int().nonnegative(), + failed: z.array(sourceOutcomeSchema), + by_outcome: z.record(z.string(), z.number()), + outcomes: z.array(sourceOutcomeSchema), +}); + +export type SourcesReport = z.infer; diff --git a/backend/BigSet_Data_Collection_Agent/src/models/schemas.ts b/backend/BigSet_Data_Collection_Agent/src/models/schemas.ts new file mode 100644 index 0000000..fe1a059 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/models/schemas.ts @@ -0,0 +1,214 @@ +import { z } from "zod"; +import { repairDiagnosisSchema } from "../memory/types.js"; +import { qualityReportSchema, sourcesReportSchema } from "./quality.js"; +import { sourceStatusSchema } from "./source-status.js"; + +export const columnSchema = z.object({ + name: z.string().min(1), + type: z.enum(["string", "number", "boolean", "date"]), + description: z.string(), + required: z.boolean(), +}); + +export const datasetSpecSchema = z.object({ + intent_summary: z.string(), + target_row_count: z.number().int().positive(), + row_grain: z.string(), + columns: z.array(columnSchema).min(1), + dedupe_keys: z.preprocess( + (value) => (Array.isArray(value) ? value.slice(0, 1) : value), + z.array(z.string()).length(1), + ), + search_queries: z.array(z.string()).min(1), + extraction_hints: z.string(), +}); + +export type ColumnDef = z.infer; +export type DatasetSpec = z.infer; + +export const fieldEvidenceSchema = z.object({ + field: z.string(), + url: z.string(), + quote: z.string(), +}); + +export const extractedRecordSchema = z.object({ + row: z.record(z.string(), z.union([z.string(), z.number(), z.boolean(), z.null()])), + evidence: z.array(fieldEvidenceSchema), + source_urls: z.array(z.string()), + /** LLM-estimated confidence that row values are accurate (0–1). */ + extraction_confidence: z.number().min(0).max(1).optional(), +}); + +export type FieldEvidence = z.infer; +export type ExtractedRecord = z.infer; + +export const extractionResultSchema = z.object({ + records: z.array(extractedRecordSchema), + notes: z.string().optional(), +}); + +export type ExtractionResult = z.infer; + +export const sourceCandidateSchema = z.object({ + url: z.string().url(), + title: z.string(), + snippet: z.string(), + site_name: z.string().optional(), + query: z.string(), + position: z.number().optional(), + /** Search API page (0-based) that produced this candidate. */ + search_page: z.number().int().min(0).optional(), +}); + +export type SourceCandidate = z.infer; + +export const fetchedPageSchema = z.object({ + url: z.string(), + final_url: z.string(), + title: z.string(), + description: z.string().optional(), + text: z.string(), + error: z.string().optional(), + /** Outbound links when Fetch API was called with links: true. */ + outbound_links: z.array(z.string()).optional(), +}); + +export type FetchedPage = z.infer; + +export const expectedYieldSchema = z.enum(["complete", "partial", "none"]); + +export const sourceTriageResultSchema = z.object({ + url: z.string(), + final_url: z.string(), + title: z.string(), + status: sourceStatusSchema, + /** Confidence in triage classification (routing). */ + confidence: z.number().min(0).max(1), + /** Expected accuracy/completeness of data if extracted from this page. */ + source_data_confidence: z.number().min(0).max(1), + /** Likely yield: full rows, partial rows, or none. */ + expected_yield: expectedYieldSchema, + reasoning: z.string(), + suggested_action: z.string().optional(), +}); + +export type SourceTriageResult = z.infer; + +export const agentGoalSchema = z.object({ + goal: z.string(), + rationale: z.string(), +}); + +export type AgentGoal = z.infer; + +export const agentRunRecordSchema = z.object({ + url: z.string(), + status: sourceStatusSchema, + run_id: z.string().nullable(), + agent_status: z.string(), + goal: z.string(), + records_extracted: z.number(), + error: z.string().optional(), +}); + +export type AgentRunRecord = z.infer; + +export const triageSummarySchema = z.object({ + pages_triaged: z.number(), + by_status: z.record(z.string(), z.number()), + extract_now: z.number(), + agent_candidates: z.number(), + agent_dispatched: z.number(), + agent_deferred: z.number(), + agent_succeeded: z.number(), + agent_failed: z.number(), + skipped: z.number(), + records_from_extract: z.number(), + records_from_agent: z.number(), +}); + +export type TriageSummary = z.infer; + +const phaseStatsSchema = z.object({ + search_queries_executed: z.number(), + search_pages_paginated: z.number().optional(), + search_results_collected: z.number(), + unique_urls_selected: z.number(), + pages_fetched: z.number(), + pages_failed: z.number(), + raw_records_extracted: z.number(), + triage: triageSummarySchema.optional(), +}); + +export const llmUsageReportSchema = z.object({ + prompt_tokens: z.number().int().nonnegative(), + completion_tokens: z.number().int().nonnegative(), + total_tokens: z.number().int().nonnegative(), + call_count: z.number().int().nonnegative(), +}); + +export const repairLoopReportSchema = z.object({ + loop_index: z.number().int().positive(), + diagnosis_summary: z.string().optional(), + repair_queries: z.array(z.string()), + rationale: z.string().optional(), + missing_fields: z.array(z.string()), + records_before: z.number(), + records_after: z.number(), + fields_filled: z.record(z.string(), z.number()), + partial_count_before: z.number().optional(), + partial_count_after: z.number().optional(), + stats: phaseStatsSchema, +}); + +export type RepairLoopReport = z.infer; + +export const repairReportSchema = z.object({ + attempted: z.boolean(), + total_loops: z.number().int().nonnegative(), + loops: z.array(repairLoopReportSchema), + skipped_reason: z.string().optional(), + missing_fields: z.array(z.string()), + repair_queries: z.array(z.string()), + rationale: z.string().optional(), + records_before: z.number(), + records_after: z.number(), + fields_filled: z.record(z.string(), z.number()), + stats: phaseStatsSchema, + last_diagnosis: repairDiagnosisSchema.optional(), +}); + +export const runReportSchema = z.object({ + run_id: z.string(), + /** Set when this run is a recurring refresh of a prior run. */ + refreshed_from_run_id: z.string().optional(), + refresh_in_place: z.boolean().optional(), + prompt: z.string(), + target_rows: z.number(), + started_at: z.string(), + finished_at: z.string(), + duration_ms: z.number(), + dataset_spec: datasetSpecSchema, + stats: phaseStatsSchema.extend({ + records_after_merge: z.number(), + visualization_records: z.number().optional(), + }), + initial: phaseStatsSchema.extend({ + search_queries: z.array(z.string()), + fetched_urls: z.array(z.string()), + failed_urls: z.array(z.string()), + }), + repair: repairReportSchema, + search_queries: z.array(z.string()), + fetched_urls: z.array(z.string()), + failed_urls: z.array(z.string()), + errors: z.array(z.string()), + quality: qualityReportSchema.optional(), + sources: sourcesReportSchema.optional(), + llm_usage: llmUsageReportSchema.optional(), +}); + +export type RunReport = z.infer; + +export type { QualityReport, RecordQuality, SourcesReport, SourceOutcome } from "./quality.js"; diff --git a/backend/BigSet_Data_Collection_Agent/src/models/source-status.ts b/backend/BigSet_Data_Collection_Agent/src/models/source-status.ts new file mode 100644 index 0000000..e25afd5 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/models/source-status.ts @@ -0,0 +1,24 @@ +import { z } from "zod"; + +export const sourceStatusSchema = z.enum([ + "extract_now", + "requires_navigation", + "requires_form_submission", + "requires_detail_page_followup", + "irrelevant", + "duplicate", + "blocked", + "low_value", +]); + +export type SourceStatus = z.infer; + +export const AGENT_STATUSES: SourceStatus[] = [ + "requires_navigation", + "requires_form_submission", + "requires_detail_page_followup", +]; + +export function statusNeedsAgent(status: SourceStatus): boolean { + return AGENT_STATUSES.includes(status); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/orchestrator/acquisition.ts b/backend/BigSet_Data_Collection_Agent/src/orchestrator/acquisition.ts new file mode 100644 index 0000000..a879312 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/orchestrator/acquisition.ts @@ -0,0 +1,285 @@ +import { selectOutboundLinksToFollow } from "../acquisition/link-follow.js"; +import { config } from "../config.js"; +import { chunkUrls, fetchPages, searchWeb } from "../integrations/tinyfish.js"; +import { domainMemoryBoost, type WorkflowMemory } from "../memory/index.js"; +import type { SearchPlan } from "../memory/search-pagination.js"; +import { getPrimaryKeyValue } from "../merge/records.js"; +import { createFetchQueue, createSearchQueue } from "../queue/pools.js"; +import { + derivePromptSourcePolicy, + recordMatchesPromptSourcePolicy, + sourceCandidatePolicyBoost, + type PromptSourcePolicy, +} from "../agents/source-policy.js"; +import type { + AgentRunRecord, + DatasetSpec, + ExtractedRecord, + FetchedPage, + SourceCandidate, + SourceTriageResult, + TriageSummary, +} from "../models/schemas.js"; +import { saveFetchedPage, type RunPaths } from "../storage/run-store.js"; +import { + processFetchedPages, + type AgentDeferredEntry, +} from "./process-pages.js"; +import { getDomain, normalizeUrl } from "../utils/url.js"; + +export interface AcquisitionResult { + candidates: SourceCandidate[]; + fetchedUrls: string[]; + failedUrls: string[]; + fetchedPages: FetchedPage[]; + records: ExtractedRecord[]; + pagesFetched: number; + triage: TriageSummary; + triageResults: SourceTriageResult[]; + agentRuns: AgentRunRecord[]; + agentDeferred: AgentDeferredEntry[]; +} + +function rankCandidates( + candidates: SourceCandidate[], + excludeUrls: Set, + limit: number, + memory?: WorkflowMemory, + sourcePolicy?: PromptSourcePolicy, +): string[] { + const byUrl = new Map< + string, + { url: string; score: number; domain: string } + >(); + + for (const candidate of candidates) { + const url = normalizeUrl(candidate.url); + if (excludeUrls.has(url)) continue; + + const domain = getDomain(url); + let score = byUrl.get(url)?.score ?? 0; + score += 1; + if (candidate.title.length > 10) score += 0.5; + if (candidate.snippet.length > 40) score += 0.5; + if (memory) score += domainMemoryBoost(memory, domain); + if (sourcePolicy) score += sourceCandidatePolicyBoost(candidate, sourcePolicy); + byUrl.set(url, { url, score, domain }); + } + + const domainsSeen = new Set(); + return [...byUrl.values()] + .sort((a, b) => b.score - a.score) + .filter((item) => { + if (domainsSeen.has(item.domain)) return false; + domainsSeen.add(item.domain); + return true; + }) + .map((item) => item.url) + .slice(0, limit); +} + +export async function runAcquisitionPhase(options: { + label: string; + userPrompt: string; + spec: DatasetSpec; + queries: string[]; + /** When set, runs Search with per-query page indices (repair pagination). */ + searches?: SearchPlan[]; + paths: RunPaths; + errors: string[]; + excludeUrls: Set; + maxResultsPerQuery: number; + maxUrlsToFetch: number; + pageIndexStart: number; + focusFields?: string[]; + knownEntityKeys?: string[]; + enableTriage?: boolean; + enableTinyfishAgent?: boolean; + agentPollTimeoutMs?: number; + memory?: WorkflowMemory; + forceAgent?: boolean; + /** Fetch outbound links from high-value pages (repair). */ + enableLinkFollow?: boolean; + log: (stage: string, message: string) => void; +}): Promise { + const searchQueue = createSearchQueue(); + const fetchQueue = createFetchQueue(); + + const searches: SearchPlan[] = + options.searches ?? + options.queries.map((query) => ({ query, page: 0 })); + + options.log( + options.label, + `Running ${searches.length} searches (parallel, concurrency=${config.searchConcurrency})...`, + ); + + const searchBatches = await searchQueue.runAll( + searches, + async (plan) => { + try { + const results = await searchWeb(plan.query, plan.page); + return results.slice(0, options.maxResultsPerQuery).map((result) => ({ + ...result, + query: plan.query, + search_page: plan.page, + })); + } catch (error) { + const msg = `Search failed for "${plan.query}" (page ${plan.page}): ${ + error instanceof Error ? error.message : String(error) + }`; + options.errors.push(msg); + options.log(options.label, `WARN ${msg}`); + return [] as SourceCandidate[]; + } + }, + ); + const candidates: SourceCandidate[] = searchBatches.flat(); + const sourcePolicy = derivePromptSourcePolicy(options.userPrompt); + + const urlsToFetch = rankCandidates( + candidates, + options.excludeUrls, + options.maxUrlsToFetch, + options.memory, + sourcePolicy, + ); + + const fetchWithLinks = + options.enableLinkFollow ?? sourcePolicy.requiresOfficialSource; + const urlChunks = chunkUrls(urlsToFetch, config.fetchBatchSize); + + options.log( + options.label, + `Fetching ${urlsToFetch.length} URLs in ${urlChunks.length} parallel batches (concurrency=${config.fetchConcurrency})${fetchWithLinks ? " with outbound links" : ""}...`, + ); + + const fetchChunk = async (chunk: string[], includeLinks: boolean) => { + try { + return await fetchPages(chunk, { includeLinks }); + } catch (error) { + const msg = `Fetch batch failed: ${ + error instanceof Error ? error.message : String(error) + }`; + options.errors.push(msg); + options.log(options.label, `WARN ${msg}`); + return chunk.map((url) => ({ + url, + final_url: url, + title: "", + text: "", + error: msg, + })); + } + }; + + let fetchedPages: FetchedPage[] = + urlChunks.length > 0 + ? ( + await fetchQueue.runAll( + urlChunks, + (chunk) => fetchChunk(chunk, fetchWithLinks), + (chunk) => chunk.map((url) => getDomain(url)), + ) + ).flat() + : []; + + if (fetchWithLinks && fetchedPages.length > 0) { + const linkUrls = selectOutboundLinksToFollow({ + pages: fetchedPages, + excludeUrls: options.excludeUrls, + focusFields: options.focusFields, + maxTotal: config.maxRepairLinkUrls, + maxPerSource: config.maxLinksPerSourcePage, + memory: options.memory, + }).filter((url) => !urlsToFetch.includes(normalizeUrl(url))); + + if (linkUrls.length > 0) { + const linkChunks = chunkUrls(linkUrls, config.fetchBatchSize); + options.log( + options.label, + `Following ${linkUrls.length} high-relevance outbound links...`, + ); + const linkPages = ( + await fetchQueue.runAll( + linkChunks, + (chunk) => fetchChunk(chunk, false), + (chunk) => chunk.map((url) => getDomain(url)), + ) + ).flat(); + fetchedPages = [...fetchedPages, ...linkPages]; + } + } + + let pageIndex = options.pageIndexStart; + for (const page of fetchedPages) { + await saveFetchedPage(options.paths, page, pageIndex); + pageIndex += 1; + } + + const failedUrls = fetchedPages + .filter((page) => page.error) + .map((page) => page.url); + + const processed = await processFetchedPages({ + label: options.label, + userPrompt: options.userPrompt, + spec: options.spec, + pages: fetchedPages, + paths: options.paths, + errors: options.errors, + focusFields: options.focusFields, + knownEntityKeys: options.knownEntityKeys, + enableTriage: options.enableTriage, + enableTinyfishAgent: + options.enableTinyfishAgent ?? + (options.forceAgent ? true : config.enableTinyfishAgent), + agentPollTimeoutMs: options.agentPollTimeoutMs, + memory: options.memory, + log: options.log, + }); + const records = sourcePolicy.requiresOfficialSource + ? processed.records.filter((record) => + recordMatchesPromptSourcePolicy(record, options.spec, sourcePolicy), + ) + : processed.records; + const droppedRecords = processed.records.length - records.length; + if (droppedRecords > 0) { + options.log( + options.label, + `Dropped ${droppedRecords} record(s) that lacked entity-owned source URLs`, + ); + } + + const allFetchedUrls = [ + ...new Set([ + ...urlsToFetch.map((url) => normalizeUrl(url)), + ...fetchedPages.map((page) => normalizeUrl(page.url)), + ]), + ]; + + return { + candidates, + fetchedUrls: allFetchedUrls, + failedUrls, + fetchedPages, + records, + pagesFetched: fetchedPages.length, + triage: processed.summary, + triageResults: processed.triageResults, + agentRuns: processed.agentRuns, + agentDeferred: processed.agentDeferred, + }; +} + +export function entityKeysFromRecords( + spec: DatasetSpec, + records: ExtractedRecord[], +): string[] { + const keys = new Set(); + for (const record of records) { + const pk = getPrimaryKeyValue(record, spec); + if (pk) keys.add(pk); + } + return [...keys]; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/orchestrator/pipeline.ts b/backend/BigSet_Data_Collection_Agent/src/orchestrator/pipeline.ts new file mode 100644 index 0000000..ae6af0d --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/orchestrator/pipeline.ts @@ -0,0 +1,656 @@ +import { runWithLlmUsageScope, getCurrentLlmUsage, type LlmUsageTotals } from "../llm/usage.js"; +import { randomUUID } from "node:crypto"; +import { join } from "node:path"; +import { generateDatasetSpec } from "../agents/dataset-spec.js"; +import type { BenchmarkSpecContext } from "../agents/benchmark-spec.js"; +import { + analyzeCoverage, + type CoverageReport, +} from "../coverage/analyze.js"; +import { assertConfig, config } from "../config.js"; +import { selectVisualizationRecords } from "../export/select-results.js"; +import { + qualityMapFromReport, + writeEvidenceJsonl, + writeResultsCsv, + writeSegmentedRecordCsvs, + writeUnkeyedRecordsJsonl, +} from "../export/csv-compiler.js"; +import { mergeRecords, mergeRepairIntoExisting } from "../merge/records.js"; +import type { DatasetSpec, ExtractedRecord, RunReport } from "../models/schemas.js"; +import { + createWorkflowMemory, + loadPersistentMemory, + mergePersistentMemory, + recordCoverageGaps, + recordPhaseInMemory, + savePersistentMemory, + saveRunMemory, + snapshotExtractionSchema, + type WorkflowMemory, +} from "../memory/index.js"; +import { + agentExtractedUrls, + buildQualityReport, + buildSourcesReport, + mergeSourcesReports, + triageByUrl, +} from "../quality/index.js"; +import { entityKeysFromRecords, runAcquisitionPhase } from "./acquisition.js"; +import { runRepairLoops } from "./repair-loop.js"; +import { loadRunForRefresh, type LoadedRun } from "../storage/run-loader.js"; +import { + createRunStore, + saveDatasetSpec, + saveJson, + saveRunReport, + saveSourceCandidates, + type RunPaths, +} from "../storage/run-store.js"; +import { normalizeUrl } from "../utils/url.js"; + +export interface PipelineOptions { + prompt: string; + targetRows: number; + outputDir: string; + memoryDir?: string; + enableRepair?: boolean; + enableTriage?: boolean; + enableTinyfishAgent?: boolean; + /** Recurring refresh: baseline run to merge into (in-place by primary key). */ + refreshFrom?: LoadedRun; + /** Overwrite the source run directory (same run_id). */ + refreshInPlace?: boolean; + /** When refreshing, re-fetch URLs already seen in the source run. */ + refetchUrls?: boolean; + /** Per-run TinyFish Agent poll timeout. Defaults to vendored config. */ + agentPollTimeoutMs?: number; + /** Override pipeline logging (benchmark adapters should log to stderr). */ + onLog?: (stage: string, message: string) => void; + /** Set when invoked from the dataset-agent benchmark harness. */ + benchmark?: BenchmarkSpecContext; +} + +export interface PipelineResult { + runId: string; + paths: RunPaths; + report: RunReport; + recordCount: number; + records: ExtractedRecord[]; + visualizationRecords: ExtractedRecord[]; + llmUsage: LlmUsageTotals; +} + +let pipelineLog: (stage: string, message: string) => void = (stage, message) => { + console.log(`[${stage}] ${message}`); +}; + +function log(stage: string, message: string): void { + pipelineLog(stage, message); +} + +function phaseStatsFromAcquisition( + acquisition: { + candidates: { length: number }; + fetchedUrls: string[]; + failedUrls: string[]; + records: ExtractedRecord[]; + pagesFetched: number; + triage: import("../models/schemas.js").TriageSummary; + }, + queryCount: number, +) { + return { + search_queries_executed: queryCount, + search_results_collected: acquisition.candidates.length, + unique_urls_selected: acquisition.fetchedUrls.length, + pages_fetched: acquisition.pagesFetched, + pages_failed: acquisition.failedUrls.length, + raw_records_extracted: acquisition.records.length, + triage: acquisition.triage, + }; +} + +function emptyRepairStats(): RunReport["repair"]["stats"] { + return { + search_queries_executed: 0, + search_results_collected: 0, + unique_urls_selected: 0, + pages_fetched: 0, + pages_failed: 0, + raw_records_extracted: 0, + triage: { + pages_triaged: 0, + by_status: {}, + extract_now: 0, + agent_candidates: 0, + agent_dispatched: 0, + agent_deferred: 0, + agent_succeeded: 0, + agent_failed: 0, + skipped: 0, + records_from_extract: 0, + records_from_agent: 0, + }, + }; +} + +function aggregateRepairStats( + loops: RunReport["repair"]["loops"], +): RunReport["repair"]["stats"] { + const stats = emptyRepairStats(); + for (const loop of loops) { + stats.search_queries_executed += loop.stats.search_queries_executed; + stats.search_results_collected += loop.stats.search_results_collected; + stats.unique_urls_selected += loop.stats.unique_urls_selected; + stats.pages_fetched += loop.stats.pages_fetched; + stats.pages_failed += loop.stats.pages_failed; + stats.raw_records_extracted += loop.stats.raw_records_extracted; + } + return stats; +} + +function memoryDirFor(options: PipelineOptions): string { + return options.memoryDir ?? join(options.outputDir, "..", "memory"); +} + +export async function runPipeline( + options: PipelineOptions, +): Promise { + const { result, usage } = await runWithLlmUsageScope(() => + executeRunPipeline(options), + ); + return { ...result, llmUsage: usage }; +} + +async function executeRunPipeline( + options: PipelineOptions, +): Promise> { + pipelineLog = + options.onLog ?? ((stage, message) => console.log(`[${stage}] ${message}`)); + assertConfig(); + + const enableRepair = options.enableRepair ?? config.enableRepairLoop; + const enableTriage = options.enableTriage ?? config.enableTriage; + const enableTinyfishAgent = + options.enableTinyfishAgent ?? config.enableTinyfishAgent; + const useMemory = config.enableWorkflowMemory; + const startedAt = new Date(); + const refreshSource = options.refreshFrom; + const inPlaceRefresh = Boolean(refreshSource && options.refreshInPlace); + const runId = + inPlaceRefresh && refreshSource + ? refreshSource.runId + : randomUUID().slice(0, 8); + const paths = await createRunStore(options.outputDir, runId); + const errors: string[] = []; + const fetchedUrlSet = new Set(); + if (refreshSource && !options.refetchUrls) { + for (const url of refreshSource.report.fetched_urls) { + fetchedUrlSet.add(normalizeUrl(url)); + } + } + let pageIndex = 0; + const targetRowCap = options.targetRows * 2; + + log( + "init", + refreshSource + ? `refresh run_id=${runId} from=${refreshSource.runId} in_place=${inPlaceRefresh} output=${paths.root}` + : `run_id=${runId} output=${paths.root}`, + ); + + let memory: WorkflowMemory = createWorkflowMemory(options.prompt); + if (refreshSource?.memory) { + memory = mergePersistentMemory(memory, refreshSource.memory); + log( + "memory", + `Loaded workflow memory from run ${refreshSource.runId} (${refreshSource.memory.query_stats.length} query stats)`, + ); + } + if (useMemory) { + const prior = await loadPersistentMemory( + memoryDirFor(options), + memory.prompt_fingerprint, + ); + memory = mergePersistentMemory(memory, prior); + if (prior && !refreshSource?.memory) { + log( + "memory", + `Loaded prior workflow memory (${prior.query_stats.length} query stats, ${prior.domain_stats.length} domain stats)`, + ); + } + } + + let spec: DatasetSpec; + let baselineRecords: ExtractedRecord[] = []; + + if (refreshSource) { + spec = refreshSource.spec; + baselineRecords = refreshSource.records; + memory.extraction_schema = snapshotExtractionSchema(spec); + memory.dedupe_keys = spec.dedupe_keys; + memory.repair_loop_count = 0; + await saveDatasetSpec(paths, spec); + log( + "refresh", + `Baseline ${baselineRecords.length} records — new search with prior diagnostics/memory`, + ); + } else { + log("spec", "Generating dataset specification..."); + spec = await generateDatasetSpec( + options.prompt, + options.targetRows, + useMemory ? memory : null, + options.benchmark, + ); + memory.extraction_schema = snapshotExtractionSchema(spec); + memory.dedupe_keys = spec.dedupe_keys; + await saveDatasetSpec(paths, spec); + } + + const initialQueries = spec.search_queries.slice(0, config.maxSearchQueries); + + const initialAcquisition = await runAcquisitionPhase({ + label: refreshSource ? "refresh" : "initial", + userPrompt: options.prompt, + spec, + queries: initialQueries, + paths, + errors, + excludeUrls: fetchedUrlSet, + maxResultsPerQuery: config.maxResultsPerQuery, + maxUrlsToFetch: config.maxUrlsToFetch, + pageIndexStart: pageIndex, + enableTriage, + enableTinyfishAgent, + agentPollTimeoutMs: options.agentPollTimeoutMs, + memory: useMemory ? memory : undefined, + log, + }); + + recordPhaseInMemory({ + memory, + spec, + phase: refreshSource ? "refresh" : "initial", + repairLoop: 0, + queries: initialQueries, + candidates: initialAcquisition.candidates, + records: initialAcquisition.records, + failedUrls: initialAcquisition.failedUrls, + agentRuns: initialAcquisition.agentRuns, + triageResults: initialAcquisition.triageResults, + }); + + if (initialAcquisition.triage.agent_dispatched > 0) { + log( + "triage", + `Initial: ${initialAcquisition.triage.extract_now} extract_now, ` + + `${initialAcquisition.triage.agent_succeeded}/${initialAcquisition.triage.agent_dispatched} agent runs succeeded`, + ); + } + + for (const url of initialAcquisition.fetchedUrls) { + fetchedUrlSet.add(normalizeUrl(url)); + } + pageIndex += initialAcquisition.pagesFetched; + + await saveSourceCandidates(paths, initialAcquisition.candidates); + + let mergeResult = refreshSource + ? mergeRepairIntoExisting( + spec, + baselineRecords, + initialAcquisition.records, + ) + : mergeRecords(spec, initialAcquisition.records); + let mergedRecords = mergeResult.records.slice(0, targetRowCap); + let benchmarkVisualizationRecords = mergedRecords; + let unkeyedRecords = mergeResult.unkeyed; + + let coverage: CoverageReport = analyzeCoverage(spec, mergedRecords); + recordCoverageGaps(memory, coverage); + await saveJson(join(paths.root, "coverage_initial.json"), coverage); + + const writeExports = async ( + csvPath: string, + evidencePath: string, + records: ExtractedRecord[], + qualityById?: ReturnType, + ) => { + await writeResultsCsv(csvPath, spec, records, qualityById); + await writeEvidenceJsonl(evidencePath, spec, records, qualityById); + }; + + log("export", `Writing init_results.csv (${mergedRecords.length} records)...`); + await writeExports(paths.initResultsPath, paths.initEvidencePath, mergedRecords); + + const allSearchQueries = [...initialQueries]; + const allFailedUrls = [...initialAcquisition.failedUrls]; + const recordsBeforeRepair = mergedRecords; + + let repairReport: RunReport["repair"] = { + attempted: false, + total_loops: 0, + loops: [], + missing_fields: [], + repair_queries: [], + records_before: mergedRecords.length, + records_after: mergedRecords.length, + fields_filled: {}, + stats: emptyRepairStats(), + }; + + const repairAcquisitions: typeof initialAcquisition[] = []; + + if (!enableRepair) { + repairReport.skipped_reason = "repair_disabled"; + log("repair", "Skipped (disabled)"); + } else if (!coverage.should_repair) { + repairReport.skipped_reason = "no_missing_required_fields"; + log( + "repair", + `Skipped (coverage satisfied) — required=[${coverage.required_columns.join(", ")}]`, + ); + } else { + repairReport.attempted = true; + repairReport.records_before = recordsBeforeRepair.length; + repairReport.missing_fields = coverage.field_gaps.map((gap) => gap.column); + + const repairResult = await runRepairLoops({ + ctx: { + userPrompt: options.prompt, + spec, + paths, + errors, + memory, + fetchedUrlSet, + allSearchQueries, + allFailedUrls, + enableTriage, + enableTinyfishAgent, + agentPollTimeoutMs: options.agentPollTimeoutMs, + targetRowCap, + log, + }, + recordsBeforeRepair, + initialCoverage: coverage, + pageIndexStart: pageIndex, + }); + + mergedRecords = repairResult.mergedRecords; + unkeyedRecords = [...unkeyedRecords, ...repairResult.unkeyedRecords]; + coverage = repairResult.coverage; + repairAcquisitions.push(...repairResult.repairAcquisitions); + + repairReport.total_loops = repairResult.loops.length; + repairReport.loops = repairResult.loops; + repairReport.last_diagnosis = repairResult.lastDiagnosis; + repairReport.records_after = mergedRecords.length; + repairReport.repair_queries = repairResult.loops.flatMap((loop) => loop.repair_queries); + repairReport.rationale = repairResult.lastDiagnosis?.summary; + repairReport.fields_filled = repairResult.loops.reduce( + (acc, loop) => { + for (const [key, value] of Object.entries(loop.fields_filled)) { + acc[key] = (acc[key] ?? 0) + value; + } + return acc; + }, + {} as Record, + ); + repairReport.stats = aggregateRepairStats(repairResult.loops); + repairReport.missing_fields = coverage.field_gaps.map((gap) => gap.column); + + if (repairResult.loops.length > 0) { + log( + "export", + `Writing repair_results.csv (${mergedRecords.length} records after ${repairResult.loops.length} repair loop(s))...`, + ); + await writeExports( + paths.repairResultsPath, + paths.repairEvidencePath, + mergedRecords, + ); + } + } + + if (useMemory) { + await saveRunMemory(paths.root, memory); + await savePersistentMemory(memoryDirFor(options), memory); + log("memory", `Saved workflow memory (repair_loops=${memory.repair_loop_count})`); + } + + let qualityReport: RunReport["quality"]; + let sourcesReport: RunReport["sources"]; + + if (config.enableQualityScoring) { + log("quality", "Scoring records and building source outcomes..."); + + const allTriage = [ + ...initialAcquisition.triageResults, + ...repairAcquisitions.flatMap((a) => a.triageResults), + ]; + const allAgentRuns = [ + ...initialAcquisition.agentRuns, + ...repairAcquisitions.flatMap((a) => a.agentRuns), + ]; + + const scoreContext = { + triageByUrl: triageByUrl(allTriage), + agentExtractedUrls: agentExtractedUrls(allAgentRuns), + }; + + qualityReport = buildQualityReport( + spec, + mergedRecords, + scoreContext, + unkeyedRecords.length, + ); + + const initialSources = buildSourcesReport({ + phase: "initial", + fetchedPages: initialAcquisition.fetchedPages, + fetchedUrls: initialAcquisition.fetchedUrls, + triageResults: initialAcquisition.triageResults, + agentRuns: initialAcquisition.agentRuns, + agentDeferred: initialAcquisition.agentDeferred, + }); + + const repairSourcesList = repairAcquisitions.map((acquisition, index) => + buildSourcesReport({ + phase: "repair", + fetchedPages: acquisition.fetchedPages, + fetchedUrls: acquisition.fetchedUrls, + triageResults: acquisition.triageResults, + agentRuns: acquisition.agentRuns, + agentDeferred: acquisition.agentDeferred, + }), + ); + + sourcesReport = repairSourcesList.reduce( + (acc, report) => mergeSourcesReports(acc, report), + initialSources, + ); + + await saveJson(join(paths.root, "quality_report.json"), qualityReport); + await saveJson(join(paths.root, "sources_outcomes.json"), sourcesReport); + + if (unkeyedRecords.length > 0) { + await writeUnkeyedRecordsJsonl( + join(paths.root, "records_unkeyed.jsonl"), + unkeyedRecords, + ); + } + + await writeSegmentedRecordCsvs( + paths.root, + spec, + mergedRecords, + qualityReport.records, + ); + + const qualityById = qualityMapFromReport(qualityReport.records); + benchmarkVisualizationRecords = config.enableSelectiveResults + ? selectVisualizationRecords(spec, mergedRecords, qualityById) + : mergedRecords; + + log( + "quality", + `complete=${qualityReport.complete.count} partial=${qualityReport.partial.count} ` + + `low_confidence=${qualityReport.low_confidence.count} needs_review=${qualityReport.needs_review.count} ` + + `visualization=${benchmarkVisualizationRecords.length}`, + ); + + if (config.enableSelectiveResults) { + log( + "export", + `Writing results_full.csv (${mergedRecords.length} records)...`, + ); + await writeExports( + paths.resultsFullPath, + paths.evidenceFullPath, + mergedRecords, + qualityById, + ); + log( + "export", + `Writing results.csv (${benchmarkVisualizationRecords.length} selective records)...`, + ); + await writeExports( + paths.resultsPath, + paths.evidencePath, + benchmarkVisualizationRecords, + qualityById, + ); + } else { + log("export", `Writing results.csv (${mergedRecords.length} records)...`); + await writeExports( + paths.resultsPath, + paths.evidencePath, + mergedRecords, + qualityById, + ); + } + } else { + log("export", `Writing results.csv (${mergedRecords.length} records)...`); + await writeExports(paths.resultsPath, paths.evidencePath, mergedRecords); + } + + const finishedAt = new Date(); + const initialStats = phaseStatsFromAcquisition( + initialAcquisition, + initialQueries.length, + ); + + const visualizationCount = benchmarkVisualizationRecords.length; + + const llmUsage = getCurrentLlmUsage(); + + const report: RunReport = { + run_id: runId, + ...(refreshSource + ? { + refreshed_from_run_id: refreshSource.runId, + refresh_in_place: inPlaceRefresh, + } + : {}), + prompt: options.prompt, + target_rows: options.targetRows, + started_at: startedAt.toISOString(), + finished_at: finishedAt.toISOString(), + duration_ms: finishedAt.getTime() - startedAt.getTime(), + dataset_spec: spec, + stats: { + ...initialStats, + search_queries_executed: + initialStats.search_queries_executed + + repairReport.stats.search_queries_executed, + search_results_collected: + initialStats.search_results_collected + + repairReport.stats.search_results_collected, + unique_urls_selected: + initialStats.unique_urls_selected + + repairReport.stats.unique_urls_selected, + pages_fetched: + initialStats.pages_fetched + repairReport.stats.pages_fetched, + pages_failed: + initialStats.pages_failed + repairReport.stats.pages_failed, + raw_records_extracted: + initialStats.raw_records_extracted + + repairReport.stats.raw_records_extracted, + records_after_merge: mergedRecords.length, + visualization_records: visualizationCount, + }, + initial: { + ...initialStats, + search_queries: initialQueries, + fetched_urls: initialAcquisition.fetchedUrls, + failed_urls: initialAcquisition.failedUrls, + }, + repair: repairReport, + search_queries: allSearchQueries, + fetched_urls: [...fetchedUrlSet], + failed_urls: allFailedUrls, + errors, + quality: qualityReport, + sources: sourcesReport, + llm_usage: { + prompt_tokens: llmUsage.promptTokens, + completion_tokens: llmUsage.completionTokens, + total_tokens: llmUsage.totalTokens, + call_count: llmUsage.callCount, + }, + }; + + await saveRunReport(paths, report); + + log("done", `results → ${paths.resultsPath}`); + return { + runId, + paths, + report, + recordCount: mergedRecords.length, + records: mergedRecords, + visualizationRecords: benchmarkVisualizationRecords, + }; +} + +export function defaultRunsDir(): string { + return join(process.cwd(), "runs"); +} + +export function defaultMemoryDir(): string { + return join(process.cwd(), "memory"); +} + +export async function runRefreshPipeline(options: { + fromRunId: string; + outputDir: string; + memoryDir?: string; + targetRows?: number; + inPlace?: boolean; + refetchUrls?: boolean; + enableRepair?: boolean; + enableTriage?: boolean; + enableTinyfishAgent?: boolean; +}): Promise { + const loaded = await loadRunForRefresh(options.outputDir, options.fromRunId); + if (loaded.records.length === 0) { + throw new Error( + `Run ${options.fromRunId} has no records in evidence.jsonl — cannot refresh`, + ); + } + + return runPipeline({ + prompt: loaded.report.prompt, + targetRows: options.targetRows ?? loaded.report.target_rows, + outputDir: options.outputDir, + memoryDir: options.memoryDir, + enableRepair: options.enableRepair, + enableTriage: options.enableTriage, + enableTinyfishAgent: options.enableTinyfishAgent, + refreshFrom: loaded, + refreshInPlace: options.inPlace, + refetchUrls: options.refetchUrls, + }); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/orchestrator/process-pages.ts b/backend/BigSet_Data_Collection_Agent/src/orchestrator/process-pages.ts new file mode 100644 index 0000000..4009569 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/orchestrator/process-pages.ts @@ -0,0 +1,474 @@ +import { generateAgentGoal } from "../agents/agent-goal.js"; +import { extractFromAgentResult } from "../agents/extract-from-agent.js"; +import { extractFromPage } from "../agents/extract.js"; +import { triagePage } from "../agents/source-triage.js"; +import { derivePromptSourcePolicy } from "../agents/source-policy.js"; +import { config } from "../config.js"; +import { runTinyfishAgentsBatch } from "../integrations/tinyfish-agent.js"; +import type { WorkflowMemory } from "../memory/index.js"; +import { getPrimaryKeyValue } from "../merge/records.js"; +import { + statusNeedsAgent, + type SourceStatus, +} from "../models/source-status.js"; +import type { + AgentRunRecord, + DatasetSpec, + ExtractedRecord, + FetchedPage, + SourceTriageResult, + TriageSummary, +} from "../models/schemas.js"; +import { + createAgentQueue, + createExtractionQueue, + createTriageQueue, +} from "../queue/pools.js"; +import { saveJson, type RunPaths } from "../storage/run-store.js"; +import { getDomain } from "../utils/url.js"; +import { join } from "node:path"; + +export interface AgentDeferredEntry { + url: string; + status: SourceStatus; + reason: "agent_budget" | "agent_disabled"; +} + +export interface ProcessPagesResult { + records: ExtractedRecord[]; + triageResults: SourceTriageResult[]; + agentRuns: AgentRunRecord[]; + agentDeferred: AgentDeferredEntry[]; + summary: TriageSummary; +} + +function emptySummary(): TriageSummary { + return { + pages_triaged: 0, + by_status: {}, + extract_now: 0, + agent_candidates: 0, + agent_dispatched: 0, + agent_deferred: 0, + agent_succeeded: 0, + agent_failed: 0, + skipped: 0, + records_from_extract: 0, + records_from_agent: 0, + }; +} + +function bumpStatus(summary: TriageSummary, status: SourceStatus): void { + summary.by_status[status] = (summary.by_status[status] ?? 0) + 1; +} + +function shouldFallbackExtractOfficialNavigation( + url: string, + status: SourceStatus, +): boolean { + if ( + status !== "requires_navigation" && + status !== "requires_detail_page_followup" + ) { + return false; + } + + try { + const parsed = new URL(url); + const path = `${parsed.pathname}${parsed.search}`.toLowerCase(); + if ( + path === "/" || + /(?:login|signin|signup|default\.aspx|home)(?:\/|$|\?)/.test(path) + ) { + return false; + } + return /(?:pricing|billing|docs|documentation|mcp|model-context-protocol|earnings|press-release|quarterly|results|news|blog)/.test( + path, + ); + } catch { + return false; + } +} + +export async function processFetchedPages(options: { + label: string; + userPrompt: string; + spec: DatasetSpec; + pages: FetchedPage[]; + paths: RunPaths; + errors: string[]; + focusFields?: string[]; + knownEntityKeys?: string[]; + enableTriage?: boolean; + enableTinyfishAgent?: boolean; + agentPollTimeoutMs?: number; + memory?: WorkflowMemory; + log: (stage: string, message: string) => void; +}): Promise { + const triageEnabled = options.enableTriage ?? config.enableTriage; + const agentEnabled = options.enableTinyfishAgent ?? config.enableTinyfishAgent; + const summary = emptySummary(); + const records: ExtractedRecord[] = []; + const agentRuns: AgentRunRecord[] = []; + const knownKeys = new Set(options.knownEntityKeys ?? []); + const sourcePolicy = derivePromptSourcePolicy(options.userPrompt); + + const successfulPages = options.pages.filter( + (page) => !page.error && page.text.trim().length > 0, + ); + + if (successfulPages.length === 0) { + return { + records: [], + triageResults: [], + agentRuns: [], + agentDeferred: [], + summary, + }; + } + + const extractionQueue = createExtractionQueue(); + + if (!triageEnabled) { + options.log( + options.label, + `Triage disabled — extracting all pages (parallel, concurrency=${config.extractionConcurrency})...`, + ); + const extracted = await extractionQueue.runAll( + successfulPages, + async (page) => { + try { + return await extractFromPage(options.spec, page, { + focusFields: options.focusFields, + memory: options.memory, + }); + } catch (error) { + const msg = `Extraction failed for ${page.final_url || page.url}: ${ + error instanceof Error ? error.message : String(error) + }`; + options.errors.push(msg); + return [] as ExtractedRecord[]; + } + }, + (page) => [getDomain(page.final_url || page.url)], + ); + const flat = extracted.flat(); + summary.pages_triaged = successfulPages.length; + summary.extract_now = successfulPages.length; + summary.records_from_extract = flat.length; + return { + records: flat, + triageResults: [], + agentRuns: [], + agentDeferred: [], + summary, + }; + } + + const triageQueue = createTriageQueue(); + + options.log( + options.label, + `Triaging ${successfulPages.length} pages (parallel, concurrency=${config.triageConcurrency})...`, + ); + + const triageResults = await triageQueue.runAll( + successfulPages, + async (page) => { + try { + return await triagePage({ + userPrompt: options.userPrompt, + spec: options.spec, + page, + knownEntityKeys: [...knownKeys], + memory: options.memory, + }); + } catch (error) { + const pageUrl = page.final_url || page.url; + const msg = `Triage failed for ${pageUrl}: ${ + error instanceof Error ? error.message : String(error) + }`; + options.errors.push(msg); + options.log(options.label, `WARN ${msg}`); + return { + url: page.url, + final_url: pageUrl, + title: page.title, + status: "extract_now" as const, + confidence: 0.3, + source_data_confidence: 0.35, + expected_yield: "partial" as const, + reasoning: "Triage failed; falling back to direct extraction.", + }; + } + }, + (page) => [getDomain(page.final_url || page.url)], + ); + + summary.pages_triaged = triageResults.length; + await saveJson( + join(options.paths.root, `triage_${options.label}.json`), + triageResults, + ); + + const pageByUrl = new Map( + successfulPages.map((page) => [page.final_url || page.url, page]), + ); + + const extractPages: { page: FetchedPage; triage: SourceTriageResult }[] = []; + const agentQueue: { page: FetchedPage; triage: SourceTriageResult }[] = []; + const agentDisabledDeferredEntries: AgentDeferredEntry[] = []; + + for (const triage of triageResults) { + bumpStatus(summary, triage.status); + + const page = pageByUrl.get(triage.final_url) ?? pageByUrl.get(triage.url); + if (!page) continue; + + if (triage.status === "extract_now") { + summary.extract_now += 1; + extractPages.push({ page, triage }); + } else if (statusNeedsAgent(triage.status)) { + summary.agent_candidates += 1; + if (agentEnabled) { + agentQueue.push({ page, triage }); + } else if ( + sourcePolicy.requiresOfficialSource && + shouldFallbackExtractOfficialNavigation(triage.final_url, triage.status) + ) { + options.log( + options.label, + `Agent disabled — intent-path fallback extract for ${triage.final_url} [${triage.status}]`, + ); + extractPages.push({ page, triage }); + } else if (sourcePolicy.requiresOfficialSource) { + summary.skipped += 1; + agentDisabledDeferredEntries.push({ + url: triage.final_url || page.url, + status: triage.status, + reason: "agent_disabled", + }); + options.log( + options.label, + `Agent disabled — skip navigation-only official source ${triage.final_url} [${triage.status}]`, + ); + } else { + options.log( + options.label, + `Agent disabled — fallback extract for ${triage.final_url} [${triage.status}]`, + ); + extractPages.push({ page, triage }); + } + } else { + summary.skipped += 1; + options.log( + options.label, + `Skip ${triage.final_url} [${triage.status}]: ${triage.reasoning.slice(0, 80)}`, + ); + } + } + + if (extractPages.length > 0) { + options.log( + options.label, + `Direct extraction on ${extractPages.length} pages (parallel, concurrency=${config.extractionConcurrency})...`, + ); + const extracted = await extractionQueue.runAll( + extractPages, + async ({ page }) => { + try { + return await extractFromPage(options.spec, page, { + focusFields: options.focusFields, + memory: options.memory, + }); + } catch (error) { + const msg = `Extraction failed for ${page.final_url || page.url}: ${ + error instanceof Error ? error.message : String(error) + }`; + options.errors.push(msg); + return [] as ExtractedRecord[]; + } + }, + ({ page }) => [getDomain(page.final_url || page.url)], + ); + for (const batch of extracted) { + for (const record of batch) { + records.push(record); + const pk = getPrimaryKeyValue(record, options.spec); + if (pk) knownKeys.add(pk); + } + } + summary.records_from_extract = records.length; + } + + const agentBudget = agentEnabled ? config.maxAgentRunsPerPhase : 0; + const toRun = agentQueue.slice(0, agentBudget); + const deferredEntries: AgentDeferredEntry[] = [ + ...agentDisabledDeferredEntries, + ...agentQueue + .slice(agentBudget) + .map(({ page, triage }) => ({ + url: triage.final_url || page.url, + status: triage.status, + reason: "agent_budget" as const, + })), + ]; + + if (deferredEntries.length > 0) { + options.log( + options.label, + `Agent capability: running ${toRun.length}/${agentQueue.length} (${deferredEntries.length} deferred)`, + ); + } + + summary.agent_dispatched = toRun.length; + summary.agent_deferred = deferredEntries.length; + + if (toRun.length > 0) { + options.log( + options.label, + `Tinyfish Agent on ${toRun.length} pages (async queue + poll, queue=${config.agentQueueConcurrency}, poll=${config.agentPollConcurrency})...`, + ); + + const agentGoalQueue = createAgentQueue(); + + const jobsWithGoals = await agentGoalQueue.runAll( + toRun, + async ({ page, triage }) => { + const pageUrl = triage.final_url || page.url; + try { + const agentGoal = await generateAgentGoal({ + userPrompt: options.userPrompt, + spec: options.spec, + triage, + focusFields: options.focusFields, + memory: options.memory, + }); + return { page, triage, pageUrl, goal: agentGoal.goal, goalError: null as string | null }; + } catch (error) { + const msg = error instanceof Error ? error.message : String(error); + options.errors.push(`Agent goal failed for ${pageUrl}: ${msg}`); + return { page, triage, pageUrl, goal: "", goalError: msg }; + } + }, + ({ page }) => [getDomain(page.final_url || page.url)], + ); + + const queueJobs: { url: string; goal: string }[] = []; + const queueJobIndices: number[] = []; + + for (let index = 0; index < jobsWithGoals.length; index++) { + const job = jobsWithGoals[index]!; + if (job.goalError) { + summary.agent_failed += 1; + agentRuns.push({ + url: job.pageUrl, + status: job.triage.status, + run_id: null, + agent_status: "FAILED", + goal: "", + records_extracted: 0, + error: job.goalError, + }); + continue; + } + queueJobs.push({ url: job.pageUrl, goal: job.goal }); + queueJobIndices.push(index); + } + + const agentRunResults = await runTinyfishAgentsBatch(queueJobs, { + pollTimeoutMs: options.agentPollTimeoutMs, + }); + + const jobsToExtract = queueJobIndices.map((jobIndex, batchIndex) => ({ + job: jobsWithGoals[jobIndex]!, + run: agentRunResults[batchIndex]!, + })); + + await extractionQueue.runAll( + jobsToExtract, + async ({ job, run }) => { + const pageUrl = job.pageUrl; + + if (run.error || !run.result) { + summary.agent_failed += 1; + agentRuns.push({ + url: pageUrl, + status: job.triage.status, + run_id: run.run_id, + agent_status: run.status, + goal: job.goal, + records_extracted: 0, + error: run.error ?? "No result returned", + }); + options.log( + options.label, + `WARN Agent failed ${pageUrl}: ${run.error ?? "no result"}`, + ); + return; + } + + try { + const agentRecords = await extractFromAgentResult({ + spec: options.spec, + pageUrl, + agentResult: run.result, + focusFields: options.focusFields, + memory: options.memory, + }); + + summary.agent_succeeded += 1; + for (const record of agentRecords) { + records.push(record); + const pk = getPrimaryKeyValue(record, options.spec); + if (pk) knownKeys.add(pk); + } + summary.records_from_agent += agentRecords.length; + + agentRuns.push({ + url: pageUrl, + status: job.triage.status, + run_id: run.run_id, + agent_status: run.status, + goal: job.goal, + records_extracted: agentRecords.length, + }); + + options.log( + options.label, + `Agent OK ${pageUrl} → ${agentRecords.length} records`, + ); + } catch (error) { + summary.agent_failed += 1; + const msg = error instanceof Error ? error.message : String(error); + options.errors.push(`Agent extract failed for ${pageUrl}: ${msg}`); + agentRuns.push({ + url: pageUrl, + status: job.triage.status, + run_id: run.run_id, + agent_status: run.status, + goal: job.goal, + records_extracted: 0, + error: msg, + }); + } + }, + ({ job }) => [getDomain(job.pageUrl)], + ); + } + + if (agentRuns.length > 0) { + await saveJson( + join(options.paths.root, `agent_runs_${options.label}.json`), + agentRuns, + ); + } + + return { + records, + triageResults, + agentRuns, + agentDeferred: deferredEntries, + summary, + }; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/orchestrator/repair-loop.ts b/backend/BigSet_Data_Collection_Agent/src/orchestrator/repair-loop.ts new file mode 100644 index 0000000..892f531 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/orchestrator/repair-loop.ts @@ -0,0 +1,282 @@ +import { join } from "node:path"; +import { generateRepairDiagnosis } from "../agents/repair-diagnosis.js"; +import { generateRepairQueries } from "../agents/repair-queries.js"; +import { + analyzeCoverage, + countFilledGaps, + type CoverageReport, +} from "../coverage/analyze.js"; +import { config } from "../config.js"; +import type { RepairLoopReport } from "../models/schemas.js"; +import type { DatasetSpec, ExtractedRecord } from "../models/schemas.js"; +import { + recordCoverageGaps, + recordDiagnosis, + recordPhaseInMemory, + type WorkflowMemory, +} from "../memory/index.js"; +import { + markSearchPagesUsed, + planRepairSearches, +} from "../memory/search-pagination.js"; +import { mergeRepairIntoExisting } from "../merge/records.js"; +import type { SourcesReport } from "../models/quality.js"; +import { buildSourcesReport } from "../quality/index.js"; +import { saveJson, type RunPaths } from "../storage/run-store.js"; +import { normalizeUrl } from "../utils/url.js"; +import { + entityKeysFromRecords, + runAcquisitionPhase, + type AcquisitionResult, +} from "./acquisition.js"; + +export interface RepairLoopContext { + userPrompt: string; + spec: DatasetSpec; + paths: RunPaths; + errors: string[]; + memory: WorkflowMemory; + fetchedUrlSet: Set; + allSearchQueries: string[]; + allFailedUrls: string[]; + enableTriage: boolean; + enableTinyfishAgent: boolean; + agentPollTimeoutMs?: number; + targetRowCap: number; + log: (stage: string, message: string) => void; +} + +export interface RepairLoopRunResult { + mergedRecords: ExtractedRecord[]; + unkeyedRecords: ExtractedRecord[]; + coverage: CoverageReport; + loops: RepairLoopReport[]; + lastDiagnosis?: import("../memory/types.js").RepairDiagnosis; + repairAcquisitions: AcquisitionResult[]; + sourcesReports: SourcesReport[]; +} + +export async function runRepairLoops(options: { + ctx: RepairLoopContext; + recordsBeforeRepair: ExtractedRecord[]; + initialCoverage: CoverageReport; + pageIndexStart: number; +}): Promise { + const { ctx } = options; + let mergedRecords = options.recordsBeforeRepair; + let unkeyedRecords: ExtractedRecord[] = []; + let coverage = options.initialCoverage; + let pageIndex = options.pageIndexStart; + + const loops: RepairLoopReport[] = []; + const repairAcquisitions: AcquisitionResult[] = []; + const sourcesReports: SourcesReport[] = []; + let lastDiagnosis: import("../memory/types.js").RepairDiagnosis | undefined; + + recordCoverageGaps(ctx.memory, coverage); + + if (!coverage.should_repair) { + return { + mergedRecords, + unkeyedRecords, + coverage, + loops, + repairAcquisitions, + sourcesReports, + }; + } + + while ( + coverage.should_repair && + ctx.memory.repair_loop_count < config.maxRepairLoops + ) { + const loopIndex = ctx.memory.repair_loop_count + 1; + ctx.memory.repair_loop_count = loopIndex; + + const recordsBeforeLoop = mergedRecords; + const partialBefore = coverage.partial_count; + + ctx.log( + "repair", + `Loop ${loopIndex}/${config.maxRepairLoops} — missing: ${coverage.field_gaps.map((g) => g.column).join(", ")}`, + ); + + const diagnosis = await generateRepairDiagnosis({ + userPrompt: ctx.userPrompt, + spec: ctx.spec, + coverage, + memory: ctx.memory, + repairLoop: loopIndex, + maxRepairLoops: config.maxRepairLoops, + }); + lastDiagnosis = diagnosis; + recordDiagnosis(ctx.memory, loopIndex, diagnosis); + + await saveJson( + join(ctx.paths.root, `repair_diagnosis_${loopIndex}.json`), + diagnosis, + ); + + const repairPlan = await generateRepairQueries({ + userPrompt: ctx.userPrompt, + spec: ctx.spec, + coverage, + priorSearchQueries: ctx.allSearchQueries, + maxQueries: config.maxRepairQueries, + memory: ctx.memory, + diagnosis, + repairLoop: loopIndex, + }); + + const repairSearches = planRepairSearches( + ctx.memory, + repairPlan.repair_queries, + ); + const paginatedCount = repairSearches.filter((plan) => plan.page > 0).length; + + await saveJson(join(ctx.paths.root, `repair_queries_${loopIndex}.json`), { + ...repairPlan, + repair_searches: repairSearches, + }); + + ctx.log( + "repair", + `Loop ${loopIndex}: ${repairSearches.length} searches (${repairPlan.repair_queries.length} new, ${paginatedCount} paginated) — ${diagnosis.summary.slice(0, 100)}`, + ); + + const preferAgent = + diagnosis.prefer_tinyfish_agent && ctx.enableTinyfishAgent; + + const acquisition = await runAcquisitionPhase({ + label: `repair_${loopIndex}`, + userPrompt: ctx.userPrompt, + spec: ctx.spec, + queries: repairSearches.map((plan) => plan.query), + searches: repairSearches, + paths: ctx.paths, + errors: ctx.errors, + excludeUrls: ctx.fetchedUrlSet, + maxResultsPerQuery: config.maxRepairResultsPerQuery, + maxUrlsToFetch: config.maxRepairUrlsToFetch, + pageIndexStart: pageIndex, + focusFields: coverage.field_gaps.map((gap) => gap.column), + knownEntityKeys: entityKeysFromRecords(ctx.spec, recordsBeforeLoop), + enableTriage: ctx.enableTriage, + enableTinyfishAgent: ctx.enableTinyfishAgent, + agentPollTimeoutMs: ctx.agentPollTimeoutMs, + memory: ctx.memory, + forceAgent: preferAgent, + enableLinkFollow: config.enableRepairLinkFollow, + log: ctx.log, + }); + + markSearchPagesUsed( + ctx.memory, + repairSearches, + `repair_${loopIndex}`, + loopIndex, + ); + + repairAcquisitions.push(acquisition); + pageIndex += acquisition.pagesFetched; + + recordPhaseInMemory({ + memory: ctx.memory, + spec: ctx.spec, + phase: `repair_${loopIndex}`, + repairLoop: loopIndex, + queries: repairSearches.map((plan) => plan.query), + candidates: acquisition.candidates, + records: acquisition.records, + failedUrls: acquisition.failedUrls, + agentRuns: acquisition.agentRuns, + triageResults: acquisition.triageResults, + }); + + for (const url of acquisition.fetchedUrls) { + ctx.fetchedUrlSet.add(normalizeUrl(url)); + } + ctx.allSearchQueries.push(...repairPlan.repair_queries); + ctx.allFailedUrls.push(...acquisition.failedUrls); + + sourcesReports.push( + buildSourcesReport({ + phase: "repair", + fetchedPages: acquisition.fetchedPages, + fetchedUrls: acquisition.fetchedUrls, + triageResults: acquisition.triageResults, + agentRuns: acquisition.agentRuns, + agentDeferred: acquisition.agentDeferred, + }), + ); + + const mergeResult = mergeRepairIntoExisting( + ctx.spec, + recordsBeforeLoop, + acquisition.records, + ); + mergedRecords = mergeResult.records.slice(0, ctx.targetRowCap); + unkeyedRecords = [...unkeyedRecords, ...mergeResult.unkeyed]; + + const coverageAfter = analyzeCoverage(ctx.spec, mergedRecords); + await saveJson( + join(ctx.paths.root, `coverage_repair_${loopIndex}.json`), + coverageAfter, + ); + + const fieldsFilled = countFilledGaps( + ctx.spec, + recordsBeforeLoop, + mergedRecords, + coverage.field_gaps.map((gap) => gap.column), + ); + + loops.push({ + loop_index: loopIndex, + diagnosis_summary: diagnosis.summary, + repair_queries: repairPlan.repair_queries, + rationale: repairPlan.rationale, + missing_fields: coverage.field_gaps.map((gap) => gap.column), + records_before: recordsBeforeLoop.length, + records_after: mergedRecords.length, + fields_filled: fieldsFilled, + partial_count_before: partialBefore, + partial_count_after: coverageAfter.partial_count, + stats: { + search_queries_executed: repairSearches.length, + search_pages_paginated: paginatedCount, + search_results_collected: acquisition.candidates.length, + unique_urls_selected: acquisition.fetchedUrls.length, + pages_fetched: acquisition.pagesFetched, + pages_failed: acquisition.failedUrls.length, + raw_records_extracted: acquisition.records.length, + triage: acquisition.triage, + }, + }); + + ctx.log( + "repair", + `Loop ${loopIndex} done — ${mergedRecords.length} records, partial ${partialBefore} → ${coverageAfter.partial_count}`, + ); + + coverage = coverageAfter; + recordCoverageGaps(ctx.memory, coverage); + } + + if (coverage.should_repair && ctx.memory.repair_loop_count >= config.maxRepairLoops) { + ctx.log( + "repair", + `Stopped after ${config.maxRepairLoops} repair loops (gaps remain)`, + ); + } + + return { + mergedRecords, + unkeyedRecords, + coverage, + loops, + lastDiagnosis, + repairAcquisitions, + sourcesReports, + }; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/quality/build-report.ts b/backend/BigSet_Data_Collection_Agent/src/quality/build-report.ts new file mode 100644 index 0000000..dac45d9 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/quality/build-report.ts @@ -0,0 +1,244 @@ +import type { + QualityBucket, + QualityReport, + SourceOutcome, + SourcesReport, +} from "../models/quality.js"; +import type { + AgentRunRecord, + DatasetSpec, + ExtractedRecord, + FetchedPage, + SourceTriageResult, +} from "../models/schemas.js"; +import { statusNeedsAgent } from "../models/source-status.js"; +import { normalizeUrl } from "../utils/url.js"; +import { scoreRecords, type ScoreRecordContext } from "./score-record.js"; + +function bucket(recordIds: string[]): QualityBucket { + return { count: recordIds.length, record_ids: recordIds }; +} + +export function buildQualityReport( + spec: DatasetSpec, + records: ExtractedRecord[], + context: ScoreRecordContext, + unkeyedCount: number, +): QualityReport { + const scored = scoreRecords(spec, records, context); + + const completeIds: string[] = []; + const partialIds: string[] = []; + const lowConfidenceIds: string[] = []; + const reviewIds: string[] = []; + + for (const quality of scored) { + if (quality.record_status === "complete") completeIds.push(quality.record_id); + if (quality.record_status === "partial") partialIds.push(quality.record_id); + if (quality.record_status === "low_confidence") { + lowConfidenceIds.push(quality.record_id); + } + if (quality.needs_review) reviewIds.push(quality.record_id); + } + + return { + total_records: records.length, + unkeyed_records: unkeyedCount, + complete: bucket(completeIds), + partial: bucket(partialIds), + low_confidence: bucket(lowConfidenceIds), + needs_review: bucket(reviewIds), + records: scored, + }; +} + +export function triageByUrl( + triageResults: SourceTriageResult[], +): Map { + const map = new Map(); + for (const triage of triageResults) { + map.set(normalizeUrl(triage.final_url), triage); + map.set(normalizeUrl(triage.url), triage); + } + return map; +} + +export function agentExtractedUrls( + agentRuns: AgentRunRecord[], +): Set { + return new Set( + agentRuns + .filter((run) => run.records_extracted > 0 && !run.error) + .map((run) => normalizeUrl(run.url)), + ); +} + +const SKIPPED_STATUSES = new Set([ + "irrelevant", + "duplicate", + "blocked", + "low_value", +]); + +export interface BuildSourcesOptions { + phase: "initial" | "repair"; + fetchedPages: FetchedPage[]; + fetchedUrls: string[]; + triageResults: SourceTriageResult[]; + agentRuns: AgentRunRecord[]; + agentDeferred: { + url: string; + status: string; + reason?: "agent_budget" | "agent_disabled"; + }[]; +} + +export function buildSourcesReport( + options: BuildSourcesOptions, +): SourcesReport { + const outcomes: SourceOutcome[] = []; + const triageMap = triageByUrl(options.triageResults); + + for (const page of options.fetchedPages) { + const url = normalizeUrl(page.final_url || page.url); + const triage = triageMap.get(url); + + if (page.error) { + outcomes.push({ + url, + phase: options.phase, + outcome: "fetch_failed", + error: page.error, + triage_status: triage?.status, + triage_confidence: triage?.confidence, + source_data_confidence: triage?.source_data_confidence, + expected_yield: triage?.expected_yield, + }); + continue; + } + + if (triage && SKIPPED_STATUSES.has(triage.status)) { + outcomes.push({ + url, + phase: options.phase, + outcome: "skipped", + triage_status: triage.status, + triage_confidence: triage.confidence, + source_data_confidence: triage.source_data_confidence, + expected_yield: triage.expected_yield, + error: triage.reasoning.slice(0, 200), + }); + } + } + + for (const deferred of options.agentDeferred) { + outcomes.push({ + url: normalizeUrl(deferred.url), + phase: options.phase, + outcome: "agent_deferred", + triage_status: deferred.status, + error: deferred.reason === "agent_disabled" + ? "TinyFish Agent disabled for browser/form/detail follow-up" + : "Exceeded MAX_AGENT_RUNS_PER_PHASE budget", + }); + } + + for (const run of options.agentRuns) { + const url = normalizeUrl(run.url); + if (run.error || run.agent_status === "FAILED" || run.agent_status === "TIMEOUT") { + outcomes.push({ + url, + phase: options.phase, + outcome: "agent_failed", + triage_status: run.status, + error: run.error ?? run.agent_status, + records_extracted: run.records_extracted, + }); + } else if (run.records_extracted === 0) { + outcomes.push({ + url, + phase: options.phase, + outcome: "no_records", + triage_status: run.status, + records_extracted: 0, + }); + } else { + outcomes.push({ + url, + phase: options.phase, + outcome: "success", + triage_status: run.status, + records_extracted: run.records_extracted, + }); + } + } + + const outcomeUrls = new Set(outcomes.map((item) => item.url)); + for (const triage of options.triageResults) { + const url = normalizeUrl(triage.final_url); + if (outcomeUrls.has(url)) continue; + + if (triage.status === "extract_now") { + outcomes.push({ + url, + phase: options.phase, + outcome: "success", + triage_status: triage.status, + triage_confidence: triage.confidence, + source_data_confidence: triage.source_data_confidence, + expected_yield: triage.expected_yield, + }); + } else if (statusNeedsAgent(triage.status)) { + outcomes.push({ + url, + phase: options.phase, + outcome: "no_records", + triage_status: triage.status, + triage_confidence: triage.confidence, + source_data_confidence: triage.source_data_confidence, + expected_yield: triage.expected_yield, + error: "Agent path did not yield records", + }); + } + } + + const byOutcome: Record = {}; + for (const item of outcomes) { + byOutcome[item.outcome] = (byOutcome[item.outcome] ?? 0) + 1; + } + + const failed = outcomes.filter((item) => + ["fetch_failed", "skipped", "agent_failed", "agent_deferred", "no_records"].includes( + item.outcome, + ), + ); + + return { + total: outcomes.length, + failed, + by_outcome: byOutcome, + outcomes, + }; +} + +export function mergeSourcesReports( + initial: SourcesReport, + repair: SourcesReport | null, +): SourcesReport { + const outcomes = [...initial.outcomes, ...(repair?.outcomes ?? [])]; + const byOutcome: Record = {}; + for (const item of outcomes) { + byOutcome[item.outcome] = (byOutcome[item.outcome] ?? 0) + 1; + } + const failed = outcomes.filter((item) => + ["fetch_failed", "skipped", "agent_failed", "agent_deferred", "no_records"].includes( + item.outcome, + ), + ); + return { + total: outcomes.length, + failed, + by_outcome: byOutcome, + outcomes, + }; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/quality/field-confidence.ts b/backend/BigSet_Data_Collection_Agent/src/quality/field-confidence.ts new file mode 100644 index 0000000..790afef --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/quality/field-confidence.ts @@ -0,0 +1,72 @@ +import type { DatasetSpec, ExtractedRecord, SourceTriageResult } from "../models/schemas.js"; +import type { ScoreRecordContext } from "./score-record.js"; + +function isEmpty(value: unknown): boolean { + return value === null || value === undefined || value === ""; +} + +/** Confidence for one populated field from its evidence URL and row-level signals. */ +export function confidenceForField( + fieldName: string, + record: ExtractedRecord, + context: ScoreRecordContext, +): number { + const extraction = record.extraction_confidence ?? 0.85; + const evidenceForField = record.evidence.filter((item) => item.field === fieldName); + + if (evidenceForField.length === 0) { + const fromAgent = record.source_urls.some((url) => + context.agentExtractedUrls.has(url), + ); + return Math.min(1, Math.max(0, extraction * (fromAgent ? 0.72 : 0.78))); + } + + const urlScores = evidenceForField + .map((item) => { + const triage = context.triageByUrl.get(item.url); + const source = triage?.source_data_confidence ?? 0.65; + const routing = triage?.confidence ?? 0.7; + return source * 0.7 + routing * 0.15 + extraction * 0.15; + }) + .filter((value) => Number.isFinite(value)); + + if (urlScores.length === 0) { + return Math.min(1, Math.max(0, extraction * 0.8)); + } + + return Math.min( + 1, + Math.max(0, urlScores.reduce((sum, value) => sum + value, 0) / urlScores.length), + ); +} + +export function computeFieldConfidences( + spec: DatasetSpec, + record: ExtractedRecord, + context: ScoreRecordContext, +): Record { + const out: Record = {}; + for (const col of spec.columns) { + if (isEmpty(record.row[col.name])) continue; + const score = confidenceForField(col.name, record, context); + out[col.name] = Math.round(score * 1000) / 1000; + } + return out; +} + +export function aggregateRecordConfidence( + spec: DatasetSpec, + fieldConfidences: Record, + requiredOnly = true, +): number { + const columns = spec.columns.filter((col) => + requiredOnly ? col.required : true, + ); + const scores = columns + .map((col) => fieldConfidences[col.name]) + .filter((value): value is number => value !== undefined); + + if (scores.length === 0) return 0; + const mean = scores.reduce((sum, value) => sum + value, 0) / scores.length; + return Math.round(mean * 1000) / 1000; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/quality/index.ts b/backend/BigSet_Data_Collection_Agent/src/quality/index.ts new file mode 100644 index 0000000..a15fd78 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/quality/index.ts @@ -0,0 +1,8 @@ +export { + agentExtractedUrls, + buildQualityReport, + buildSourcesReport, + mergeSourcesReports, + triageByUrl, +} from "./build-report.js"; +export { scoreRecord, scoreRecords, type ScoreRecordContext } from "./score-record.js"; diff --git a/backend/BigSet_Data_Collection_Agent/src/quality/score-record.ts b/backend/BigSet_Data_Collection_Agent/src/quality/score-record.ts new file mode 100644 index 0000000..cdefa1f --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/quality/score-record.ts @@ -0,0 +1,176 @@ +import { config } from "../config.js"; +import { canonicalRecordId } from "../merge/records.js"; +import type { RecordQuality, RecordStatus } from "../models/quality.js"; +import type { DatasetSpec, ExtractedRecord, SourceTriageResult } from "../models/schemas.js"; +import { + aggregateRecordConfidence, + computeFieldConfidences, +} from "./field-confidence.js"; + +export interface ScoreRecordContext { + triageByUrl: Map; + agentExtractedUrls: Set; +} + +function isEmpty(value: unknown): boolean { + return value === null || value === undefined || value === ""; +} + +function evidenceCoverage( + spec: DatasetSpec, + record: ExtractedRecord, +): { ratio: number; fieldsWithoutEvidence: string[] } { + const nonNullFields = spec.columns.filter((col) => !isEmpty(record.row[col.name])); + if (nonNullFields.length === 0) { + return { ratio: 1, fieldsWithoutEvidence: [] }; + } + + const evidenced = new Set(record.evidence.map((item) => item.field)); + const fieldsWithoutEvidence = nonNullFields + .filter((col) => !evidenced.has(col.name)) + .map((col) => col.name); + + const ratio = + (nonNullFields.length - fieldsWithoutEvidence.length) / nonNullFields.length; + + return { ratio, fieldsWithoutEvidence }; +} + +function minSourceConfidence( + record: ExtractedRecord, + triageByUrl: Map, +): number { + const scores = record.source_urls + .map((url) => triageByUrl.get(url)?.source_data_confidence) + .filter((value): value is number => value !== undefined); + + if (scores.length === 0) return 0.65; + return Math.min(...scores); +} + +export function scoreRecord( + spec: DatasetSpec, + record: ExtractedRecord, + context: ScoreRecordContext, + recordId: string, +): RecordQuality { + const requiredColumns = spec.columns.filter((col) => col.required); + const optionalColumns = spec.columns.filter((col) => !col.required); + + const missingRequired = requiredColumns + .filter((col) => isEmpty(record.row[col.name])) + .map((col) => col.name); + const missingOptional = optionalColumns + .filter((col) => isEmpty(record.row[col.name])) + .map((col) => col.name); + + const filledRequired = + requiredColumns.length > 0 + ? requiredColumns.length - missingRequired.length + : spec.columns.length; + const completenessPct = + requiredColumns.length > 0 + ? filledRequired / requiredColumns.length + : spec.columns.filter((col) => !isEmpty(record.row[col.name])).length / + Math.max(spec.columns.length, 1); + + const { ratio: evidenceRatio, fieldsWithoutEvidence } = evidenceCoverage( + spec, + record, + ); + const sourceConfidence = minSourceConfidence(record, context.triageByUrl); + const extractionConfidence = record.extraction_confidence ?? 0.85; + const fieldConfidences = computeFieldConfidences(spec, record, context); + + const requiredFieldConfidence = aggregateRecordConfidence( + spec, + fieldConfidences, + true, + ); + const legacyBlend = Math.min( + 1, + Math.max( + 0, + completenessPct * 0.35 + + sourceConfidence * 0.25 + + extractionConfidence * 0.25 + + evidenceRatio * 0.15, + ), + ); + const confidenceScore = + requiredColumns.length > 0 && Object.keys(fieldConfidences).length > 0 + ? requiredFieldConfidence + : legacyBlend; + + const reviewReasons: string[] = []; + if (missingRequired.length > 0) { + reviewReasons.push( + `missing required fields: ${missingRequired.join(", ")}`, + ); + } + if (fieldsWithoutEvidence.length > 0) { + reviewReasons.push( + `fields without evidence: ${fieldsWithoutEvidence.join(", ")}`, + ); + } + if (sourceConfidence < config.qualitySourceConfidenceThreshold) { + reviewReasons.push( + `low source data confidence (${sourceConfidence.toFixed(2)})`, + ); + } + if (extractionConfidence < config.qualityExtractionConfidenceThreshold) { + reviewReasons.push( + `low extraction confidence (${extractionConfidence.toFixed(2)})`, + ); + } + + const fromAgent = record.source_urls.some((url) => + context.agentExtractedUrls.has(url), + ); + if (fromAgent && extractionConfidence < 0.8) { + reviewReasons.push("browser agent extraction — verify manually"); + } + + let recordStatus: RecordStatus; + if (missingRequired.length > 0) { + recordStatus = "partial"; + } else if ( + confidenceScore < config.qualityLowConfidenceThreshold || + fieldsWithoutEvidence.length > 0 + ) { + recordStatus = "low_confidence"; + } else { + recordStatus = "complete"; + } + + const needsReview = + recordStatus === "partial" || + recordStatus === "low_confidence" || + confidenceScore < config.qualityReviewThreshold; + + return { + record_id: recordId, + record_status: recordStatus, + needs_review: needsReview, + completeness_pct: Math.round(completenessPct * 1000) / 1000, + confidence_score: Math.round(confidenceScore * 1000) / 1000, + field_confidences: fieldConfidences, + missing_required_fields: missingRequired, + missing_optional_fields: missingOptional, + fields_without_evidence: fieldsWithoutEvidence, + review_reasons: reviewReasons, + }; +} + +export function scoreRecords( + spec: DatasetSpec, + records: ExtractedRecord[], + context: ScoreRecordContext, +): RecordQuality[] { + return records.map((record) => { + const recordId = + canonicalRecordId(record, spec) ?? + `unkeyed:${JSON.stringify(record.row).slice(0, 80)}`; + return scoreRecord(spec, record, context, recordId); + }); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/queue/domain-throttle.ts b/backend/BigSet_Data_Collection_Agent/src/queue/domain-throttle.ts new file mode 100644 index 0000000..8efb7a8 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/queue/domain-throttle.ts @@ -0,0 +1,63 @@ +/** + * Limits concurrent work per domain (e.g. max 2 fetches on yelp.com at once). + */ +export class DomainThrottle { + private readonly active = new Map(); + private readonly waiters = new Map void>>(); + + constructor(private readonly maxPerDomain: number) {} + + async acquire(domain: string): Promise<() => void> { + if (!domain) { + return () => undefined; + } + + await new Promise((resolve) => { + const tryAcquire = (): void => { + const count = this.active.get(domain) ?? 0; + if (count < this.maxPerDomain) { + this.active.set(domain, count + 1); + resolve(); + return; + } + const queue = this.waiters.get(domain) ?? []; + queue.push(tryAcquire); + this.waiters.set(domain, queue); + }; + tryAcquire(); + }); + + let released = false; + return () => { + if (released) return; + released = true; + const count = (this.active.get(domain) ?? 1) - 1; + if (count <= 0) { + this.active.delete(domain); + } else { + this.active.set(domain, count); + } + const queue = this.waiters.get(domain); + if (queue && queue.length > 0) { + const next = queue.shift()!; + next(); + } + }; + } + + async withDomains(domains: string[], fn: () => Promise): Promise { + const unique = [...new Set(domains.filter(Boolean))].sort(); + const releases: Array<() => void> = []; + + try { + for (const domain of unique) { + releases.push(await this.acquire(domain)); + } + return await fn(); + } finally { + for (const release of releases.reverse()) { + release(); + } + } + } +} diff --git a/backend/BigSet_Data_Collection_Agent/src/queue/pools.ts b/backend/BigSet_Data_Collection_Agent/src/queue/pools.ts new file mode 100644 index 0000000..05aefc2 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/queue/pools.ts @@ -0,0 +1,73 @@ +import { config } from "../config.js"; +import { DomainThrottle } from "./domain-throttle.js"; +import { RateLimiter } from "./rate-limiter.js"; +import { TaskQueue } from "./task-queue.js"; + +let sharedDomainThrottle: DomainThrottle | null = null; +let openRouterLimiter: RateLimiter | null = null; + +export function getSharedDomainThrottle(): DomainThrottle { + if (!sharedDomainThrottle) { + sharedDomainThrottle = new DomainThrottle(config.maxConcurrentPerDomain); + } + return sharedDomainThrottle; +} + +export function getOpenRouterLimiter(): RateLimiter { + if (!openRouterLimiter) { + openRouterLimiter = new RateLimiter(config.openRouterRpm, 60_000); + } + return openRouterLimiter; +} + +const defaultRetry = { + maxRetries: config.maxRetries, + retryBaseDelayMs: config.retryBaseDelayMs, +}; + +export function createSearchQueue(): TaskQueue { + return new TaskQueue({ + name: "search", + concurrency: config.searchConcurrency, + rateLimiter: new RateLimiter(config.tinyfishSearchRpm, 60_000), + ...defaultRetry, + }); +} + +export function createFetchQueue(): TaskQueue { + return new TaskQueue({ + name: "fetch", + concurrency: config.fetchConcurrency, + rateLimiter: new RateLimiter(config.tinyfishFetchRpm, 60_000), + domainThrottle: getSharedDomainThrottle(), + ...defaultRetry, + }); +} + +export function createTriageQueue(): TaskQueue { + return new TaskQueue({ + name: "triage", + concurrency: config.triageConcurrency, + rateLimiter: getOpenRouterLimiter(), + ...defaultRetry, + }); +} + +export function createExtractionQueue(): TaskQueue { + return new TaskQueue({ + name: "extract", + concurrency: config.extractionConcurrency, + rateLimiter: getOpenRouterLimiter(), + ...defaultRetry, + }); +} + +export function createAgentQueue(): TaskQueue { + return new TaskQueue({ + name: "agent", + concurrency: config.agentConcurrency, + rateLimiter: new RateLimiter(config.tinyfishAgentRpm, 60_000), + domainThrottle: getSharedDomainThrottle(), + ...defaultRetry, + }); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/queue/rate-limiter.ts b/backend/BigSet_Data_Collection_Agent/src/queue/rate-limiter.ts new file mode 100644 index 0000000..a3c46af --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/queue/rate-limiter.ts @@ -0,0 +1,41 @@ +import { sleep } from "./retry.js"; + +/** + * Token-bucket style limiter: at most `maxRequests` starts per `intervalMs`. + */ +export class RateLimiter { + private tokens: number; + private lastRefillAt: number; + + constructor( + private readonly maxRequests: number, + private readonly intervalMs: number, + ) { + this.tokens = maxRequests; + this.lastRefillAt = Date.now(); + } + + private refill(): void { + const now = Date.now(); + const elapsed = now - this.lastRefillAt; + if (elapsed < this.intervalMs) return; + + const periods = Math.floor(elapsed / this.intervalMs); + this.tokens = Math.min( + this.maxRequests, + this.tokens + periods * this.maxRequests, + ); + this.lastRefillAt += periods * this.intervalMs; + } + + async acquire(): Promise { + while (true) { + this.refill(); + if (this.tokens > 0) { + this.tokens -= 1; + return; + } + await sleep(Math.min(250, this.intervalMs)); + } + } +} diff --git a/backend/BigSet_Data_Collection_Agent/src/queue/retry.ts b/backend/BigSet_Data_Collection_Agent/src/queue/retry.ts new file mode 100644 index 0000000..dd9e8e5 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/queue/retry.ts @@ -0,0 +1,55 @@ +export function isRetryableError(error: unknown): boolean { + if (error && typeof error === "object" && "status" in error) { + const status = (error as { status: number }).status; + if (status === 429 || status === 502 || status === 503 || status === 504) { + return true; + } + } + + const message = + error instanceof Error + ? error.message + : typeof error === "string" + ? error + : JSON.stringify(error); + + return /429|502|503|504|timeout|timed out|ECONNRESET|ETIMEDOUT|rate limit|temporarily unavailable/i.test( + message, + ); +} + +export async function sleep(ms: number): Promise { + await new Promise((resolve) => setTimeout(resolve, ms)); +} + +export async function withRetry( + fn: () => Promise, + options: { + maxRetries: number; + baseDelayMs: number; + label?: string; + }, +): Promise { + let lastError: unknown; + + for (let attempt = 0; attempt <= options.maxRetries; attempt++) { + try { + return await fn(); + } catch (error) { + lastError = error; + if (!isRetryableError(error) || attempt >= options.maxRetries) { + throw error; + } + const delay = options.baseDelayMs * 2 ** attempt; + const label = options.label ? ` (${options.label})` : ""; + console.warn( + `[retry]${label} attempt ${attempt + 1}/${options.maxRetries} failed, retrying in ${delay}ms: ${ + error instanceof Error ? error.message : String(error) + }`, + ); + await sleep(delay); + } + } + + throw lastError; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/queue/task-queue.ts b/backend/BigSet_Data_Collection_Agent/src/queue/task-queue.ts new file mode 100644 index 0000000..e3327a0 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/queue/task-queue.ts @@ -0,0 +1,79 @@ +import type { DomainThrottle } from "./domain-throttle.js"; +import type { RateLimiter } from "./rate-limiter.js"; +import { withRetry } from "./retry.js"; + +export interface TaskQueueOptions { + name: string; + concurrency: number; + maxRetries?: number; + retryBaseDelayMs?: number; + rateLimiter?: RateLimiter; + domainThrottle?: DomainThrottle; +} + +export class TaskQueue { + private readonly maxRetries: number; + private readonly retryBaseDelayMs: number; + + constructor(private readonly options: TaskQueueOptions) { + this.maxRetries = options.maxRetries ?? 0; + this.retryBaseDelayMs = options.retryBaseDelayMs ?? 1000; + } + + /** + * Run handler for each item with bounded concurrency, optional rate limit, + * per-domain throttle, and retries on transient failures. + */ + async runAll( + items: T[], + handler: (item: T, index: number) => Promise, + getDomains?: (item: T) => string[], + ): Promise { + if (items.length === 0) return []; + + const results = new Array(items.length); + let nextIndex = 0; + + const runOne = async (index: number, item: T): Promise => { + const execute = async (): Promise => { + if (this.options.rateLimiter) { + await this.options.rateLimiter.acquire(); + } + + const runHandler = () => handler(item, index); + + if (this.options.domainThrottle && getDomains) { + const domains = getDomains(item); + return this.options.domainThrottle.withDomains(domains, runHandler); + } + + return runHandler(); + }; + + const wrapped = () => + withRetry(execute, { + maxRetries: this.maxRetries, + baseDelayMs: this.retryBaseDelayMs, + label: `${this.options.name}#${index}`, + }); + + results[index] = await wrapped(); + }; + + async function worker(): Promise { + while (true) { + const index = nextIndex; + nextIndex += 1; + if (index >= items.length) return; + await runOne(index, items[index]!); + } + } + + const workers = Array.from( + { length: Math.min(this.options.concurrency, items.length) }, + () => worker(), + ); + await Promise.all(workers); + return results; + } +} diff --git a/backend/BigSet_Data_Collection_Agent/src/records/source-urls.ts b/backend/BigSet_Data_Collection_Agent/src/records/source-urls.ts new file mode 100644 index 0000000..b47add7 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/records/source-urls.ts @@ -0,0 +1,82 @@ +import type { DatasetSpec, ExtractedRecord } from "../models/schemas.js"; + +export function isHttpUrl(value: unknown): value is string { + return typeof value === "string" && /^https?:\/\//i.test(value.trim()); +} + +export function isUrlLikeColumnName(name: string): boolean { + const lower = name.toLowerCase(); + return ( + lower === "url" || + lower.endsWith("_url") || + lower.includes("url") || + lower === "website" || + lower.endsWith("_website") || + lower === "homepage" || + lower.endsWith("_homepage") + ); +} + +export function deriveRecordSourceUrls(input: { + spec: DatasetSpec; + row: ExtractedRecord["row"]; + evidence: ExtractedRecord["evidence"]; + fallbackUrls?: string[]; +}): string[] { + const urls = new Set(); + for (const item of input.evidence) { + if (isHttpUrl(item.url)) { + urls.add(item.url.trim()); + } + } + + for (const column of input.spec.columns) { + if (!isUrlLikeColumnName(column.name)) continue; + const value = input.row[column.name]; + if (isHttpUrl(value)) { + urls.add(value.trim()); + } + } + + for (const url of input.fallbackUrls ?? []) { + if (isHttpUrl(url)) { + urls.add(url.trim()); + } + } + + return [...urls]; +} + +export function scoreDocsUrlForOfficialSource(value: unknown): number { + if (!isHttpUrl(value)) return 0; + const normalized = value.toLowerCase(); + let score = 1; + if (/^https:\/\/(?:docs|developers)\./.test(normalized)) score += 4; + if (/\/(?:docs|documentation|guides|api\/docs|agents|model-context-protocol|mcp)(?:\/|$|\?)/.test(normalized)) { + score += 3; + } + if (/\b(?:blog|news|course|academy|directory|skilljar)\b/.test(normalized)) { + score -= 4; + } + return score; +} + +export function scoreUrlForCanonicalSource(value: unknown): number { + if (!isHttpUrl(value)) return 0; + const normalized = value.toLowerCase(); + let score = scoreDocsUrlForOfficialSource(value); + if (/\b(?:pricing|billing)\b/.test(normalized)) score += 3; + if (/\b(?:earnings|press-release|financial-results|reports-.*quarter|quarter-results)\b/.test(normalized)) { + score += 4; + } + if (/\b(?:news|newsroom|investor|investors)\b/.test(normalized)) { + score += 2; + } + if (/\/(?:default|index)\.(?:aspx|html?)$/.test(normalized)) { + score -= 2; + } + if (/\/(?:financial-info|financial-reports|annual-reports)\/(?:default\.aspx)?$/.test(normalized)) { + score -= 2; + } + return score; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/storage/run-loader.ts b/backend/BigSet_Data_Collection_Agent/src/storage/run-loader.ts new file mode 100644 index 0000000..e857630 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/storage/run-loader.ts @@ -0,0 +1,90 @@ +import { readFile } from "node:fs/promises"; +import { join } from "node:path"; +import { workflowMemorySchema, type WorkflowMemory } from "../memory/types.js"; +import { + datasetSpecSchema, + extractedRecordSchema, + runReportSchema, + type DatasetSpec, + type ExtractedRecord, + type RunReport, +} from "../models/schemas.js"; + +export interface LoadedRun { + runId: string; + root: string; + spec: DatasetSpec; + report: RunReport; + records: ExtractedRecord[]; + memory: WorkflowMemory | null; +} + +export function runRoot(baseDir: string, runId: string): string { + return join(baseDir, runId); +} + +export async function loadRunForRefresh( + baseDir: string, + runId: string, +): Promise { + const root = runRoot(baseDir, runId); + const spec = datasetSpecSchema.parse( + JSON.parse(await readFile(join(root, "dataset_spec.json"), "utf8")), + ); + const report = runReportSchema.parse( + JSON.parse(await readFile(join(root, "run_report.json"), "utf8")), + ); + + let memory: WorkflowMemory | null = null; + try { + memory = workflowMemorySchema.parse( + JSON.parse(await readFile(join(root, "workflow_memory.json"), "utf8")), + ); + } catch { + memory = null; + } + + const records = await loadRecordsFromEvidence(join(root, "evidence.jsonl")); + const fallback = + records.length > 0 + ? records + : await loadRecordsFromEvidence(join(root, "evidence_full.jsonl")); + + return { + runId, + root, + spec, + report, + records: fallback, + memory, + }; +} + +export async function loadRecordsFromEvidence( + path: string, +): Promise { + try { + const raw = await readFile(path, "utf8"); + const lines = raw.split("\n").filter((line) => line.trim().length > 0); + const records: ExtractedRecord[] = []; + for (const line of lines) { + const parsed = JSON.parse(line) as { + row: ExtractedRecord["row"]; + evidence: ExtractedRecord["evidence"]; + source_urls: string[]; + extraction_confidence?: number; + }; + records.push( + extractedRecordSchema.parse({ + row: parsed.row, + evidence: parsed.evidence ?? [], + source_urls: parsed.source_urls ?? [], + extraction_confidence: parsed.extraction_confidence, + }), + ); + } + return records; + } catch { + return []; + } +} diff --git a/backend/BigSet_Data_Collection_Agent/src/storage/run-store.ts b/backend/BigSet_Data_Collection_Agent/src/storage/run-store.ts new file mode 100644 index 0000000..a7ceb16 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/storage/run-store.ts @@ -0,0 +1,99 @@ +import { mkdir, writeFile } from "node:fs/promises"; +import { join } from "node:path"; +import type { + DatasetSpec, + FetchedPage, + RunReport, + SourceCandidate, +} from "../models/schemas.js"; + +export interface RunPaths { + runId: string; + root: string; + pagesDir: string; + specPath: string; + candidatesPath: string; + /** Final selective view (required fields only, ranked). */ + resultsPath: string; + /** Full merged dataset before selective filter. */ + resultsFullPath: string; + evidencePath: string; + evidenceFullPath: string; + /** Snapshot after initial search → fetch → extract → merge. */ + initResultsPath: string; + initEvidencePath: string; + /** Snapshot after repair pass (written only when repair runs). */ + repairResultsPath: string; + repairEvidencePath: string; + reportPath: string; +} + +export async function createRunStore( + baseDir: string, + runId: string, +): Promise { + const root = join(baseDir, runId); + const pagesDir = join(root, "pages"); + await mkdir(pagesDir, { recursive: true }); + + return { + runId, + root, + pagesDir, + specPath: join(root, "dataset_spec.json"), + candidatesPath: join(root, "source_candidates.json"), + resultsPath: join(root, "results.csv"), + resultsFullPath: join(root, "results_full.csv"), + evidencePath: join(root, "evidence.jsonl"), + evidenceFullPath: join(root, "evidence_full.jsonl"), + initResultsPath: join(root, "init_results.csv"), + initEvidencePath: join(root, "init_evidence.jsonl"), + repairResultsPath: join(root, "repair_results.csv"), + repairEvidencePath: join(root, "repair_evidence.jsonl"), + reportPath: join(root, "run_report.json"), + }; +} + +export async function saveJson(path: string, data: unknown): Promise { + await writeFile(path, `${JSON.stringify(data, null, 2)}\n`, "utf8"); +} + +export async function saveDatasetSpec( + paths: RunPaths, + spec: DatasetSpec, +): Promise { + await saveJson(paths.specPath, spec); +} + +export async function saveSourceCandidates( + paths: RunPaths, + candidates: SourceCandidate[], +): Promise { + await saveJson(paths.candidatesPath, candidates); +} + +export async function saveFetchedPage( + paths: RunPaths, + page: FetchedPage, + index: number, +): Promise { + const slug = String(index).padStart(3, "0"); + const metaPath = join(paths.pagesDir, `${slug}.meta.json`); + const textPath = join(paths.pagesDir, `${slug}.md`); + + await saveJson(metaPath, { + url: page.url, + final_url: page.final_url, + title: page.title, + description: page.description, + error: page.error, + }); + await writeFile(textPath, page.text || "", "utf8"); +} + +export async function saveRunReport( + paths: RunPaths, + report: RunReport, +): Promise { + await saveJson(paths.reportPath, report); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/utils/concurrency.ts b/backend/BigSet_Data_Collection_Agent/src/utils/concurrency.ts new file mode 100644 index 0000000..767fc3b --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/utils/concurrency.ts @@ -0,0 +1,26 @@ +export async function mapWithConcurrency( + items: T[], + concurrency: number, + fn: (item: T, index: number) => Promise, +): Promise { + if (items.length === 0) return []; + + const results = new Array(items.length); + let nextIndex = 0; + + async function worker(): Promise { + while (true) { + const index = nextIndex; + nextIndex += 1; + if (index >= items.length) return; + results[index] = await fn(items[index]!, index); + } + } + + const workers = Array.from( + { length: Math.min(concurrency, items.length) }, + () => worker(), + ); + await Promise.all(workers); + return results; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/utils/url.ts b/backend/BigSet_Data_Collection_Agent/src/utils/url.ts new file mode 100644 index 0000000..3f1f0fc --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/utils/url.ts @@ -0,0 +1,20 @@ +export function normalizeUrl(url: string): string { + try { + const parsed = new URL(url); + parsed.hash = ""; + if (parsed.pathname.endsWith("/") && parsed.pathname.length > 1) { + parsed.pathname = parsed.pathname.slice(0, -1); + } + return parsed.toString(); + } catch { + return url.trim(); + } +} + +export function getDomain(url: string): string { + try { + return new URL(url).hostname.replace(/^www\./, ""); + } catch { + return url; + } +} diff --git a/backend/CLAUDE.md b/backend/CLAUDE.md index f5dccc5..38eb942 100644 --- a/backend/CLAUDE.md +++ b/backend/CLAUDE.md @@ -9,7 +9,7 @@ Fastify serves the backend API on :3501. Protected routes use Clerk JWT verifica Routes: - `GET /health` — public health check - `POST /infer-schema` — protected. Accepts `{ prompt: string }`, returns a `DatasetSchema`. Calls `inferSchema()` from the pipeline. -- `POST /populate` — protected. Accepts a `DatasetContext` (datasetId, name, description, columns). Triggers the populate workflow which clears existing rows, then uses an AI agent to search the web and insert real data. +- `POST /populate` — protected. Accepts a `DatasetContext` (datasetId, name, description, columns). Runs the self-healing populate layer, validates the active/candidate recipe output, then atomically replaces rows only after validation passes. To add a new protected route, register it inside the scoped plugin in `src/index.ts` that has `requireAuth` as a preHandler. Use `req.auth.userId` for the authenticated user — never trust user-supplied IDs in the body. @@ -19,13 +19,27 @@ To add a new protected route, register it inside the scoped plugin in `src/index The pipeline is a pure function (`inferSchema(prompt) → DatasetSchema`). It is called by both Fastify (for the HTTP API) and Mastra (for workflow orchestration). +## Populate And Self-Healing + +`src/pipeline/populate-runtime.ts` — direct callable runtime around the Mastra populate agent. It uses in-memory row capture and returns rows, validation issues, usage, metrics, and debug artifacts without writing Convex rows. + +`src/pipeline/populate-self-healing.ts` — recipe runtime/service/store layer. It reruns the active recipe, generates the first recipe, repairs failed active recipes, validates candidate output, promotes healthy candidates, and rejects unsafe candidates. + +`src/pipeline/populate-self-healing-runner.ts` — shared route/CLI runner. HTTP populate uses a durable filesystem store and `ConvexPopulateDatasetRowWriter`; benchmark/dry-run paths can inject an in-memory store and skip row commits. + +`npm --silent run populate:self-heal -- --dataset-id ` — operator/cron-friendly dry run. It loads live dataset context with system Convex auth, emits one JSON summary to stdout, and does not persist recipe history or commit rows. + +`npm --silent run populate:self-heal -- --dataset-id --commit` — commits validated rows through the atomic Convex replace mutation. Requires `CONVEX_URL`, `CONVEX_SELF_HOSTED_ADMIN_KEY`, `OPENROUTER_API_KEY`, and `TINYFISH_API_KEY`. + +`npm --silent run populate:self-heal -- --context context.json` — dev harness dry run for a pasted `DatasetContext`. It uses an isolated in-memory recipe store; `--recipe-store-dir` is rejected unless `--commit` is set. + ## Mastra (Workflow Orchestration) `src/mastra/` — wraps pipelines into Mastra workflows. Runs as a separate Docker service on :4111 with `mastra dev`, which provides a Studio UI for inspecting and testing workflows. - `src/mastra/index.ts` — registers agents and workflows with the `Mastra` instance - `src/mastra/workflows/infer-schema.ts` — `inferSchemaWorkflow`, a single-step workflow wrapping `inferSchema()` -- `src/mastra/workflows/populate.ts` — `populateWorkflow`, 3-step workflow: clear rows → build prompt → run populate agent +- `src/mastra/workflows/populate.ts` — legacy Mastra workflow: clear rows → build prompt → run populate agent. HTTP `/populate` no longer uses this destructive pre-clear path. - `src/mastra/agents/populate.ts` — `populateAgent`, an AI agent (Claude Sonnet 4.6 via OpenRouter) with 7 tools for database CRUD and web access - `src/mastra/tools/dataset-tools.ts` — 5 Convex-backed tools: `insert_row`, `list_rows`, `get_row`, `update_row`, `delete_row` - `src/mastra/tools/web-tools.ts` — 2 TinyFish API tools: `search_web`, `fetch_page` diff --git a/backend/package-lock.json b/backend/package-lock.json index 16bad4d..ea6aa1c 100644 --- a/backend/package-lock.json +++ b/backend/package-lock.json @@ -12,6 +12,7 @@ "@fastify/cors": "^11.0.0", "@mastra/core": "^1.36.0", "@openrouter/ai-sdk-provider": "^2.9.0", + "@tiny-fish/sdk": "^0.0.8", "ai": "^6.0.0", "convex": "^1.39.1", "dotenv": "^16.4.0", @@ -2544,6 +2545,18 @@ "url": "https://github.com/sponsors/tannerlinsley" } }, + "node_modules/@tiny-fish/sdk": { + "version": "0.0.8", + "resolved": "https://registry.npmjs.org/@tiny-fish/sdk/-/sdk-0.0.8.tgz", + "integrity": "sha512-GTIpIDcwYuCbtd1xcgf0JD81wbPWGY0mxiab9VepT1allNUfVvjWCKT1n8RypsrzXne39j5Ez3ILDBE4ZwlApQ==", + "dependencies": { + "p-retry": "^7.1.1", + "zod": "^4.3.6" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/@types/babel__traverse": { "version": "7.28.0", "resolved": "https://registry.npmjs.org/@types/babel__traverse/-/babel__traverse-7.28.0.tgz", diff --git a/backend/package.json b/backend/package.json index 6903fbd..f7784a9 100644 --- a/backend/package.json +++ b/backend/package.json @@ -5,15 +5,18 @@ "private": true, "scripts": { "dev": "tsx watch src/index.ts", + "test": "node --import tsx --test test/*.test.ts", "build": "tsc", "start": "node dist/index.js", - "mastra:dev": "mastra dev" + "mastra:dev": "mastra dev", + "populate:self-heal": "tsx src/pipeline/populate-self-healing-cli.ts" }, "dependencies": { "@clerk/backend": "^3.4.11", "@fastify/cors": "^11.0.0", "@mastra/core": "^1.36.0", "@openrouter/ai-sdk-provider": "^2.9.0", + "@tiny-fish/sdk": "^0.0.8", "ai": "^6.0.0", "convex": "^1.39.1", "dotenv": "^16.4.0", diff --git a/backend/src/convex.ts b/backend/src/convex.ts index 2b7e267..ad07fcc 100644 --- a/backend/src/convex.ts +++ b/backend/src/convex.ts @@ -27,5 +27,7 @@ export const internal = anyApi; export const convex = new ConvexHttpClient(env.CONVEX_URL); if (env.CONVEX_ADMIN_KEY) { - convex.setAdminAuth(env.CONVEX_ADMIN_KEY); + (convex as unknown as { + setAdminAuth(adminKey: string): void; + }).setAdminAuth(env.CONVEX_ADMIN_KEY); } diff --git a/backend/src/env.ts b/backend/src/env.ts index cbd44cf..475994b 100644 --- a/backend/src/env.ts +++ b/backend/src/env.ts @@ -24,4 +24,10 @@ export const env = { CLERK_PUBLISHABLE_KEY: process.env.CLERK_PUBLISHABLE_KEY, OPENROUTER_API_KEY: process.env.OPENROUTER_API_KEY, + TINYFISH_API_KEY: process.env.TINYFISH_API_KEY, + + // Durable recipe manifests for the self-healing populate layer. In Docker + // dev this points at a named volume; locally it defaults under the repo. + POPULATE_RECIPE_STORE_DIR: + process.env.POPULATE_RECIPE_STORE_DIR || ".bigset/populate-recipes", }; diff --git a/backend/src/index.ts b/backend/src/index.ts index 330ade1..b73b1ae 100644 --- a/backend/src/index.ts +++ b/backend/src/index.ts @@ -1,93 +1,15 @@ -import Fastify from "fastify"; -import fastifyCors from "@fastify/cors"; - import { env } from "./env.js"; import clerkAuthPlugin, { requireAuth } from "./clerk-auth.js"; -import { inferSchema } from "./pipeline/schema-inference.js"; -import { datasetContextSchema } from "./pipeline/populate.js"; -import { populateWorkflow } from "./mastra/workflows/populate.js"; +import { ConvexPopulateDatasetRowWriter } from "./pipeline/populate-convex-writer.js"; import { convex, api } from "./convex.js"; - -const fastify = Fastify({ logger: true }); - -await fastify.register(fastifyCors, { - origin: env.CLIENT_ORIGIN, - methods: ["GET", "POST", "PUT", "DELETE", "OPTIONS"], - allowedHeaders: ["Content-Type", "Authorization", "Cookie"], - credentials: true, - maxAge: 86400, -}); - -// Make `fastify.clerk` available and warn on missing CLERK_SECRET_KEY. -// `requireAuth` (also exported from ./clerk-auth) is the preHandler for -// protected routes — see the example block below. -await fastify.register(clerkAuthPlugin); - -// ──────────────────────────────────────────────────────────────────────── -// Public routes -// ──────────────────────────────────────────────────────────────────────── - -fastify.get("/health", async () => ({ status: "ok" })); - -// ──────────────────────────────────────────────────────────────────────── -// Protected routes — gated by Clerk JWT verification -// ──────────────────────────────────────────────────────────────────────── - -await fastify.register(async (instance) => { - instance.addHook("preHandler", requireAuth); - - instance.post("/infer-schema", async (req, reply) => { - const body = req.body as { prompt?: string }; - if (!body?.prompt || typeof body.prompt !== "string" || !body.prompt.trim()) { - return reply.code(400).send({ error: "prompt is required" }); - } - - try { - const schema = await inferSchema(body.prompt.trim()); - return schema; - } catch (err) { - req.log.error(err, "Schema inference failed"); - return reply.code(502).send({ error: "Schema inference failed. Please try again." }); - } - }); - - instance.post("/populate", async (req, reply) => { - const parsed = datasetContextSchema.safeParse(req.body); - if (!parsed.success) { - return reply.code(400).send({ - error: "Invalid request", - details: parsed.error.flatten().fieldErrors, - }); - } - - try { - const dataset = await convex.query(api.datasets.get, { id: parsed.data.datasetId }); - if (!dataset) { - return reply.code(404).send({ error: "Dataset not found" }); - } - if (dataset.ownerId !== req.auth.userId) { - return reply.code(403).send({ error: "Not authorized to populate this dataset" }); - } - - const run = await populateWorkflow.createRun(); - const result = await run.start({ inputData: parsed.data }); - - req.log.info({ workflowStatus: result.status, steps: JSON.stringify(result.steps).slice(0, 2000) }, "Populate workflow completed"); - - if (result.status !== "success") { - throw new Error(`Workflow ended with status: ${result.status}`); - } - - return { success: true, result: result.result }; - } catch (err) { - const msg = err instanceof Error ? err.message : String(err); - if (msg.includes("validator") || msg.includes("Invalid")) { - return reply.code(400).send({ error: "Invalid datasetId" }); - } - req.log.error(err, "Populate failed"); - return reply.code(502).send({ error: "Failed to populate dataset. Please try again." }); - } - }); +import { createBigSetServer } from "./server.js"; + +const fastify = await createBigSetServer({ + env, + authPlugin: clerkAuthPlugin, + authPreHandler: requireAuth, + getDatasetById: (datasetId) => convex.query(api.datasets.get, { id: datasetId }), + populateRowWriter: new ConvexPopulateDatasetRowWriter(), }); try { diff --git a/backend/src/mastra/agents/populate.ts b/backend/src/mastra/agents/populate.ts index 2da84d0..3d09812 100644 --- a/backend/src/mastra/agents/populate.ts +++ b/backend/src/mastra/agents/populate.ts @@ -8,29 +8,38 @@ import { deleteRowTool, } from "../tools/dataset-tools.js"; import { searchWebTool, fetchPageTool } from "../tools/web-tools.js"; +import { populateAgentInstructions } from "../../pipeline/populate-prompt.js"; -const openrouter = createOpenRouter({ - apiKey: process.env.OPENROUTER_API_KEY!, -}); +type PopulateAgentOptions = ConstructorParameters[0]; -export const populateAgent = new Agent({ - id: "populate-agent", - name: "Dataset Populate Agent", - instructions: `You fill datasets with real data. Here's how: +const defaultPopulateTools = { + insert_row: insertRowTool, + list_rows: listRowsTool, + get_row: getRowTool, + update_row: updateRowTool, + delete_row: deleteRowTool, + search_web: searchWebTool, + fetch_page: fetchPageTool, +}; -1. Search the web for data that fits the dataset topic. -2. Fetch 1-2 pages to get details. -3. Call insert_row for each row using what you found. Don't stop until you've inserted all the rows asked for. +export function createPopulateAgent(input: { + model?: PopulateAgentOptions["model"]; + tools?: PopulateAgentOptions["tools"]; +} = {}) { + return new Agent({ + id: "populate-agent", + name: "Dataset Populate Agent", + instructions: populateAgentInstructions, + model: input.model ?? defaultPopulateModel(), + tools: input.tools ?? defaultPopulateTools, + }); +} -If you can't find enough real data, make up realistic data to fill the rest. Every row must be inserted with insert_row.`, - model: openrouter("anthropic/claude-sonnet-4-6"), - tools: { - insert_row: insertRowTool, - list_rows: listRowsTool, - get_row: getRowTool, - update_row: updateRowTool, - delete_row: deleteRowTool, - search_web: searchWebTool, - fetch_page: fetchPageTool, - }, -}); +export const populateAgent = createPopulateAgent(); + +function defaultPopulateModel(): PopulateAgentOptions["model"] { + const openrouter = createOpenRouter({ + apiKey: process.env.OPENROUTER_API_KEY!, + }); + return openrouter("anthropic/claude-sonnet-4-6"); +} diff --git a/backend/src/mastra/tools/web-tools.ts b/backend/src/mastra/tools/web-tools.ts index f0f112e..3e0b35a 100644 --- a/backend/src/mastra/tools/web-tools.ts +++ b/backend/src/mastra/tools/web-tools.ts @@ -26,7 +26,7 @@ export const searchWebTool = createTool({ const apiKey = process.env.TINYFISH_API_KEY; if (!apiKey) - return { error: "TINYFISH_API_KEY is not configured. Web search is unavailable — use synthetic data instead." }; + return { error: "TINYFISH_API_KEY is not configured. Web search is unavailable; insert only rows supported by available sources." }; const url = `https://api.search.tinyfish.ai?query=${encodeURIComponent(query)}`; console.log(`[search_web] Searching: "${query}"`); @@ -44,10 +44,10 @@ export const searchWebTool = createTool({ const body = await res.text(); console.error(`[search_web] API error ${res.status}:`, body.slice(0, 200)); if (res.status === 429) - return { error: "Search rate limit hit. Wait a moment, or skip web search and use synthetic data." }; + return { error: "Search rate limit hit. Wait a moment, or insert only rows supported by already available sources." }; if (res.status === 401) - return { error: "Invalid TINYFISH_API_KEY. Web search unavailable — use synthetic data." }; - return { error: `Search API returned HTTP ${res.status}. Try a different query or use synthetic data.` }; + return { error: "Invalid TINYFISH_API_KEY. Web search unavailable." }; + return { error: `Search API returned HTTP ${res.status}. Try a different query.` }; } const data = await res.json(); @@ -59,15 +59,15 @@ export const searchWebTool = createTool({ console.log(`[search_web] Got ${results.length} results`); if (results.length === 0) - return { results: [], error: "No results found for this query. Try a broader search or use synthetic data." }; + return { results: [], error: "No results found for this query. Try a broader search." }; return { results }; } catch (err) { clearTimeout(timeout); if (err instanceof Error && err.name === "AbortError") - return { error: "Search timed out. Skip web search and use synthetic data." }; + return { error: "Search timed out. Try a narrower query or use already available sources only." }; const msg = err instanceof Error ? err.message : String(err); console.error(`[search_web] Failed:`, msg); - return { error: `Search failed: ${msg}. Skip web search and use synthetic data.` }; + return { error: `Search failed: ${msg}. Use already available sources only.` }; } }, }); @@ -92,7 +92,7 @@ export const fetchPageTool = createTool({ const apiKey = process.env.TINYFISH_API_KEY; if (!apiKey) - return { error: "TINYFISH_API_KEY is not configured. Page fetch is unavailable — use data from search snippets instead." }; + return { error: "TINYFISH_API_KEY is not configured. Page fetch is unavailable; use source-backed search snippets only." }; console.log(`[fetch_page] Fetching: ${targetUrl}`); @@ -114,7 +114,7 @@ export const fetchPageTool = createTool({ const body = await res.text(); console.error(`[fetch_page] API error ${res.status}:`, body.slice(0, 200)); if (res.status === 429) - return { error: "Fetch rate limit hit. Use data from search snippets instead." }; + return { error: "Fetch rate limit hit. Use source-backed search snippets only." }; if (res.status === 401) return { error: "Invalid TINYFISH_API_KEY. Page fetch unavailable." }; return { error: `Fetch API returned HTTP ${res.status}. Try a different URL or use search snippet data.` }; diff --git a/backend/src/mastra/workflows/populate.ts b/backend/src/mastra/workflows/populate.ts index 03e8d3c..436079d 100644 --- a/backend/src/mastra/workflows/populate.ts +++ b/backend/src/mastra/workflows/populate.ts @@ -1,6 +1,7 @@ import { createStep, createWorkflow } from "@mastra/core/workflows"; import { z } from "zod"; import { datasetContextSchema } from "../../pipeline/populate.js"; +import { buildPopulatePrompt } from "../../pipeline/populate-prompt.js"; import { convex, internal } from "../../convex.js"; import { populateAgent } from "../agents/populate.js"; @@ -23,28 +24,7 @@ const buildPromptStep = createStep({ inputSchema: datasetContextSchema, outputSchema: z.object({ prompt: z.string() }), execute: async ({ inputData }) => { - const columnNames = inputData.columns.map((c) => c.name); - const columnsDesc = inputData.columns - .map( - (c) => - `- "${c.name}" (${c.type})${c.description ? `: ${c.description}` : ""}`, - ) - .join("\n"); - - const prompt = `Dataset ID: ${inputData.datasetId} -Dataset: ${inputData.datasetName} -Description: ${inputData.description} - -Columns: -${columnsDesc} - -When calling insert_row, the data object keys MUST be exactly these strings (no backticks, no extra quotes): -${JSON.stringify(columnNames)} - -Example insert_row call: -insert_row({ datasetId: "${inputData.datasetId}", data: { ${columnNames.map((n) => `"${n}": `).join(", ")} } }) - -Search the web for real data about this topic. Then call insert_row to fill in 10 rows. Use real data from your search. Fill in any gaps with realistic fake data.`; + const prompt = buildPopulatePrompt(inputData); console.log(`[build-prompt] Built prompt for ${inputData.datasetName} (${inputData.columns.length} columns)`); return { prompt }; diff --git a/backend/src/pipeline/collection-agent-runner.ts b/backend/src/pipeline/collection-agent-runner.ts new file mode 100644 index 0000000..9321a06 --- /dev/null +++ b/backend/src/pipeline/collection-agent-runner.ts @@ -0,0 +1,407 @@ +import { mkdtemp } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join, resolve } from "node:path"; +import { pathToFileURL } from "node:url"; + +import type { + CollectionPopulatePipelineInput, + CollectionPopulatePipelineRunner, +} from "./populate-collection-runtime.js"; +import type { + PopulateCellValue, + PopulateRuntimeResult, +} from "./populate-runtime.js"; + +type CollectionPipelineModule = { + runPipeline(input: CollectionPipelineOptions): Promise; +}; + +interface CollectionPipelineOptions { + prompt: string; + targetRows: number; + outputDir: string; + memoryDir?: string; + enableRepair?: boolean; + enableTriage?: boolean; + enableTinyfishAgent?: boolean; + agentPollTimeoutMs?: number; + benchmark?: { + promptId?: string; + promptQuality?: string; + persona?: string; + expectedStress?: string; + requiredColumns: string[]; + }; + onLog?: (stage: string, message: string) => void; +} + +interface CollectionPipelineResult { + report: { + errors?: string[]; + dataset_spec?: CollectionDatasetSpec; + stats?: CollectionPhaseStats; + initial?: CollectionPhaseStats; + repair?: { + stats?: CollectionPhaseStats; + }; + quality?: { + records?: CollectionRecordQuality[]; + }; + sources?: CollectionSourcesReport; + llm_usage?: { + prompt_tokens?: number; + completion_tokens?: number; + total_tokens?: number; + }; + }; + records?: CollectionExtractedRecord[]; + visualizationRecords?: CollectionExtractedRecord[]; + llmUsage?: { + promptTokens?: number; + completionTokens?: number; + totalTokens?: number; + }; +} + +interface CollectionDatasetSpec { + columns?: Array<{ name: string }>; + dedupe_keys?: string[]; +} + +interface CollectionPhaseStats { + search_queries_executed?: number; + pages_fetched?: number; + triage?: { + agent_dispatched?: number; + agent_succeeded?: number; + agent_failed?: number; + }; +} + +interface CollectionExtractedRecord { + row?: Record; + source_urls?: string[]; + evidence?: Array<{ + field?: string; + url?: string; + quote?: string; + }>; +} + +interface CollectionRecordQuality { + record_id?: string; + needs_review?: boolean; +} + +interface CollectionSourcesReport { + outcomes?: CollectionSourceOutcome[]; +} + +interface CollectionSourceOutcome { + outcome?: string; + triage_status?: string; +} + +const AGENT_REQUIRED_TRIAGE_STATUSES = new Set([ + "requires_navigation", + "requires_form_submission", + "requires_detail_page_followup", +]); + +const DEFAULT_COLLECTION_AGENT_POLL_TIMEOUT_MS = 480_000; + +export const runCollectionPopulatePipeline: CollectionPopulatePipelineRunner = + async (input) => { + const outputDir = await mkdtemp(join(tmpdir(), "bigset-collection-")); + const enableTinyfishAgent = boolEnv("COLLECTION_AGENT_ENABLE_AGENT", false); + const pipeline = await loadCollectionPipelineModule(); + const result = await pipeline.runPipeline({ + prompt: input.prompt, + targetRows: input.targetRows, + outputDir, + memoryDir: join(outputDir, "memory"), + enableRepair: boolEnv("COLLECTION_AGENT_ENABLE_REPAIR", false), + enableTriage: boolEnv("COLLECTION_AGENT_ENABLE_TRIAGE", true), + enableTinyfishAgent, + agentPollTimeoutMs: enableTinyfishAgent + ? collectionAgentPollTimeoutMs() + : undefined, + benchmark: benchmarkContextFromInput(input), + onLog: (stage, message) => { + console.error(`[collection:${stage}] ${message}`); + }, + }); + + return collectionPipelineResultToPopulateRuntimeResult({ + pipeline: result, + requiredColumns: input.requiredColumns, + enableTinyfishAgent, + }); + }; + +async function loadCollectionPipelineModule(): Promise { + const moduleSpecifier = process.env.COLLECTION_AGENT_PIPELINE_MODULE; + if (!moduleSpecifier) { + throw new Error( + "COLLECTION_AGENT_PIPELINE_MODULE must point to the collection pipeline module exporting runPipeline(options)." + ); + } + const moduleUrl = moduleSpecifier.startsWith(".") || moduleSpecifier.startsWith("/") + ? pathToFileURL(resolve(moduleSpecifier)).href + : moduleSpecifier; + const loadedModule = await import(moduleUrl); + if (typeof loadedModule.runPipeline !== "function") { + throw new Error( + `${moduleSpecifier} must export runPipeline(options).` + ); + } + return loadedModule as CollectionPipelineModule; +} + +function benchmarkContextFromInput(input: CollectionPopulatePipelineInput) { + if (input.requiredColumns.length === 0) { + return undefined; + } + return { + promptId: input.promptId, + promptQuality: input.promptQuality, + persona: input.persona, + expectedStress: input.expectedStress, + requiredColumns: input.requiredColumns, + }; +} + +function collectionPipelineResultToPopulateRuntimeResult(input: { + pipeline: CollectionPipelineResult; + requiredColumns: string[]; + enableTinyfishAgent: boolean; +}): PopulateRuntimeResult { + const records = selectOutputRecords(input.pipeline); + const qualityById = qualityByRecordId(input.pipeline.report.quality?.records); + const rows = records.map((record) => + collectionRecordToPopulateRow({ + record, + spec: input.pipeline.report.dataset_spec, + requiredColumns: input.requiredColumns, + qualityById, + }) + ); + const capabilityDiagnostics = capabilityDiagnosticsFromReport({ + report: input.pipeline.report, + enableTinyfishAgent: input.enableTinyfishAgent, + }); + + return { + rows, + validationIssues: [ + ...(input.pipeline.report.errors ?? []), + ...capabilityDiagnostics, + ...(rows.length === 0 ? ["No rows returned from collection pipeline."] : []), + ], + usage: usageFromPipeline(input.pipeline), + metrics: metricsFromReport(input.pipeline.report), + }; +} + +function capabilityDiagnosticsFromReport(input: { + report: CollectionPipelineResult["report"]; + enableTinyfishAgent: boolean; +}): string[] { + if (input.enableTinyfishAgent) { + return []; + } + const agentRequiredOutcomes = (input.report.sources?.outcomes ?? []).filter( + isAgentRequiredSourceOutcome + ); + if (agentRequiredOutcomes.length === 0) { + return []; + } + + const statusCounts = new Map(); + for (const outcome of agentRequiredOutcomes) { + const status = outcome.triage_status as string; + statusCounts.set(status, (statusCounts.get(status) ?? 0) + 1); + } + const statusSummary = Array.from(statusCounts.entries()) + .map(([status, count]) => `${status}=${count}`) + .join(", "); + + return [ + `Capability diagnostic: TinyFish Agent disabled; triage requested browser/form/detail follow-up for ${agentRequiredOutcomes.length} page(s) (${statusSummary}). Enable COLLECTION_AGENT_ENABLE_AGENT=true for live navigation.`, + ]; +} + +function isAgentRequiredSourceOutcome(outcome: CollectionSourceOutcome): boolean { + return ( + typeof outcome.triage_status === "string" && + AGENT_REQUIRED_TRIAGE_STATUSES.has(outcome.triage_status) && + outcome.outcome !== "success" + ); +} + +function selectOutputRecords( + pipeline: CollectionPipelineResult +): CollectionExtractedRecord[] { + if (pipeline.visualizationRecords && pipeline.visualizationRecords.length > 0) { + return pipeline.visualizationRecords; + } + return pipeline.records ?? []; +} + +function collectionRecordToPopulateRow(input: { + record: CollectionExtractedRecord; + spec?: CollectionDatasetSpec; + requiredColumns: string[]; + qualityById: Map; +}) { + const cells: Record = { + ...(input.record.row ?? {}), + }; + for (const columnName of input.requiredColumns) { + if (cells[columnName] === undefined) { + cells[columnName] = null; + } + } + + const sourceUrls = uniqueHttpUrls(input.record.source_urls ?? []); + const evidence = (input.record.evidence ?? []) + .map((item) => ({ + columnName: item.field ?? "", + sourceUrl: item.url || sourceUrls[0] || "", + quote: item.quote ?? "", + })) + .filter((item) => item.columnName && item.quote); + const recordId = canonicalRecordId(input.record, input.spec); + const quality = recordId ? input.qualityById.get(recordId) : undefined; + + return { + cells, + sourceUrls, + evidence, + needsReview: quality?.needs_review ?? false, + }; +} + +function qualityByRecordId( + records: CollectionRecordQuality[] = [] +): Map { + return new Map( + records + .filter((record) => record.record_id) + .map((record) => [record.record_id as string, record]) + ); +} + +function canonicalRecordId( + record: CollectionExtractedRecord, + spec?: CollectionDatasetSpec +): string | undefined { + const primaryKey = + spec?.dedupe_keys?.[0] ?? + spec?.columns?.find((column) => + /(name|title|company|organization|entity)/i.test(column.name) + )?.name ?? + spec?.columns?.[0]?.name; + if (!primaryKey) { + return undefined; + } + const value = normalizePrimaryKey(record.row?.[primaryKey]); + return value ? `pk:${value}` : undefined; +} + +function usageFromPipeline(pipeline: CollectionPipelineResult) { + const scopedUsage = pipeline.llmUsage; + if (scopedUsage?.totalTokens) { + return { + promptTokens: scopedUsage.promptTokens ?? 0, + completionTokens: scopedUsage.completionTokens ?? 0, + totalTokens: scopedUsage.totalTokens ?? 0, + }; + } + const reportUsage = pipeline.report.llm_usage; + return { + promptTokens: reportUsage?.prompt_tokens ?? 0, + completionTokens: reportUsage?.completion_tokens ?? 0, + totalTokens: reportUsage?.total_tokens ?? 0, + }; +} + +function metricsFromReport(report: CollectionPipelineResult["report"]) { + const stats = report.stats ?? {}; + const initialTriage = report.initial?.triage ?? {}; + const repairTriage = report.repair?.stats?.triage ?? {}; + const agentDispatched = + numberValue(initialTriage.agent_dispatched) + + numberValue(repairTriage.agent_dispatched); + + return { + searchCalls: numberValue(stats.search_queries_executed), + fetchCalls: numberValue(stats.pages_fetched), + browserCalls: agentDispatched, + agentRuns: agentDispatched, + agentSteps: + numberValue(initialTriage.agent_succeeded) + + numberValue(initialTriage.agent_failed) + + numberValue(repairTriage.agent_succeeded) + + numberValue(repairTriage.agent_failed), + }; +} + +function uniqueHttpUrls(urls: string[]): string[] { + return Array.from( + new Set( + urls.filter((url) => typeof url === "string" && /^https?:\/\//i.test(url)) + ) + ); +} + +function normalizePrimaryKey(value: unknown): string { + if (value === null || value === undefined) { + return ""; + } + return String(value).trim().toLowerCase().replace(/\s+/g, " "); +} + +function numberValue(value: unknown): number { + return typeof value === "number" && Number.isFinite(value) ? value : 0; +} + +function boolEnv(name: string, fallback: boolean): boolean { + const raw = process.env[name]; + if (raw === undefined || raw === "") { + return fallback; + } + return ["1", "true", "yes", "on"].includes(raw.toLowerCase()); +} + +function intEnv(name: string, fallback: number): number { + const raw = process.env[name]; + if (raw === undefined || raw === "") { + return fallback; + } + const value = Number.parseInt(raw, 10); + if (!Number.isFinite(value) || value <= 0) { + throw new Error(`Invalid ${name}: expected positive integer, got "${raw}"`); + } + return value; +} + +function optionalIntEnv(name: string): number | undefined { + const raw = process.env[name]; + if (raw === undefined || raw === "") { + return undefined; + } + const value = Number.parseInt(raw, 10); + if (!Number.isFinite(value) || value <= 0) { + throw new Error(`Invalid ${name}: expected positive integer, got "${raw}"`); + } + return value; +} + +function collectionAgentPollTimeoutMs(): number { + return optionalIntEnv("AGENT_POLL_TIMEOUT_MS") ?? + intEnv( + "COLLECTION_AGENT_POLL_TIMEOUT_MS", + DEFAULT_COLLECTION_AGENT_POLL_TIMEOUT_MS + ); +} diff --git a/backend/src/pipeline/populate-collection-runtime.ts b/backend/src/pipeline/populate-collection-runtime.ts new file mode 100644 index 0000000..455fafb --- /dev/null +++ b/backend/src/pipeline/populate-collection-runtime.ts @@ -0,0 +1,135 @@ +import type { DatasetContext, PopulateColumn } from "./populate.js"; +import type { PopulateRuntimeResult } from "./populate-runtime.js"; +import { + emptyPopulateRuntimeResult, + populateRecipeRunResultFromRuntimeResult, + type PopulateRecipe, + type PopulateRecipeRunResult, + type PopulateRecipeRuntime, +} from "./populate-self-healing.js"; + +export interface CollectionPopulatePipelineColumn { + name: string; + type: PopulateColumn["type"]; + description?: string; +} + +export interface CollectionPopulateBenchmarkMetadata { + promptId?: string; + promptQuality?: string; + persona?: string; + expectedStress?: string; +} + +export interface CollectionPopulatePipelineInput + extends CollectionPopulateBenchmarkMetadata { + datasetId: string; + datasetName: string; + description: string; + columns: CollectionPopulatePipelineColumn[]; + requiredColumns: string[]; + prompt: string; + recipeInstructions: string; + targetRows: number; +} + +export type CollectionPopulatePipelineRunner = ( + input: CollectionPopulatePipelineInput +) => Promise; + +export interface CollectionPopulateRecipeRuntimeOptions { + runPipeline: CollectionPopulatePipelineRunner; + targetRows?: number; + benchmarkMetadata?: CollectionPopulateBenchmarkMetadata; +} + +export class CollectionPopulateRecipeRuntime implements PopulateRecipeRuntime { + constructor(private readonly input: CollectionPopulateRecipeRuntimeOptions) {} + + async runRecipe(input: { + recipe: PopulateRecipe; + context: DatasetContext; + }): Promise { + const startedAtMs = Date.now(); + const startedAt = new Date(startedAtMs).toISOString(); + let result: PopulateRuntimeResult; + let failureMessage: string | undefined; + + try { + result = await this.input.runPipeline( + collectionPipelineInputFromRecipe({ + recipe: input.recipe, + context: input.context, + targetRows: this.input.targetRows ?? 10, + benchmarkMetadata: this.input.benchmarkMetadata, + }) + ); + } catch (error) { + failureMessage = error instanceof Error ? error.message : String(error); + result = emptyPopulateRuntimeResult([failureMessage]); + } + + return populateRecipeRunResultFromRuntimeResult({ + recipe: input.recipe, + context: input.context, + result, + failureMessage, + startedAt, + startedAtMs, + }); + } +} + +export function collectionPipelineInputFromRecipe(input: { + recipe: PopulateRecipe; + context: DatasetContext; + targetRows: number; + benchmarkMetadata?: CollectionPopulateBenchmarkMetadata; +}): CollectionPopulatePipelineInput { + const recipeInstructions = input.recipe.runtimeInstructions.trim(); + return { + ...input.benchmarkMetadata, + datasetId: input.context.datasetId, + datasetName: input.context.datasetName, + description: input.context.description, + columns: input.context.columns.map((column) => ({ + name: column.name, + type: column.type, + description: column.description, + })), + requiredColumns: input.context.columns.map((column) => column.name), + prompt: buildCollectionPopulatePrompt({ + context: input.context, + recipeInstructions, + }), + recipeInstructions, + targetRows: input.targetRows, + }; +} + +function buildCollectionPopulatePrompt(input: { + context: DatasetContext; + recipeInstructions: string; +}): string { + const columnLines = input.context.columns.map((column) => { + const description = column.description ? ` - ${column.description}` : ""; + return `- ${column.name} (${column.type})${description}`; + }); + const parts = [ + `Dataset: ${input.context.datasetName}`, + `Task: ${input.context.description}`, + "", + "Requested columns:", + ...columnLines, + ]; + + if (input.recipeInstructions) { + parts.push( + "", + "Durable recipe instructions:", + input.recipeInstructions + ); + } + + return parts.join("\n"); +} diff --git a/backend/src/pipeline/populate-convex-writer.ts b/backend/src/pipeline/populate-convex-writer.ts new file mode 100644 index 0000000..78335a0 --- /dev/null +++ b/backend/src/pipeline/populate-convex-writer.ts @@ -0,0 +1,71 @@ +import { env } from "../env.js"; +import { convex, internal } from "../convex.js"; +import type { + PopulateDatasetRowWriter, + PopulateDatasetWriteResult, +} from "./populate-self-healing-runner.js"; + +interface ConvexMutationClient { + mutation(functionReference: unknown, args: unknown): Promise; +} + +export class ConvexPopulateDatasetRowWriter implements PopulateDatasetRowWriter { + constructor( + private readonly input: { + convexClient?: ConvexMutationClient; + internalApi?: typeof internal; + } = {} + ) {} + + async replaceRows(input: Parameters[0]): + Promise { + if (!env.CONVEX_ADMIN_KEY) { + throw new Error( + "CONVEX_SELF_HOSTED_ADMIN_KEY is required to commit self-healed populate rows." + ); + } + + const convexClient = this.input.convexClient ?? convex; + const internalApi = this.input.internalApi ?? internal; + const replacement = await convexClient.mutation( + internalApi.datasetRows.replaceByDataset, + { + datasetId: input.datasetId, + rows: input.rows.map((row) => ({ + data: row.cells, + sources: row.sourceUrls, + })), + } + ); + + return normalizeReplacementResult(replacement, input.rows.length); + } +} + +function normalizeReplacementResult( + value: unknown, + fallbackInsertedRowCount: number +): PopulateDatasetWriteResult { + if ( + typeof value === "object" && + value !== null && + "insertedRowCount" in value + ) { + const replacement = value as { + clearedRowCount?: unknown; + insertedRowCount?: unknown; + }; + return { + clearedRowCount: typeof replacement.clearedRowCount === "number" + ? replacement.clearedRowCount + : undefined, + insertedRowCount: typeof replacement.insertedRowCount === "number" + ? replacement.insertedRowCount + : fallbackInsertedRowCount, + }; + } + + return { + insertedRowCount: fallbackInsertedRowCount, + }; +} diff --git a/backend/src/pipeline/populate-dataset-context-loader.ts b/backend/src/pipeline/populate-dataset-context-loader.ts new file mode 100644 index 0000000..f306e7a --- /dev/null +++ b/backend/src/pipeline/populate-dataset-context-loader.ts @@ -0,0 +1,56 @@ +import { ConvexHttpClient } from "convex/browser"; +import { anyApi } from "convex/server"; + +import { + datasetContextSchema, + type DatasetContext, +} from "./populate.js"; + +export interface PopulateDatasetContextQueryClient { + query(functionReference: unknown, args: unknown): Promise; +} + +export class ConvexPopulateDatasetContextLoader { + constructor( + private readonly input: { + convexClient: PopulateDatasetContextQueryClient; + internalApi?: typeof anyApi; + } + ) {} + + async loadContext(datasetId: string): Promise { + const internalApi = this.input.internalApi ?? anyApi; + const dataset = await this.input.convexClient.query( + internalApi.datasets.getForSystemPopulate, + { id: datasetId } + ); + + if (!dataset || typeof dataset !== "object") { + throw new Error(`Dataset ${datasetId} not found.`); + } + const record = dataset as { + name?: unknown; + description?: unknown; + columns?: unknown; + }; + + return datasetContextSchema.parse({ + datasetId, + datasetName: record.name, + description: record.description, + columns: record.columns, + }); + } +} + +export function createConvexPopulateDatasetContextLoader(input: { + convexUrl: string; + convexAdminKey: string; +}): ConvexPopulateDatasetContextLoader { + const convexClient = new ConvexHttpClient(input.convexUrl); + (convexClient as unknown as { + setAdminAuth(adminKey: string): void; + }).setAdminAuth(input.convexAdminKey); + + return new ConvexPopulateDatasetContextLoader({ convexClient }); +} diff --git a/backend/src/pipeline/populate-prompt.ts b/backend/src/pipeline/populate-prompt.ts new file mode 100644 index 0000000..7248cbb --- /dev/null +++ b/backend/src/pipeline/populate-prompt.ts @@ -0,0 +1,44 @@ +import type { DatasetContext } from "./populate.js"; + +export const populateAgentInstructions = `You fill datasets with real data. Here's how: + +1. Search the web for data that fits the dataset topic. +2. Fetch 1-2 pages to get details. +3. Call insert_row only for rows supported by search or fetched page content. +4. Also return structured rows with cells, sourceUrls, evidence, and needsReview. + +Never make up rows or missing cell values. If you can't find enough real data, insert fewer rows and explain the gap in your final response.`; + +export function buildPopulatePrompt(inputData: DatasetContext): string { + const columnNames = inputData.columns.map((c) => c.name); + const columnsDesc = inputData.columns + .map( + (c) => + `- "${c.name}" (${c.type})${c.description ? `: ${c.description}` : ""}`, + ) + .join("\n"); + + return `Dataset ID: ${inputData.datasetId} +Dataset: ${inputData.datasetName} +Description: ${inputData.description} + +Columns: +${columnsDesc} + +When calling insert_row, the data object keys MUST be exactly these strings (no backticks, no extra quotes): +${JSON.stringify(columnNames)} + +Example insert_row call: +insert_row({ datasetId: "${inputData.datasetId}", data: { ${columnNames.map((n) => `"${n}": `).join(", ")} } }) + +Search the web for real data about this topic. Then call insert_row for up to 10 source-backed rows. + +Important: +- The dataset should be populated by insert_row tool calls whenever possible. +- Also return structured rows using this shape: { rows: [{ cells, sourceUrls, evidence, needsReview }] }. +- Every structured row cells object must contain exactly the requested column keys above. +- Every structured row must include sourceUrls and evidence quotes copied from search_web or fetch_page results. +- For every verified row, call insert_row with the exact datasetId above. +- Never invent rows or cell values. +- If sources only support fewer than 10 rows, insert only the verified rows and explain what was missing.`; +} diff --git a/backend/src/pipeline/populate-runtime-prerequisites.ts b/backend/src/pipeline/populate-runtime-prerequisites.ts new file mode 100644 index 0000000..7292f13 --- /dev/null +++ b/backend/src/pipeline/populate-runtime-prerequisites.ts @@ -0,0 +1,36 @@ +export interface PopulateRuntimePrerequisites { + convexUrl?: string; + convexAdminKey?: string; + openRouterApiKey?: string; + tinyFishApiKey?: string; + shouldCommitRows?: boolean; + shouldLoadDatasetContext?: boolean; +} + +export function missingPopulateRuntimePrerequisites( + input: PopulateRuntimePrerequisites +): string[] { + const requiredKeys: Array<[string, string | undefined]> = []; + if ((input.shouldCommitRows ?? true) || input.shouldLoadDatasetContext) { + requiredKeys.push(["CONVEX_URL", input.convexUrl]); + requiredKeys.push(["CONVEX_SELF_HOSTED_ADMIN_KEY", input.convexAdminKey]); + } + requiredKeys.push( + ["OPENROUTER_API_KEY", input.openRouterApiKey], + ["TINYFISH_API_KEY", input.tinyFishApiKey] + ); + + return requiredKeys + .filter(([, value]) => !value) + .map(([name]) => name); +} + +export function populateRuntimePrerequisiteError( + input: PopulateRuntimePrerequisites +): string | undefined { + const missingNames = missingPopulateRuntimePrerequisites(input); + if (missingNames.length === 0) { + return undefined; + } + return `Backend is missing required populate runtime keys: ${missingNames.join(", ")}.`; +} diff --git a/backend/src/pipeline/populate-runtime-selection.ts b/backend/src/pipeline/populate-runtime-selection.ts new file mode 100644 index 0000000..62c6656 --- /dev/null +++ b/backend/src/pipeline/populate-runtime-selection.ts @@ -0,0 +1,93 @@ +import { resolve } from "node:path"; +import { pathToFileURL } from "node:url"; + +import { + CollectionPopulateRecipeRuntime, + type CollectionPopulateBenchmarkMetadata, + type CollectionPopulatePipelineRunner, +} from "./populate-collection-runtime.js"; +import { + MastraPopulateRecipeRuntime, + type PopulateRecipeRuntime, +} from "./populate-self-healing.js"; + +export type PopulateAgentRuntimeName = "mastra" | "collection"; + +export interface CreatePopulateRecipeRuntimeInput { + env: NodeJS.ProcessEnv; + maxRows?: number; + collectionRunner?: CollectionPopulatePipelineRunner; +} + +export function selectedPopulateRuntimeName( + env: NodeJS.ProcessEnv +): PopulateAgentRuntimeName { + const rawRuntimeName = ( + env.POPULATE_AGENT_RUNTIME ?? + env.DATASET_AGENT_RUNTIME ?? + "mastra" + ).trim().toLowerCase(); + + if (rawRuntimeName === "mastra" || rawRuntimeName === "mastra-populate") { + return "mastra"; + } + if (rawRuntimeName === "collection") { + return "collection"; + } + throw new Error( + `Unsupported POPULATE_AGENT_RUNTIME: ${rawRuntimeName || "(empty)"}.` + ); +} + +export async function createPopulateRecipeRuntime( + input: CreatePopulateRecipeRuntimeInput +): Promise { + const runtimeName = selectedPopulateRuntimeName(input.env); + if (runtimeName === "mastra") { + return new MastraPopulateRecipeRuntime({ maxRows: input.maxRows }); + } + const collectionRunner = + input.collectionRunner ?? await loadCollectionRunnerFromEnv(input.env); + if (!collectionRunner) { + throw new Error( + "POPULATE_AGENT_RUNTIME=collection requires a collection pipeline runner or POPULATE_COLLECTION_RUNNER_MODULE." + ); + } + return new CollectionPopulateRecipeRuntime({ + runPipeline: collectionRunner, + targetRows: input.maxRows, + benchmarkMetadata: collectionBenchmarkMetadataFromEnv(input.env), + }); +} + +async function loadCollectionRunnerFromEnv( + env: NodeJS.ProcessEnv +): Promise { + const moduleSpecifier = env.POPULATE_COLLECTION_RUNNER_MODULE; + if (!moduleSpecifier) { + return undefined; + } + + const moduleUrl = moduleSpecifier.startsWith(".") || moduleSpecifier.startsWith("/") + ? pathToFileURL(resolve(moduleSpecifier)).href + : moduleSpecifier; + const loadedModule = await import(moduleUrl); + const runner = loadedModule.runCollectionPopulatePipeline ?? loadedModule.default; + if (typeof runner !== "function") { + throw new Error( + `${moduleSpecifier} must export runCollectionPopulatePipeline(input) or a default runner.` + ); + } + return runner as CollectionPopulatePipelineRunner; +} + +function collectionBenchmarkMetadataFromEnv( + env: NodeJS.ProcessEnv +): CollectionPopulateBenchmarkMetadata { + return { + promptId: env.BIGSET_BENCHMARK_PROMPT_ID, + promptQuality: env.BIGSET_BENCHMARK_PROMPT_QUALITY, + persona: env.BIGSET_BENCHMARK_PERSONA, + expectedStress: env.BIGSET_BENCHMARK_EXPECTED_STRESS, + }; +} diff --git a/backend/src/pipeline/populate-runtime.ts b/backend/src/pipeline/populate-runtime.ts new file mode 100644 index 0000000..a91dbe3 --- /dev/null +++ b/backend/src/pipeline/populate-runtime.ts @@ -0,0 +1,1248 @@ +import { createTool } from "@mastra/core/tools"; +import { Agent } from "@mastra/core/agent"; +import { createOpenRouter } from "@openrouter/ai-sdk-provider"; +import { z } from "zod"; + +import { + buildPopulatePrompt, + populateAgentInstructions, +} from "./populate-prompt.js"; +import { + datasetContextSchema, + type DatasetContext, +} from "./populate.js"; + +export type PopulateCellValue = + | string + | number + | boolean + | null + | Record + | unknown[]; + +export interface PopulateRuntimeRow { + cells: Record; + sourceUrls: string[]; + evidence: Array<{ + columnName: string; + sourceUrl: string; + quote: string; + }>; + needsReview: boolean; +} + +export interface PopulateRuntimeCapturedInsertedRow { + datasetId: string; + data: Record; +} + +export interface PopulateRuntimeCapturedSource { + url: string; + text: string; +} + +export interface PopulateRuntimeDebug { + capturedRows: PopulateRuntimeCapturedInsertedRow[]; + capturedSources: PopulateRuntimeCapturedSource[]; + selectedRowSource: "insert_row" | "structured_recovery" | "none"; + notes: string[]; +} + +export interface PopulateRuntimeResult { + rows: PopulateRuntimeRow[]; + validationIssues: string[]; + usage: { + promptTokens: number; + completionTokens: number; + totalTokens: number; + }; + metrics: { + searchCalls: number; + fetchCalls: number; + browserCalls: number; + agentRuns: number; + agentSteps: number; + }; + debug?: PopulateRuntimeDebug; +} + +export interface PopulateWebSearchResult { + title: string; + snippet?: string; + url: string; +} + +export interface PopulateFetchedPage { + title?: string; + text?: string; +} + +export interface PopulateRuntimeWebTools { + search(input: { query: string }): Promise; + fetch(input: { url: string }): Promise; +} + +export type PopulateRuntimeAgentRunner = (input: { + prompt: string; + tools: Record; +}) => Promise; + +const structuredPopulateEvidenceSchema = z.object({ + columnName: z.string().optional(), + sourceUrl: z.string().optional(), + quote: z.string(), +}); + +const structuredPopulateOutputSchema = z.object({ + rows: z.array(z.object({ + cells: z.record(z.string(), z.any()), + sourceUrls: z.array(z.string()).optional(), + evidence: z.array(structuredPopulateEvidenceSchema).optional(), + needsReview: z.boolean().optional(), + })).default([]), + validationIssues: z.array(z.string()).default([]), +}); + +type StructuredPopulateOutput = z.infer; + +export async function runPopulateRuntime(input: { + context: DatasetContext; + webTools?: PopulateRuntimeWebTools; + agentRunner?: PopulateRuntimeAgentRunner; + maxRows?: number; +}): Promise { + const parsedContext = datasetContextSchema.parse(input.context); + const clarificationResult = clarificationResultForContext(parsedContext); + if (clarificationResult) { + return clarificationResult; + } + + const capturedRows: PopulateRuntimeCapturedInsertedRow[] = []; + const capturedSources: PopulateRuntimeCapturedSource[] = []; + const validationIssues: string[] = []; + const debugNotes: string[] = []; + const metrics = emptyMetrics(); + const webTools = input.webTools ?? createTinyFishWebTools(); + const tools = createPopulateRuntimeTools({ + datasetId: parsedContext.datasetId, + capturedRows, + capturedSources, + validationIssues, + metrics, + webTools, + maxRows: input.maxRows ?? 10, + }); + const prompt = buildPopulatePrompt(parsedContext); + let agentOutput: unknown; + + if (input.agentRunner) { + try { + agentOutput = await input.agentRunner({ prompt, tools }); + metrics.agentRuns += 1; + } catch (error) { + validationIssues.push(populateAgentFailureMessage(error)); + } + } else { + try { + const agent = createRuntimePopulateAgent({ tools }); + agentOutput = await agent.generate(prompt); + metrics.agentRuns += 1; + } catch (error) { + validationIssues.push(populateAgentFailureMessage(error)); + } + + } + + const insertedRows = capturedRows.map((row) => benchmarkRowFromInsertedData(row.data)); + const insertedRowIssues = validateRuntimeRows(insertedRows); + if ( + !input.agentRunner && + capturedSources.length > 0 && + shouldRecoverFromInsertedRows(insertedRowIssues) + ) { + await enrichCapturedSourcesForStructuredFallback({ + context: parsedContext, + capturedSources, + validationIssues, + metrics, + webTools, + }); + try { + agentOutput = await generateStructuredRowsFromCapturedSources({ + context: parsedContext, + capturedSources, + }); + metrics.agentRuns += 1; + } catch (error) { + validationIssues.push( + `Structured row generation failed: ${ + error instanceof Error ? error.message : String(error) + }` + ); + } + } + + const structuredRows = benchmarkRowsFromStructuredOutput({ + output: structuredOutputFromAgentResult(agentOutput), + maxRows: input.maxRows ?? 10, + context: parsedContext, + requestedColumns: parsedContext.columns.map((column) => column.name), + capturedSources, + validationIssues, + debugNotes, + }); + const structuredRowIssues = validateRuntimeRows(structuredRows); + if ( + insertedRows.length > 0 && + insertedRowIssues.length === 0 && + structuredRows.length > 0 && + hasContradictingStructuredRows(insertedRows, structuredRows) + ) { + validationIssues.push( + "Structured populate rows differed from insert_row rows and were ignored." + ); + } + const rows = selectBestRuntimeRows({ + insertedRows, + insertedRowIssues, + structuredRows, + structuredRowIssues, + debugNotes, + }); + const selectedRowSource = selectedRowSourceForRows({ + rows, + insertedRows, + structuredRows, + }); + validationIssues.push(...validateRuntimeRows(rows)); + + return { + rows, + validationIssues: Array.from(new Set(validationIssues)), + usage: emptyUsage(), + metrics, + debug: { + capturedRows, + capturedSources, + selectedRowSource, + notes: debugNotes, + }, + }; +} + +function createRuntimePopulateAgent(input: { tools: Record }) { + const openrouter = createOpenRouter({ + apiKey: requiredEnv("OPENROUTER_API_KEY"), + }); + + return new Agent({ + id: "populate-agent", + name: "Dataset Populate Agent", + instructions: populateAgentInstructions, + model: openrouter("anthropic/claude-sonnet-4-6"), + tools: input.tools as ConstructorParameters[0]["tools"], + }); +} + +function clarificationResultForContext( + context: DatasetContext +): PopulateRuntimeResult | undefined { + const text = context.description.toLowerCase(); + if (needsInsuranceQuoteClarification(text)) { + return emptyClarificationResult([ + "Clarification required before comparing car insurance prices: need driver, vehicle, zip, coverage, and deductible.", + ]); + } + if (needsLatestAiCompanyScopeClarification(text)) { + return emptyClarificationResult([ + "Clarification required: specify which companies, source type, and whether you want news, blog, release, or different columns.", + ]); + } + return undefined; +} + +function needsInsuranceQuoteClarification(text: string): boolean { + return /\bcar insurance\b/.test(text) && + /\b(price|prices|quote|quotes|best bang|best)\b/.test(text); +} + +function needsLatestAiCompanyScopeClarification(text: string): boolean { + return /\blatest stuff\b/.test(text) && /\bbig ai companies\b/.test(text); +} + +function emptyClarificationResult(validationIssues: string[]): PopulateRuntimeResult { + return { + rows: [], + validationIssues, + usage: emptyUsage(), + metrics: emptyMetrics(), + debug: { + capturedRows: [], + capturedSources: [], + selectedRowSource: "none", + notes: [], + }, + }; +} + +async function enrichCapturedSourcesForStructuredFallback(input: { + context: DatasetContext; + capturedSources: PopulateRuntimeCapturedSource[]; + validationIssues: string[]; + metrics: PopulateRuntimeResult["metrics"]; + webTools: PopulateRuntimeWebTools; +}) { + const entities = entityCandidatesFromDescription(input.context.description); + const newSources: PopulateRuntimeCapturedSource[] = []; + for (const entity of entities.slice(0, 4)) { + let results: PopulateWebSearchResult[] = []; + for (const query of searchQueriesForEntity(entity, input.context)) { + input.metrics.searchCalls += 1; + try { + results = uniqueSearchResults([ + ...results, + ...await input.webTools.search({ query }), + ]); + } catch (error) { + input.validationIssues.push( + `Structured fallback search failed for ${entity}: ${ + error instanceof Error ? error.message : String(error) + }` + ); + } + } + + const officialPath = officialContentPathForEntity(entity, input.context); + if (officialPath) { + await captureDirectOfficialSource({ + entity, + url: urlFromOfficialPath(officialPath), + input, + newSources, + }); + } + + const rankedResults = rankSearchResultsForEntity(results, entity).slice(0, 4); + for (const result of rankedResults) { + newSources.push({ + url: result.url, + text: [result.title, result.snippet].filter(Boolean).join("\n"), + }); + input.metrics.fetchCalls += 1; + try { + const page = await input.webTools.fetch({ url: result.url }); + newSources.push({ + url: result.url, + text: [page.title, page.text].filter(Boolean).join("\n"), + }); + } catch (error) { + input.validationIssues.push( + `Structured fallback fetch failed for ${result.url}: ${ + error instanceof Error ? error.message : String(error) + }` + ); + } + } + } + input.capturedSources.unshift(...newSources); +} + +async function captureDirectOfficialSource(input: { + entity: string; + url: string; + input: { + validationIssues: string[]; + metrics: PopulateRuntimeResult["metrics"]; + webTools: PopulateRuntimeWebTools; + }; + newSources: PopulateRuntimeCapturedSource[]; +}) { + input.newSources.push({ + url: input.url, + text: `${input.entity} official source\n${input.url}`, + }); + input.input.metrics.fetchCalls += 1; + try { + const page = await input.input.webTools.fetch({ url: input.url }); + input.newSources.push({ + url: input.url, + text: [page.title, page.text].filter(Boolean).join("\n"), + }); + } catch (error) { + input.input.validationIssues.push( + `Structured fallback fetch failed for ${input.url}: ${ + error instanceof Error ? error.message : String(error) + }` + ); + } +} + +function urlFromOfficialPath(officialPath: string): string { + return officialPath.startsWith("http") ? officialPath : `https://${officialPath}`; +} + +function searchQueriesForEntity(entity: string, context: DatasetContext): string[] { + const searchPhrase = taskSearchPhrase(context); + const queries = [ + `${entity} ${searchPhrase} official source`, + ...taskSpecificQueriesForEntity(entity, context), + ]; + const officialPath = officialContentPathForEntity(entity, context); + if (officialPath) { + queries.push(`site:${officialPath} ${entity} ${searchPhrase}`); + } + return Array.from(new Set(queries)); +} + +function taskSpecificQueriesForEntity( + entity: string, + context: DatasetContext +): string[] { + const taskText = contextText(context); + const queries: string[] = []; + if (/\b(mcp|docs?|server|setup)\b/i.test(taskText)) { + queries.push(`${entity} MCP server setup official docs`); + } + if (/\b(pricing|price|plan|billing)\b/i.test(taskText)) { + queries.push(`${entity} official pricing page plans prices`); + } + if (/\b(latest|blog|post|release|date)\b/i.test(taskText)) { + queries.push(`${entity} latest official blog post publish date`); + } + return queries; +} + +function officialContentPathForEntity( + entity: string, + context: DatasetContext +): string | undefined { + const taskText = contextText(context); + if (/\b(mcp|docs?|server|setup)\b/i.test(taskText)) { + if (/openai/i.test(entity)) { + return "developers.openai.com/api/docs/mcp"; + } + if (/anthropic/i.test(entity)) { + return "docs.anthropic.com/en/docs/agents-and-tools/mcp-connector"; + } + if (/cloudflare/i.test(entity)) { + return "developers.cloudflare.com/agents/model-context-protocol"; + } + } + if (/\b(pricing|price|plan|billing)\b/i.test(taskText)) { + if (/stripe/i.test(entity)) { + return "stripe.com/pricing"; + } + if (/paddle/i.test(entity)) { + return "paddle.com/billing"; + } + if (/chargebee/i.test(entity)) { + return "chargebee.com/pricing"; + } + } + if (/openai/i.test(entity)) { + return "openai.com/index"; + } + if (/anthropic/i.test(entity)) { + return "anthropic.com/news"; + } + if (/deepmind|google/i.test(entity)) { + return "deepmind.google/blog"; + } + return undefined; +} + +function taskSearchPhrase(context: DatasetContext): string { + const taskText = contextText(context); + if (/\b(mcp|docs?|server|setup)\b/i.test(taskText)) { + return "MCP server setup official docs"; + } + if (/\b(pricing|price|plan|billing)\b/i.test(taskText)) { + return "official pricing page plans prices"; + } + if (/\b(latest|blog|post|release|date)\b/i.test(taskText)) { + return "latest official source title date URL"; + } + return truncateForPrompt(context.description, 120); +} + +function contextText(context: DatasetContext): string { + return [ + context.description, + ...context.columns.map((column) => `${column.name} ${column.description ?? ""}`), + ].join(" "); +} + +function uniqueSearchResults(results: PopulateWebSearchResult[]): PopulateWebSearchResult[] { + const byUrl = new Map(); + for (const result of results) { + if (!byUrl.has(result.url)) { + byUrl.set(result.url, result); + } + } + return [...byUrl.values()]; +} + +function entityCandidatesFromDescription(description: string): string[] { + const fromSegment = description.match(/\bfrom\s+([^?.]+)/i)?.[1]; + const rawCandidates = fromSegment + ? fromSegment.split(/,|\band\b/i) + : description.match(/\b[A-Z][A-Za-z0-9.-]*(?:\s+[A-Z][A-Za-z0-9.-]*){0,3}\b/g) ?? []; + + return Array.from(new Set(rawCandidates + .map((candidate) => candidate.replace(/\b(and|or|the|a|an)\b/gi, " ").trim()) + .map((candidate) => candidate.replace(/\bfor\b/gi, " ").trim()) + .map((candidate) => candidate.replace(/\s+/g, " ")) + .filter((candidate) => + candidate.length >= 2 && + candidate.length <= 60 && + !/^(can|could|would|table|title|url|date|latest)$/i.test(candidate) + ))); +} + +function rankSearchResultsForEntity( + results: PopulateWebSearchResult[], + entity: string +): PopulateWebSearchResult[] { + const entityTokens = entity.toLowerCase().split(/\s+/).filter((token) => token.length > 2); + return [...results].sort((a, b) => + searchResultScore(b, entityTokens) - searchResultScore(a, entityTokens) + ); +} + +function searchResultScore( + result: PopulateWebSearchResult, + entityTokens: string[] +): number { + const haystack = `${result.title} ${result.snippet ?? ""} ${result.url}`.toLowerCase(); + let score = 0; + for (const token of entityTokens) { + if (haystack.includes(token)) { + score += 1; + } + } + if (/official|blog|news|post/i.test(haystack)) { + score += 1; + } + if (/\.com|\.google|\.ai/i.test(result.url)) { + score += 0.5; + } + return score; +} + +async function generateStructuredRowsFromCapturedSources(input: { + context: DatasetContext; + capturedSources: PopulateRuntimeCapturedSource[]; +}): Promise { + const openrouter = createOpenRouter({ + apiKey: requiredEnv("OPENROUTER_API_KEY"), + }); + const agent = new Agent({ + id: "populate-structured-row-agent", + name: "Dataset Populate Structured Row Agent", + instructions: [ + "Convert captured search/fetch source text into benchmark rows.", + "Only use facts directly present in the source transcript.", + "Every evidence quote must be copied from source text.", + ].join("\n"), + model: openrouter("anthropic/claude-sonnet-4-6"), + }); + const output = await agent.generate(buildStructuredRowsPrompt(input), { + structuredOutput: { + schema: structuredPopulateOutputSchema, + jsonPromptInjection: true, + errorStrategy: "fallback", + fallbackValue: { + rows: [], + validationIssues: ["Structured row generation produced no valid rows."], + }, + }, + }); + return structuredPopulateOutputSchema.parse(output.object); +} + +function buildStructuredRowsPrompt(input: { + context: DatasetContext; + capturedSources: PopulateRuntimeCapturedSource[]; +}): string { + const columnNames = input.context.columns.map((column) => column.name); + const entities = entityCandidatesFromDescription(input.context.description); + const officialHints = Object.fromEntries( + entities.map((entity) => [ + entity, + officialContentPathForEntity(entity, input.context) ?? "official source", + ]) + ); + const sourceTranscript = input.capturedSources + .slice(0, 30) + .map((source, index) => [ + `SOURCE ${index + 1}`, + `URL: ${source.url}`, + "TEXT:", + truncateForPrompt(source.text, 3_000), + ].join("\n")) + .join("\n\n"); + + return `Dataset description: +${input.context.description} + +Required columns: +${JSON.stringify(columnNames)} + +Named entities, when present: +${JSON.stringify(entities)} + +Official source hints: +${JSON.stringify(officialHints)} + +Captured source transcript: +${sourceTranscript} + +Return rows using this exact shape: +{ "rows": [{ "cells": {}, "sourceUrls": [], "evidence": [{ "columnName": "", "sourceUrl": "", "quote": "" }], "needsReview": true }], "validationIssues": [] } + +Rules: +- cells must contain exactly the required columns. +- sourceUrls must contain exact URLs from the captured source transcript. +- evidence.sourceUrl must exactly match one captured source URL. +- evidence.quote must be copied verbatim from that source text. +- needsReview must be true. +- If named entities are present, return at most one best row per named entity. +- Prefer official docs, pricing, or product pages over blogs, announcements, directories, or reviews unless the prompt asks for news/blog posts. +- Return fewer rows rather than inventing missing values.`; +} + +function truncateForPrompt(value: string, maxLength: number): string { + if (value.length <= maxLength) { + return value; + } + return `${value.slice(0, maxLength)}\n[truncated]`; +} + +function populateAgentFailureMessage(error: unknown): string { + return `Populate agent failed: ${ + error instanceof Error ? error.message : String(error) + }`; +} + +function structuredOutputFromAgentResult( + agentOutput: unknown +): StructuredPopulateOutput | undefined { + const candidates = [ + objectProperty(agentOutput, "object"), + agentOutput, + ]; + for (const candidate of candidates) { + const parsed = structuredPopulateOutputSchema.safeParse(candidate); + if (parsed.success) { + return parsed.data; + } + } + return undefined; +} + +function objectProperty(input: unknown, key: string): unknown { + if (typeof input !== "object" || input === null) { + return undefined; + } + return (input as Record)[key]; +} + +function shouldRecoverFromInsertedRows(issues: string[]): boolean { + return issues.some((issue) => + /returned no rows|no source url|evidence quotes/i.test(issue) + ); +} + +function selectBestRuntimeRows(input: { + insertedRows: PopulateRuntimeRow[]; + insertedRowIssues: string[]; + structuredRows: PopulateRuntimeRow[]; + structuredRowIssues: string[]; + debugNotes: string[]; +}): PopulateRuntimeRow[] { + if (input.insertedRows.length > 0 && input.insertedRowIssues.length === 0) { + return input.insertedRows; + } + if (input.structuredRows.length > 0 && input.structuredRowIssues.length === 0) { + if (input.insertedRows.length > 0) { + input.debugNotes.push( + "Structured row recovery replaced insert_row rows without enough source/evidence support." + ); + } + return input.structuredRows; + } + return input.insertedRows.length > 0 ? input.insertedRows : input.structuredRows; +} + +function selectedRowSourceForRows(input: { + rows: PopulateRuntimeRow[]; + insertedRows: PopulateRuntimeRow[]; + structuredRows: PopulateRuntimeRow[]; +}): PopulateRuntimeDebug["selectedRowSource"] { + if (input.rows.length === 0) { + return "none"; + } + if (input.rows === input.insertedRows) { + return "insert_row"; + } + if (input.rows === input.structuredRows) { + return "structured_recovery"; + } + return "none"; +} + +function createPopulateRuntimeTools(input: { + datasetId: string; + capturedRows: PopulateRuntimeCapturedInsertedRow[]; + capturedSources: PopulateRuntimeCapturedSource[]; + validationIssues: string[]; + metrics: PopulateRuntimeResult["metrics"]; + webTools: PopulateRuntimeWebTools; + maxRows: number; +}) { + return { + insert_row: createTool({ + id: "insert_row", + description: "Capture one source-backed row for this populate run.", + inputSchema: z.object({ + datasetId: z.string(), + data: z.record(z.string(), z.any()), + }), + outputSchema: z.object({ + success: z.boolean(), + error: z.string().optional(), + }), + execute: async ({ datasetId, data }) => { + if (datasetId !== input.datasetId) { + return { + success: false, + error: `datasetId must be ${input.datasetId}.`, + }; + } + if (input.capturedRows.length >= input.maxRows) { + return { + success: false, + error: `Row cap reached for this benchmark run (${input.maxRows}).`, + }; + } + input.capturedRows.push({ datasetId, data }); + return { success: true }; + }, + }), + search_web: createTool({ + id: "search_web", + description: "Search the web for source-backed dataset rows.", + inputSchema: z.object({ query: z.string() }), + outputSchema: z.object({ + results: z.array(z.object({ + title: z.string(), + snippet: z.string().optional(), + url: z.string(), + })).optional(), + error: z.string().optional(), + }), + execute: async ({ query }) => { + input.metrics.searchCalls += 1; + try { + const results = await input.webTools.search({ query }); + input.capturedSources.push( + ...results.map((result) => ({ + url: result.url, + text: [result.title, result.snippet].filter(Boolean).join("\n"), + })) + ); + return { results }; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + input.validationIssues.push(`search_web failed: ${message}`); + return { error: message }; + } + }, + }), + fetch_page: createTool({ + id: "fetch_page", + description: "Fetch a source page for row details.", + inputSchema: z.object({ url: z.string() }), + outputSchema: z.object({ + title: z.string().optional(), + text: z.string().optional(), + error: z.string().optional(), + }), + execute: async ({ url }) => { + input.metrics.fetchCalls += 1; + try { + const page = await input.webTools.fetch({ url }); + input.capturedSources.push({ + url, + text: [page.title, page.text].filter(Boolean).join("\n"), + }); + return page; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + input.validationIssues.push(`fetch_page failed: ${message}`); + return { error: message }; + } + }, + }), + list_rows: createTool({ + id: "list_rows", + description: "List rows captured in this in-memory populate run.", + inputSchema: z.object({ datasetId: z.string() }), + outputSchema: z.object({ rows: z.array(z.any()) }), + execute: async () => ({ rows: input.capturedRows }), + }), + }; +} + +function createTinyFishWebTools(): PopulateRuntimeWebTools { + return { + async search({ query }) { + const apiKey = requiredEnv("TINYFISH_API_KEY"); + const response = await fetch( + `https://api.search.tinyfish.ai?query=${encodeURIComponent(query)}`, + { headers: { "X-API-Key": apiKey } } + ); + if (!response.ok) { + throw new Error(`TinyFish search returned HTTP ${response.status}.`); + } + const payload = await response.json() as { + results?: Array<{ title?: string; snippet?: string; url?: string }>; + }; + return (payload.results ?? []) + .filter((result) => result.title && result.url) + .map((result) => ({ + title: result.title!, + snippet: result.snippet, + url: result.url!, + })); + }, + async fetch({ url }) { + const apiKey = requiredEnv("TINYFISH_API_KEY"); + const response = await fetch("https://api.fetch.tinyfish.ai", { + method: "POST", + headers: { + "Content-Type": "application/json", + "X-API-Key": apiKey, + }, + body: JSON.stringify({ urls: [url], format: "markdown" }), + }); + if (!response.ok) { + throw new Error(`TinyFish fetch returned HTTP ${response.status}.`); + } + const payload = await response.json() as { + results?: Array<{ title?: string; text?: string }>; + errors?: Array<{ error?: string }>; + }; + const page = payload.results?.[0]; + if (!page && payload.errors?.[0]) { + throw new Error(payload.errors[0].error ?? "TinyFish fetch failed."); + } + return { + title: page?.title, + text: page?.text, + }; + }, + }; +} + +function benchmarkRowFromInsertedData( + data: Record +): PopulateRuntimeRow { + const cells = normalizeCells(data); + const sourceUrls = sourceUrlsFromData(cells); + return { + cells, + sourceUrls, + evidence: evidenceFromData(cells, sourceUrls), + needsReview: true, + }; +} + +function benchmarkRowsFromStructuredOutput(input: { + output: StructuredPopulateOutput | undefined; + maxRows: number; + context: DatasetContext; + requestedColumns: string[]; + capturedSources: PopulateRuntimeCapturedSource[]; + validationIssues: string[]; + debugNotes: string[]; +}): PopulateRuntimeRow[] { + if (!input.output) { + return []; + } + const rows: PopulateRuntimeRow[] = []; + input.output.validationIssues.forEach((issue) => { + input.validationIssues.push(`Populate agent reported: ${issue}`); + }); + + input.output.rows.slice(0, input.maxRows).forEach((row, index) => { + const cells = normalizeCells(row.cells); + const columnIssue = validateStructuredRowColumns(cells, input.requestedColumns); + if (columnIssue) { + input.validationIssues.push(`Structured row ${index + 1}: ${columnIssue}`); + return; + } + + const sourceUrls = uniqueHttpUrls([ + ...(row.sourceUrls ?? []), + ...sourceUrlsFromData(cells), + ...(row.evidence ?? []).map((item) => item.sourceUrl ?? ""), + ]); + const evidence = repairStructuredEvidence({ + evidence: normalizeStructuredEvidence(row.evidence ?? []), + cells, + sourceUrls, + capturedSources: input.capturedSources, + context: input.context, + debugNotes: input.debugNotes, + rowNumber: index + 1, + }); + if (sourceUrls.length === 0) { + input.validationIssues.push( + `Structured row ${index + 1}: missing sourceUrls.` + ); + return; + } + if (evidence.length === 0) { + input.validationIssues.push( + `Structured row ${index + 1}: missing evidence.` + ); + return; + } + const unmatchedEvidence = evidence.find( + (item) => !isEvidenceBackedByCapturedSource(item, input.capturedSources) + ); + if (unmatchedEvidence) { + input.validationIssues.push( + `Structured row ${index + 1}: evidence quote not found in captured source ${unmatchedEvidence.sourceUrl}.` + ); + return; + } + + rows.push({ + cells, + sourceUrls, + evidence, + needsReview: true, + }); + }); + + return selectRepresentativeRows(rows, input.context); +} + +function validateStructuredRowColumns( + cells: Record, + requestedColumns: string[] +): string | undefined { + const actualColumns = Object.keys(cells).sort(); + const expectedColumns = [...requestedColumns].sort(); + if (JSON.stringify(actualColumns) !== JSON.stringify(expectedColumns)) { + return `cells must contain exactly requested columns ${JSON.stringify(requestedColumns)}.`; + } + return undefined; +} + +function normalizeStructuredEvidence( + evidence: Array> +): PopulateRuntimeRow["evidence"] { + return evidence + .map((item) => ({ + columnName: item.columnName?.trim() || "entity_name", + sourceUrl: item.sourceUrl?.trim() ?? "", + quote: item.quote.trim(), + })) + .filter((item) => item.sourceUrl && item.quote); +} + +function repairStructuredEvidence(input: { + evidence: PopulateRuntimeRow["evidence"]; + cells: Record; + sourceUrls: string[]; + capturedSources: PopulateRuntimeCapturedSource[]; + context: DatasetContext; + debugNotes: string[]; + rowNumber: number; +}): PopulateRuntimeRow["evidence"] { + return input.evidence.map((item) => { + if (isEvidenceBackedByCapturedSource(item, input.capturedSources)) { + return item; + } + const repairedQuote = quoteFromCapturedSources({ + cells: input.cells, + sourceUrls: input.sourceUrls, + capturedSources: input.capturedSources, + context: input.context, + }); + if (!repairedQuote) { + return item; + } + input.debugNotes.push( + `Structured row ${input.rowNumber}: replaced evidence quote with captured source text.` + ); + return { + ...item, + sourceUrl: repairedQuote.sourceUrl, + quote: repairedQuote.quote, + }; + }); +} + +function quoteFromCapturedSources(input: { + cells: Record; + sourceUrls: string[]; + capturedSources: PopulateRuntimeCapturedSource[]; + context: DatasetContext; +}): { sourceUrl: string; quote: string } | undefined { + const sourceUrlSet = new Set(input.sourceUrls); + const candidateValues = Object.entries(input.cells) + .filter(([columnName]) => !/(^entity_name$|^source_url$|url$|website|link)/i.test(columnName)) + .flatMap(([, value]) => stringCandidatesFromCellValue(value)) + .filter((value) => value.length >= 5) + .sort((a, b) => b.length - a.length); + const sources = input.capturedSources.filter((source) => sourceUrlSet.has(source.url)); + for (const source of sources) { + const normalizedSourceText = normalizeEvidenceText(source.text); + for (const candidate of candidateValues) { + if (normalizedSourceText.includes(normalizeEvidenceText(candidate))) { + return { + sourceUrl: source.url, + quote: sourceQuoteForCandidate(source.text, candidate), + }; + } + } + const taskFallbackQuote = taskSpecificSourceQuote(source.text, input.context); + if (taskFallbackQuote) { + return { + sourceUrl: source.url, + quote: taskFallbackQuote, + }; + } + } + return undefined; +} + +function taskSpecificSourceQuote( + sourceText: string, + context: DatasetContext +): string | undefined { + const taskText = contextText(context); + const lineMatcher = /\b(pricing|price|plan|billing|starter|performance|enterprise|merchant|transaction|\$|%)\b/i; + if (!/\b(pricing|price|plan|billing)\b/i.test(taskText)) { + return undefined; + } + return sourceText + .split(/\r?\n/) + .map((line) => line.trim()) + .find((line) => lineMatcher.test(line)) + ?.slice(0, 240); +} + +function stringCandidatesFromCellValue(value: PopulateCellValue): string[] { + if (typeof value === "string") { + return [value]; + } + if (typeof value === "number" || typeof value === "boolean") { + return [String(value)]; + } + return []; +} + +function sourceQuoteForCandidate(sourceText: string, candidate: string): string { + const lines = sourceText.split(/\r?\n/).map((line) => line.trim()).filter(Boolean); + return lines.find((line) => + normalizeEvidenceText(line).includes(normalizeEvidenceText(candidate)) + ) ?? candidate; +} + +function isEvidenceBackedByCapturedSource( + evidence: PopulateRuntimeRow["evidence"][number], + capturedSources: PopulateRuntimeCapturedSource[] +): boolean { + const normalizedQuote = normalizeEvidenceText(evidence.quote); + return capturedSources.some((source) => { + if (source.url !== evidence.sourceUrl) { + return false; + } + return normalizeEvidenceText(source.text).includes(normalizedQuote); + }); +} + +function selectRepresentativeRows( + rows: PopulateRuntimeRow[], + context: DatasetContext +): PopulateRuntimeRow[] { + const entities = entityCandidatesFromDescription(context.description); + if (entities.length < 2 || rows.length <= entities.length) { + return rows; + } + const selectedRows = entities + .map((entity) => bestRowForEntity(rows, entity, context)) + .filter((row): row is PopulateRuntimeRow => Boolean(row)); + + return selectedRows.length > 0 ? selectedRows : rows; +} + +function bestRowForEntity( + rows: PopulateRuntimeRow[], + entity: string, + context: DatasetContext +): PopulateRuntimeRow | undefined { + const candidates = rows.filter((row) => + normalizeEvidenceText(String(row.cells.entity_name ?? "")).includes( + normalizeEvidenceText(entity) + ) || + normalizeEvidenceText(entity).includes( + normalizeEvidenceText(String(row.cells.entity_name ?? "")) + ) + ); + return candidates.sort((a, b) => + representativeRowScore(b, entity, context) - + representativeRowScore(a, entity, context) + )[0]; +} + +function representativeRowScore( + row: PopulateRuntimeRow, + entity: string, + context: DatasetContext +): number { + const rowText = JSON.stringify(row).toLowerCase(); + const officialPath = officialContentPathForEntity(entity, context); + let score = row.evidence.length * 2 + row.sourceUrls.length; + if (officialPath && rowText.includes(officialPath.toLowerCase())) { + score += 10; + } + if (/\bdocs?\b|developers\./i.test(rowText)) { + score += 3; + } + if (/\bpricing\b|\/pricing/i.test(rowText)) { + score += 3; + } + if (/\bblog\b|reddit|capterra|review/i.test(rowText)) { + score -= 4; + } + return score; +} + +function hasContradictingStructuredRows( + insertedRows: PopulateRuntimeRow[], + structuredRows: PopulateRuntimeRow[] +): boolean { + if (structuredRows.length === 0) { + return false; + } + return rowFingerprint(insertedRows) !== rowFingerprint(structuredRows); +} + +function rowFingerprint(rows: PopulateRuntimeRow[]): string { + return JSON.stringify(rows.map((row) => row.cells)); +} + +function normalizeCells( + data: Record +): Record { + return Object.fromEntries( + Object.entries(data).map(([key, value]) => [key, normalizeCellValue(value)]) + ); +} + +function normalizeCellValue(value: unknown): PopulateCellValue { + if ( + typeof value === "string" || + typeof value === "number" || + typeof value === "boolean" || + value === null || + Array.isArray(value) + ) { + return value; + } + if (typeof value === "object" && value !== null) { + return value as Record; + } + return null; +} + +function evidenceFromData( + data: Record, + sourceUrls: string[] +): PopulateRuntimeRow["evidence"] { + const quote = + stringValue(data.evidence_quote) ?? + stringValue(data.evidence) ?? + stringValue(data.quote); + if (!quote) { + return []; + } + return [{ + columnName: firstPresentColumn(data), + sourceUrl: sourceUrls[0] ?? "", + quote, + }]; +} + +function sourceUrlsFromData(data: Record): string[] { + const urls = []; + for (const [key, value] of Object.entries(data)) { + if (!/(url|website|source|link|page)/i.test(key)) { + continue; + } + if (typeof value === "string" && /^https?:\/\//i.test(value)) { + urls.push(value); + } + } + return Array.from(new Set(urls)); +} + +function uniqueHttpUrls(values: string[]): string[] { + return Array.from(new Set( + values.filter((value) => /^https?:\/\//i.test(value)) + )); +} + +function normalizeEvidenceText(value: string): string { + return value.toLowerCase().replace(/\s+/g, " ").trim(); +} + +function validateRuntimeRows(rows: PopulateRuntimeRow[]): string[] { + const issues = []; + if (rows.length === 0) { + issues.push("Mastra populate runtime returned no rows."); + } + if (rows.some((row) => row.sourceUrls.length === 0)) { + issues.push("One or more Mastra populate rows have no source URL."); + } + if (rows.some((row) => row.evidence.length === 0)) { + issues.push("Mastra populate rows do not include per-row evidence quotes yet."); + } + return issues; +} + +function firstPresentColumn(data: Record): string { + return Object.keys(data)[0] ?? "entity_name"; +} + +function stringValue(value: unknown): string | undefined { + return typeof value === "string" && value.trim() ? value.trim() : undefined; +} + +function emptyUsage(): PopulateRuntimeResult["usage"] { + return { promptTokens: 0, completionTokens: 0, totalTokens: 0 }; +} + +function emptyMetrics(): PopulateRuntimeResult["metrics"] { + return { + searchCalls: 0, + fetchCalls: 0, + browserCalls: 0, + agentRuns: 0, + agentSteps: 0, + }; +} + +function requiredEnv(name: string): string { + const value = process.env[name]; + if (!value) { + throw new Error(`Missing required environment variable: ${name}`); + } + return value; +} diff --git a/backend/src/pipeline/populate-self-healing-cli.ts b/backend/src/pipeline/populate-self-healing-cli.ts new file mode 100644 index 0000000..ddec693 --- /dev/null +++ b/backend/src/pipeline/populate-self-healing-cli.ts @@ -0,0 +1,6 @@ +import { runPopulateSelfHealingCli } from "./populate-self-healing-command.js"; + +process.exitCode = await runPopulateSelfHealingCli({ + argv: process.argv.slice(2), + env: process.env, +}); diff --git a/backend/src/pipeline/populate-self-healing-command.ts b/backend/src/pipeline/populate-self-healing-command.ts new file mode 100644 index 0000000..3436017 --- /dev/null +++ b/backend/src/pipeline/populate-self-healing-command.ts @@ -0,0 +1,249 @@ +import { readFile } from "node:fs/promises"; + +import { + populateRuntimePrerequisiteError, + type PopulateRuntimePrerequisites, +} from "./populate-runtime-prerequisites.js"; +import { datasetContextSchema, type DatasetContext } from "./populate.js"; +import { InMemoryPopulateRecipeStore } from "./populate-self-healing.js"; +import { + runSelfHealingPopulate, + type PopulateDatasetRowWriter, + type RunSelfHealingPopulateResult, +} from "./populate-self-healing-runner.js"; +import { + createPopulateRecipeRuntime, + type CreatePopulateRecipeRuntimeInput, +} from "./populate-runtime-selection.js"; + +export interface PopulateSelfHealingCliOptions { + datasetId?: string; + contextPath?: string; + shouldReadStdin: boolean; + shouldCommitRows: boolean; + recipeStoreDirectory?: string; + maxRows?: number; +} + +export interface PopulateSelfHealingCliDependencies { + argv: string[]; + env: NodeJS.ProcessEnv; + readFileText?: (path: string) => Promise; + readStdinText?: () => Promise; + writeStdout?: (text: string) => void; + writeStderr?: (text: string) => void; + runSelfHealing?: typeof runSelfHealingPopulate; + createRuntime?: ( + input: CreatePopulateRecipeRuntimeInput + ) => Promise>>; + loadDatasetContextById?: (datasetId: string) => Promise; + createRowWriter?: () => Promise; +} + +export async function runPopulateSelfHealingCli( + input: PopulateSelfHealingCliDependencies +): Promise { + const writeStdout = input.writeStdout ?? ((text) => console.log(text)); + const writeStderr = input.writeStderr ?? ((text) => console.error(text)); + + try { + const options = parsePopulateSelfHealingCliArgs(input.argv); + const prerequisiteError = populateRuntimePrerequisiteError( + prerequisitesFromEnv({ + env: input.env, + shouldCommitRows: options.shouldCommitRows, + shouldLoadDatasetContext: Boolean(options.datasetId), + }) + ); + if (prerequisiteError) { + writeStdout(JSON.stringify({ + success: false, + error: prerequisiteError, + dryRun: !options.shouldCommitRows, + })); + return 1; + } + + const context = await resolveDatasetContext({ + options, + readFileText: input.readFileText ?? ((path) => readFile(path, "utf8")), + readStdinText: input.readStdinText ?? readProcessStdin, + loadDatasetContextById: + input.loadDatasetContextById ?? + ((datasetId) => defaultLoadDatasetContextById(datasetId, input.env)), + }); + const runtime = await (input.createRuntime ?? createPopulateRecipeRuntime)({ + env: input.env, + maxRows: options.maxRows, + }); + const rowWriter = options.shouldCommitRows + ? await (input.createRowWriter ?? defaultCreateRowWriter)() + : undefined; + const result = await (input.runSelfHealing ?? runSelfHealingPopulate)({ + context, + store: options.shouldCommitRows + ? undefined + : new InMemoryPopulateRecipeStore(), + recipeStoreDirectory: options.shouldCommitRows + ? options.recipeStoreDirectory ?? input.env.POPULATE_RECIPE_STORE_DIR + : undefined, + rowWriter, + shouldCommitRows: options.shouldCommitRows, + runtime, + }); + + writeStdout(JSON.stringify(summaryForResult(result, !options.shouldCommitRows))); + return result.success ? 0 : 2; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + writeStderr(message); + writeStdout(JSON.stringify({ success: false, error: message })); + return 1; + } +} + +export function parsePopulateSelfHealingCliArgs( + argv: string[] +): PopulateSelfHealingCliOptions { + const options: PopulateSelfHealingCliOptions = { + shouldReadStdin: false, + shouldCommitRows: false, + }; + const contextSources: string[] = []; + + for (let index = 0; index < argv.length; index += 1) { + const arg = argv[index]; + if (arg === "--context" || arg === "--context-file") { + const value = argv[index + 1]; + if (!value) { + throw new Error(`${arg} requires a file path or "-".`); + } + options.contextPath = value; + options.shouldReadStdin = value === "-"; + contextSources.push(arg); + index += 1; + } else if (arg === "--stdin") { + options.shouldReadStdin = true; + options.contextPath = "-"; + contextSources.push(arg); + } else if (arg === "--dataset-id") { + const value = argv[index + 1]; + if (!value) { + throw new Error("--dataset-id requires a dataset id."); + } + options.datasetId = value; + contextSources.push(arg); + index += 1; + } else if (arg === "--commit") { + options.shouldCommitRows = true; + } else if (arg === "--recipe-store-dir") { + const value = argv[index + 1]; + if (!value) { + throw new Error("--recipe-store-dir requires a directory path."); + } + options.recipeStoreDirectory = value; + index += 1; + } else if (arg === "--max-rows") { + const value = argv[index + 1]; + const parsed = Number(value); + if (!Number.isInteger(parsed) || parsed <= 0) { + throw new Error("--max-rows requires a positive integer."); + } + options.maxRows = parsed; + index += 1; + } else { + throw new Error(`Unknown argument: ${arg}`); + } + } + + if (contextSources.length === 0) { + throw new Error("Missing --dataset-id , --context , or --stdin."); + } + if (contextSources.length > 1) { + throw new Error( + `Choose exactly one context source: ${contextSources.join(", ")}.` + ); + } + if (!options.shouldCommitRows && options.recipeStoreDirectory) { + throw new Error("--recipe-store-dir requires --commit."); + } + return options; +} + +async function resolveDatasetContext(input: { + options: PopulateSelfHealingCliOptions; + readFileText: (path: string) => Promise; + readStdinText: () => Promise; + loadDatasetContextById: (datasetId: string) => Promise; +}): Promise { + if (input.options.datasetId) { + return input.loadDatasetContextById(input.options.datasetId); + } + const text = input.options.shouldReadStdin + ? await input.readStdinText() + : await input.readFileText(input.options.contextPath!); + return datasetContextSchema.parse(JSON.parse(text)); +} + +function prerequisitesFromEnv(input: { + env: NodeJS.ProcessEnv; + shouldCommitRows: boolean; + shouldLoadDatasetContext: boolean; +}): PopulateRuntimePrerequisites { + return { + convexUrl: input.env.CONVEX_URL, + convexAdminKey: input.env.CONVEX_SELF_HOSTED_ADMIN_KEY, + openRouterApiKey: input.env.OPENROUTER_API_KEY, + tinyFishApiKey: input.env.TINYFISH_API_KEY, + shouldCommitRows: input.shouldCommitRows, + shouldLoadDatasetContext: input.shouldLoadDatasetContext, + }; +} + +async function defaultLoadDatasetContextById( + datasetId: string, + env: NodeJS.ProcessEnv +): Promise { + const { createConvexPopulateDatasetContextLoader } = await import( + "./populate-dataset-context-loader.js" + ); + const loader = createConvexPopulateDatasetContextLoader({ + convexUrl: env.CONVEX_URL!, + convexAdminKey: env.CONVEX_SELF_HOSTED_ADMIN_KEY!, + }); + return loader.loadContext(datasetId); +} + +async function defaultCreateRowWriter(): Promise { + const { ConvexPopulateDatasetRowWriter } = await import( + "./populate-convex-writer.js" + ); + return new ConvexPopulateDatasetRowWriter(); +} + +function summaryForResult( + result: RunSelfHealingPopulateResult, + isDryRun: boolean +) { + const diagnosticRun = result.selectedRun ?? result.diagnosticRun; + return { + success: result.success, + dryRun: isDryRun, + action: result.action, + datasetId: result.datasetId, + committedRows: result.committedRows, + rowCount: diagnosticRun?.rows.length ?? 0, + validationIssues: result.validationIssues, + rejectionReasons: result.rejectionReasons, + productionValidation: diagnosticRun?.productionValidation, + metrics: diagnosticRun?.metrics, + }; +} + +async function readProcessStdin(): Promise { + let text = ""; + for await (const chunk of process.stdin) { + text += String(chunk); + } + return text; +} diff --git a/backend/src/pipeline/populate-self-healing-runner.ts b/backend/src/pipeline/populate-self-healing-runner.ts new file mode 100644 index 0000000..3e3347d --- /dev/null +++ b/backend/src/pipeline/populate-self-healing-runner.ts @@ -0,0 +1,128 @@ +import { join } from "node:path"; + +import type { DatasetContext } from "./populate.js"; +import { + DefaultPopulateRecipeAuthor, + FileSystemPopulateRecipeStore, + MastraPopulateRecipeRuntime, + SelfHealingPopulateRecipeService, + type PopulateRecipeAuthor, + type PopulateRecipeRunResult, + type PopulateRecipeRuntime, + type PopulateRecipeStore, + type SelfHealingPopulateTickResult, +} from "./populate-self-healing.js"; + +export interface PopulateDatasetRowWriter { + replaceRows(input: { + datasetId: string; + rows: PopulateRecipeRunResult["rows"]; + }): Promise; +} + +export interface PopulateDatasetWriteResult { + clearedRowCount?: number; + insertedRowCount: number; +} + +export interface RunSelfHealingPopulateInput { + context: DatasetContext; + store?: PopulateRecipeStore; + runtime?: PopulateRecipeRuntime; + author?: PopulateRecipeAuthor; + rowWriter?: PopulateDatasetRowWriter; + shouldCommitRows?: boolean; + recipeStoreDirectory?: string; +} + +export interface RunSelfHealingPopulateResult { + success: boolean; + action: SelfHealingPopulateTickResult["action"]; + datasetId: string; + selectedRun?: PopulateRecipeRunResult; + diagnosticRun?: PopulateRecipeRunResult; + committedRows?: PopulateDatasetWriteResult; + rejectionReasons: string[]; + validationIssues: string[]; + tick: SelfHealingPopulateTickResult; +} + +export async function runSelfHealingPopulate( + input: RunSelfHealingPopulateInput +): Promise { + if (input.shouldCommitRows && !input.rowWriter) { + throw new Error("rowWriter is required when shouldCommitRows is true."); + } + const rowWriter = input.rowWriter; + + const store = input.store ?? new FileSystemPopulateRecipeStore( + input.recipeStoreDirectory ?? defaultPopulateRecipeStoreDirectory() + ); + const service = new SelfHealingPopulateRecipeService({ + store, + runtime: input.runtime ?? new MastraPopulateRecipeRuntime(), + author: input.author ?? new DefaultPopulateRecipeAuthor(), + }); + const tick = await service.tick({ + datasetId: input.context.datasetId, + context: input.context, + }); + const selectedRun = successfulRunForTick(tick); + const diagnosticRun = diagnosticRunForTick(tick); + let committedRows: PopulateDatasetWriteResult | undefined; + + if (input.shouldCommitRows && selectedRun && rowWriter) { + committedRows = await rowWriter.replaceRows({ + datasetId: input.context.datasetId, + rows: selectedRun.rows, + }); + } + + return { + success: Boolean(selectedRun), + action: tick.action, + datasetId: input.context.datasetId, + selectedRun, + diagnosticRun, + committedRows, + rejectionReasons: tick.rejectionReasons, + validationIssues: validationIssuesForSelfHealingTick(tick), + tick, + }; +} + +export function successfulRunForTick( + tick: SelfHealingPopulateTickResult +): PopulateRecipeRunResult | undefined { + if (tick.action === "active_rerun_succeeded") { + return tick.activeRun; + } + if ( + tick.action === "generated_initial_recipe" || + tick.action === "repaired_active_recipe" + ) { + return tick.candidateRun; + } + return undefined; +} + +export function diagnosticRunForTick( + tick: SelfHealingPopulateTickResult +): PopulateRecipeRunResult | undefined { + return successfulRunForTick(tick) ?? tick.candidateRun ?? tick.activeRun; +} + +export function validationIssuesForSelfHealingTick( + tick: SelfHealingPopulateTickResult +): string[] { + const run = diagnosticRunForTick(tick); + return Array.from(new Set([ + ...(run?.validationIssues ?? []), + ...(run?.productionValidation.criticalIssues ?? []), + ...tick.rejectionReasons, + ])); +} + +function defaultPopulateRecipeStoreDirectory(): string { + return join(process.cwd(), ".bigset", "populate-recipes"); +} diff --git a/backend/src/pipeline/populate-self-healing.ts b/backend/src/pipeline/populate-self-healing.ts new file mode 100644 index 0000000..b5f89e2 --- /dev/null +++ b/backend/src/pipeline/populate-self-healing.ts @@ -0,0 +1,948 @@ +import { mkdir, readFile, writeFile } from "node:fs/promises"; +import { join } from "node:path"; + +import { + type PopulateRuntimeAgentRunner, + type PopulateRuntimeResult, + type PopulateRuntimeRow, + type PopulateRuntimeWebTools, + runPopulateRuntime, +} from "./populate-runtime.js"; +import { + datasetContextSchema, + type DatasetContext, +} from "./populate.js"; + +export type PopulateRecipeStatus = + | "active" + | "candidate" + | "retired" + | "rejected"; + +export type PopulateRecipeRunStatus = "succeeded" | "failed"; + +export type PopulateRecipeArtifactKind = + | "text" + | "stderr" + | "source-transcript" + | "captured-rows"; + +const MAX_ARTIFACT_TEXT_LENGTH = 20_000; + +export interface PopulateRecipe { + recipeId: string; + datasetId: string; + version: number; + status: PopulateRecipeStatus; + runtimeInstructions: string; + sourceDescription: string; + requestedColumns: string[]; + createdAt: string; + createdBy: "agent" | "human" | "system"; + lastSuccessfulRunAt?: string; + lastValidationScore?: number; +} + +export interface PopulateRecipeArtifact { + kind: PopulateRecipeArtifactKind; + label: string; + content: string; +} + +export interface PopulateRecipeProductionValidation { + isValid: boolean; + score: number; + rowCount: number; + requestedCellCompletenessRatio: number; + sourceUrlCoverageRatio: number; + evidenceCoverageRatio: number; + expectedEntityCoverageRatio: number; + expectedEntities: string[]; + missingExpectedEntities: string[]; + criticalIssues: string[]; + warnings: string[]; +} + +export interface PopulateRecipeRunResult extends PopulateRuntimeResult { + recipeId: string; + recipeVersion: number; + runStatus: PopulateRecipeRunStatus; + startedAt: string; + completedAt: string; + runtimeMs: number; + productionValidation: PopulateRecipeProductionValidation; + artifacts: PopulateRecipeArtifact[]; +} + +export interface PopulateRecipeRuntime { + runRecipe(input: { + recipe: PopulateRecipe; + context: DatasetContext; + }): Promise; +} + +export interface PopulateRecipeAuthorGenerateInput { + context: DatasetContext; + nextVersion: number; +} + +export interface PopulateRecipeAuthorRepairInput + extends PopulateRecipeAuthorGenerateInput { + activeRecipe: PopulateRecipe; + failedRun: PopulateRecipeRunResult; +} + +export interface PopulateRecipeAuthor { + generateRecipe(input: PopulateRecipeAuthorGenerateInput): Promise; + repairRecipe(input: PopulateRecipeAuthorRepairInput): Promise; +} + +export interface StoredPopulateRecipeRunRecord { + recipeId: string; + recipeVersion: number; + runStatus: PopulateRecipeRunStatus; + completedAt: string; + productionValidation: PopulateRecipeProductionValidation; +} + +export interface PopulateRecipeStoreSnapshot { + datasetId: string; + recipes: PopulateRecipe[]; + runRecords: StoredPopulateRecipeRunRecord[]; +} + +export interface PopulateRecipeStore { + loadSnapshot(datasetId: string): Promise; + saveRecipe(recipe: PopulateRecipe): Promise; + saveRunResult(datasetId: string, runResult: PopulateRecipeRunResult): Promise; + getActiveRecipe(datasetId: string): Promise; +} + +export type SelfHealingPopulateAction = + | "active_rerun_succeeded" + | "generated_initial_recipe" + | "repaired_active_recipe" + | "candidate_rejected"; + +export interface SelfHealingPopulateTickResult { + datasetId: string; + action: SelfHealingPopulateAction; + activeRecipe?: PopulateRecipe; + candidateRecipe?: PopulateRecipe; + activeRun?: PopulateRecipeRunResult; + candidateRun?: PopulateRecipeRunResult; + rejectionReasons: string[]; +} + +export class MastraPopulateRecipeRuntime implements PopulateRecipeRuntime { + constructor( + private readonly input: { + runPopulate?: typeof runPopulateRuntime; + webTools?: PopulateRuntimeWebTools; + agentRunner?: PopulateRuntimeAgentRunner; + maxRows?: number; + } = {} + ) {} + + async runRecipe(input: { + recipe: PopulateRecipe; + context: DatasetContext; + }): Promise { + const startedAtMs = Date.now(); + const startedAt = new Date(startedAtMs).toISOString(); + const runtime = this.input.runPopulate ?? runPopulateRuntime; + const context = contextWithRecipeInstructions(input.context, input.recipe); + let result: PopulateRuntimeResult; + let failureMessage: string | undefined; + + try { + result = await runtime({ + context, + webTools: this.input.webTools, + agentRunner: this.input.agentRunner, + maxRows: this.input.maxRows, + }); + } catch (error) { + failureMessage = error instanceof Error ? error.message : String(error); + result = emptyPopulateRuntimeResult([failureMessage]); + } + + return populateRecipeRunResultFromRuntimeResult({ + recipe: input.recipe, + context: input.context, + result, + failureMessage, + startedAt, + startedAtMs, + }); + } +} + +export function populateRecipeRunResultFromRuntimeResult(input: { + recipe: PopulateRecipe; + context: DatasetContext; + result: PopulateRuntimeResult; + failureMessage?: string; + startedAt: string; + startedAtMs: number; +}): PopulateRecipeRunResult { + const productionValidation = validatePopulateRuntimeResult({ + result: input.result, + context: input.context, + }); + const artifacts = artifactsForRun({ + result: input.result, + failureMessage: input.failureMessage, + validationIssues: input.result.validationIssues, + productionValidation, + }); + const completedAt = new Date().toISOString(); + + return { + ...input.result, + recipeId: input.recipe.recipeId, + recipeVersion: input.recipe.version, + runStatus: productionValidation.isValid ? "succeeded" : "failed", + startedAt: input.startedAt, + completedAt, + runtimeMs: Date.now() - input.startedAtMs, + productionValidation, + artifacts, + }; +} + +export class DefaultPopulateRecipeAuthor implements PopulateRecipeAuthor { + async generateRecipe( + input: PopulateRecipeAuthorGenerateInput + ): Promise { + return createPopulateRecipe({ + recipeId: populateRecipeId(input.context.datasetId, input.nextVersion), + datasetId: input.context.datasetId, + version: input.nextVersion, + sourceDescription: input.context.description, + requestedColumns: requestedColumnNames(input.context), + runtimeInstructions: initialRuntimeInstructions(input.context), + createdBy: "system", + }); + } + + async repairRecipe( + input: PopulateRecipeAuthorRepairInput + ): Promise { + return createPopulateRecipe({ + recipeId: populateRecipeId(input.context.datasetId, input.nextVersion), + datasetId: input.context.datasetId, + version: input.nextVersion, + sourceDescription: input.context.description, + requestedColumns: requestedColumnNames(input.context), + runtimeInstructions: repairRuntimeInstructions(input), + createdBy: "system", + }); + } +} + +export class SelfHealingPopulateRecipeService { + constructor( + private readonly input: { + store: PopulateRecipeStore; + runtime: PopulateRecipeRuntime; + author: PopulateRecipeAuthor; + } + ) {} + + async tick(input: { + datasetId: string; + context: DatasetContext; + }): Promise { + const context = { + ...datasetContextSchema.parse(input.context), + datasetId: input.datasetId, + }; + const activeRecipe = await this.input.store.getActiveRecipe(input.datasetId); + + if (!activeRecipe) { + return this.generateInitialRecipe({ datasetId: input.datasetId, context }); + } + + const activeRun = await this.input.runtime.runRecipe({ + recipe: activeRecipe, + context, + }); + await this.input.store.saveRunResult(input.datasetId, activeRun); + + if (isHealthyRun(activeRun)) { + const updatedRecipe = successfulRecipe(activeRecipe, activeRun); + await this.input.store.saveRecipe(updatedRecipe); + return { + datasetId: input.datasetId, + action: "active_rerun_succeeded", + activeRecipe: updatedRecipe, + activeRun, + rejectionReasons: [], + }; + } + + const nextVersion = await this.nextVersion(input.datasetId); + const candidateRecipe = normalizeCandidateRecipe({ + recipe: await this.input.author.repairRecipe({ + context, + activeRecipe, + failedRun: activeRun, + nextVersion, + }), + datasetId: input.datasetId, + context, + version: nextVersion, + }); + const candidateRun = await this.runCandidate({ + recipe: candidateRecipe, + context, + datasetId: input.datasetId, + }); + + if (shouldPromoteCandidate({ activeRecipe, activeRun, candidateRun })) { + const retiredRecipe = { ...activeRecipe, status: "retired" as const }; + const promotedRecipe = successfulRecipe(candidateRecipe, candidateRun); + await this.input.store.saveRecipe(retiredRecipe); + await this.input.store.saveRecipe(promotedRecipe); + return { + datasetId: input.datasetId, + action: "repaired_active_recipe", + activeRecipe: promotedRecipe, + candidateRecipe, + activeRun, + candidateRun, + rejectionReasons: [], + }; + } + + const rejectedRecipe = { ...candidateRecipe, status: "rejected" as const }; + await this.input.store.saveRecipe(rejectedRecipe); + return { + datasetId: input.datasetId, + action: "candidate_rejected", + activeRecipe, + candidateRecipe: rejectedRecipe, + activeRun, + candidateRun, + rejectionReasons: rejectionReasonsForCandidate({ + activeRecipe, + activeRun, + candidateRun, + }), + }; + } + + private async generateInitialRecipe(input: { + datasetId: string; + context: DatasetContext; + }): Promise { + const nextVersion = await this.nextVersion(input.datasetId); + const candidateRecipe = normalizeCandidateRecipe({ + recipe: await this.input.author.generateRecipe({ + context: input.context, + nextVersion, + }), + datasetId: input.datasetId, + context: input.context, + version: nextVersion, + }); + const candidateRun = await this.runCandidate({ + recipe: candidateRecipe, + context: input.context, + datasetId: input.datasetId, + }); + + if (candidateRun.productionValidation.isValid) { + const activeRecipe = successfulRecipe(candidateRecipe, candidateRun); + await this.input.store.saveRecipe(activeRecipe); + return { + datasetId: input.datasetId, + action: "generated_initial_recipe", + activeRecipe, + candidateRecipe, + candidateRun, + rejectionReasons: [], + }; + } + + const rejectedRecipe = { ...candidateRecipe, status: "rejected" as const }; + await this.input.store.saveRecipe(rejectedRecipe); + return { + datasetId: input.datasetId, + action: "candidate_rejected", + candidateRecipe: rejectedRecipe, + candidateRun, + rejectionReasons: candidateRun.productionValidation.criticalIssues, + }; + } + + private async runCandidate(input: { + recipe: PopulateRecipe; + context: DatasetContext; + datasetId: string; + }): Promise { + await this.input.store.saveRecipe(input.recipe); + const runResult = await this.input.runtime.runRecipe({ + recipe: input.recipe, + context: input.context, + }); + await this.input.store.saveRunResult(input.datasetId, runResult); + return runResult; + } + + private async nextVersion(datasetId: string): Promise { + const snapshot = await this.input.store.loadSnapshot(datasetId); + return snapshot.recipes.reduce( + (version, recipe) => Math.max(version, recipe.version), + 0 + ) + 1; + } +} + +export class InMemoryPopulateRecipeStore implements PopulateRecipeStore { + private readonly snapshotsByDatasetId = new Map(); + + async loadSnapshot(datasetId: string): Promise { + return this.snapshotFor(datasetId); + } + + async saveRecipe(recipe: PopulateRecipe): Promise { + const snapshot = this.snapshotFor(recipe.datasetId); + const existingIndex = snapshot.recipes.findIndex( + (storedRecipe) => storedRecipe.recipeId === recipe.recipeId + ); + if (existingIndex >= 0) { + snapshot.recipes[existingIndex] = recipe; + } else { + snapshot.recipes.push(recipe); + } + snapshot.recipes.sort((left, right) => left.version - right.version); + } + + async saveRunResult( + datasetId: string, + runResult: PopulateRecipeRunResult + ): Promise { + this.snapshotFor(datasetId).runRecords.push(runRecordFromRunResult(runResult)); + } + + async getActiveRecipe(datasetId: string): Promise { + const snapshot = this.snapshotFor(datasetId); + return snapshot.recipes + .filter((recipe) => recipe.status === "active") + .sort((left, right) => right.version - left.version)[0]; + } + + private snapshotFor(datasetId: string): PopulateRecipeStoreSnapshot { + let snapshot = this.snapshotsByDatasetId.get(datasetId); + if (!snapshot) { + snapshot = { datasetId, recipes: [], runRecords: [] }; + this.snapshotsByDatasetId.set(datasetId, snapshot); + } + return snapshot; + } +} + +export class FileSystemPopulateRecipeStore implements PopulateRecipeStore { + constructor(private readonly rootDirectory: string) {} + + async loadSnapshot(datasetId: string): Promise { + try { + const manifestText = await readFile(this.manifestPath(datasetId), "utf8"); + const parsed = JSON.parse(manifestText) as PopulateRecipeStoreSnapshot; + return { + datasetId, + recipes: parsed.recipes ?? [], + runRecords: parsed.runRecords ?? [], + }; + } catch (error) { + if (isNodeError(error) && error.code === "ENOENT") { + return { datasetId, recipes: [], runRecords: [] }; + } + throw error; + } + } + + async saveRecipe(recipe: PopulateRecipe): Promise { + const snapshot = await this.loadSnapshot(recipe.datasetId); + const existingIndex = snapshot.recipes.findIndex( + (storedRecipe) => storedRecipe.recipeId === recipe.recipeId + ); + if (existingIndex >= 0) { + snapshot.recipes[existingIndex] = recipe; + } else { + snapshot.recipes.push(recipe); + } + snapshot.recipes.sort((left, right) => left.version - right.version); + await this.writeSnapshot(snapshot); + } + + async saveRunResult( + datasetId: string, + runResult: PopulateRecipeRunResult + ): Promise { + const snapshot = await this.loadSnapshot(datasetId); + snapshot.runRecords.push(runRecordFromRunResult(runResult)); + await this.writeSnapshot(snapshot); + } + + async getActiveRecipe(datasetId: string): Promise { + const snapshot = await this.loadSnapshot(datasetId); + return snapshot.recipes + .filter((recipe) => recipe.status === "active") + .sort((left, right) => right.version - left.version)[0]; + } + + private async writeSnapshot(snapshot: PopulateRecipeStoreSnapshot): Promise { + await mkdir(this.datasetDirectory(snapshot.datasetId), { recursive: true }); + await writeFile( + this.manifestPath(snapshot.datasetId), + `${JSON.stringify(snapshot, null, 2)}\n`, + "utf8" + ); + } + + private datasetDirectory(datasetId: string): string { + return join(this.rootDirectory, safePathSegment(datasetId)); + } + + private manifestPath(datasetId: string): string { + return join(this.datasetDirectory(datasetId), "manifest.json"); + } +} + +export function createPopulateRecipe(input: { + recipeId: string; + datasetId: string; + version: number; + sourceDescription: string; + requestedColumns: string[]; + runtimeInstructions?: string; + status?: PopulateRecipeStatus; + createdAt?: string; + createdBy?: PopulateRecipe["createdBy"]; +}): PopulateRecipe { + return { + recipeId: input.recipeId, + datasetId: input.datasetId, + version: input.version, + status: input.status ?? "candidate", + runtimeInstructions: input.runtimeInstructions ?? "", + sourceDescription: input.sourceDescription, + requestedColumns: input.requestedColumns, + createdAt: input.createdAt ?? new Date().toISOString(), + createdBy: input.createdBy ?? "agent", + }; +} + +function normalizeCandidateRecipe(input: { + recipe: PopulateRecipe; + datasetId: string; + context: DatasetContext; + version: number; +}): PopulateRecipe { + return { + ...input.recipe, + datasetId: input.datasetId, + version: input.version, + status: "candidate", + sourceDescription: input.context.description, + requestedColumns: input.context.columns.map((column) => column.name), + }; +} + +function populateRecipeId(datasetId: string, version: number): string { + return `${safePathSegment(datasetId)}-recipe-v${version}`; +} + +function requestedColumnNames(context: DatasetContext): string[] { + return context.columns.map((column) => column.name); +} + +function initialRuntimeInstructions(context: DatasetContext): string { + return [ + "Use search_web before fetch_page unless an official source URL is already obvious.", + "Prefer official docs, pricing, blog, product, or company pages over third-party summaries.", + "Every inserted row must include source_url and evidence_quote cells when those columns exist.", + "Every inserted row must include at least one source URL and one evidence quote.", + `Requested columns: ${requestedColumnNames(context).join(", ")}.`, + ].join("\n"); +} + +function repairRuntimeInstructions(input: PopulateRecipeAuthorRepairInput): string { + const failureSummary = [ + ...input.failedRun.productionValidation.criticalIssues, + ...input.failedRun.validationIssues, + ] + .map((issue) => issue.trim()) + .filter(Boolean) + .slice(0, 8); + const priorInstructions = input.activeRecipe.runtimeInstructions.trim(); + return [ + priorInstructions || initialRuntimeInstructions(input.context), + "", + "Repair focus from previous failed run:", + ...failureSummary.map((issue) => `- ${truncateInstruction(issue, 240)}`), + "- Do not reuse rows that failed validation without fixing source URL and evidence quote coverage.", + "- If expected entities were missing, collect one source-backed row per missing entity before returning.", + ].join("\n"); +} + +function truncateInstruction(value: string, maxLength: number): string { + if (value.length <= maxLength) { + return value; + } + return `${value.slice(0, maxLength - 12)} [truncated]`; +} + +function contextWithRecipeInstructions( + context: DatasetContext, + recipe: PopulateRecipe +): DatasetContext { + if (!recipe.runtimeInstructions.trim()) { + return context; + } + return { + ...context, + description: [ + context.description, + "", + "Durable recipe instructions:", + recipe.runtimeInstructions.trim(), + ].join("\n"), + }; +} + +function validatePopulateRuntimeResult(input: { + result: PopulateRuntimeResult; + context: DatasetContext; +}): PopulateRecipeProductionValidation { + const requestedColumns = input.context.columns.map((column) => column.name); + const expectedEntities = expectedEntitiesFromContext(input.context); + const entityCoverage = expectedEntityCoverage({ + rows: input.result.rows, + expectedEntities, + }); + const rowCount = input.result.rows.length; + const requestedCellCompletenessRatio = averageRatio( + input.result.rows.map((row) => cellCompletenessRatio(row, requestedColumns)) + ); + const sourceUrlCoverageRatio = averageRatio( + input.result.rows.map((row) => row.sourceUrls.length > 0 ? 1 : 0) + ); + const evidenceCoverageRatio = averageRatio( + input.result.rows.map((row) => row.evidence.length > 0 ? 1 : 0) + ); + const criticalIssues = criticalIssuesForRows({ + rows: input.result.rows, + requestedColumns, + validationIssues: input.result.validationIssues, + missingExpectedEntities: entityCoverage.missingExpectedEntities, + }); + const scoreComponents = [ + requestedCellCompletenessRatio, + sourceUrlCoverageRatio, + evidenceCoverageRatio, + ]; + if (expectedEntities.length > 0) { + scoreComponents.push(entityCoverage.expectedEntityCoverageRatio); + } + const score = rowCount === 0 + ? 0 + : averageRatio(scoreComponents); + + return { + isValid: criticalIssues.length === 0, + score, + rowCount, + requestedCellCompletenessRatio, + sourceUrlCoverageRatio, + evidenceCoverageRatio, + expectedEntityCoverageRatio: entityCoverage.expectedEntityCoverageRatio, + expectedEntities, + missingExpectedEntities: entityCoverage.missingExpectedEntities, + criticalIssues, + warnings: input.result.validationIssues, + }; +} + +function criticalIssuesForRows(input: { + rows: PopulateRuntimeRow[]; + requestedColumns: string[]; + validationIssues: string[]; + missingExpectedEntities: string[]; +}): string[] { + const issues: string[] = []; + if (input.rows.length === 0) { + issues.push("Populate runtime returned no rows."); + } + if (input.missingExpectedEntities.length > 0) { + issues.push( + `Missing expected entities: ${input.missingExpectedEntities.join(", ")}.` + ); + } + input.rows.forEach((row, index) => { + const missingColumns = input.requestedColumns.filter( + (columnName) => isMissingCellValue(row.cells[columnName]) + ); + if (missingColumns.length > 0) { + issues.push(`Row ${index + 1} missing requested columns: ${missingColumns.join(", ")}.`); + } + if (row.sourceUrls.length === 0) { + issues.push(`Row ${index + 1} has no source URL.`); + } + if (row.evidence.length === 0) { + issues.push(`Row ${index + 1} has no evidence quote.`); + } + }); + input.validationIssues + .filter((issue) => + /failed|missing|no rows|not found|invented|invalid/i.test(issue) && + !isNonBlockingOperationalWarning(issue) + ) + .forEach((issue) => issues.push(issue)); + return Array.from(new Set(issues)); +} + +function cellCompletenessRatio( + row: PopulateRuntimeRow, + requestedColumns: string[] +): number { + if (requestedColumns.length === 0) { + return 1; + } + const filledCount = requestedColumns.filter( + (columnName) => !isMissingCellValue(row.cells[columnName]) + ).length; + return filledCount / requestedColumns.length; +} + +function expectedEntitiesFromContext(context: DatasetContext): string[] { + const fromSegment = context.description.match(/\bfrom\s+([^?.]+)/i)?.[1]; + if (!fromSegment) { + return []; + } + const entities = fromSegment + .split(/,|\band\b/i) + .map((entity) => entity.replace(/\b(the|a|an)\b/gi, " ").trim()) + .map((entity) => entity.replace(/\s+/g, " ")) + .filter((entity) => + entity.length >= 2 && + entity.length <= 60 && + /[A-Z]/.test(entity) + ); + return entities.length >= 2 ? Array.from(new Set(entities)) : []; +} + +function expectedEntityCoverage(input: { + rows: PopulateRuntimeRow[]; + expectedEntities: string[]; +}): { + expectedEntityCoverageRatio: number; + missingExpectedEntities: string[]; +} { + if (input.expectedEntities.length === 0) { + return { + expectedEntityCoverageRatio: 1, + missingExpectedEntities: [], + }; + } + const missingExpectedEntities = input.expectedEntities.filter( + (entity) => !input.rows.some((row) => + rowIdentityText(row).includes(entity.toLowerCase()) + ) + ); + return { + expectedEntityCoverageRatio: roundScore( + (input.expectedEntities.length - missingExpectedEntities.length) / + input.expectedEntities.length + ), + missingExpectedEntities, + }; +} + +function rowIdentityText(row: PopulateRuntimeRow): string { + return [ + row.cells.entity_name, + row.cells.company_name, + row.cells.provider_name, + row.cells.product_name, + row.cells.name, + ] + .filter((value) => typeof value === "string" && value.trim()) + .join(" ") + .toLowerCase(); +} + +function isNonBlockingOperationalWarning(issue: string): boolean { + return /^Structured fallback (search|fetch) failed/i.test(issue); +} + +function isMissingCellValue(value: unknown): boolean { + return value === undefined || value === null || value === ""; +} + +function averageRatio(values: number[]): number { + if (values.length === 0) { + return 0; + } + return roundScore(values.reduce((sum, value) => sum + value, 0) / values.length); +} + +function roundScore(value: number): number { + return Math.round(value * 1_000) / 1_000; +} + +function artifactsForRun(input: { + result: PopulateRuntimeResult; + failureMessage?: string; + validationIssues: string[]; + productionValidation: PopulateRecipeProductionValidation; +}): PopulateRecipeArtifact[] { + const artifacts: PopulateRecipeArtifact[] = []; + const debugNotes = input.result.debug?.notes ?? []; + if (input.failureMessage) { + artifacts.push({ + kind: "stderr", + label: "populate-runtime-error", + content: input.failureMessage, + }); + } + if (input.validationIssues.length > 0 || input.productionValidation.criticalIssues.length > 0) { + artifacts.push({ + kind: "text", + label: "populate-validation", + content: [ + ...input.validationIssues, + ...input.productionValidation.criticalIssues, + ].join("\n"), + }); + } + if (debugNotes.length > 0) { + artifacts.push({ + kind: "text", + label: "populate-debug-notes", + content: debugNotes.join("\n").slice(0, MAX_ARTIFACT_TEXT_LENGTH), + }); + } + const capturedSources = input.result.debug?.capturedSources ?? []; + const capturedRows = input.result.debug?.capturedRows ?? []; + if (capturedSources.length > 0) { + artifacts.push({ + kind: "source-transcript", + label: "populate-source-transcript", + content: capturedSources + .map((source, index) => [ + `SOURCE ${index + 1}`, + `URL: ${source.url}`, + "TEXT:", + source.text, + ].join("\n")) + .join("\n\n") + .slice(0, MAX_ARTIFACT_TEXT_LENGTH), + }); + } + if (capturedRows.length > 0) { + artifacts.push({ + kind: "captured-rows", + label: "populate-captured-rows", + content: JSON.stringify(capturedRows, null, 2) + .slice(0, MAX_ARTIFACT_TEXT_LENGTH), + }); + } + return artifacts; +} + +export function emptyPopulateRuntimeResult(validationIssues: string[]): PopulateRuntimeResult { + return { + rows: [], + validationIssues, + usage: { + promptTokens: 0, + completionTokens: 0, + totalTokens: 0, + }, + metrics: { + searchCalls: 0, + fetchCalls: 0, + browserCalls: 0, + agentRuns: 0, + agentSteps: 0, + }, + debug: { + capturedRows: [], + capturedSources: [], + selectedRowSource: "none", + notes: [], + }, + }; +} + +function isHealthyRun(runResult: PopulateRecipeRunResult): boolean { + return runResult.runStatus === "succeeded" && + runResult.productionValidation.isValid; +} + +function shouldPromoteCandidate(input: { + activeRecipe: PopulateRecipe; + activeRun: PopulateRecipeRunResult; + candidateRun: PopulateRecipeRunResult; +}): boolean { + const baselineScore = + input.activeRecipe.lastValidationScore ?? + input.activeRun.productionValidation.score; + return input.candidateRun.productionValidation.isValid && + input.candidateRun.productionValidation.score >= + baselineScore; +} + +function rejectionReasonsForCandidate(input: { + activeRecipe: PopulateRecipe; + activeRun: PopulateRecipeRunResult; + candidateRun: PopulateRecipeRunResult; +}): string[] { + const reasons = [...input.candidateRun.productionValidation.criticalIssues]; + const baselineScore = + input.activeRecipe.lastValidationScore ?? + input.activeRun.productionValidation.score; + if ( + input.candidateRun.productionValidation.score < + baselineScore + ) { + reasons.push("Candidate validation score is below the active recipe baseline."); + } + return Array.from(new Set(reasons)); +} + +function successfulRecipe( + recipe: PopulateRecipe, + runResult: PopulateRecipeRunResult +): PopulateRecipe { + return { + ...recipe, + status: "active", + lastSuccessfulRunAt: runResult.completedAt, + lastValidationScore: runResult.productionValidation.score, + }; +} + +function runRecordFromRunResult( + runResult: PopulateRecipeRunResult +): StoredPopulateRecipeRunRecord { + return { + recipeId: runResult.recipeId, + recipeVersion: runResult.recipeVersion, + runStatus: runResult.runStatus, + completedAt: runResult.completedAt, + productionValidation: runResult.productionValidation, + }; +} + +function safePathSegment(value: string): string { + return value.replace(/[^a-zA-Z0-9._-]/g, "_"); +} + +function isNodeError(error: unknown): error is NodeJS.ErrnoException { + return error instanceof Error && "code" in error; +} diff --git a/backend/src/pipeline/schema-inference.ts b/backend/src/pipeline/schema-inference.ts index 0b12015..36d8561 100644 --- a/backend/src/pipeline/schema-inference.ts +++ b/backend/src/pipeline/schema-inference.ts @@ -54,7 +54,7 @@ async function callOnce( model, output: Output.object({ schema: datasetSchemaSchema }), system: SYSTEM_PROMPT, - maxTokens: 4096, + maxOutputTokens: 4096, prompt, }); if (!output) throw new Error("Model did not generate a valid schema object"); diff --git a/backend/src/server.ts b/backend/src/server.ts new file mode 100644 index 0000000..aa93ea7 --- /dev/null +++ b/backend/src/server.ts @@ -0,0 +1,186 @@ +import Fastify, { + type FastifyInstance, + type FastifyPluginAsync, + type FastifyReply, + type FastifyRequest, +} from "fastify"; +import fastifyCors from "@fastify/cors"; + +import { inferSchema } from "./pipeline/schema-inference.js"; +import { datasetContextSchema } from "./pipeline/populate.js"; +import { populateRuntimePrerequisiteError } from "./pipeline/populate-runtime-prerequisites.js"; +import { + runSelfHealingPopulate, + type PopulateDatasetRowWriter, +} from "./pipeline/populate-self-healing-runner.js"; +import { + createPopulateRecipeRuntime, + type CreatePopulateRecipeRuntimeInput, +} from "./pipeline/populate-runtime-selection.js"; + +export interface BigSetServerEnv { + CLIENT_ORIGIN: string; + CONVEX_URL: string; + CONVEX_ADMIN_KEY?: string; + OPENROUTER_API_KEY?: string; + TINYFISH_API_KEY?: string; + POPULATE_RECIPE_STORE_DIR: string; +} + +export interface BigSetPopulateDataset { + ownerId: string; +} + +export interface CreateBigSetServerInput { + env: BigSetServerEnv; + authPlugin?: FastifyPluginAsync; + authPreHandler: ( + request: FastifyRequest, + reply: FastifyReply + ) => Promise | void; + getDatasetById: (datasetId: string) => Promise; + populateRowWriter: PopulateDatasetRowWriter; + runtimeEnv?: NodeJS.ProcessEnv; + inferSchemaFn?: typeof inferSchema; + runSelfHealing?: typeof runSelfHealingPopulate; + createRuntime?: ( + input: CreatePopulateRecipeRuntimeInput + ) => Promise; +} + +type CreatePopulateRecipeRuntimeResult = Awaited< + ReturnType +>; + +export async function createBigSetServer( + input: CreateBigSetServerInput +): Promise { + const fastify = Fastify({ logger: true }); + const inferSchemaForRequest = input.inferSchemaFn ?? inferSchema; + const runSelfHealing = input.runSelfHealing ?? runSelfHealingPopulate; + const createRuntime = input.createRuntime ?? createPopulateRecipeRuntime; + + await fastify.register(fastifyCors, { + origin: input.env.CLIENT_ORIGIN, + methods: ["GET", "POST", "PUT", "DELETE", "OPTIONS"], + allowedHeaders: ["Content-Type", "Authorization", "Cookie"], + credentials: true, + maxAge: 86400, + }); + + if (input.authPlugin) { + await fastify.register(input.authPlugin); + } + + fastify.get("/health", async () => ({ status: "ok" })); + + await fastify.register(async (instance) => { + instance.addHook("preHandler", input.authPreHandler); + + instance.post("/infer-schema", async (req, reply) => { + const body = req.body as { prompt?: string }; + if (!body?.prompt || typeof body.prompt !== "string" || !body.prompt.trim()) { + return reply.code(400).send({ error: "prompt is required" }); + } + + try { + const schema = await inferSchemaForRequest(body.prompt.trim()); + return schema; + } catch (err) { + req.log.error(err, "Schema inference failed"); + return reply.code(502).send({ error: "Schema inference failed. Please try again." }); + } + }); + + instance.post("/populate", async (req, reply) => { + const parsed = datasetContextSchema.safeParse(req.body); + if (!parsed.success) { + return reply.code(400).send({ + error: "Invalid request", + details: parsed.error.flatten().fieldErrors, + }); + } + + try { + const dataset = await input.getDatasetById(parsed.data.datasetId); + if (!dataset) { + return reply.code(404).send({ error: "Dataset not found" }); + } + const authenticatedUserId = req.auth?.userId; + if (!authenticatedUserId) { + return reply.code(401).send({ error: "Unauthenticated" }); + } + if (dataset.ownerId !== authenticatedUserId) { + return reply.code(403).send({ error: "Not authorized to populate this dataset" }); + } + const prerequisiteError = populateRuntimePrerequisiteError({ + convexUrl: input.env.CONVEX_URL, + convexAdminKey: input.env.CONVEX_ADMIN_KEY, + openRouterApiKey: input.env.OPENROUTER_API_KEY, + tinyFishApiKey: input.env.TINYFISH_API_KEY, + }); + if (prerequisiteError) { + return reply.code(500).send({ + error: prerequisiteError, + }); + } + + const runtime = await createRuntime({ + env: input.runtimeEnv ?? process.env, + }); + const result = await runSelfHealing({ + context: parsed.data, + recipeStoreDirectory: input.env.POPULATE_RECIPE_STORE_DIR, + rowWriter: input.populateRowWriter, + shouldCommitRows: true, + runtime, + }); + + req.log.info({ + action: result.action, + datasetId: result.datasetId, + committedRows: result.committedRows?.insertedRowCount ?? 0, + validationIssues: result.validationIssues.slice(0, 5), + }, "Self-healing populate completed"); + + if (!result.success) { + return reply.code(422).send({ + error: "Self-healing populate failed validation.", + result: responseSafePopulateResult(result), + }); + } + + return { + success: true, + result: responseSafePopulateResult(result), + }; + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + if (msg.includes("validator") || msg.includes("Invalid")) { + return reply.code(400).send({ error: "Invalid datasetId" }); + } + req.log.error(err, "Populate failed"); + return reply.code(502).send({ error: "Failed to populate dataset. Please try again." }); + } + }); + }); + + return fastify; +} + +function responseSafePopulateResult( + result: Awaited> +) { + const diagnosticRun = result.selectedRun ?? result.diagnosticRun; + return { + action: result.action, + datasetId: result.datasetId, + success: result.success, + committedRows: result.committedRows, + rejectionReasons: result.rejectionReasons, + validationIssues: result.validationIssues, + productionValidation: diagnosticRun?.productionValidation, + metrics: diagnosticRun?.metrics, + rowCount: diagnosticRun?.rows.length ?? 0, + }; +} diff --git a/backend/test/collection-agent-runner.test.ts b/backend/test/collection-agent-runner.test.ts new file mode 100644 index 0000000..1b88c6e --- /dev/null +++ b/backend/test/collection-agent-runner.test.ts @@ -0,0 +1,282 @@ +import assert from "node:assert/strict"; +import { test } from "node:test"; + +import { runCollectionPopulatePipeline } from "../src/pipeline/collection-agent-runner.js"; + +test("collection agent runner maps vendored pipeline output into populate runtime result", async () => { + const previousEnv = snapshotEnv([ + "AGENT_POLL_TIMEOUT_MS", + "COLLECTION_AGENT_ENABLE_AGENT", + "COLLECTION_AGENT_PIPELINE_MODULE", + "COLLECTION_AGENT_POLL_TIMEOUT_MS", + ]); + delete process.env.AGENT_POLL_TIMEOUT_MS; + delete process.env.COLLECTION_AGENT_ENABLE_AGENT; + delete process.env.COLLECTION_AGENT_POLL_TIMEOUT_MS; + process.env.COLLECTION_AGENT_PIPELINE_MODULE = fakeCollectionPipelineModuleUrl({ + expectedCalls: [{ agentEnabled: false }], + }); + try { + const result = await runCollectionPopulatePipeline(collectionPipelineInput()); + + assert.equal(result.rows.length, 1); + assert.equal(result.rows[0]?.cells.entity_name, "OpenAI"); + assert.equal(result.rows[0]?.cells.evidence_quote, "technical operator"); + assert.deepEqual(result.rows[0]?.sourceUrls, ["https://openai.com/news"]); + assert.equal(result.rows[0]?.evidence[0]?.columnName, "entity_name"); + assert.equal(result.rows[0]?.needsReview, true); + assert.deepEqual(result.validationIssues, []); + assert.deepEqual(result.usage, { + promptTokens: 11, + completionTokens: 7, + totalTokens: 18, + }); + assert.equal(result.metrics.searchCalls, 2); + assert.equal(result.metrics.fetchCalls, 3); + assert.equal(result.metrics.browserCalls, 3); + assert.equal(result.metrics.agentRuns, 3); + assert.equal(result.metrics.agentSteps, 3); + } finally { + restoreEnv(previousEnv); + } +}); + +test("collection agent runner requires explicit Agent opt-in and caps poll timeout per warm process call", async () => { + const previousEnv = snapshotEnv([ + "AGENT_POLL_TIMEOUT_MS", + "COLLECTION_AGENT_ENABLE_AGENT", + "COLLECTION_AGENT_PIPELINE_MODULE", + "COLLECTION_AGENT_POLL_TIMEOUT_MS", + ]); + delete process.env.AGENT_POLL_TIMEOUT_MS; + delete process.env.COLLECTION_AGENT_ENABLE_AGENT; + delete process.env.COLLECTION_AGENT_POLL_TIMEOUT_MS; + process.env.COLLECTION_AGENT_PIPELINE_MODULE = fakeCollectionPipelineModuleUrl({ + expectedModuleLoadPollTimeoutMs: null, + expectedCalls: [ + { agentEnabled: false }, + { agentEnabled: true, pollTimeoutMs: 12345 }, + { agentEnabled: true, pollTimeoutMs: 23456 }, + ], + }); + + try { + assert.equal( + (await runCollectionPopulatePipeline(collectionPipelineInput())).rows.length, + 1 + ); + + process.env.COLLECTION_AGENT_ENABLE_AGENT = "true"; + process.env.COLLECTION_AGENT_POLL_TIMEOUT_MS = "12345"; + assert.equal( + (await runCollectionPopulatePipeline(collectionPipelineInput())).rows.length, + 1 + ); + + process.env.COLLECTION_AGENT_POLL_TIMEOUT_MS = "23456"; + assert.equal( + (await runCollectionPopulatePipeline(collectionPipelineInput())).rows.length, + 1 + ); + } finally { + restoreEnv(previousEnv); + } +}); + +test("collection agent runner surfaces Agent-required capability diagnostics from source outcomes", async () => { + const previousEnv = snapshotEnv([ + "AGENT_POLL_TIMEOUT_MS", + "COLLECTION_AGENT_ENABLE_AGENT", + "COLLECTION_AGENT_PIPELINE_MODULE", + "COLLECTION_AGENT_POLL_TIMEOUT_MS", + ]); + delete process.env.AGENT_POLL_TIMEOUT_MS; + delete process.env.COLLECTION_AGENT_ENABLE_AGENT; + delete process.env.COLLECTION_AGENT_POLL_TIMEOUT_MS; + process.env.COLLECTION_AGENT_PIPELINE_MODULE = fakeCollectionPipelineModuleUrl({ + expectedCalls: [{ agentEnabled: false }], + sources: { + outcomes: [ + { + outcome: "agent_deferred", + triage_status: "requires_navigation", + }, + { + outcome: "no_records", + triage_status: "requires_form_submission", + }, + { + outcome: "success", + triage_status: "requires_detail_page_followup", + }, + ], + }, + }); + + try { + const result = await runCollectionPopulatePipeline(collectionPipelineInput()); + const diagnostic = result.validationIssues.join("\n"); + + assert.equal(result.rows.length, 1); + assert.match(diagnostic, /Capability diagnostic: TinyFish Agent disabled/); + assert.match(diagnostic, /2 page\(s\)/); + assert.match(diagnostic, /requires_navigation=1/); + assert.match(diagnostic, /requires_form_submission=1/); + assert.doesNotMatch( + diagnostic, + /failed|missing|no rows|not found|invented|invalid/i + ); + } finally { + restoreEnv(previousEnv); + } +}); + +function collectionPipelineInput() { + return { + datasetId: "dataset-ai-posts", + datasetName: "AI posts", + description: "Find latest AI blog posts.", + columns: [ + { name: "entity_name", type: "text" as const }, + { name: "source_url", type: "url" as const }, + { name: "evidence_quote", type: "text" as const }, + ], + requiredColumns: ["entity_name", "source_url", "evidence_quote"], + prompt: [ + "Dataset: AI posts", + "Task: Find latest AI blog posts.", + "", + "Durable recipe instructions:", + "Prefer official source pages.", + ].join("\n"), + recipeInstructions: "Prefer official source pages.", + targetRows: 3, + promptId: "latest-ai-blog-posts", + promptQuality: "easy", + persona: "technical operator", + expectedStress: "Latest dated source pages.", + }; +} + +function fakeCollectionPipelineModuleUrl(input: { + expectedModuleLoadPollTimeoutMs?: string | null; + expectedCalls: Array<{ + agentEnabled: boolean; + pollTimeoutMs?: number; + }>; + sources?: unknown; +}): string { + const source = ` + const moduleLoadPollTimeoutMs = process.env.AGENT_POLL_TIMEOUT_MS ?? null; + const expectedModuleLoadPollTimeoutMs = ${JSON.stringify(input.expectedModuleLoadPollTimeoutMs ?? null)}; + const expectedCalls = ${JSON.stringify(input.expectedCalls)}; + let callIndex = 0; + + export async function runPipeline(options) { + if (moduleLoadPollTimeoutMs !== expectedModuleLoadPollTimeoutMs) { + throw new Error("unexpected module-load poll timeout"); + } + const expected = expectedCalls[callIndex++]; + if (!expected) { + throw new Error("unexpected extra pipeline call"); + } + if (options.enableTinyfishAgent !== expected.agentEnabled) { + throw new Error("unexpected TinyFish Agent setting"); + } + if ((options.agentPollTimeoutMs ?? null) !== (expected.pollTimeoutMs ?? null)) { + throw new Error("bounded agent poll timeout missing"); + } + if (!options.prompt.includes("Durable recipe instructions")) { + throw new Error("recipe instructions missing from prompt"); + } + if (!options.memoryDir || !options.memoryDir.includes("memory")) { + throw new Error("isolated memory dir missing"); + } + if (options.benchmark?.promptId !== "latest-ai-blog-posts") { + throw new Error("prompt id missing from benchmark context"); + } + if (options.benchmark?.persona !== "technical operator") { + throw new Error("persona missing from benchmark context"); + } + if (options.benchmark?.requiredColumns?.join(",") !== "entity_name,source_url,evidence_quote") { + throw new Error("required columns missing from benchmark context"); + } + return { + report: { + errors: [], + dataset_spec: { + columns: [{ name: "entity_name" }], + dedupe_keys: ["entity_name"], + }, + stats: { + search_queries_executed: 2, + pages_fetched: 3, + triage: { + agent_dispatched: 1, + agent_succeeded: 1, + agent_failed: 0, + }, + }, + initial: { + triage: { + agent_dispatched: 1, + agent_succeeded: 1, + agent_failed: 0, + }, + }, + repair: { + stats: { + triage: { + agent_dispatched: 2, + agent_succeeded: 1, + agent_failed: 1, + }, + }, + }, + quality: { + records: [{ record_id: "pk:openai", needs_review: true }], + }, + sources: ${JSON.stringify(input.sources ?? { outcomes: [] })}, + llm_usage: { + prompt_tokens: 1, + completion_tokens: 1, + total_tokens: 2, + }, + }, + records: [{ + row: { + entity_name: "OpenAI", + source_url: "https://openai.com/news", + evidence_quote: options.benchmark.persona, + }, + source_urls: ["https://openai.com/news"], + evidence: [{ + field: "entity_name", + url: "https://openai.com/news", + quote: options.benchmark.expectedStress, + }], + }], + llmUsage: { + promptTokens: 11, + completionTokens: 7, + totalTokens: 18, + }, + }; + } + `; + return `data:text/javascript,${encodeURIComponent(source)}`; +} + +function snapshotEnv(names: string[]): Map { + return new Map(names.map((name) => [name, process.env[name]])); +} + +function restoreEnv(snapshot: Map): void { + for (const [name, value] of snapshot) { + if (value === undefined) { + delete process.env[name]; + } else { + process.env[name] = value; + } + } +} diff --git a/backend/test/collection-extract-finalize.test.ts b/backend/test/collection-extract-finalize.test.ts new file mode 100644 index 0000000..0e6e733 --- /dev/null +++ b/backend/test/collection-extract-finalize.test.ts @@ -0,0 +1,130 @@ +import assert from "node:assert/strict"; +import { test } from "node:test"; + +import { finalizeExtractedRecord } from "../BigSet_Data_Collection_Agent/src/agents/extract.js"; +import type { DatasetSpec } from "../BigSet_Data_Collection_Agent/src/models/schemas.js"; + +const docsSpec: DatasetSpec = { + intent_summary: "Official docs pages.", + target_row_count: 1, + row_grain: "one row per docs page", + columns: [ + { + name: "entity_name", + type: "string", + description: "Vendor name.", + required: true, + }, + { + name: "docs_url", + type: "string", + description: "Official docs URL.", + required: true, + }, + { + name: "summary", + type: "string", + description: "What the page covers.", + required: true, + }, + ], + dedupe_keys: ["entity_name"], + search_queries: ["Cloudflare MCP docs"], + extraction_hints: "Prefer official docs pages.", +}; + +test("collection extraction adds URL cell evidence when model omits evidence", () => { + const record = finalizeExtractedRecord( + { + row: { + entity_name: "Cloudflare", + docs_url: "https://developers.cloudflare.com/agents/guides/remote-mcp-server/", + summary: "Remote MCP server docs.", + }, + evidence: [], + extraction_confidence: 0.8, + }, + "https://developers.cloudflare.com/agents/guides/remote-mcp-server/", + docsSpec, + ); + + assert.deepEqual(record.evidence, [ + { + field: "docs_url", + url: "https://developers.cloudflare.com/agents/guides/remote-mcp-server/", + quote: "https://developers.cloudflare.com/agents/guides/remote-mcp-server/", + }, + ]); + assert.deepEqual(record.source_urls, [ + "https://developers.cloudflare.com/agents/guides/remote-mcp-server/", + ]); +}); + +test("collection extraction treats official website cells as source URLs", () => { + const spec: DatasetSpec = { + intent_summary: "Official company websites.", + target_row_count: 1, + row_grain: "one row per company", + columns: [ + { + name: "entity_name", + type: "string", + description: "Company name.", + required: true, + }, + { + name: "official_website", + type: "string", + description: "Official website URL.", + required: true, + }, + { + name: "description", + type: "string", + description: "Company description.", + required: true, + }, + { + name: "source_url", + type: "string", + description: "Where the row facts were found.", + required: true, + }, + ], + dedupe_keys: ["entity_name"], + search_queries: ["Vietnam fintech official websites"], + extraction_hints: "Prefer official company websites.", + }; + + const record = finalizeExtractedRecord( + { + row: { + entity_name: "MoMo", + official_website: "https://momo.vn", + description: "Vietnamese fintech wallet.", + source_url: "https://www.startupblink.com/top-startups/vietnam", + }, + evidence: [ + { + field: "description", + quote: "MoMo is a FinTech startup.", + }, + ], + extraction_confidence: 0.8, + }, + "https://www.startupblink.com/top-startups/vietnam", + spec, + ); + + assert.deepEqual(record.source_urls, [ + "https://www.startupblink.com/top-startups/vietnam", + "https://momo.vn", + ]); + assert.ok( + record.evidence.some((item) => + item.field === "official_website" && + item.url === "https://momo.vn" && + item.quote === "https://momo.vn" + ), + ); +}); diff --git a/backend/test/collection-record-merge.test.ts b/backend/test/collection-record-merge.test.ts new file mode 100644 index 0000000..93d205f --- /dev/null +++ b/backend/test/collection-record-merge.test.ts @@ -0,0 +1,583 @@ +import assert from "node:assert/strict"; +import { test } from "node:test"; + +import { + mergePair, + mergeRecords, +} from "../BigSet_Data_Collection_Agent/src/merge/records.js"; +import type { + DatasetSpec, + ExtractedRecord, +} from "../BigSet_Data_Collection_Agent/src/models/schemas.js"; + +const docsSpec: DatasetSpec = { + intent_summary: "Official MCP docs pages.", + target_row_count: 3, + row_grain: "one row per vendor", + columns: [ + { + name: "entity_name", + type: "string", + description: "Vendor name.", + required: true, + }, + { + name: "docs_title", + type: "string", + description: "Docs page title.", + required: true, + }, + { + name: "docs_url", + type: "string", + description: "Official docs page URL.", + required: true, + }, + { + name: "summary", + type: "string", + description: "What the page covers.", + required: true, + }, + ], + dedupe_keys: ["entity_name"], + search_queries: ["MCP docs"], + extraction_hints: "Prefer official docs pages.", +}; + +const earningsSpec: DatasetSpec = { + intent_summary: "Latest earnings releases.", + target_row_count: 3, + row_grain: "one row per company", + columns: [ + { + name: "entity_name", + type: "string", + description: "Company name.", + required: true, + }, + { + name: "release_date", + type: "date", + description: "Release date.", + required: true, + }, + { + name: "fiscal_quarter", + type: "string", + description: "Fiscal quarter.", + required: true, + }, + { + name: "source_url", + type: "string", + description: "Official earnings release source URL.", + required: true, + }, + ], + dedupe_keys: ["entity_name"], + search_queries: ["latest earnings releases"], + extraction_hints: "Prefer official dated earnings release pages.", +}; + +test("collection record merge does not attach evidence from conflicting duplicate rows", () => { + const officialRecord = record({ + row: { + entity_name: "Cloudflare", + docs_title: "Connect to an MCP server", + docs_url: "https://developers.cloudflare.com/agents/guides/connect-mcp-client/", + summary: "Official docs for connecting an MCP client.", + }, + evidence: [ + evidence( + "summary", + "https://developers.cloudflare.com/agents/guides/connect-mcp-client/", + "Connect to an MCP server." + ), + ], + sourceUrls: [ + "https://developers.cloudflare.com/agents/guides/connect-mcp-client/", + ], + }); + const blogRecord = record({ + row: { + entity_name: "Cloudflare", + docs_title: "Code Mode: the better way to use MCP", + docs_url: "https://blog.cloudflare.com/code-mode/", + summary: "Blog post about code mode.", + }, + evidence: [ + evidence( + "docs_title", + "https://blog.cloudflare.com/code-mode/", + "Code Mode: the better way to use MCP" + ), + evidence( + "docs_url", + "https://blog.cloudflare.com/code-mode/", + "https://blog.cloudflare.com/code-mode/" + ), + ], + sourceUrls: ["https://blog.cloudflare.com/code-mode/"], + }); + + const merged = mergePair(officialRecord, blogRecord, docsSpec); + + assert.equal( + merged.row.docs_url, + "https://developers.cloudflare.com/agents/guides/connect-mcp-client/" + ); + assert.deepEqual( + merged.evidence.map((item) => item.url), + ["https://developers.cloudflare.com/agents/guides/connect-mcp-client/"] + ); + assert.deepEqual(merged.source_urls, [ + "https://developers.cloudflare.com/agents/guides/connect-mcp-client/", + ]); +}); + +test("collection record merge keeps incoming evidence when it fills a missing field", () => { + const partialRecord = record({ + row: { + entity_name: "OpenAI", + docs_title: "MCP and Connectors", + docs_url: null, + summary: "OpenAI MCP docs.", + }, + evidence: [ + evidence( + "summary", + "https://developers.openai.com/api/docs/guides/tools-connectors-mcp", + "remote MCP servers and connectors" + ), + ], + sourceUrls: [ + "https://developers.openai.com/api/docs/guides/tools-connectors-mcp", + ], + }); + const urlRecord = record({ + row: { + entity_name: "OpenAI", + docs_title: "MCP and Connectors", + docs_url: "https://developers.openai.com/api/docs/guides/tools-connectors-mcp", + summary: null, + }, + evidence: [ + evidence( + "docs_url", + "https://developers.openai.com/api/docs/guides/tools-connectors-mcp", + "https://developers.openai.com/api/docs/guides/tools-connectors-mcp" + ), + ], + sourceUrls: [ + "https://developers.openai.com/api/docs/guides/tools-connectors-mcp", + ], + }); + + const merged = mergePair(partialRecord, urlRecord, docsSpec); + + assert.equal( + merged.row.docs_url, + "https://developers.openai.com/api/docs/guides/tools-connectors-mcp" + ); + assert.deepEqual( + merged.evidence.map((item) => item.field), + ["summary", "docs_url"] + ); + assert.deepEqual(merged.source_urls, [ + "https://developers.openai.com/api/docs/guides/tools-connectors-mcp", + ]); +}); + +test("collection record merge keeps same-value supplemental evidence", () => { + const merged = mergeRecords(docsSpec, [ + record({ + row: { + entity_name: "Anthropic", + docs_title: "Model Context Protocol connector", + docs_url: "https://docs.anthropic.com/en/docs/agents-and-tools/mcp-connector", + summary: "Connector docs.", + }, + evidence: [ + evidence( + "summary", + "https://docs.anthropic.com/en/docs/agents-and-tools/mcp-connector", + "MCP connector" + ), + ], + sourceUrls: [ + "https://docs.anthropic.com/en/docs/agents-and-tools/mcp-connector", + ], + }), + record({ + row: { + entity_name: "Anthropic", + docs_title: "Model Context Protocol connector", + docs_url: "https://docs.anthropic.com/en/docs/agents-and-tools/mcp-connector", + summary: "Connector docs.", + }, + evidence: [ + evidence( + "docs_title", + "https://docs.anthropic.com/en/docs/agents-and-tools/mcp-connector", + "Model Context Protocol connector" + ), + ], + sourceUrls: [ + "https://docs.anthropic.com/en/docs/agents-and-tools/mcp-connector", + ], + }), + ]).records; + + assert.equal(merged.length, 1); + assert.deepEqual( + merged[0]?.evidence.map((item) => item.field), + ["summary", "docs_title"] + ); +}); + +test("collection record merge replaces weak docs URLs with stronger docs surfaces", () => { + const merged = mergePair( + record({ + row: { + entity_name: "Cloudflare", + docs_title: "Code Mode: the better way to use MCP", + docs_url: "https://blog.cloudflare.com/code-mode/", + summary: "Blog post about MCP code mode.", + }, + evidence: [ + evidence( + "docs_url", + "https://blog.cloudflare.com/code-mode/", + "https://blog.cloudflare.com/code-mode/" + ), + ], + sourceUrls: ["https://blog.cloudflare.com/code-mode/"], + }), + record({ + row: { + entity_name: "Cloudflare", + docs_title: "Model Context Protocol", + docs_url: "https://developers.cloudflare.com/agents/model-context-protocol/", + summary: "Official docs for Cloudflare MCP servers.", + }, + evidence: [ + evidence( + "docs_title", + "https://developers.cloudflare.com/agents/model-context-protocol/", + "Model Context Protocol" + ), + evidence( + "docs_url", + "https://developers.cloudflare.com/agents/model-context-protocol/", + "https://developers.cloudflare.com/agents/model-context-protocol/" + ), + evidence( + "summary", + "https://developers.cloudflare.com/agents/model-context-protocol/", + "MCP servers" + ), + ], + sourceUrls: [ + "https://developers.cloudflare.com/agents/model-context-protocol/", + ], + }), + docsSpec, + ); + + assert.equal( + merged.row.docs_url, + "https://developers.cloudflare.com/agents/model-context-protocol/" + ); + assert.equal(merged.row.docs_title, "Model Context Protocol"); + assert.equal(merged.row.summary, "Official docs for Cloudflare MCP servers."); + assert.deepEqual( + merged.evidence.map((item) => item.field), + ["docs_title", "docs_url", "summary"] + ); + assert.deepEqual( + merged.evidence.map((item) => item.url), + [ + "https://developers.cloudflare.com/agents/model-context-protocol/", + "https://developers.cloudflare.com/agents/model-context-protocol/", + "https://developers.cloudflare.com/agents/model-context-protocol/", + ] + ); + assert.deepEqual(merged.source_urls, [ + "https://developers.cloudflare.com/agents/model-context-protocol/", + ]); +}); + +test("collection record merge drops docs URL evidence from unrelated source pages", () => { + const merged = mergePair( + record({ + row: { + entity_name: "Cloudflare", + docs_title: "Docs for agents", + docs_url: null, + summary: null, + }, + evidence: [], + sourceUrls: [], + }), + record({ + row: { + entity_name: "Cloudflare", + docs_title: "Model Context Protocol", + docs_url: "https://developers.cloudflare.com/agents/model-context-protocol/", + summary: "Official docs for Cloudflare MCP servers.", + }, + evidence: [ + evidence( + "docs_url", + "https://developers.openai.com/api/docs", + "https://developers.cloudflare.com/agents/model-context-protocol/" + ), + evidence( + "summary", + "https://developers.cloudflare.com/agents/model-context-protocol/", + "MCP servers" + ), + ], + sourceUrls: [ + "https://developers.openai.com/api/docs", + "https://developers.cloudflare.com/agents/model-context-protocol/", + ], + }), + docsSpec, + ); + + assert.equal( + merged.row.docs_url, + "https://developers.cloudflare.com/agents/model-context-protocol/" + ); + assert.deepEqual( + merged.evidence.map((item) => item.field), + ["summary"] + ); + assert.deepEqual(merged.source_urls, [ + "https://developers.cloudflare.com/agents/model-context-protocol/", + ]); +}); + +test("collection record merge folds corporate suffix variants and prefers stronger source pages", () => { + const merged = mergeRecords(earningsSpec, [ + record({ + row: { + entity_name: "Nvidia", + release_date: "2026-02-25", + fiscal_quarter: "Q4 Fiscal 2026", + source_url: "https://nvidianews.nvidia.com/news/nvidia-announces-financial-results-for-fourth-quarter-and-fiscal-2026", + }, + evidence: [ + evidence( + "release_date", + "https://nvidianews.nvidia.com/news/nvidia-announces-financial-results-for-fourth-quarter-and-fiscal-2026", + "February 25, 2026", + ), + evidence( + "fiscal_quarter", + "https://nvidianews.nvidia.com/news/nvidia-announces-financial-results-for-fourth-quarter-and-fiscal-2026", + "fourth quarter fiscal 2026", + ), + evidence( + "source_url", + "https://nvidianews.nvidia.com/news/nvidia-announces-financial-results-for-fourth-quarter-and-fiscal-2026", + "https://nvidianews.nvidia.com/news/nvidia-announces-financial-results-for-fourth-quarter-and-fiscal-2026", + ), + ], + sourceUrls: [ + "https://nvidianews.nvidia.com/news/nvidia-announces-financial-results-for-fourth-quarter-and-fiscal-2026", + ], + }), + record({ + row: { + entity_name: "NVIDIA Corporation", + release_date: "2026-05-20", + fiscal_quarter: "FY27 Q1", + source_url: "https://nvidianews.nvidia.com/news/nvidia-announces-financial-results-for-first-quarter-fiscal-2027", + }, + evidence: [ + evidence( + "release_date", + "https://nvidianews.nvidia.com/news/nvidia-announces-financial-results-for-first-quarter-fiscal-2027", + "May 20, 2026", + ), + evidence( + "fiscal_quarter", + "https://nvidianews.nvidia.com/news/nvidia-announces-financial-results-for-first-quarter-fiscal-2027", + "first quarter fiscal 2027", + ), + evidence( + "source_url", + "https://nvidianews.nvidia.com/news/nvidia-announces-financial-results-for-first-quarter-fiscal-2027", + "https://nvidianews.nvidia.com/news/nvidia-announces-financial-results-for-first-quarter-fiscal-2027", + ), + ], + sourceUrls: [ + "https://nvidianews.nvidia.com/news/nvidia-announces-financial-results-for-first-quarter-fiscal-2027", + ], + }), + ]).records; + + assert.equal(merged.length, 1); + assert.equal(merged[0]?.row.entity_name, "Nvidia"); + assert.equal(merged[0]?.row.fiscal_quarter, "FY27 Q1"); + assert.equal( + merged[0]?.row.source_url, + "https://nvidianews.nvidia.com/news/nvidia-announces-financial-results-for-first-quarter-fiscal-2027", + ); + assert.deepEqual(merged[0]?.source_urls, [ + "https://nvidianews.nvidia.com/news/nvidia-announces-financial-results-for-first-quarter-fiscal-2027", + ]); +}); + +test("collection record merge fixture reaches benchmark-equivalent domain coverage", () => { + const merged = mergeRecords(docsSpec, [ + record({ + row: { + entity_name: "OpenAI", + docs_title: "MCP and Connectors", + docs_url: "https://developers.openai.com/api/docs/guides/tools-connectors-mcp", + summary: "OpenAI MCP docs.", + }, + evidence: [ + evidence( + "summary", + "https://developers.openai.com/api/docs/guides/tools-connectors-mcp", + "remote MCP servers and connectors" + ), + ], + sourceUrls: [ + "https://developers.openai.com/api/docs/guides/tools-connectors-mcp", + ], + }), + record({ + row: { + entity_name: "Anthropic", + docs_title: "Introduction to Model Context Protocol", + docs_url: "https://anthropic.skilljar.com/introduction-to-model-context-protocol", + summary: "Anthropic MCP course.", + }, + evidence: [ + evidence( + "summary", + "https://anthropic.skilljar.com/introduction-to-model-context-protocol", + "course provides comprehensive coverage" + ), + ], + sourceUrls: [ + "https://anthropic.skilljar.com/introduction-to-model-context-protocol", + ], + }), + record({ + row: { + entity_name: "Anthropic", + docs_title: "MCP connector", + docs_url: "https://docs.anthropic.com/en/docs/agents-and-tools/mcp-connector", + summary: "Anthropic MCP connector docs.", + }, + evidence: [ + evidence( + "docs_url", + "https://docs.anthropic.com/en/docs/agents-and-tools/mcp-connector", + "https://docs.anthropic.com/en/docs/agents-and-tools/mcp-connector" + ), + ], + sourceUrls: [ + "https://docs.anthropic.com/en/docs/agents-and-tools/mcp-connector", + ], + }), + record({ + row: { + entity_name: "Cloudflare", + docs_title: "Code Mode", + docs_url: "https://blog.cloudflare.com/code-mode/", + summary: "Cloudflare MCP blog post.", + }, + evidence: [ + evidence( + "summary", + "https://blog.cloudflare.com/code-mode/", + "Cloudflare Agents SDK" + ), + ], + sourceUrls: ["https://blog.cloudflare.com/code-mode/"], + }), + record({ + row: { + entity_name: "Cloudflare", + docs_title: "Model Context Protocol", + docs_url: "https://developers.cloudflare.com/agents/model-context-protocol/", + summary: "Cloudflare MCP docs.", + }, + evidence: [ + evidence( + "docs_url", + "https://developers.cloudflare.com/agents/model-context-protocol/", + "https://developers.cloudflare.com/agents/model-context-protocol/" + ), + ], + sourceUrls: [ + "https://developers.cloudflare.com/agents/model-context-protocol/", + ], + }), + ]).records; + + assert.equal(merged.length, 3); + assert.equal( + merged.find((item) => item.row.entity_name === "Anthropic")?.row.docs_url, + "https://docs.anthropic.com/en/docs/agents-and-tools/mcp-connector" + ); + assert.equal( + merged.find((item) => item.row.entity_name === "Cloudflare")?.row.docs_url, + "https://developers.cloudflare.com/agents/model-context-protocol/" + ); + assert.equal( + domainCoverage(merged, { + OpenAI: ["developers.openai.com", "platform.openai.com", "openai.com"], + Anthropic: ["docs.anthropic.com"], + Cloudflare: ["developers.cloudflare.com"], + }), + 1, + ); +}); + +function evidence(field: string, url: string, quote: string) { + return { field, url, quote }; +} + +function record(input: { + row: ExtractedRecord["row"]; + evidence: ExtractedRecord["evidence"]; + sourceUrls: string[]; +}): ExtractedRecord { + return { + row: input.row, + evidence: input.evidence, + source_urls: input.sourceUrls, + extraction_confidence: 0.9, + }; +} + +function domainCoverage( + records: ExtractedRecord[], + allowedDomainsByEntity: Record, +): number { + const matched = records.filter((record) => { + const entity = String(record.row.entity_name ?? ""); + const allowedDomains = allowedDomainsByEntity[entity] ?? []; + return record.source_urls.some((url) => + allowedDomains.some((domain) => hostname(url).endsWith(domain)), + ); + }); + return matched.length / records.length; +} + +function hostname(url: string): string { + try { + return new URL(url).hostname.replace(/^www\./, ""); + } catch { + return ""; + } +} diff --git a/backend/test/collection-source-policy.test.ts b/backend/test/collection-source-policy.test.ts new file mode 100644 index 0000000..48b6ac2 --- /dev/null +++ b/backend/test/collection-source-policy.test.ts @@ -0,0 +1,312 @@ +import assert from "node:assert/strict"; +import { test } from "node:test"; + +import { + applyPromptSourcePolicyToSpec, + applyPromptSourcePolicyToTriageResult, + derivePromptSourcePolicy, + promptSourceSearchQueries, + recordMatchesPromptSourcePolicy, + sourceCandidatePolicyBoost, + urlMatchesPromptSourcePolicy, +} from "../BigSet_Data_Collection_Agent/src/agents/source-policy.js"; +import type { + DatasetSpec, + ExtractedRecord, + SourceCandidate, + SourceTriageResult, +} from "../BigSet_Data_Collection_Agent/src/models/schemas.js"; + +test("prompt source policy derives official queries from the user's prompt", () => { + const policy = derivePromptSourcePolicy( + "For Stripe, Paddle, and Chargebee, collect the official pricing page URL and the plan names or starting prices shown on the page.", + ); + + assert.equal(policy.requiresOfficialSource, true); + assert.deepEqual( + policy.entities.map((entity) => entity.name), + ["Stripe", "Paddle", "Chargebee"], + ); + assert.deepEqual(promptSourceSearchQueries(policy).slice(0, 3), [ + "Stripe official pricing page", + "Stripe billing pricing", + "Paddle official pricing page", + ]); +}); + +test("prompt source policy ignores generic durable recipe source wording", () => { + const policy = derivePromptSourcePolicy( + [ + "Dataset: benchmark_latest-ai-blog-posts", + "Task: Can you make me a table of the latest blog posts from OpenAI, Anthropic, and Google DeepMind? I need title, publish date, and URL.", + "", + "Durable recipe instructions:", + "Prefer official docs, pricing, blog, product, or company pages over third-party summaries.", + ].join("\n"), + ); + + const queries = promptSourceSearchQueries(policy); + + assert.deepEqual(queries, [ + "OpenAI official blog latest post", + "Anthropic official blog latest post", + "Google DeepMind official blog latest post", + ]); +}); + +test("prompt source policy adds official-source guidance without benchmark answer keys", () => { + const spec: DatasetSpec = { + intent_summary: "Collect pricing pages.", + target_row_count: 3, + row_grain: "one row per company", + columns: [ + { + name: "entity_name", + type: "string", + description: "Company.", + required: true, + }, + { + name: "pricing_page_url", + type: "string", + description: "Official pricing URL.", + required: true, + }, + ], + dedupe_keys: ["entity_name"], + search_queries: ["SaaS pricing pages"], + extraction_hints: "Extract plan names.", + }; + + const updated = applyPromptSourcePolicyToSpec( + spec, + "For Stripe and Paddle, collect the official pricing page URL.", + ); + + assert.equal(updated.search_queries[0], "Stripe official pricing page"); + assert.equal(updated.search_queries[1], "Stripe billing pricing"); + assert.equal(updated.search_queries[2], "Paddle official pricing page"); + assert.match(updated.extraction_hints, /Prompt source policy/); + assert.match(updated.extraction_hints, /Stripe, Paddle/); +}); + +test("prompt source policy prefers entity-owned domains over third-party proof", () => { + const policy = derivePromptSourcePolicy( + "Find the latest investor relations earnings release page for Apple, Microsoft, and Nvidia.", + ); + + assert.equal( + urlMatchesPromptSourcePolicy("https://investor.apple.com/newsroom/", policy), + true, + ); + assert.equal( + urlMatchesPromptSourcePolicy("https://finance.yahoo.com/quote/AAPL", policy), + false, + ); + assert.equal( + urlMatchesPromptSourcePolicy("https://cloud.google.com/blog/topics/threat-intelligence", { + ...derivePromptSourcePolicy( + "Can you make me a table of the latest blog posts from OpenAI, Anthropic, and Google DeepMind?", + ), + }), + false, + ); + assert.equal( + urlMatchesPromptSourcePolicy( + "https://openai.github.io/openai-agents-python/mcp/", + derivePromptSourcePolicy( + "I need official docs pages for setting up MCP servers from Anthropic, OpenAI, and Cloudflare.", + ), + ), + false, + ); +}); + +test("prompt source policy downgrades third-party extraction triage", () => { + const policy = derivePromptSourcePolicy( + "For Stripe, Paddle, and Chargebee, collect the official pricing page URL and plan names.", + ); + const triage: SourceTriageResult = { + url: "https://www.trustradius.com/products/paddle/pricing", + final_url: "https://www.trustradius.com/products/paddle/pricing", + title: "Paddle Pricing", + status: "extract_now", + confidence: 0.9, + source_data_confidence: 0.8, + expected_yield: "complete", + reasoning: "Page lists pricing information.", + }; + + const updated = applyPromptSourcePolicyToTriageResult(triage, policy); + + assert.equal(updated.status, "low_value"); + assert.equal(updated.expected_yield, "none"); + assert.match(updated.reasoning, /official\/canonical sources/); +}); + +test("prompt source policy boosts official candidates", () => { + const policy = derivePromptSourcePolicy( + [ + "Dataset: benchmark_mcp-docs-pages", + "Task: I need official docs pages for setting up MCP servers from Anthropic, OpenAI, and Cloudflare. Give me title, URL, and what each page covers.", + "", + "Durable recipe instructions:", + "Prefer official docs, pricing, blog, product, or company pages over third-party summaries.", + ].join("\n"), + ); + assert.deepEqual( + policy.entities.map((entity) => entity.name), + ["Anthropic", "OpenAI", "Cloudflare"], + ); + assert.deepEqual(promptSourceSearchQueries(policy).slice(0, 4), [ + "Anthropic MCP connector docs site:platform.claude.com", + "OpenAI MCP connector docs site:developers.openai.com", + "Cloudflare MCP connector docs site:developers.cloudflare.com", + "Anthropic MCP connector docs", + ]); + const official: SourceCandidate = { + url: "https://developers.cloudflare.com/agents/model-context-protocol/", + title: "MCP servers", + snippet: "Official Cloudflare docs for MCP server setup.", + query: "Cloudflare official docs MCP server setup", + }; + const thirdParty: SourceCandidate = { + url: "https://example.com/cloudflare-mcp-guide", + title: "Cloudflare MCP guide", + snippet: "A blog guide to Cloudflare MCP.", + query: "Cloudflare official docs MCP server setup", + }; + + assert.ok( + sourceCandidatePolicyBoost(official, policy) > + sourceCandidatePolicyBoost(thirdParty, policy), + ); +}); + +test("prompt source policy prefers docs surfaces over blogs, courses, and directories", () => { + const policy = derivePromptSourcePolicy( + "I need official docs pages for setting up MCP servers from Anthropic, OpenAI, and Cloudflare.", + ); + const docs: SourceCandidate = { + url: "https://platform.claude.com/docs/en/agents-and-tools/mcp-connector", + title: "Model Context Protocol connector", + snippet: "Official Anthropic documentation for MCP connector setup.", + query: "Anthropic MCP connector docs", + }; + const course: SourceCandidate = { + url: "https://anthropic.skilljar.com/introduction-to-model-context-protocol", + title: "Introduction to Model Context Protocol", + snippet: "Anthropic course for learning MCP.", + query: "Anthropic MCP connector docs", + }; + const blog: SourceCandidate = { + url: "https://blog.cloudflare.com/code-mode/", + title: "Code Mode: the better way to use MCP", + snippet: "Cloudflare blog post about MCP.", + query: "Cloudflare MCP connector docs", + }; + const cloudflareDocs: SourceCandidate = { + url: "https://developers.cloudflare.com/agents/model-context-protocol/", + title: "Model Context Protocol", + snippet: "Official Cloudflare docs for MCP servers.", + query: "Cloudflare MCP connector docs", + }; + + assert.ok( + sourceCandidatePolicyBoost(docs, policy) > + sourceCandidatePolicyBoost(course, policy), + ); + assert.equal( + urlMatchesPromptSourcePolicy( + "https://platform.claude.com/docs/en/agents-and-tools/mcp-connector", + policy, + ), + true, + ); + assert.ok( + sourceCandidatePolicyBoost(cloudflareDocs, policy) > + sourceCandidatePolicyBoost(blog, policy), + ); +}); + +test("prompt source policy rejects records sourced from another entity's docs", () => { + const policy = derivePromptSourcePolicy( + "I need official docs pages for setting up MCP servers from Anthropic, OpenAI, and Cloudflare.", + ); + const spec: DatasetSpec = { + intent_summary: "Official MCP docs pages.", + target_row_count: 3, + row_grain: "one row per vendor", + columns: [ + { + name: "entity_name", + type: "string", + description: "Vendor name.", + required: true, + }, + { + name: "docs_url", + type: "string", + description: "Official docs page URL.", + required: true, + }, + ], + dedupe_keys: ["entity_name"], + search_queries: [], + extraction_hints: "", + }; + + assert.equal( + recordMatchesPromptSourcePolicy( + record("Anthropic", "https://modelcontextprotocol.io/docs/develop/build-server"), + spec, + policy, + ), + false, + ); + assert.equal( + recordMatchesPromptSourcePolicy( + record( + "Anthropic", + "https://platform.claude.com/docs/en/agents-and-tools/remote-mcp-servers", + ), + spec, + policy, + ), + true, + ); + assert.equal( + recordMatchesPromptSourcePolicy( + record("OpenAI", "https://developers.openai.com/blog"), + spec, + policy, + ), + false, + ); + assert.equal( + recordMatchesPromptSourcePolicy( + record("OpenAI", "https://developers.openai.com/api/docs/guides/tools-connectors-mcp"), + spec, + policy, + ), + true, + ); +}); + +function record(entityName: string, docsUrl: string): ExtractedRecord { + return { + row: { + entity_name: entityName, + docs_url: docsUrl, + }, + evidence: [ + { + field: "docs_url", + url: docsUrl, + quote: docsUrl, + }, + ], + source_urls: [docsUrl], + extraction_confidence: 0.8, + }; +} diff --git a/backend/test/populate-collection-runtime.test.ts b/backend/test/populate-collection-runtime.test.ts new file mode 100644 index 0000000..f195bc2 --- /dev/null +++ b/backend/test/populate-collection-runtime.test.ts @@ -0,0 +1,218 @@ +import assert from "node:assert/strict"; +import { test } from "node:test"; + +import { + CollectionPopulateRecipeRuntime, + collectionPipelineInputFromRecipe, + type CollectionPopulatePipelineInput, +} from "../src/pipeline/populate-collection-runtime.js"; +import { + createPopulateRecipe, + type PopulateRecipe, +} from "../src/pipeline/populate-self-healing.js"; +import type { DatasetContext } from "../src/pipeline/populate.js"; + +const context: DatasetContext = { + datasetId: "dataset-ai-posts", + datasetName: "AI posts", + description: "Find latest blog posts from OpenAI.", + columns: [ + { + name: "entity_name", + type: "text", + description: "Company name.", + }, + { + name: "latest_post_title", + type: "text", + description: "Post title.", + }, + { + name: "source_url", + type: "url", + description: "Source URL.", + }, + { + name: "evidence_quote", + type: "text", + description: "Evidence quote.", + }, + ], +}; + +test("collection runtime threads recipe instructions into the collection prompt", async () => { + let capturedInput: CollectionPopulatePipelineInput | undefined; + const runtime = new CollectionPopulateRecipeRuntime({ + targetRows: 3, + benchmarkMetadata: { + promptId: "latest-ai-blog-posts", + promptQuality: "easy", + persona: "technical operator", + expectedStress: "Latest dated source pages; date precision matters.", + }, + runPipeline: async (input) => { + capturedInput = input; + return { + rows: [{ + cells: { + entity_name: "OpenAI", + latest_post_title: "Release notes from OpenAI", + source_url: "https://openai.com/news", + evidence_quote: "Release notes from OpenAI", + }, + sourceUrls: ["https://openai.com/news"], + evidence: [{ + columnName: "latest_post_title", + sourceUrl: "https://openai.com/news", + quote: "Release notes from OpenAI", + }], + needsReview: false, + }], + validationIssues: [], + usage: { + promptTokens: 11, + completionTokens: 7, + totalTokens: 18, + }, + metrics: { + searchCalls: 1, + fetchCalls: 1, + browserCalls: 0, + agentRuns: 1, + agentSteps: 0, + }, + }; + }, + }); + const recipe = collectionRecipe({ + runtimeInstructions: + "Prefer official news pages already known to work. Do not use aggregator pages.", + }); + + const run = await runtime.runRecipe({ recipe, context }); + + assert.ok(capturedInput); + assert.equal(capturedInput.datasetId, context.datasetId); + assert.equal(capturedInput.datasetName, context.datasetName); + assert.equal(capturedInput.targetRows, 3); + assert.equal(capturedInput.promptId, "latest-ai-blog-posts"); + assert.equal(capturedInput.promptQuality, "easy"); + assert.equal(capturedInput.persona, "technical operator"); + assert.equal( + capturedInput.expectedStress, + "Latest dated source pages; date precision matters." + ); + assert.deepEqual(capturedInput.requiredColumns, [ + "entity_name", + "latest_post_title", + "source_url", + "evidence_quote", + ]); + assert.match(capturedInput.prompt, /Find latest blog posts from OpenAI/); + assert.match(capturedInput.prompt, /Durable recipe instructions/); + assert.match(capturedInput.prompt, /Do not use aggregator pages/); + assert.equal( + capturedInput.recipeInstructions, + "Prefer official news pages already known to work. Do not use aggregator pages." + ); + assert.equal(run.runStatus, "succeeded"); + assert.equal(run.productionValidation.isValid, true); + assert.equal(run.productionValidation.score, 1); + assert.equal(run.rows[0]?.cells.entity_name, "OpenAI"); +}); + +test("collection runtime treats capability diagnostics as non-fatal warnings for healthy rows", async () => { + const runtime = new CollectionPopulateRecipeRuntime({ + targetRows: 3, + runPipeline: async () => ({ + rows: [{ + cells: { + entity_name: "OpenAI", + latest_post_title: "Release notes from OpenAI", + source_url: "https://openai.com/news", + evidence_quote: "Release notes from OpenAI", + }, + sourceUrls: ["https://openai.com/news"], + evidence: [{ + columnName: "latest_post_title", + sourceUrl: "https://openai.com/news", + quote: "Release notes from OpenAI", + }], + needsReview: false, + }], + validationIssues: [ + "Capability diagnostic: TinyFish Agent disabled; triage requested browser/form/detail follow-up for 2 page(s) (requires_navigation=1, requires_form_submission=1). Enable COLLECTION_AGENT_ENABLE_AGENT=true for live navigation.", + ], + usage: { + promptTokens: 11, + completionTokens: 7, + totalTokens: 18, + }, + metrics: { + searchCalls: 1, + fetchCalls: 1, + browserCalls: 0, + agentRuns: 0, + agentSteps: 0, + }, + }), + }); + + const run = await runtime.runRecipe({ + recipe: collectionRecipe(), + context, + }); + + assert.equal(run.runStatus, "succeeded"); + assert.equal(run.productionValidation.isValid, true); + assert.deepEqual(run.productionValidation.criticalIssues, []); + assert.match( + run.productionValidation.warnings.join("\n"), + /Capability diagnostic: TinyFish Agent disabled/ + ); +}); + +test("collection pipeline input builder trims empty recipe instructions", () => { + const input = collectionPipelineInputFromRecipe({ + recipe: collectionRecipe({ runtimeInstructions: " " }), + context, + targetRows: 5, + }); + + assert.equal(input.recipeInstructions, ""); + assert.doesNotMatch(input.prompt, /Durable recipe instructions/); +}); + +test("collection pipeline input builder carries benchmark metadata", () => { + const input = collectionPipelineInputFromRecipe({ + recipe: collectionRecipe(), + context, + targetRows: 5, + benchmarkMetadata: { + promptId: "saas-pricing-pages", + promptQuality: "medium", + persona: "startup founder", + expectedStress: "Official pricing evidence.", + }, + }); + + assert.equal(input.promptId, "saas-pricing-pages"); + assert.equal(input.promptQuality, "medium"); + assert.equal(input.persona, "startup founder"); + assert.equal(input.expectedStress, "Official pricing evidence."); +}); + +function collectionRecipe(input: { + runtimeInstructions?: string; +} = {}): PopulateRecipe { + return createPopulateRecipe({ + recipeId: "collection-v1", + datasetId: context.datasetId, + version: 1, + status: "active", + runtimeInstructions: input.runtimeInstructions ?? "", + sourceDescription: context.description, + requestedColumns: context.columns.map((column) => column.name), + createdBy: "system", + }); +} diff --git a/backend/test/populate-convex-writer.test.ts b/backend/test/populate-convex-writer.test.ts new file mode 100644 index 0000000..d347b9f --- /dev/null +++ b/backend/test/populate-convex-writer.test.ts @@ -0,0 +1,63 @@ +import assert from "node:assert/strict"; +import { test } from "node:test"; + +test("Convex populate row writer uses one atomic replace mutation", async () => { + process.env.CONVEX_URL = process.env.CONVEX_URL ?? "https://example.convex.cloud"; + process.env.CONVEX_SELF_HOSTED_ADMIN_KEY = + process.env.CONVEX_SELF_HOSTED_ADMIN_KEY ?? "test-admin-key"; + const { ConvexPopulateDatasetRowWriter } = await import( + "../src/pipeline/populate-convex-writer.js" + ); + const calls: Array<{ functionReference: unknown; args: unknown }> = []; + const replaceByDataset = Symbol("replaceByDataset"); + const writer = new ConvexPopulateDatasetRowWriter({ + internalApi: { + datasetRows: { + replaceByDataset, + }, + }, + convexClient: { + async mutation(functionReference, args) { + calls.push({ functionReference, args }); + return { + clearedRowCount: 2, + insertedRowCount: 1, + }; + }, + }, + }); + + const result = await writer.replaceRows({ + datasetId: "dataset-ai-posts", + rows: [{ + cells: { + entity_name: "OpenAI", + source_url: "https://openai.com/news", + }, + sourceUrls: ["https://openai.com/news"], + evidence: [{ + columnName: "entity_name", + sourceUrl: "https://openai.com/news", + quote: "OpenAI", + }], + needsReview: true, + }], + }); + + assert.deepEqual(result, { + clearedRowCount: 2, + insertedRowCount: 1, + }); + assert.equal(calls.length, 1); + assert.equal(calls[0]?.functionReference, replaceByDataset); + assert.deepEqual(calls[0]?.args, { + datasetId: "dataset-ai-posts", + rows: [{ + data: { + entity_name: "OpenAI", + source_url: "https://openai.com/news", + }, + sources: ["https://openai.com/news"], + }], + }); +}); diff --git a/backend/test/populate-dataset-context-loader.test.ts b/backend/test/populate-dataset-context-loader.test.ts new file mode 100644 index 0000000..1cf4113 --- /dev/null +++ b/backend/test/populate-dataset-context-loader.test.ts @@ -0,0 +1,67 @@ +import assert from "node:assert/strict"; +import { test } from "node:test"; + +import { ConvexPopulateDatasetContextLoader } from "../src/pipeline/populate-dataset-context-loader.js"; + +test("Convex dataset context loader maps system dataset to populate context", async () => { + const getForSystemPopulate = Symbol("getForSystemPopulate"); + const calls: Array<{ functionReference: unknown; args: unknown }> = []; + const loader = new ConvexPopulateDatasetContextLoader({ + internalApi: { + datasets: { + getForSystemPopulate, + }, + }, + convexClient: { + async query(functionReference, args) { + calls.push({ functionReference, args }); + return { + name: "AI posts", + description: "Find latest blog posts from OpenAI.", + columns: [{ + name: "entity_name", + type: "text", + description: "Company name.", + }], + }; + }, + }, + }); + + const context = await loader.loadContext("dataset-ai-posts"); + + assert.deepEqual(calls, [{ + functionReference: getForSystemPopulate, + args: { id: "dataset-ai-posts" }, + }]); + assert.deepEqual(context, { + datasetId: "dataset-ai-posts", + datasetName: "AI posts", + description: "Find latest blog posts from OpenAI.", + columns: [{ + name: "entity_name", + type: "text", + description: "Company name.", + }], + }); +}); + +test("Convex dataset context loader rejects missing dataset", async () => { + const loader = new ConvexPopulateDatasetContextLoader({ + internalApi: { + datasets: { + getForSystemPopulate: Symbol("getForSystemPopulate"), + }, + }, + convexClient: { + async query() { + return null; + }, + }, + }); + + await assert.rejects( + loader.loadContext("missing-dataset"), + /Dataset missing-dataset not found/ + ); +}); diff --git a/backend/test/populate-runtime-prerequisites.test.ts b/backend/test/populate-runtime-prerequisites.test.ts new file mode 100644 index 0000000..eb55222 --- /dev/null +++ b/backend/test/populate-runtime-prerequisites.test.ts @@ -0,0 +1,51 @@ +import assert from "node:assert/strict"; +import { test } from "node:test"; + +import { + missingPopulateRuntimePrerequisites, + populateRuntimePrerequisiteError, +} from "../src/pipeline/populate-runtime-prerequisites.js"; + +test("populate runtime prerequisite check reports every missing key", () => { + assert.deepEqual(missingPopulateRuntimePrerequisites({}), [ + "CONVEX_URL", + "CONVEX_SELF_HOSTED_ADMIN_KEY", + "OPENROUTER_API_KEY", + "TINYFISH_API_KEY", + ]); +}); + +test("populate runtime prerequisite check skips Convex admin key for dry runs", () => { + assert.deepEqual( + missingPopulateRuntimePrerequisites({ + openRouterApiKey: "openrouter", + tinyFishApiKey: "tinyfish", + shouldCommitRows: false, + }), + [] + ); +}); + +test("populate runtime prerequisite check passes when all keys are configured", () => { + const input = { + convexUrl: "http://convex:3210", + convexAdminKey: "convex", + openRouterApiKey: "openrouter", + tinyFishApiKey: "tinyfish", + }; + + assert.deepEqual(missingPopulateRuntimePrerequisites(input), []); + assert.equal(populateRuntimePrerequisiteError(input), undefined); +}); + +test("populate runtime prerequisite check requires Convex keys for dataset-id dry runs", () => { + assert.deepEqual( + missingPopulateRuntimePrerequisites({ + openRouterApiKey: "openrouter", + tinyFishApiKey: "tinyfish", + shouldCommitRows: false, + shouldLoadDatasetContext: true, + }), + ["CONVEX_URL", "CONVEX_SELF_HOSTED_ADMIN_KEY"] + ); +}); diff --git a/backend/test/populate-runtime-selection.test.ts b/backend/test/populate-runtime-selection.test.ts new file mode 100644 index 0000000..b1a9993 --- /dev/null +++ b/backend/test/populate-runtime-selection.test.ts @@ -0,0 +1,128 @@ +import assert from "node:assert/strict"; +import { test } from "node:test"; + +import { + createPopulateRecipeRuntime, + selectedPopulateRuntimeName, +} from "../src/pipeline/populate-runtime-selection.js"; +import { CollectionPopulateRecipeRuntime } from "../src/pipeline/populate-collection-runtime.js"; +import { + createPopulateRecipe, + MastraPopulateRecipeRuntime, +} from "../src/pipeline/populate-self-healing.js"; +import type { DatasetContext } from "../src/pipeline/populate.js"; + +test("populate runtime selection defaults to Mastra", async () => { + assert.equal(selectedPopulateRuntimeName({}), "mastra"); + assert.ok( + await createPopulateRecipeRuntime({ env: {} }) instanceof + MastraPopulateRecipeRuntime + ); +}); + +test("populate runtime selection supports collection when a runner is provided", async () => { + assert.equal( + selectedPopulateRuntimeName({ POPULATE_AGENT_RUNTIME: "collection" }), + "collection" + ); + const runtime = await createPopulateRecipeRuntime({ + env: { POPULATE_AGENT_RUNTIME: "collection" }, + collectionRunner: async () => ({ + rows: [], + validationIssues: ["not used"], + usage: { promptTokens: 0, completionTokens: 0, totalTokens: 0 }, + metrics: { + searchCalls: 0, + fetchCalls: 0, + browserCalls: 0, + agentRuns: 0, + agentSteps: 0, + }, + }), + }); + + assert.ok(runtime instanceof CollectionPopulateRecipeRuntime); +}); + +test("populate runtime selection rejects collection without a runner", async () => { + await assert.rejects( + () => createPopulateRecipeRuntime({ + env: { POPULATE_AGENT_RUNTIME: "collection" }, + }), + /requires a collection pipeline runner/ + ); +}); + +test("populate runtime selection loads collection runner from env module", async () => { + const runtime = await createPopulateRecipeRuntime({ + env: { + POPULATE_AGENT_RUNTIME: "collection", + POPULATE_COLLECTION_RUNNER_MODULE: runnerModuleUrl(), + BIGSET_BENCHMARK_PROMPT_ID: "latest-ai-blog-posts", + BIGSET_BENCHMARK_PROMPT_QUALITY: "easy", + BIGSET_BENCHMARK_PERSONA: "technical operator", + BIGSET_BENCHMARK_EXPECTED_STRESS: "Latest dated source pages.", + }, + }); + const context: DatasetContext = { + datasetId: "dataset-ai-posts", + datasetName: "AI posts", + description: "Find latest blog posts from OpenAI.", + columns: [ + { name: "entity_name", type: "text" }, + { name: "source_url", type: "url" }, + { name: "evidence_quote", type: "text" }, + ], + }; + const run = await runtime.runRecipe({ + context, + recipe: createPopulateRecipe({ + recipeId: "collection-v1", + datasetId: context.datasetId, + version: 1, + status: "active", + runtimeInstructions: "Prefer official sources.", + sourceDescription: context.description, + requestedColumns: context.columns.map((column) => column.name), + createdBy: "system", + }), + }); + + assert.equal(run.runStatus, "succeeded"); + assert.equal(run.rows[0]?.cells.entity_name, "latest-ai-blog-posts"); + assert.equal(run.rows[0]?.cells.evidence_quote, "technical operator"); +}); + +function runnerModuleUrl(): string { + const source = ` + export async function runCollectionPopulatePipeline(input) { + const quote = input.expectedStress || "Loaded runner module."; + return { + rows: [{ + cells: { + entity_name: input.promptId, + source_url: "https://example.com/source", + evidence_quote: input.persona, + }, + sourceUrls: ["https://example.com/source"], + evidence: [ + { columnName: "entity_name", sourceUrl: "https://example.com/source", quote }, + { columnName: "source_url", sourceUrl: "https://example.com/source", quote }, + { columnName: "evidence_quote", sourceUrl: "https://example.com/source", quote }, + ], + needsReview: false, + }], + validationIssues: [], + usage: { promptTokens: 1, completionTokens: 1, totalTokens: 2 }, + metrics: { + searchCalls: 0, + fetchCalls: 0, + browserCalls: 0, + agentRuns: 1, + agentSteps: 0, + }, + }; + } + `; + return `data:text/javascript,${encodeURIComponent(source)}`; +} diff --git a/backend/test/populate-runtime.test.ts b/backend/test/populate-runtime.test.ts new file mode 100644 index 0000000..5172f8e --- /dev/null +++ b/backend/test/populate-runtime.test.ts @@ -0,0 +1,455 @@ +import assert from "node:assert/strict"; +import { test } from "node:test"; + +import { runPopulateRuntime } from "../src/pipeline/populate-runtime.js"; + +interface ToolLike { + execute(input: TInput): Promise; +} + +const context = { + datasetId: "benchmark-dataset", + datasetName: "benchmark_dataset", + description: "Find latest blog posts from OpenAI.", + columns: [ + { + name: "entity_name", + type: "text" as const, + description: "Company name.", + }, + { + name: "latest_post_title", + type: "text" as const, + description: "Latest post title.", + }, + { + name: "source_url", + type: "url" as const, + description: "Source URL.", + }, + { + name: "evidence_quote", + type: "text" as const, + description: "Evidence quote.", + }, + ], +}; + +test("populate runtime captures rows through injected tools without Convex writes", async () => { + const result = await runPopulateRuntime({ + context, + webTools: { + search: async () => [ + { + title: "OpenAI news", + snippet: "Release notes", + url: "https://openai.com/news", + }, + ], + fetch: async () => ({ + title: "OpenAI news", + text: "Release notes", + }), + }, + agentRunner: async ({ tools }) => { + const searchWeb = tools.search_web as ToolLike< + { query: string }, + { results?: unknown[] } + >; + const fetchPage = tools.fetch_page as ToolLike< + { url: string }, + { text?: string } + >; + const insertRow = tools.insert_row as ToolLike< + { datasetId: string; data: Record }, + { success: boolean } + >; + + const search = await searchWeb.execute({ query: "OpenAI latest blog" }); + assert.equal(search.results?.length, 1); + const page = await fetchPage.execute({ url: "https://openai.com/news" }); + assert.match(page.text ?? "", /Release notes/); + const inserted = await insertRow.execute({ + datasetId: "benchmark-dataset", + data: { + entity_name: "OpenAI", + latest_post_title: "Release notes", + source_url: "https://openai.com/news", + evidence_quote: "Release notes", + }, + }); + assert.equal(inserted.success, true); + }, + }); + + assert.equal(result.rows.length, 1); + assert.equal(result.rows[0]?.cells.entity_name, "OpenAI"); + assert.deepEqual(result.rows[0]?.sourceUrls, ["https://openai.com/news"]); + assert.equal(result.rows[0]?.evidence[0]?.quote, "Release notes"); + assert.equal(result.metrics.searchCalls, 1); + assert.equal(result.metrics.fetchCalls, 1); + assert.equal(result.metrics.agentRuns, 1); + assert.deepEqual(result.validationIssues, []); +}); + +test("populate runtime accepts structured fallback rows backed by captured sources", async () => { + const result = await runPopulateRuntime({ + context, + webTools: { + search: async () => [ + { + title: "OpenAI news", + snippet: "Release notes from OpenAI", + url: "https://openai.com/news", + }, + ], + fetch: async () => ({ + title: "OpenAI news", + text: "Release notes from OpenAI", + }), + }, + agentRunner: async ({ tools }) => { + const searchWeb = tools.search_web as ToolLike< + { query: string }, + { results?: unknown[] } + >; + const fetchPage = tools.fetch_page as ToolLike< + { url: string }, + { text?: string } + >; + + await searchWeb.execute({ query: "OpenAI latest blog" }); + await fetchPage.execute({ url: "https://openai.com/news" }); + + return { + rows: [{ + cells: { + entity_name: "OpenAI", + latest_post_title: "Release notes", + source_url: "https://openai.com/news", + evidence_quote: "Release notes from OpenAI", + }, + sourceUrls: ["https://openai.com/news"], + evidence: [{ + columnName: "latest_post_title", + sourceUrl: "https://openai.com/news", + quote: "Release notes from OpenAI", + }], + }], + }; + }, + }); + + assert.equal(result.rows.length, 1); + assert.equal(result.rows[0]?.cells.entity_name, "OpenAI"); + assert.equal(result.rows[0]?.needsReview, true); + assert.deepEqual(result.rows[0]?.sourceUrls, ["https://openai.com/news"]); + assert.deepEqual(result.validationIssues, []); +}); + +test("populate runtime rejects structured fallback rows without source-backed evidence", async () => { + const result = await runPopulateRuntime({ + context, + webTools: { + search: async () => [ + { + title: "OpenAI news", + snippet: "Release notes from OpenAI", + url: "https://openai.com/news", + }, + ], + fetch: async () => ({ + title: "OpenAI news", + text: "Release notes from OpenAI", + }), + }, + agentRunner: async ({ tools }) => { + const searchWeb = tools.search_web as ToolLike< + { query: string }, + { results?: unknown[] } + >; + + await searchWeb.execute({ query: "OpenAI latest blog" }); + + return { + rows: [{ + cells: { + entity_name: "OpenAI", + latest_post_title: "Invented post", + source_url: "https://openai.com/news", + evidence_quote: "Invented quote", + }, + sourceUrls: ["https://openai.com/news"], + evidence: [{ + columnName: "latest_post_title", + sourceUrl: "https://openai.com/news", + quote: "Invented quote", + }], + }], + }; + }, + }); + + assert.equal(result.rows.length, 0); + assert.match(result.validationIssues.join("\n"), /evidence quote not found/); + assert.match(result.validationIssues.join("\n"), /returned no rows/); +}); + +test("populate runtime prefers insert_row captures over contradictory structured rows", async () => { + const result = await runPopulateRuntime({ + context, + webTools: { + search: async () => [ + { + title: "OpenAI news", + snippet: "Release notes from OpenAI", + url: "https://openai.com/news", + }, + ], + fetch: async () => ({ + title: "OpenAI news", + text: "Release notes from OpenAI", + }), + }, + agentRunner: async ({ tools }) => { + const searchWeb = tools.search_web as ToolLike< + { query: string }, + { results?: unknown[] } + >; + const insertRow = tools.insert_row as ToolLike< + { datasetId: string; data: Record }, + { success: boolean } + >; + + await searchWeb.execute({ query: "OpenAI latest blog" }); + await insertRow.execute({ + datasetId: "benchmark-dataset", + data: { + entity_name: "OpenAI", + latest_post_title: "Release notes", + source_url: "https://openai.com/news", + evidence_quote: "Release notes from OpenAI", + }, + }); + + return { + rows: [{ + cells: { + entity_name: "Different", + latest_post_title: "Release notes", + source_url: "https://openai.com/news", + evidence_quote: "Release notes from OpenAI", + }, + sourceUrls: ["https://openai.com/news"], + evidence: [{ + columnName: "latest_post_title", + sourceUrl: "https://openai.com/news", + quote: "Release notes from OpenAI", + }], + }], + }; + }, + }); + + assert.equal(result.rows.length, 1); + assert.equal(result.rows[0]?.cells.entity_name, "OpenAI"); + assert.match(result.validationIssues.join("\n"), /Structured populate rows differed/); +}); + +test("populate runtime uses structured recovery when insert_row rows lack evidence", async () => { + const result = await runPopulateRuntime({ + context, + webTools: { + search: async () => [ + { + title: "OpenAI news", + snippet: "Release notes from OpenAI", + url: "https://openai.com/news", + }, + ], + fetch: async () => ({ + title: "OpenAI news", + text: "Release notes from OpenAI", + }), + }, + agentRunner: async ({ tools }) => { + const searchWeb = tools.search_web as ToolLike< + { query: string }, + { results?: unknown[] } + >; + const fetchPage = tools.fetch_page as ToolLike< + { url: string }, + { text?: string } + >; + const insertRow = tools.insert_row as ToolLike< + { datasetId: string; data: Record }, + { success: boolean } + >; + + await searchWeb.execute({ query: "OpenAI latest blog" }); + await fetchPage.execute({ url: "https://openai.com/news" }); + await insertRow.execute({ + datasetId: "benchmark-dataset", + data: { + entity_name: "OpenAI", + latest_post_title: "Release notes", + source_url: "https://openai.com/news", + evidence_quote: "", + }, + }); + + return { + rows: [{ + cells: { + entity_name: "OpenAI", + latest_post_title: "Release notes", + source_url: "https://openai.com/news", + evidence_quote: "Release notes from OpenAI", + }, + sourceUrls: ["https://openai.com/news"], + evidence: [{ + columnName: "latest_post_title", + sourceUrl: "https://openai.com/news", + quote: "Release notes from OpenAI", + }], + }], + }; + }, + }); + + assert.equal(result.rows.length, 1); + assert.equal(result.rows[0]?.evidence[0]?.quote, "Release notes from OpenAI"); + assert.match( + result.debug?.notes.join("\n") ?? "", + /Structured row recovery replaced insert_row rows/ + ); + assert.deepEqual(result.validationIssues, []); +}); + +test("populate runtime enforces per-run row cap before inserting", async () => { + const result = await runPopulateRuntime({ + context, + maxRows: 1, + webTools: { + search: async () => [], + fetch: async () => ({}), + }, + agentRunner: async ({ tools }) => { + const insertRow = tools.insert_row as ToolLike< + { datasetId: string; data: Record }, + { success: boolean; error?: string } + >; + + const first = await insertRow.execute({ + datasetId: "benchmark-dataset", + data: { + entity_name: "OpenAI", + source_url: "https://openai.com/news", + evidence_quote: "Release notes", + }, + }); + const second = await insertRow.execute({ + datasetId: "benchmark-dataset", + data: { + entity_name: "Anthropic", + source_url: "https://anthropic.com/news", + evidence_quote: "News", + }, + }); + + assert.equal(first.success, true); + assert.equal(second.success, false); + assert.match(second.error ?? "", /Row cap/); + }, + }); + + assert.equal(result.rows.length, 1); + assert.equal(result.rows[0]?.cells.entity_name, "OpenAI"); +}); + +test("populate runtime asks for insurance quote inputs without running the agent", async () => { + let wasAgentRunnerCalled = false; + const result = await runPopulateRuntime({ + context: { + ...context, + description: + "find me the best car insurance prices in California so I can pick the best bang for my buck", + }, + webTools: { + search: async () => { + throw new Error("search should not run"); + }, + fetch: async () => { + throw new Error("fetch should not run"); + }, + }, + agentRunner: async () => { + wasAgentRunnerCalled = true; + }, + }); + + assert.equal(wasAgentRunnerCalled, false); + assert.deepEqual(result.rows, []); + assert.equal(result.metrics.agentRuns, 0); + assert.equal(result.metrics.searchCalls, 0); + assert.equal(result.metrics.fetchCalls, 0); + assert.match(result.validationIssues.join(" "), /driver/); + assert.match(result.validationIssues.join(" "), /vehicle/); + assert.match(result.validationIssues.join(" "), /zip/); + assert.match(result.validationIssues.join(" "), /coverage/); + assert.match(result.validationIssues.join(" "), /deductible/); +}); + +test("populate runtime asks for AI company scope without running the agent", async () => { + let wasAgentRunnerCalled = false; + const result = await runPopulateRuntime({ + context: { + ...context, + description: "get me the latest stuff from the big AI companies", + }, + webTools: { + search: async () => { + throw new Error("search should not run"); + }, + fetch: async () => { + throw new Error("fetch should not run"); + }, + }, + agentRunner: async () => { + wasAgentRunnerCalled = true; + }, + }); + + assert.equal(wasAgentRunnerCalled, false); + assert.deepEqual(result.rows, []); + assert.equal(result.metrics.agentRuns, 0); + assert.equal(result.metrics.searchCalls, 0); + assert.equal(result.metrics.fetchCalls, 0); + assert.match(result.validationIssues.join(" "), /which companies/); + assert.match(result.validationIssues.join(" "), /source type/); + assert.match(result.validationIssues.join(" "), /news/); + assert.match(result.validationIssues.join(" "), /blog/); + assert.match(result.validationIssues.join(" "), /release/); + assert.match(result.validationIssues.join(" "), /columns/); +}); + +test("populate runtime does not preflight explicit latest blog post requests", async () => { + let wasAgentRunnerCalled = false; + const result = await runPopulateRuntime({ + context: { + ...context, + description: + "Can you make me a table of the latest blog posts from OpenAI, Anthropic, and Google DeepMind?", + }, + webTools: { + search: async () => [], + fetch: async () => ({}), + }, + agentRunner: async () => { + wasAgentRunnerCalled = true; + }, + }); + + assert.equal(wasAgentRunnerCalled, true); + assert.equal(result.metrics.agentRuns, 1); +}); diff --git a/backend/test/populate-self-healing-command.test.ts b/backend/test/populate-self-healing-command.test.ts new file mode 100644 index 0000000..1baf0f1 --- /dev/null +++ b/backend/test/populate-self-healing-command.test.ts @@ -0,0 +1,505 @@ +import assert from "node:assert/strict"; +import { test } from "node:test"; + +import type { DatasetContext } from "../src/pipeline/populate.js"; +import type { PopulateRecipeRuntime } from "../src/pipeline/populate-self-healing.js"; +import type { RunSelfHealingPopulateResult } from "../src/pipeline/populate-self-healing-runner.js"; +import { + parsePopulateSelfHealingCliArgs, + runPopulateSelfHealingCli, +} from "../src/pipeline/populate-self-healing-command.js"; + +const context: DatasetContext = { + datasetId: "dataset-ai-posts", + datasetName: "AI posts", + description: "Find latest blog posts from OpenAI.", + columns: [{ + name: "entity_name", + type: "text", + description: "Company name.", + }], +}; + +test("self-healing CLI parses context and dry-run mode", () => { + assert.deepEqual(parsePopulateSelfHealingCliArgs([ + "--context", + "context.json", + "--max-rows", + "3", + ]), { + contextPath: "context.json", + shouldReadStdin: false, + shouldCommitRows: false, + maxRows: 3, + }); +}); + +test("self-healing CLI parses dataset-id mode", () => { + assert.deepEqual(parsePopulateSelfHealingCliArgs([ + "--dataset-id", + "dataset-ai-posts", + "--commit", + ]), { + datasetId: "dataset-ai-posts", + shouldReadStdin: false, + shouldCommitRows: true, + }); +}); + +test("self-healing CLI rejects dataset-id mixed with context input", () => { + assert.throws( + () => parsePopulateSelfHealingCliArgs([ + "--dataset-id", + "dataset-ai-posts", + "--context", + "context.json", + ]), + /Choose exactly one context source/ + ); + assert.throws( + () => parsePopulateSelfHealingCliArgs([ + "--context", + "context.json", + "--dataset-id", + "dataset-ai-posts", + ]), + /Choose exactly one context source/ + ); + assert.throws( + () => parsePopulateSelfHealingCliArgs([ + "--dataset-id", + "dataset-ai-posts", + "--stdin", + ]), + /Choose exactly one context source/ + ); + assert.throws( + () => parsePopulateSelfHealingCliArgs([ + "--stdin", + "--dataset-id", + "dataset-ai-posts", + ]), + /Choose exactly one context source/ + ); +}); + +test("self-healing CLI rejects context and stdin mixed in any order", () => { + assert.throws( + () => parsePopulateSelfHealingCliArgs([ + "--context", + "context.json", + "--stdin", + ]), + /Choose exactly one context source/ + ); + assert.throws( + () => parsePopulateSelfHealingCliArgs([ + "--stdin", + "--context", + "context.json", + ]), + /Choose exactly one context source/ + ); +}); + +test("self-healing CLI dry run does not require Convex admin key or create writer", async () => { + const stdout: string[] = []; + let runCalls = 0; + let writerCalls = 0; + const exitCode = await runPopulateSelfHealingCli({ + argv: ["--context", "context.json"], + env: { + OPENROUTER_API_KEY: "openrouter", + TINYFISH_API_KEY: "tinyfish", + }, + readFileText: async () => JSON.stringify(context), + writeStdout: (text) => stdout.push(text), + writeStderr: () => undefined, + createRowWriter: async () => { + writerCalls += 1; + throw new Error("writer should not be created"); + }, + runSelfHealing: async (input) => { + runCalls += 1; + assert.equal(input.shouldCommitRows, false); + assert.equal(input.rowWriter, undefined); + assert.equal(input.recipeStoreDirectory, undefined); + assert.ok(input.store); + return successfulResult(input.context.datasetId); + }, + }); + + assert.equal(exitCode, 0); + assert.equal(runCalls, 1); + assert.equal(writerCalls, 0); + assert.equal(stdout.length, 1); + const output = JSON.parse(stdout[0]!); + assert.equal(output.success, true); + assert.equal(output.dryRun, true); + assert.equal(output.rowCount, 1); +}); + +test("self-healing CLI passes selected runtime into the runner", async () => { + const stdout: string[] = []; + const selectedRuntime = fakeRuntime(); + let createRuntimeCalls = 0; + let didUseSelectedRuntime = false; + const exitCode = await runPopulateSelfHealingCli({ + argv: ["--context", "context.json"], + env: { + OPENROUTER_API_KEY: "openrouter", + TINYFISH_API_KEY: "tinyfish", + POPULATE_AGENT_RUNTIME: "collection", + }, + readFileText: async () => JSON.stringify(context), + writeStdout: (text) => stdout.push(text), + writeStderr: () => undefined, + createRuntime: async (input) => { + createRuntimeCalls += 1; + assert.equal(input.env.POPULATE_AGENT_RUNTIME, "collection"); + return selectedRuntime; + }, + runSelfHealing: async (input) => { + didUseSelectedRuntime = input.runtime === selectedRuntime; + return successfulResult(input.context.datasetId); + }, + }); + + assert.equal(exitCode, 0); + assert.equal(createRuntimeCalls, 1); + assert.equal(didUseSelectedRuntime, true); + assert.equal(JSON.parse(stdout[0]!).success, true); +}); + +test("self-healing CLI dataset-id dry run loads context before running", async () => { + const stdout: string[] = []; + let loadedDatasetId = ""; + let didReadFile = false; + const exitCode = await runPopulateSelfHealingCli({ + argv: ["--dataset-id", "dataset-ai-posts"], + env: { + CONVEX_URL: "http://convex:3210", + CONVEX_SELF_HOSTED_ADMIN_KEY: "convex-admin", + OPENROUTER_API_KEY: "openrouter", + TINYFISH_API_KEY: "tinyfish", + }, + readFileText: async () => { + didReadFile = true; + return JSON.stringify(context); + }, + loadDatasetContextById: async (datasetId) => { + loadedDatasetId = datasetId; + return context; + }, + writeStdout: (text) => stdout.push(text), + writeStderr: () => undefined, + runSelfHealing: async (input) => { + assert.equal(input.context.datasetId, context.datasetId); + assert.equal(input.shouldCommitRows, false); + assert.ok(input.store); + assert.equal(input.rowWriter, undefined); + return successfulResult(input.context.datasetId); + }, + }); + + assert.equal(exitCode, 0); + assert.equal(loadedDatasetId, "dataset-ai-posts"); + assert.equal(didReadFile, false); + assert.equal(JSON.parse(stdout[0]!).success, true); +}); + +test("self-healing CLI dataset-id commit loads context and creates writer", async () => { + const stdout: string[] = []; + let writerCalls = 0; + const exitCode = await runPopulateSelfHealingCli({ + argv: ["--dataset-id", "dataset-ai-posts", "--commit"], + env: { + CONVEX_URL: "http://convex:3210", + CONVEX_SELF_HOSTED_ADMIN_KEY: "convex-admin", + OPENROUTER_API_KEY: "openrouter", + TINYFISH_API_KEY: "tinyfish", + POPULATE_RECIPE_STORE_DIR: ".bigset/populate-recipes", + }, + loadDatasetContextById: async (datasetId) => ({ + ...context, + datasetId, + }), + createRowWriter: async () => { + writerCalls += 1; + return { + async replaceRows() { + return { insertedRowCount: 1 }; + }, + }; + }, + writeStdout: (text) => stdout.push(text), + writeStderr: () => undefined, + runSelfHealing: async (input) => { + assert.equal(input.context.datasetId, "dataset-ai-posts"); + assert.equal(input.shouldCommitRows, true); + assert.equal(input.store, undefined); + assert.equal(input.recipeStoreDirectory, ".bigset/populate-recipes"); + assert.ok(input.rowWriter); + return successfulResult(input.context.datasetId); + }, + }); + + assert.equal(exitCode, 0); + assert.equal(writerCalls, 1); + assert.equal(JSON.parse(stdout[0]!).success, true); +}); + +test("self-healing CLI dataset-id mode preflights Convex keys before loading context", async () => { + const stdout: string[] = []; + let loadCalls = 0; + const exitCode = await runPopulateSelfHealingCli({ + argv: ["--dataset-id", "dataset-ai-posts"], + env: { + OPENROUTER_API_KEY: "openrouter", + TINYFISH_API_KEY: "tinyfish", + }, + loadDatasetContextById: async () => { + loadCalls += 1; + return context; + }, + writeStdout: (text) => stdout.push(text), + writeStderr: () => undefined, + }); + + assert.equal(exitCode, 1); + assert.equal(loadCalls, 0); + assert.match(stdout[0]!, /CONVEX_URL/); + assert.match(stdout[0]!, /CONVEX_SELF_HOSTED_ADMIN_KEY/); +}); + +test("self-healing CLI dataset-id loader failures skip runtime and writer", async () => { + const stdout: string[] = []; + let runCalls = 0; + let writerCalls = 0; + const exitCode = await runPopulateSelfHealingCli({ + argv: ["--dataset-id", "not-a-convex-id", "--commit"], + env: { + CONVEX_URL: "http://convex:3210", + CONVEX_SELF_HOSTED_ADMIN_KEY: "convex-admin", + OPENROUTER_API_KEY: "openrouter", + TINYFISH_API_KEY: "tinyfish", + }, + loadDatasetContextById: async () => { + throw new Error("Invalid dataset id: not-a-convex-id."); + }, + createRowWriter: async () => { + writerCalls += 1; + return { + async replaceRows() { + return { insertedRowCount: 0 }; + }, + }; + }, + writeStdout: (text) => stdout.push(text), + writeStderr: () => undefined, + runSelfHealing: async () => { + runCalls += 1; + throw new Error("runtime should not run"); + }, + }); + + assert.equal(exitCode, 1); + assert.equal(runCalls, 0); + assert.equal(writerCalls, 0); + assert.match(stdout[0]!, /Invalid dataset id/); +}); + +test("self-healing CLI rejects durable recipe store on dry run", async () => { + const stdout: string[] = []; + const stderr: string[] = []; + let didReadContext = false; + const exitCode = await runPopulateSelfHealingCli({ + argv: [ + "--stdin", + "--recipe-store-dir", + ".bigset/test-recipes", + ], + env: { + OPENROUTER_API_KEY: "openrouter", + TINYFISH_API_KEY: "tinyfish", + }, + readStdinText: async () => { + didReadContext = true; + return JSON.stringify(context); + }, + writeStdout: (text) => stdout.push(text), + writeStderr: (text) => stderr.push(text), + runSelfHealing: async () => { + throw new Error("runtime should not run"); + }, + }); + + assert.equal(exitCode, 1); + assert.equal(didReadContext, false); + assert.equal(stdout.length, 1); + assert.match(stdout[0]!, /--recipe-store-dir requires --commit/); + assert.match(stderr.join("\n"), /--recipe-store-dir requires --commit/); +}); + +test("self-healing CLI commit mode preflights missing Convex key before runtime", async () => { + const stdout: string[] = []; + let runCalls = 0; + const exitCode = await runPopulateSelfHealingCli({ + argv: ["--context", "context.json", "--commit"], + env: { + OPENROUTER_API_KEY: "openrouter", + TINYFISH_API_KEY: "tinyfish", + }, + readFileText: async () => JSON.stringify(context), + writeStdout: (text) => stdout.push(text), + writeStderr: () => undefined, + runSelfHealing: async () => { + runCalls += 1; + throw new Error("runtime should not run"); + }, + }); + + assert.equal(exitCode, 1); + assert.equal(runCalls, 0); + assert.equal(stdout.length, 1); + assert.match(stdout[0]!, /CONVEX_SELF_HOSTED_ADMIN_KEY/); +}); + +test("self-healing CLI exits 2 when tick rejects candidate", async () => { + const stdout: string[] = []; + const exitCode = await runPopulateSelfHealingCli({ + argv: ["--stdin"], + env: { + OPENROUTER_API_KEY: "openrouter", + TINYFISH_API_KEY: "tinyfish", + }, + readStdinText: async () => JSON.stringify(context), + writeStdout: (text) => stdout.push(text), + writeStderr: () => undefined, + runSelfHealing: async (input) => rejectedResult(input.context.datasetId), + }); + + assert.equal(exitCode, 2); + assert.equal(stdout.length, 1); + const output = JSON.parse(stdout[0]!); + assert.equal(output.success, false); + assert.equal(output.action, "candidate_rejected"); + assert.match(output.validationIssues.join("\n"), /Still no evidence/); +}); + +test("self-healing CLI reports malformed context JSON as one stdout object", async () => { + const stdout: string[] = []; + const stderr: string[] = []; + const exitCode = await runPopulateSelfHealingCli({ + argv: ["--context", "context.json"], + env: { + OPENROUTER_API_KEY: "openrouter", + TINYFISH_API_KEY: "tinyfish", + }, + readFileText: async () => "{ nope", + writeStdout: (text) => stdout.push(text), + writeStderr: (text) => stderr.push(text), + }); + + assert.equal(exitCode, 1); + assert.equal(stdout.length, 1); + assert.equal(JSON.parse(stdout[0]!).success, false); + assert.match(stderr.join("\n"), /JSON/); +}); + +function successfulResult(datasetId: string): RunSelfHealingPopulateResult { + return { + success: true, + action: "generated_initial_recipe", + datasetId, + selectedRun: { + ...baseRun(datasetId), + rows: [{ + cells: { entity_name: "OpenAI" }, + sourceUrls: ["https://openai.com/news"], + evidence: [{ + columnName: "entity_name", + sourceUrl: "https://openai.com/news", + quote: "OpenAI", + }], + needsReview: true, + }], + }, + rejectionReasons: [], + validationIssues: [], + tick: { + datasetId, + action: "generated_initial_recipe", + rejectionReasons: [], + }, + }; +} + +function rejectedResult(datasetId: string): RunSelfHealingPopulateResult { + return { + success: false, + action: "candidate_rejected", + datasetId, + diagnosticRun: { + ...baseRun(datasetId), + runStatus: "failed", + validationIssues: ["Still no evidence."], + productionValidation: { + ...baseRun(datasetId).productionValidation, + isValid: false, + score: 0, + criticalIssues: ["Still no evidence."], + }, + }, + rejectionReasons: ["Still no evidence."], + validationIssues: ["Still no evidence."], + tick: { + datasetId, + action: "candidate_rejected", + rejectionReasons: ["Still no evidence."], + }, + }; +} + +function baseRun(datasetId: string): RunSelfHealingPopulateResult["selectedRun"] { + return { + rows: [], + validationIssues: [], + usage: { promptTokens: 0, completionTokens: 0, totalTokens: 0 }, + metrics: { + searchCalls: 0, + fetchCalls: 0, + browserCalls: 0, + agentRuns: 0, + agentSteps: 0, + }, + recipeId: `${datasetId}-recipe-v1`, + recipeVersion: 1, + runStatus: "succeeded", + startedAt: "2026-05-22T00:00:00.000Z", + completedAt: "2026-05-22T00:00:01.000Z", + runtimeMs: 1_000, + productionValidation: { + isValid: true, + score: 1, + rowCount: 1, + requestedCellCompletenessRatio: 1, + sourceUrlCoverageRatio: 1, + evidenceCoverageRatio: 1, + expectedEntityCoverageRatio: 1, + expectedEntities: [], + missingExpectedEntities: [], + criticalIssues: [], + warnings: [], + }, + artifacts: [], + }; +} + +function fakeRuntime(): PopulateRecipeRuntime { + return { + async runRecipe() { + throw new Error("fake runtime should not execute in CLI unit tests"); + }, + }; +} diff --git a/backend/test/populate-self-healing-runner.test.ts b/backend/test/populate-self-healing-runner.test.ts new file mode 100644 index 0000000..b63c4c0 --- /dev/null +++ b/backend/test/populate-self-healing-runner.test.ts @@ -0,0 +1,365 @@ +import assert from "node:assert/strict"; +import { mkdtemp } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { test } from "node:test"; + +import type { DatasetContext } from "../src/pipeline/populate.js"; +import { + createPopulateRecipe, + FileSystemPopulateRecipeStore, + InMemoryPopulateRecipeStore, + type PopulateRecipe, + type PopulateRecipeAuthor, + type PopulateRecipeRunResult, + type PopulateRecipeRuntime, + type SelfHealingPopulateTickResult, +} from "../src/pipeline/populate-self-healing.js"; +import { + diagnosticRunForTick, + runSelfHealingPopulate, + validationIssuesForSelfHealingTick, + type PopulateDatasetRowWriter, +} from "../src/pipeline/populate-self-healing-runner.js"; + +const context: DatasetContext = { + datasetId: "dataset-ai-posts", + datasetName: "AI posts", + description: "Find latest blog posts from OpenAI.", + columns: [ + { + name: "entity_name", + type: "text", + description: "Company name.", + }, + { + name: "latest_post_title", + type: "text", + description: "Post title.", + }, + { + name: "source_url", + type: "url", + description: "Source URL.", + }, + { + name: "evidence_quote", + type: "text", + description: "Evidence quote.", + }, + ], +}; + +test("self-healing runner commits rows only after a successful tick", async () => { + const store = new InMemoryPopulateRecipeStore(); + const generatedRecipe = recipe({ recipeId: "generated-v1" }); + const writer = new FakePopulateDatasetRowWriter(); + + const result = await runSelfHealingPopulate({ + context, + store, + runtime: new FakePopulateRecipeRuntime({ + "generated-v1": validRun(generatedRecipe), + }), + author: new FakeRecipeAuthor({ generatedRecipe }), + rowWriter: writer, + shouldCommitRows: true, + }); + + assert.equal(result.success, true); + assert.equal(result.action, "generated_initial_recipe"); + assert.equal(result.committedRows?.insertedRowCount, 1); + assert.equal(writer.replaceCalls.length, 1); + assert.equal(writer.replaceCalls[0]?.datasetId, context.datasetId); + assert.equal(writer.replaceCalls[0]?.rows[0]?.cells.entity_name, "OpenAI"); +}); + +test("self-healing runner requires a row writer before runtime work when committing", async () => { + let runtimeCalls = 0; + + await assert.rejects( + runSelfHealingPopulate({ + context, + runtime: { + async runRecipe(input) { + runtimeCalls += 1; + return validRun(input.recipe); + }, + }, + author: new FakeRecipeAuthor({ + generatedRecipe: recipe({ recipeId: "generated-v1" }), + }), + shouldCommitRows: true, + }), + /rowWriter is required/ + ); + + assert.equal(runtimeCalls, 0); +}); + +test("self-healing runner commits healthy active reruns", async () => { + const store = new InMemoryPopulateRecipeStore(); + const activeRecipe = recipe({ recipeId: "active-v1", status: "active" }); + const writer = new FakePopulateDatasetRowWriter(); + await store.saveRecipe(activeRecipe); + + const result = await runSelfHealingPopulate({ + context, + store, + runtime: new FakePopulateRecipeRuntime({ + "active-v1": validRun(activeRecipe), + }), + author: new FakeRecipeAuthor(), + rowWriter: writer, + shouldCommitRows: true, + }); + + assert.equal(result.success, true); + assert.equal(result.action, "active_rerun_succeeded"); + assert.equal(result.selectedRun?.recipeId, "active-v1"); + assert.equal(writer.replaceCalls.length, 1); +}); + +test("self-healing runner commits promoted repair candidate rows", async () => { + const store = new InMemoryPopulateRecipeStore(); + const activeRecipe = recipe({ recipeId: "active-broken", status: "active" }); + const repairedRecipe = recipe({ recipeId: "repair-v2", version: 2 }); + const writer = new FakePopulateDatasetRowWriter(); + await store.saveRecipe(activeRecipe); + + const result = await runSelfHealingPopulate({ + context, + store, + runtime: new FakePopulateRecipeRuntime({ + "active-broken": invalidRun(activeRecipe, "No source-backed rows."), + "repair-v2": validRun(repairedRecipe), + }), + author: new FakeRecipeAuthor({ repairedRecipe }), + rowWriter: writer, + shouldCommitRows: true, + }); + + assert.equal(result.success, true); + assert.equal(result.action, "repaired_active_recipe"); + assert.equal(result.selectedRun?.recipeId, "repair-v2"); + assert.equal(writer.replaceCalls.length, 1); +}); + +test("self-healing runner does not clear or insert rows when candidate is rejected", async () => { + const store = new InMemoryPopulateRecipeStore(); + const activeRecipe = recipe({ recipeId: "active-broken", status: "active" }); + const rejectedRecipe = recipe({ recipeId: "repair-v2", version: 2 }); + const writer = new FakePopulateDatasetRowWriter(); + await store.saveRecipe(activeRecipe); + + const result = await runSelfHealingPopulate({ + context, + store, + runtime: new FakePopulateRecipeRuntime({ + "active-broken": invalidRun(activeRecipe, "No source-backed rows."), + "repair-v2": invalidRun(rejectedRecipe, "Still no evidence."), + }), + author: new FakeRecipeAuthor({ repairedRecipe: rejectedRecipe }), + rowWriter: writer, + shouldCommitRows: true, + }); + + assert.equal(result.success, false); + assert.equal(result.action, "candidate_rejected"); + assert.equal(result.selectedRun, undefined); + assert.equal(result.diagnosticRun?.recipeId, "repair-v2"); + assert.equal(result.committedRows, undefined); + assert.equal(writer.replaceCalls.length, 0); + assert.match(result.validationIssues.join("\n"), /Still no evidence/); +}); + +test("filesystem store lets the runner reuse an active recipe across calls", async () => { + const rootDirectory = await mkdtemp(join(tmpdir(), "bigset-populate-runner-")); + const store = new FileSystemPopulateRecipeStore(rootDirectory); + const generatedRecipe = recipe({ recipeId: "generated-v1" }); + const writer = new FakePopulateDatasetRowWriter(); + const runtime = new FakePopulateRecipeRuntime({ + "generated-v1": validRun(generatedRecipe), + }); + const author = new FakeRecipeAuthor({ generatedRecipe }); + + const first = await runSelfHealingPopulate({ + context, + store, + runtime, + author, + rowWriter: writer, + shouldCommitRows: true, + }); + const second = await runSelfHealingPopulate({ + context, + store: new FileSystemPopulateRecipeStore(rootDirectory), + runtime, + author, + rowWriter: writer, + shouldCommitRows: true, + }); + + assert.equal(first.action, "generated_initial_recipe"); + assert.equal(second.action, "active_rerun_succeeded"); + assert.equal(author.generateCalls, 1); + assert.equal(writer.replaceCalls.length, 2); +}); + +test("self-healing tick diagnostics expose rejected candidate validation issues", () => { + const candidateRecipe = recipe({ recipeId: "repair-v2", version: 2 }); + const candidateRun = invalidRun(candidateRecipe, "Missing expected entities: Anthropic."); + const tick: SelfHealingPopulateTickResult = { + datasetId: context.datasetId, + action: "candidate_rejected", + candidateRecipe, + candidateRun, + rejectionReasons: ["Candidate validation score is below the active recipe baseline."], + }; + + assert.equal(diagnosticRunForTick(tick)?.recipeId, "repair-v2"); + assert.deepEqual(validationIssuesForSelfHealingTick(tick), [ + "Missing expected entities: Anthropic.", + "Candidate validation score is below the active recipe baseline.", + ]); +}); + +function recipe(input: { + recipeId: string; + version?: number; + status?: PopulateRecipe["status"]; +}): PopulateRecipe { + return createPopulateRecipe({ + recipeId: input.recipeId, + datasetId: context.datasetId, + version: input.version ?? 1, + status: input.status, + sourceDescription: context.description, + requestedColumns: context.columns.map((column) => column.name), + createdAt: "2026-05-22T00:00:00.000Z", + }); +} + +function validRun(recipe: PopulateRecipe): PopulateRecipeRunResult { + return runResult({ + recipe, + rows: [{ + cells: { + entity_name: "OpenAI", + latest_post_title: "Release notes from OpenAI", + source_url: "https://openai.com/news", + evidence_quote: "Release notes from OpenAI", + }, + sourceUrls: ["https://openai.com/news"], + evidence: [{ + columnName: "latest_post_title", + sourceUrl: "https://openai.com/news", + quote: "Release notes from OpenAI", + }], + needsReview: true, + }], + isValid: true, + score: 1, + }); +} + +function invalidRun(recipe: PopulateRecipe, issue: string): PopulateRecipeRunResult { + return runResult({ + recipe, + rows: [], + validationIssues: [issue], + criticalIssues: [issue], + isValid: false, + score: 0, + }); +} + +function runResult(input: { + recipe: PopulateRecipe; + rows: PopulateRecipeRunResult["rows"]; + validationIssues?: string[]; + criticalIssues?: string[]; + isValid: boolean; + score: number; +}): PopulateRecipeRunResult { + return { + rows: input.rows, + validationIssues: input.validationIssues ?? [], + usage: { + promptTokens: 0, + completionTokens: 0, + totalTokens: 0, + }, + metrics: { + searchCalls: 0, + fetchCalls: 0, + browserCalls: 0, + agentRuns: 0, + agentSteps: 0, + }, + recipeId: input.recipe.recipeId, + recipeVersion: input.recipe.version, + runStatus: input.isValid ? "succeeded" : "failed", + startedAt: "2026-05-22T00:00:00.000Z", + completedAt: "2026-05-22T00:00:01.000Z", + runtimeMs: 1_000, + productionValidation: { + isValid: input.isValid, + score: input.score, + rowCount: input.rows.length, + requestedCellCompletenessRatio: input.score, + sourceUrlCoverageRatio: input.score, + evidenceCoverageRatio: input.score, + expectedEntityCoverageRatio: input.score, + expectedEntities: [], + missingExpectedEntities: [], + criticalIssues: input.criticalIssues ?? [], + warnings: input.validationIssues ?? [], + }, + artifacts: [], + }; +} + +class FakePopulateRecipeRuntime implements PopulateRecipeRuntime { + constructor(private readonly runsByRecipeId: Record) {} + + async runRecipe(input: { + recipe: PopulateRecipe; + context: DatasetContext; + }): Promise { + return this.runsByRecipeId[input.recipe.recipeId] ?? + invalidRun(input.recipe, `Missing fake run for ${input.recipe.recipeId}.`); + } +} + +class FakeRecipeAuthor implements PopulateRecipeAuthor { + generateCalls = 0; + + constructor( + private readonly recipes: { + generatedRecipe?: PopulateRecipe; + repairedRecipe?: PopulateRecipe; + } = {} + ) {} + + async generateRecipe(): Promise { + this.generateCalls += 1; + return this.recipes.generatedRecipe ?? recipe({ recipeId: "generated-v1" }); + } + + async repairRecipe(): Promise { + return this.recipes.repairedRecipe ?? recipe({ recipeId: "repair-v2", version: 2 }); + } +} + +class FakePopulateDatasetRowWriter implements PopulateDatasetRowWriter { + readonly replaceCalls: Array[0]> = []; + + async replaceRows(input: Parameters[0]) { + this.replaceCalls.push(input); + return { + clearedRowCount: 7, + insertedRowCount: input.rows.length, + }; + } +} diff --git a/backend/test/populate-self-healing.test.ts b/backend/test/populate-self-healing.test.ts new file mode 100644 index 0000000..e1be40d --- /dev/null +++ b/backend/test/populate-self-healing.test.ts @@ -0,0 +1,556 @@ +import assert from "node:assert/strict"; +import { mkdtemp } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { test } from "node:test"; + +import { + createPopulateRecipe, + FileSystemPopulateRecipeStore, + InMemoryPopulateRecipeStore, + MastraPopulateRecipeRuntime, + SelfHealingPopulateRecipeService, +} from "../src/pipeline/populate-self-healing.js"; +import type { + PopulateRecipe, + PopulateRecipeAuthor, + PopulateRecipeRunResult, + PopulateRecipeRuntime, +} from "../src/pipeline/populate-self-healing.js"; +import type { DatasetContext } from "../src/pipeline/populate.js"; + +const context: DatasetContext = { + datasetId: "dataset-ai-posts", + datasetName: "AI posts", + description: "Find latest blog posts from OpenAI.", + columns: [ + { + name: "entity_name", + type: "text", + description: "Company name.", + }, + { + name: "latest_post_title", + type: "text", + description: "Post title.", + }, + { + name: "source_url", + type: "url", + description: "Source URL.", + }, + { + name: "evidence_quote", + type: "text", + description: "Evidence quote.", + }, + ], +}; + +test("Mastra populate recipe runtime maps populate rows into a healthy recipe run", async () => { + let promptText = ""; + const runtime = new MastraPopulateRecipeRuntime({ + webTools: { + search: async () => [ + { + title: "OpenAI news", + snippet: "Release notes from OpenAI", + url: "https://openai.com/news", + }, + ], + fetch: async () => ({ + title: "OpenAI news", + text: "Release notes from OpenAI", + }), + }, + agentRunner: async ({ prompt, tools }) => { + promptText = prompt; + const searchWeb = tools.search_web as ToolLike< + { query: string }, + { results?: unknown[] } + >; + const fetchPage = tools.fetch_page as ToolLike< + { url: string }, + { text?: string } + >; + const insertRow = tools.insert_row as ToolLike< + { datasetId: string; data: Record }, + { success: boolean } + >; + await searchWeb.execute({ query: "OpenAI latest blog" }); + await fetchPage.execute({ url: "https://openai.com/news" }); + await insertRow.execute({ + datasetId: context.datasetId, + data: { + entity_name: "OpenAI", + latest_post_title: "Release notes from OpenAI", + source_url: "https://openai.com/news", + evidence_quote: "Release notes from OpenAI", + }, + }); + }, + }); + + const run = await runtime.runRecipe({ + recipe: recipe({ + recipeId: "recipe-v1", + runtimeInstructions: "Prefer official news pages already known to work.", + }), + context, + }); + + assert.match(promptText, /Durable recipe instructions/); + assert.equal(run.runStatus, "succeeded"); + assert.equal(run.productionValidation.isValid, true); + assert.equal(run.productionValidation.score, 1); + assert.equal(run.recipeId, "recipe-v1"); + assert.equal(run.rows[0]?.cells.entity_name, "OpenAI"); + assert.equal(run.debug?.selectedRowSource, "insert_row"); + assert.ok(run.artifacts.some((artifact) => artifact.kind === "source-transcript")); + assert.ok(run.artifacts.some((artifact) => artifact.kind === "captured-rows")); +}); + +test("Mastra populate recipe runtime keeps supplemental fetch misses non-blocking", async () => { + const runtime = new MastraPopulateRecipeRuntime({ + runPopulate: async () => ({ + rows: validRows(), + validationIssues: [ + "Structured fallback fetch failed for https://example.com/noise: timeout", + ], + usage: emptyUsage(), + metrics: emptyMetrics(), + }), + }); + + const run = await runtime.runRecipe({ + recipe: recipe({ recipeId: "recipe-v1" }), + context, + }); + + assert.equal(run.runStatus, "succeeded"); + assert.equal(run.productionValidation.isValid, true); + assert.deepEqual(run.productionValidation.criticalIssues, []); + assert.match(run.productionValidation.warnings.join("\n"), /timeout/); +}); + +test("Mastra populate recipe runtime blocks missing expected entities", async () => { + const runtime = new MastraPopulateRecipeRuntime({ + runPopulate: async () => ({ + rows: [{ + ...validRows()[0]!, + cells: { + ...validRows()[0]!.cells, + latest_post_title: + "OpenAI roundtable mentions Anthropic and Google DeepMind", + evidence_quote: + "OpenAI discussed Anthropic and Google DeepMind in passing.", + }, + evidence: [{ + columnName: "latest_post_title", + sourceUrl: "https://openai.com/news", + quote: "OpenAI discussed Anthropic and Google DeepMind in passing.", + }], + }], + validationIssues: [], + usage: emptyUsage(), + metrics: emptyMetrics(), + }), + }); + + const run = await runtime.runRecipe({ + recipe: recipe({ recipeId: "recipe-v1" }), + context: { + ...context, + description: + "Find latest blog posts from OpenAI, Anthropic, and Google DeepMind.", + }, + }); + + assert.equal(run.runStatus, "failed"); + assert.equal(run.productionValidation.isValid, false); + assert.deepEqual(run.productionValidation.expectedEntities, [ + "OpenAI", + "Anthropic", + "Google DeepMind", + ]); + assert.deepEqual(run.productionValidation.missingExpectedEntities, [ + "Anthropic", + "Google DeepMind", + ]); + assert.match( + run.productionValidation.criticalIssues.join("\n"), + /Missing expected entities/ + ); +}); + +test("self-healing service reruns a healthy active recipe without author repair", async () => { + const store = new InMemoryPopulateRecipeStore(); + const activeRecipe = recipe({ recipeId: "active-v1", status: "active" }); + await store.saveRecipe(activeRecipe); + const author = new FakeRecipeAuthor(); + const service = new SelfHealingPopulateRecipeService({ + store, + runtime: new FakePopulateRecipeRuntime({ + "active-v1": validRun(activeRecipe), + }), + author, + }); + + const result = await service.tick({ datasetId: context.datasetId, context }); + + assert.equal(result.action, "active_rerun_succeeded"); + assert.equal(author.generateCalls, 0); + assert.equal(author.repairCalls, 0); + assert.equal(result.activeRecipe?.status, "active"); + assert.equal(result.activeRecipe?.lastValidationScore, 1); +}); + +test("self-healing service generates and activates the first valid recipe", async () => { + const store = new InMemoryPopulateRecipeStore(); + const generatedRecipe = recipe({ recipeId: "generated-v1" }); + const service = new SelfHealingPopulateRecipeService({ + store, + runtime: new FakePopulateRecipeRuntime({ + "generated-v1": validRun(generatedRecipe), + }), + author: new FakeRecipeAuthor({ generatedRecipe }), + }); + + const result = await service.tick({ datasetId: context.datasetId, context }); + const snapshot = await store.loadSnapshot(context.datasetId); + + assert.equal(result.action, "generated_initial_recipe"); + assert.equal(result.activeRecipe?.recipeId, "generated-v1"); + assert.equal(snapshot.recipes[0]?.status, "active"); + assert.equal(snapshot.runRecords.length, 1); +}); + +test("self-healing service normalizes author recipe metadata before storing", async () => { + const store = new InMemoryPopulateRecipeStore(); + const generatedRecipe = createPopulateRecipe({ + recipeId: "generated-v1", + datasetId: "wrong-dataset", + version: 99, + status: "active", + sourceDescription: "wrong prompt", + requestedColumns: ["wrong_column"], + }); + const service = new SelfHealingPopulateRecipeService({ + store, + runtime: new FakePopulateRecipeRuntime({ + "generated-v1": validRun({ + ...generatedRecipe, + datasetId: context.datasetId, + version: 1, + status: "candidate", + }), + }), + author: new FakeRecipeAuthor({ generatedRecipe }), + }); + + const result = await service.tick({ datasetId: context.datasetId, context }); + const snapshot = await store.loadSnapshot(context.datasetId); + + assert.equal(result.action, "generated_initial_recipe"); + assert.equal(result.activeRecipe?.datasetId, context.datasetId); + assert.equal(result.activeRecipe?.version, 1); + assert.equal(result.activeRecipe?.status, "active"); + assert.deepEqual( + result.activeRecipe?.requestedColumns, + context.columns.map((column) => column.name) + ); + assert.equal(snapshot.recipes.length, 1); + assert.equal(snapshot.recipes[0]?.datasetId, context.datasetId); +}); + +test("self-healing service uses tick dataset id as the runtime context id", async () => { + const store = new InMemoryPopulateRecipeStore(); + const generatedRecipe = recipe({ recipeId: "generated-v1" }); + let runtimeContextDatasetId = ""; + const service = new SelfHealingPopulateRecipeService({ + store, + runtime: { + async runRecipe(input) { + runtimeContextDatasetId = input.context.datasetId; + return validRun(input.recipe); + }, + }, + author: new FakeRecipeAuthor({ generatedRecipe }), + }); + + await service.tick({ + datasetId: context.datasetId, + context: { + ...context, + datasetId: "wrong-dataset", + }, + }); + + assert.equal(runtimeContextDatasetId, context.datasetId); +}); + +test("self-healing service repairs a failed active recipe and promotes the candidate", async () => { + const store = new InMemoryPopulateRecipeStore(); + const activeRecipe = recipe({ recipeId: "active-broken", status: "active" }); + const repairedRecipe = recipe({ recipeId: "repair-v2", version: 2 }); + await store.saveRecipe(activeRecipe); + const author = new FakeRecipeAuthor({ repairedRecipe }); + const service = new SelfHealingPopulateRecipeService({ + store, + runtime: new FakePopulateRecipeRuntime({ + "active-broken": invalidRun(activeRecipe, "No source-backed rows."), + "repair-v2": validRun(repairedRecipe), + }), + author, + }); + + const result = await service.tick({ datasetId: context.datasetId, context }); + const snapshot = await store.loadSnapshot(context.datasetId); + + assert.equal(result.action, "repaired_active_recipe"); + assert.equal(author.repairCalls, 1); + assert.equal(author.lastRepairInput?.failedRun.runStatus, "failed"); + assert.equal(snapshot.recipes.find((item) => item.recipeId === "active-broken")?.status, "retired"); + assert.equal(snapshot.recipes.find((item) => item.recipeId === "repair-v2")?.status, "active"); +}); + +test("self-healing service rejects valid repairs below active recipe baseline", async () => { + const store = new InMemoryPopulateRecipeStore(); + const activeRecipe = { + ...recipe({ recipeId: "active-broken", status: "active" }), + lastValidationScore: 1, + }; + const weakerRepair = recipe({ recipeId: "repair-v2", version: 2 }); + await store.saveRecipe(activeRecipe); + const service = new SelfHealingPopulateRecipeService({ + store, + runtime: new FakePopulateRecipeRuntime({ + "active-broken": invalidRun(activeRecipe, "Transient source outage."), + "repair-v2": validRun(weakerRepair, 0.75), + }), + author: new FakeRecipeAuthor({ repairedRecipe: weakerRepair }), + }); + + const result = await service.tick({ datasetId: context.datasetId, context }); + const snapshot = await store.loadSnapshot(context.datasetId); + + assert.equal(result.action, "candidate_rejected"); + assert.match(result.rejectionReasons.join("\n"), /active recipe baseline/); + assert.equal(snapshot.recipes.find((item) => item.recipeId === "active-broken")?.status, "active"); + assert.equal(snapshot.recipes.find((item) => item.recipeId === "repair-v2")?.status, "rejected"); +}); + +test("self-healing service rejects a repaired candidate that still fails validation", async () => { + const store = new InMemoryPopulateRecipeStore(); + const activeRecipe = recipe({ recipeId: "active-broken", status: "active" }); + const rejectedRecipe = recipe({ recipeId: "bad-repair", version: 2 }); + await store.saveRecipe(activeRecipe); + const service = new SelfHealingPopulateRecipeService({ + store, + runtime: new FakePopulateRecipeRuntime({ + "active-broken": invalidRun(activeRecipe, "No source-backed rows."), + "bad-repair": invalidRun(rejectedRecipe, "Still no evidence."), + }), + author: new FakeRecipeAuthor({ repairedRecipe: rejectedRecipe }), + }); + + const result = await service.tick({ datasetId: context.datasetId, context }); + const snapshot = await store.loadSnapshot(context.datasetId); + + assert.equal(result.action, "candidate_rejected"); + assert.match(result.rejectionReasons.join("\n"), /Still no evidence/); + assert.equal(snapshot.recipes.find((item) => item.recipeId === "active-broken")?.status, "active"); + assert.equal(snapshot.recipes.find((item) => item.recipeId === "bad-repair")?.status, "rejected"); +}); + +test("file store reloads populate recipes and run records", async () => { + const rootDirectory = await mkdtemp(join(tmpdir(), "bigset-populate-recipes-")); + const store = new FileSystemPopulateRecipeStore(rootDirectory); + const generatedRecipe = recipe({ recipeId: "persisted-v1" }); + const service = new SelfHealingPopulateRecipeService({ + store, + runtime: new FakePopulateRecipeRuntime({ + "persisted-v1": validRun(generatedRecipe), + }), + author: new FakeRecipeAuthor({ generatedRecipe }), + }); + + await service.tick({ datasetId: context.datasetId, context }); + + const reloadedStore = new FileSystemPopulateRecipeStore(rootDirectory); + const snapshot = await reloadedStore.loadSnapshot(context.datasetId); + + assert.equal(snapshot.recipes.length, 1); + assert.equal(snapshot.recipes[0]?.status, "active"); + assert.equal(snapshot.runRecords.length, 1); + assert.equal(snapshot.runRecords[0]?.runStatus, "succeeded"); +}); + +interface ToolLike { + execute(input: TInput): Promise; +} + +function recipe(input: { + recipeId: string; + version?: number; + status?: PopulateRecipe["status"]; + runtimeInstructions?: string; +}): PopulateRecipe { + return createPopulateRecipe({ + recipeId: input.recipeId, + datasetId: context.datasetId, + version: input.version ?? 1, + status: input.status, + sourceDescription: context.description, + requestedColumns: context.columns.map((column) => column.name), + runtimeInstructions: input.runtimeInstructions, + createdAt: "2026-05-22T00:00:00.000Z", + }); +} + +function validRun(recipe: PopulateRecipe, score = 1): PopulateRecipeRunResult { + return runResult({ + recipe, + rows: validRows(), + isValid: true, + score, + }); +} + +function invalidRun(recipe: PopulateRecipe, issue: string): PopulateRecipeRunResult { + return runResult({ + recipe, + rows: [], + validationIssues: [issue], + criticalIssues: [issue], + isValid: false, + score: 0, + }); +} + +function runResult(input: { + recipe: PopulateRecipe; + rows: PopulateRecipeRunResult["rows"]; + validationIssues?: string[]; + criticalIssues?: string[]; + isValid: boolean; + score: number; +}): PopulateRecipeRunResult { + return { + rows: input.rows, + validationIssues: input.validationIssues ?? [], + usage: { + promptTokens: 0, + completionTokens: 0, + totalTokens: 0, + }, + metrics: { + searchCalls: 0, + fetchCalls: 0, + browserCalls: 0, + agentRuns: 0, + agentSteps: 0, + }, + recipeId: input.recipe.recipeId, + recipeVersion: input.recipe.version, + runStatus: input.isValid ? "succeeded" : "failed", + startedAt: "2026-05-22T00:00:00.000Z", + completedAt: "2026-05-22T00:00:01.000Z", + runtimeMs: 1_000, + productionValidation: { + isValid: input.isValid, + score: input.score, + rowCount: input.rows.length, + requestedCellCompletenessRatio: input.score, + sourceUrlCoverageRatio: input.score, + evidenceCoverageRatio: input.score, + expectedEntityCoverageRatio: input.score, + expectedEntities: [], + missingExpectedEntities: [], + criticalIssues: input.criticalIssues ?? [], + warnings: input.validationIssues ?? [], + }, + artifacts: [], + }; +} + +function validRows(): PopulateRecipeRunResult["rows"] { + return [ + { + cells: { + entity_name: "OpenAI", + latest_post_title: "Release notes from OpenAI", + source_url: "https://openai.com/news", + evidence_quote: "Release notes from OpenAI", + }, + sourceUrls: ["https://openai.com/news"], + evidence: [ + { + columnName: "latest_post_title", + sourceUrl: "https://openai.com/news", + quote: "Release notes from OpenAI", + }, + ], + needsReview: true, + }, + ]; +} + +function emptyUsage(): PopulateRecipeRunResult["usage"] { + return { + promptTokens: 0, + completionTokens: 0, + totalTokens: 0, + }; +} + +function emptyMetrics(): PopulateRecipeRunResult["metrics"] { + return { + searchCalls: 0, + fetchCalls: 0, + browserCalls: 0, + agentRuns: 0, + agentSteps: 0, + }; +} + +class FakePopulateRecipeRuntime implements PopulateRecipeRuntime { + constructor(private readonly runsByRecipeId: Record) {} + + async runRecipe(input: { + recipe: PopulateRecipe; + context: DatasetContext; + }): Promise { + const run = this.runsByRecipeId[input.recipe.recipeId]; + if (!run) { + return invalidRun(input.recipe, `Missing fake run for ${input.recipe.recipeId}.`); + } + return run; + } +} + +class FakeRecipeAuthor implements PopulateRecipeAuthor { + generateCalls = 0; + repairCalls = 0; + lastRepairInput?: Parameters[0]; + + constructor( + private readonly recipes: { + generatedRecipe?: PopulateRecipe; + repairedRecipe?: PopulateRecipe; + } = {} + ) {} + + async generateRecipe(): Promise { + this.generateCalls += 1; + return this.recipes.generatedRecipe ?? recipe({ recipeId: "generated-v1" }); + } + + async repairRecipe( + input: Parameters[0] + ): Promise { + this.repairCalls += 1; + this.lastRepairInput = input; + return this.recipes.repairedRecipe ?? recipe({ recipeId: "repair-v2", version: 2 }); + } +} diff --git a/backend/test/populate-server.test.ts b/backend/test/populate-server.test.ts new file mode 100644 index 0000000..99e63f2 --- /dev/null +++ b/backend/test/populate-server.test.ts @@ -0,0 +1,138 @@ +import assert from "node:assert/strict"; +import { test } from "node:test"; + +import { createBigSetServer } from "../src/server.js"; +import type { DatasetContext } from "../src/pipeline/populate.js"; +import type { PopulateRecipeRuntime } from "../src/pipeline/populate-self-healing.js"; +import type { RunSelfHealingPopulateResult } from "../src/pipeline/populate-self-healing-runner.js"; + +const context: DatasetContext = { + datasetId: "dataset-ai-posts", + datasetName: "AI posts", + description: "Find latest blog posts from OpenAI.", + columns: [{ + name: "entity_name", + type: "text", + description: "Company name.", + }], +}; + +test("POST /populate passes selected runtime into self-healing runner", async () => { + const selectedRuntime = fakeRuntime(); + let createRuntimeCalls = 0; + let didUseSelectedRuntime = false; + const app = await createBigSetServer({ + env: { + CLIENT_ORIGIN: "http://localhost:3500", + CONVEX_URL: "http://convex:3210", + CONVEX_ADMIN_KEY: "convex-admin", + OPENROUTER_API_KEY: "openrouter", + TINYFISH_API_KEY: "tinyfish", + POPULATE_RECIPE_STORE_DIR: ".bigset/populate-recipes", + }, + runtimeEnv: { + POPULATE_AGENT_RUNTIME: "collection", + }, + authPreHandler: async (request) => { + request.auth = { userId: "user-1" }; + }, + getDatasetById: async (datasetId) => { + assert.equal(datasetId, context.datasetId); + return { ownerId: "user-1" }; + }, + populateRowWriter: { + async replaceRows() { + return { insertedRowCount: 1 }; + }, + }, + createRuntime: async (input) => { + createRuntimeCalls += 1; + assert.equal(input.env.POPULATE_AGENT_RUNTIME, "collection"); + return selectedRuntime; + }, + runSelfHealing: async (input) => { + didUseSelectedRuntime = input.runtime === selectedRuntime; + assert.equal(input.shouldCommitRows, true); + assert.equal(input.recipeStoreDirectory, ".bigset/populate-recipes"); + assert.ok(input.rowWriter); + return successfulResult(input.context.datasetId); + }, + }); + + const response = await app.inject({ + method: "POST", + url: "/populate", + payload: context, + }); + + await app.close(); + + assert.equal(response.statusCode, 200); + assert.equal(createRuntimeCalls, 1); + assert.equal(didUseSelectedRuntime, true); + assert.equal(response.json().success, true); +}); + +function successfulResult(datasetId: string): RunSelfHealingPopulateResult { + return { + success: true, + action: "generated_initial_recipe", + datasetId, + selectedRun: { + rows: [{ + cells: { entity_name: "OpenAI" }, + sourceUrls: ["https://openai.com/news"], + evidence: [{ + columnName: "entity_name", + sourceUrl: "https://openai.com/news", + quote: "OpenAI", + }], + needsReview: true, + }], + validationIssues: [], + usage: { promptTokens: 0, completionTokens: 0, totalTokens: 0 }, + metrics: { + searchCalls: 0, + fetchCalls: 0, + browserCalls: 0, + agentRuns: 0, + agentSteps: 0, + }, + recipeId: `${datasetId}-recipe-v1`, + recipeVersion: 1, + runStatus: "succeeded", + startedAt: "2026-05-22T00:00:00.000Z", + completedAt: "2026-05-22T00:00:01.000Z", + runtimeMs: 1_000, + productionValidation: { + isValid: true, + score: 1, + rowCount: 1, + requestedCellCompletenessRatio: 1, + sourceUrlCoverageRatio: 1, + evidenceCoverageRatio: 1, + expectedEntityCoverageRatio: 1, + expectedEntities: [], + missingExpectedEntities: [], + criticalIssues: [], + warnings: [], + }, + artifacts: [], + }, + rejectionReasons: [], + validationIssues: [], + tick: { + datasetId, + action: "generated_initial_recipe", + rejectionReasons: [], + }, + }; +} + +function fakeRuntime(): PopulateRecipeRuntime { + return { + async runRecipe() { + throw new Error("fake runtime should not execute in route unit tests"); + }, + }; +} diff --git a/benchmarks/dataset-agent/README.md b/benchmarks/dataset-agent/README.md new file mode 100644 index 0000000..a4e0cc7 --- /dev/null +++ b/benchmarks/dataset-agent/README.md @@ -0,0 +1,168 @@ +# Dataset Agent Benchmark + +Shared harness for scoring one dataset agent command against the same prompt pack. + +The runner is intentionally standalone. Each system is a command that reads the +benchmark env vars, runs one prompt, and prints one JSON object to stdout. + +## Run Mastra Populate + +The Mastra adapter calls the self-healing populate service around +`runPopulateRuntime`. It avoids the HTTP/auth route, uses an isolated in-memory +recipe store per prompt run, and never clears or inserts Convex rows. + +```bash +node benchmarks/dataset-agent/run-benchmark.mjs \ + --prompt-ids latest-ai-blog-posts,saas-pricing-pages \ + --system mastra='node --import ./backend/node_modules/tsx/dist/esm/index.mjs benchmarks/dataset-agent/adapters/mastra-populate-adapter.mjs' +``` + +Real Mastra benchmark runs require `OPENROUTER_API_KEY` and `TINYFISH_API_KEY` +loaded execution-only. If either is missing, the adapter returns a blocked +benchmark result instead of touching app data. + +## Run Collection Inside Self-Healing + +The collection adapter uses the same benchmark runner, but wraps +`CollectionPopulateRecipeRuntime` inside `SelfHealingPopulateRecipeService`. +That means collection results are scored after the same recipe generation, +repair, validation, and promotion path as the app runtime. + +```bash +COLLECTION_AGENT_PIPELINE_MODULE=./backend/BigSet_Data_Collection_Agent/src/orchestrator/pipeline.ts \ +BIGSET_COLLECTION_BENCHMARK_RUNNER_MODULE=./backend/src/pipeline/collection-agent-runner.ts \ +node benchmarks/dataset-agent/run-benchmark.mjs \ + --prompt-ids latest-ai-blog-posts,saas-pricing-pages \ + --system collection-self-heal='node --import ./backend/node_modules/tsx/dist/esm/index.mjs benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs' +``` + +Real collection benchmark runs require `OPENROUTER_API_KEY`, +`TINYFISH_API_KEY`, `BIGSET_COLLECTION_BENCHMARK_RUNNER_MODULE`, and +`COLLECTION_AGENT_PIPELINE_MODULE` loaded in the shell. The benchmark runner +module must export `runCollectionPopulatePipeline(input)` or a default runner +that accepts `CollectionPopulatePipelineInput` and returns a +`PopulateRuntimeResult`. The pipeline module must export `runPipeline(options)`. +The BigSet runner keeps TinyFish Agent/browser calls off by default so the +benchmark stays cheap and bounded. Set `COLLECTION_AGENT_ENABLE_AGENT=true` to +opt in; Agent polling is capped by `AGENT_POLL_TIMEOUT_MS`, or by +`COLLECTION_AGENT_POLL_TIMEOUT_MS` when the generic timeout is unset. + +When Agent is off and triage finds browser/form/detail-page follow-up, the +collection runner emits a non-fatal capability diagnostic. Healthy rows can +still pass self-healing validation with this diagnostic as a warning. Benchmark +failures show the same diagnostic as the failure message so the result says +"turn Agent on for this prompt" instead of pretending the run hit auth, +credits, or generic zero-row failure. + +Use this canary when checking whether Agent/browser follow-up fixes the current +source-evidence misses: + +```bash +COLLECTION_AGENT_ENABLE_AGENT=true \ +COLLECTION_AGENT_POLL_TIMEOUT_MS=480000 \ +COLLECTION_AGENT_PIPELINE_MODULE=./backend/BigSet_Data_Collection_Agent/src/orchestrator/pipeline.ts \ +BIGSET_COLLECTION_BENCHMARK_RUNNER_MODULE=./backend/src/pipeline/collection-agent-runner.ts \ +node benchmarks/dataset-agent/run-benchmark.mjs \ + --prompt-ids mcp-docs-pages \ + --timeout-ms 900000 \ + --system collection-self-heal='node --import ./backend/node_modules/tsx/dist/esm/index.mjs benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs' +``` + +Latest `mcp-docs-pages` Agent-enabled canary evidence: + +- artifact: `benchmark-results/collection-agent-canary-mcp-20260523-001` +- status: failed, not blocked +- rows/evidence: 3 rows, 12 evidence quotes, 10 source URLs +- cost: about `$0.053552` +- signal: Agent runs complete and claim support reaches `1.0`, but domain + accuracy stays `0.667`; next fix is source/domain coherence, not more Agent + plumbing. + +App and CLI collection-runtime runs use the same runner shape, but load it from +`POPULATE_COLLECTION_RUNNER_MODULE` when `POPULATE_AGENT_RUNTIME=collection`. + +## Verify Self-Healing Stack + +Use this before asking someone else to migrate a new collection agent into the +app path: + +```bash +make verify-self-healing +``` + +That command runs backend tests, backend build, adapter syntax checks, and +Mastra + collection no-key benchmark smokes that must produce clean `blocked` +results without spending OpenRouter or TinyFish credits. + +Live checks are explicit: + +```bash +bash scripts/verify-self-healing-stack.sh --real-benchmark +bash scripts/verify-self-healing-stack.sh --convex-push --dataset-id +bash scripts/verify-self-healing-stack.sh --convex-push --dataset-id --commit +``` + +The live benchmark and dataset smoke expect required env vars to already be +exported in the shell. They print only missing key names and never print secret +values. The `--convex-push` mode still uses the existing `make convex-push` +target, which requires `frontend/.env.local`. + +## Benchmark Env + +For each prompt the runner sets: + +- `BIGSET_BENCHMARK_PROMPT` +- `BIGSET_BENCHMARK_PROMPT_ID` +- `BIGSET_BENCHMARK_PROMPT_QUALITY` +- `BIGSET_BENCHMARK_PERSONA` +- `BIGSET_BENCHMARK_EXPECTED_STRESS` +- `BIGSET_BENCHMARK_REQUIRED_COLUMNS` +- `BIGSET_BENCHMARK_MINIMUM_REQUIRED_COLUMNS` + +`BIGSET_BENCHMARK_REQUIRED_COLUMNS` is the requested table shape. +`BIGSET_BENCHMARK_MINIMUM_REQUIRED_COLUMNS` is the hard row identity minimum. +Rows still need at least one source URL and evidence quote. Collection benchmark +runners receive prompt id, quality, persona, expected stress, and required +columns through `CollectionPopulatePipelineInput` so they can build the same +benchmark/spec context that the direct collection lane expects. + +## Agent Output Contract + +The command must print JSON: + +```json +{ + "rows": [ + { + "cells": { + "entity_name": "Example", + "source_url": "https://example.com" + }, + "sourceUrls": ["https://example.com"], + "evidence": [ + { + "columnName": "entity_name", + "sourceUrl": "https://example.com", + "quote": "Example source quote" + } + ], + "needsReview": false + } + ], + "validationIssues": [], + "usage": { + "promptTokens": 0, + "completionTokens": 0, + "totalTokens": 0 + }, + "metrics": { + "searchCalls": 0, + "fetchCalls": 0, + "browserCalls": 0, + "agentRuns": 1, + "agentSteps": 0 + } +} +``` + +Logs must go to stderr. diff --git a/benchmarks/dataset-agent/adapters/.gitignore b/benchmarks/dataset-agent/adapters/.gitignore new file mode 100644 index 0000000..0935c2f --- /dev/null +++ b/benchmarks/dataset-agent/adapters/.gitignore @@ -0,0 +1 @@ +local-*.mjs diff --git a/benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs b/benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs new file mode 100644 index 0000000..c9480ba --- /dev/null +++ b/benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs @@ -0,0 +1,171 @@ +#!/usr/bin/env node +import { pathToFileURL } from "node:url"; +import { resolve } from "node:path"; + +const prompt = requiredEnv("BIGSET_BENCHMARK_PROMPT"); +const promptId = process.env.BIGSET_BENCHMARK_PROMPT_ID ?? "benchmark-prompt"; +const promptQuality = process.env.BIGSET_BENCHMARK_PROMPT_QUALITY ?? "unknown"; +const persona = process.env.BIGSET_BENCHMARK_PERSONA; +const expectedStress = process.env.BIGSET_BENCHMARK_EXPECTED_STRESS; +const requiredColumns = columnList( + requiredEnv("BIGSET_BENCHMARK_REQUIRED_COLUMNS") +); +const minimumRequiredColumns = columnList( + process.env.BIGSET_BENCHMARK_MINIMUM_REQUIRED_COLUMNS ?? "" +); + +const missingRuntimeKeys = ["OPENROUTER_API_KEY", "TINYFISH_API_KEY"].filter( + (name) => !process.env[name] +); +if (missingRuntimeKeys.length > 0) { + console.log(JSON.stringify({ + rows: [], + validationIssues: [ + `Missing ${missingRuntimeKeys.join(", ")} for collection self-healing benchmark.`, + ], + usage: emptyUsage(), + metrics: emptyMetrics(), + })); + process.exit(0); +} + +const collectionRunner = await loadCollectionRunner(); +if (!collectionRunner) { + console.log(JSON.stringify({ + rows: [], + validationIssues: [ + "Collection self-healing benchmark runner is not configured. Set BIGSET_COLLECTION_BENCHMARK_RUNNER_MODULE to a module exporting runCollectionPopulatePipeline(input).", + ], + usage: emptyUsage(), + metrics: emptyMetrics(), + })); + process.exit(0); +} + +const { + diagnosticRunForTick, + validationIssuesForSelfHealingTick, +} = await import( + "../../../backend/src/pipeline/populate-self-healing-runner.ts" +); +const { + DefaultPopulateRecipeAuthor, + InMemoryPopulateRecipeStore, + SelfHealingPopulateRecipeService, +} = await import( + "../../../backend/src/pipeline/populate-self-healing.ts" +); +const { + CollectionPopulateRecipeRuntime, +} = await import( + "../../../backend/src/pipeline/populate-collection-runtime.ts" +); + +const context = { + datasetId: `benchmark-${safeIdSegment(promptId)}`, + datasetName: `benchmark_${safeIdSegment(promptId)}`, + description: prompt, + columns: requiredColumns.map((columnName) => ({ + name: columnName, + type: inferPopulateColumnType(columnName), + description: `Benchmark requested column for ${promptQuality} prompt.`, + })), +}; +const service = new SelfHealingPopulateRecipeService({ + store: new InMemoryPopulateRecipeStore(), + runtime: new CollectionPopulateRecipeRuntime({ + runPipeline: collectionRunner, + targetRows: Number(process.env.BIGSET_COLLECTION_BENCHMARK_MAX_ROWS ?? "10"), + benchmarkMetadata: { + promptId, + promptQuality, + persona, + expectedStress, + }, + }), + author: new DefaultPopulateRecipeAuthor(), +}); +const tick = await service.tick({ datasetId: context.datasetId, context }); +const result = diagnosticRunForTick(tick); + +console.log(JSON.stringify({ + rows: result?.rows ?? [], + validationIssues: [ + ...validationIssuesForSelfHealingTick(tick), + ...minimumColumnIssues(result?.rows ?? []), + ], + usage: result?.usage ?? emptyUsage(), + metrics: result?.metrics ?? emptyMetrics(), +})); + +async function loadCollectionRunner() { + const moduleSpecifier = process.env.BIGSET_COLLECTION_BENCHMARK_RUNNER_MODULE; + if (!moduleSpecifier) { + return undefined; + } + const moduleUrl = moduleSpecifier.startsWith(".") || moduleSpecifier.startsWith("/") + ? pathToFileURL(resolve(moduleSpecifier)).href + : moduleSpecifier; + const loaded = await import(moduleUrl); + const runner = loaded.runCollectionPopulatePipeline ?? loaded.default; + if (typeof runner !== "function") { + throw new Error( + `${moduleSpecifier} must export runCollectionPopulatePipeline(input) or a default runner.` + ); + } + return runner; +} + +function minimumColumnIssues(rows) { + const issues = []; + for (const [rowIndex, row] of rows.entries()) { + for (const columnName of minimumRequiredColumns) { + const value = row.cells?.[columnName]; + if (value === undefined || value === null || value === "") { + issues.push(`Row ${rowIndex} missing minimum required column ${columnName}.`); + } + } + } + return issues; +} + +function inferPopulateColumnType(columnName) { + if (/(url|website|link|page)$/i.test(columnName)) return "url"; + if (/(date|_at)$/i.test(columnName)) return "date"; + if (/^(is_|has_|can_)/i.test(columnName)) return "boolean"; + if (/(count|price|amount|score|number|total)/i.test(columnName)) return "number"; + return "text"; +} + +function safeIdSegment(value) { + return String(value).replace(/[^a-zA-Z0-9._-]/g, "_").slice(0, 80); +} + +function columnList(value) { + return value + .split(",") + .map((columnName) => columnName.trim()) + .filter(Boolean); +} + +function emptyUsage() { + return { promptTokens: 0, completionTokens: 0, totalTokens: 0 }; +} + +function emptyMetrics() { + return { + searchCalls: 0, + fetchCalls: 0, + browserCalls: 0, + agentRuns: 0, + agentSteps: 0, + }; +} + +function requiredEnv(name) { + const value = process.env[name]; + if (!value) { + throw new Error(`Missing ${name}. Run through run-benchmark.mjs.`); + } + return value; +} diff --git a/benchmarks/dataset-agent/adapters/mastra-populate-adapter.mjs b/benchmarks/dataset-agent/adapters/mastra-populate-adapter.mjs new file mode 100644 index 0000000..24096ce --- /dev/null +++ b/benchmarks/dataset-agent/adapters/mastra-populate-adapter.mjs @@ -0,0 +1,125 @@ +#!/usr/bin/env node + +const prompt = requiredEnv("BIGSET_BENCHMARK_PROMPT"); +const promptId = process.env.BIGSET_BENCHMARK_PROMPT_ID ?? "benchmark-prompt"; +const promptQuality = process.env.BIGSET_BENCHMARK_PROMPT_QUALITY ?? "unknown"; +const requiredColumns = columnList( + requiredEnv("BIGSET_BENCHMARK_REQUIRED_COLUMNS") +); +const minimumRequiredColumns = columnList( + process.env.BIGSET_BENCHMARK_MINIMUM_REQUIRED_COLUMNS ?? "" +); + +const missingRuntimeKeys = ["OPENROUTER_API_KEY", "TINYFISH_API_KEY"].filter( + (name) => !process.env[name] +); +if (missingRuntimeKeys.length > 0) { + console.log(JSON.stringify({ + rows: [], + validationIssues: [ + `Missing ${missingRuntimeKeys.join(", ")} for Mastra populate benchmark.`, + ], + usage: emptyUsage(), + metrics: emptyMetrics(), + })); + process.exit(0); +} + +const { + diagnosticRunForTick, + validationIssuesForSelfHealingTick, +} = await import( + "../../../backend/src/pipeline/populate-self-healing-runner.ts" +); +const { + DefaultPopulateRecipeAuthor, + InMemoryPopulateRecipeStore, + MastraPopulateRecipeRuntime, + SelfHealingPopulateRecipeService, +} = await import( + "../../../backend/src/pipeline/populate-self-healing.ts" +); + +const context = { + datasetId: `benchmark-${safeIdSegment(promptId)}`, + datasetName: `benchmark_${safeIdSegment(promptId)}`, + description: prompt, + columns: requiredColumns.map((columnName) => ({ + name: columnName, + type: inferPopulateColumnType(columnName), + description: `Benchmark requested column for ${promptQuality} prompt.`, + })), +}; +const service = new SelfHealingPopulateRecipeService({ + store: new InMemoryPopulateRecipeStore(), + runtime: new MastraPopulateRecipeRuntime({ + maxRows: Number(process.env.BIGSET_MASTRA_BENCHMARK_MAX_ROWS ?? "10"), + }), + author: new DefaultPopulateRecipeAuthor(), +}); +const tick = await service.tick({ datasetId: context.datasetId, context }); +const result = diagnosticRunForTick(tick); + +console.log(JSON.stringify({ + rows: result?.rows ?? [], + validationIssues: [ + ...validationIssuesForSelfHealingTick(tick), + ...minimumColumnIssues(result?.rows ?? []), + ], + usage: result?.usage ?? emptyUsage(), + metrics: result?.metrics ?? emptyMetrics(), +})); + +function minimumColumnIssues(rows) { + const issues = []; + for (const [rowIndex, row] of rows.entries()) { + for (const columnName of minimumRequiredColumns) { + const value = row.cells?.[columnName]; + if (value === undefined || value === null || value === "") { + issues.push(`Row ${rowIndex} missing minimum required column ${columnName}.`); + } + } + } + return issues; +} + +function inferPopulateColumnType(columnName) { + if (/(url|website|link|page)$/i.test(columnName)) return "url"; + if (/(date|_at)$/i.test(columnName)) return "date"; + if (/^(is_|has_|can_)/i.test(columnName)) return "boolean"; + if (/(count|price|amount|score|number|total)/i.test(columnName)) return "number"; + return "text"; +} + +function safeIdSegment(value) { + return String(value).replace(/[^a-zA-Z0-9._-]/g, "_").slice(0, 80); +} + +function columnList(value) { + return value + .split(",") + .map((columnName) => columnName.trim()) + .filter(Boolean); +} + +function emptyUsage() { + return { promptTokens: 0, completionTokens: 0, totalTokens: 0 }; +} + +function emptyMetrics() { + return { + searchCalls: 0, + fetchCalls: 0, + browserCalls: 0, + agentRuns: 0, + agentSteps: 0, + }; +} + +function requiredEnv(name) { + const value = process.env[name]; + if (!value) { + throw new Error(`Missing ${name}. Run through run-benchmark.mjs.`); + } + return value; +} diff --git a/benchmarks/dataset-agent/adapters/smoke-adapter.mjs b/benchmarks/dataset-agent/adapters/smoke-adapter.mjs new file mode 100644 index 0000000..aca5027 --- /dev/null +++ b/benchmarks/dataset-agent/adapters/smoke-adapter.mjs @@ -0,0 +1,66 @@ +#!/usr/bin/env node + +const prompt = process.env.BIGSET_BENCHMARK_PROMPT ?? ""; +const promptId = process.env.BIGSET_BENCHMARK_PROMPT_ID ?? "unknown"; +const requiredColumns = (process.env.BIGSET_BENCHMARK_REQUIRED_COLUMNS ?? "") + .split(",") + .map((columnName) => columnName.trim()) + .filter(Boolean); + +const cells = Object.fromEntries( + requiredColumns.map((columnName) => [ + columnName, + valueForColumn({ columnName, prompt, promptId }), + ]) +); + +const sourceUrl = `https://example.com/bigset-benchmark/${encodeURIComponent(promptId)}`; +cells.source_url = cells.source_url ?? sourceUrl; + +console.log( + JSON.stringify({ + rows: [ + { + cells, + sourceUrls: [sourceUrl], + evidence: [ + { + columnName: requiredColumns[0] ?? "entity_name", + sourceUrl, + quote: `Smoke benchmark evidence for ${promptId}`, + }, + ], + needsReview: false, + }, + ], + validationIssues: [], + usage: { + promptTokens: Math.max(1, Math.round(prompt.length / 4)), + completionTokens: 120, + totalTokens: Math.max(1, Math.round(prompt.length / 4)) + 120, + }, + metrics: { + searchCalls: 1, + fetchCalls: 1, + browserCalls: 0, + agentRuns: 1, + agentSteps: 3, + }, + }) +); + +function valueForColumn({ columnName, prompt, promptId }) { + if (columnName.endsWith("_url") || columnName === "source_url") { + return `https://example.com/${encodeURIComponent(promptId)}`; + } + if (columnName.includes("date") || columnName.endsWith("_at")) { + return "2026-05-19"; + } + if (columnName.includes("price") || columnName.includes("count")) { + return 1; + } + if (columnName.startsWith("is_") || columnName.startsWith("has_")) { + return true; + } + return prompt.slice(0, 80) || promptId; +} diff --git a/benchmarks/dataset-agent/adapters/template-adapter.mjs b/benchmarks/dataset-agent/adapters/template-adapter.mjs new file mode 100644 index 0000000..4764c61 --- /dev/null +++ b/benchmarks/dataset-agent/adapters/template-adapter.mjs @@ -0,0 +1,169 @@ +#!/usr/bin/env node +import { spawn } from "node:child_process"; + +const prompt = requiredEnv("BIGSET_BENCHMARK_PROMPT"); +const promptId = requiredEnv("BIGSET_BENCHMARK_PROMPT_ID"); +const requiredColumns = requiredEnv("BIGSET_BENCHMARK_REQUIRED_COLUMNS") + .split(",") + .map((columnName) => columnName.trim()) + .filter(Boolean); +const minimumRequiredColumns = (process.env.BIGSET_BENCHMARK_MINIMUM_REQUIRED_COLUMNS ?? "") + .split(",") + .map((columnName) => columnName.trim()) + .filter(Boolean); + +const agentResult = await runCurrentAgent({ + prompt, + promptId, + requiredColumns, + minimumRequiredColumns, +}); + +console.log(JSON.stringify(toBenchmarkPayload(agentResult))); + +async function runCurrentAgent(input) { + // Replace this function with the current agent call. + // + // Option A: direct JS import + // const { runDatasetAgent } = await import("../../path/to/agent.js"); + // return runDatasetAgent({ prompt: input.prompt }); + // + // Option B: existing CLI + // return runJsonCommand("npm", ["run", "agent:run", "--", input.prompt]); + // + // Option C: local HTTP server + // const response = await fetch("http://localhost:3001/dataset-agent", { + // method: "POST", + // headers: { "Content-Type": "application/json" }, + // body: JSON.stringify({ prompt: input.prompt }), + // }); + // if (!response.ok) throw new Error(`Agent HTTP ${response.status}`); + // return response.json(); + // + // Keep this throw until the real call is wired. + throw new Error( + `Wire current agent in ${import.meta.url} for prompt ${input.promptId}.` + ); +} + +function toBenchmarkPayload(agentResult) { + const rows = normalizeRows(agentResult.rows ?? agentResult.data ?? []); + return { + rows, + validationIssues: + agentResult.validationIssues ?? agentResult.issues ?? agentResult.errors ?? [], + usage: { + promptTokens: + agentResult.usage?.promptTokens ?? + agentResult.usage?.inputTokens ?? + agentResult.inputTokens ?? + 0, + completionTokens: + agentResult.usage?.completionTokens ?? + agentResult.usage?.outputTokens ?? + agentResult.outputTokens ?? + 0, + totalTokens: + agentResult.usage?.totalTokens ?? + agentResult.totalTokens ?? + 0, + }, + metrics: { + searchCalls: + agentResult.metrics?.searchCalls ?? agentResult.searchCallCount ?? 0, + fetchCalls: + agentResult.metrics?.fetchCalls ?? agentResult.fetchCallCount ?? 0, + browserCalls: + agentResult.metrics?.browserCalls ?? agentResult.browserCallCount ?? 0, + agentRuns: + agentResult.metrics?.agentRuns ?? agentResult.agentRunCount ?? 1, + agentSteps: + agentResult.metrics?.agentSteps ?? agentResult.agentStepCount ?? 0, + }, + }; +} + +function normalizeRows(rows) { + return rows.map((row) => { + const cells = row.cells ?? row.data ?? row; + const sourceUrls = normalizeSourceUrls(row, cells); + return { + cells, + sourceUrls, + evidence: normalizeEvidence(row, sourceUrls), + needsReview: row.needsReview ?? row.needs_review ?? false, + }; + }); +} + +function normalizeSourceUrls(row, cells) { + return [ + ...arrayOfStrings(row.sourceUrls), + ...arrayOfStrings(row.sources), + ...arrayOfStrings(row.source_urls), + ...singleString(row.sourceUrl), + ...singleString(row.source_url), + ...singleString(cells.source_url), + ...singleString(cells.sourceUrl), + ].filter((value, index, array) => value && array.indexOf(value) === index); +} + +function normalizeEvidence(row, sourceUrls) { + if (Array.isArray(row.evidence)) { + return row.evidence; + } + if (Array.isArray(row.evidenceQuotes)) { + return row.evidenceQuotes.map((quote) => ({ + columnName: "entity_name", + sourceUrl: sourceUrls[0] ?? "", + quote, + })); + } + return []; +} + +async function runJsonCommand(command, args) { + const execution = await runCommand(command, args); + if (execution.exitCode !== 0) { + throw new Error(`${command} exited ${execution.exitCode}: ${execution.stderr}`); + } + return JSON.parse(execution.stdout); +} + +function runCommand(command, args) { + return new Promise((resolve) => { + const child = spawn(command, args, { + stdio: ["ignore", "pipe", "pipe"], + env: process.env, + }); + let stdout = ""; + let stderr = ""; + child.stdout.on("data", (chunk) => { + stdout += chunk.toString(); + }); + child.stderr.on("data", (chunk) => { + stderr += chunk.toString(); + }); + child.on("close", (exitCode) => { + resolve({ stdout, stderr, exitCode: exitCode ?? 1 }); + }); + }); +} + +function requiredEnv(name) { + const value = process.env[name]; + if (!value) { + throw new Error(`Missing ${name}. Run through run-benchmark.mjs.`); + } + return value; +} + +function arrayOfStrings(value) { + return Array.isArray(value) + ? value.filter((item) => typeof item === "string") + : []; +} + +function singleString(value) { + return typeof value === "string" ? [value] : []; +} diff --git a/benchmarks/dataset-agent/prompts.json b/benchmarks/dataset-agent/prompts.json new file mode 100644 index 0000000..65eb0d8 --- /dev/null +++ b/benchmarks/dataset-agent/prompts.json @@ -0,0 +1,130 @@ +[ + { + "id": "latest-ai-blog-posts", + "quality": "good", + "persona": "technical operator", + "prompt": "Can you make me a table of the latest blog posts from OpenAI, Anthropic, and Google DeepMind? I need title, publish date, and URL.", + "requiredColumns": ["entity_name", "latest_post_title", "latest_post_date", "source_url"], + "expectedStress": "Clear entities and fields; tests current web facts with low ambiguity." + }, + { + "id": "saas-pricing-pages", + "quality": "good", + "persona": "startup founder", + "prompt": "For Stripe, Paddle, and Chargebee, collect the official pricing page URL and the plan names or starting prices shown on the page.", + "requiredColumns": ["entity_name", "pricing_page_url", "plan_or_price", "source_url"], + "expectedStress": "Official pricing evidence; should not require a browser agent unless pricing is hidden." + }, + { + "id": "earnings-release-pages", + "quality": "good", + "persona": "finance analyst", + "prompt": "Find the latest investor relations earnings release page for Apple, Microsoft, and Nvidia. Include release date, fiscal quarter, and source URL.", + "requiredColumns": ["entity_name", "release_date", "fiscal_quarter", "source_url"], + "expectedStress": "Latest dated source pages; date precision matters." + }, + { + "id": "mcp-docs-pages", + "quality": "good", + "persona": "developer", + "prompt": "I need official docs pages for setting up MCP servers from Anthropic, OpenAI, and Cloudflare. Give me title, URL, and what each page covers.", + "requiredColumns": ["entity_name", "docs_title", "docs_url", "summary"], + "expectedStress": "Official docs discovery; should avoid random blog posts." + }, + { + "id": "menlo-park-coca-cola", + "quality": "average", + "persona": "local researcher", + "prompt": "restaurants in Menlo Park that serve Coca-Cola", + "requiredColumns": ["entity_name", "address", "serves_requested_item", "source_url"], + "expectedStress": "Short but understandable; menu evidence may require deeper page checks." + }, + { + "id": "hcmc-bakery-products", + "quality": "average", + "persona": "food blogger", + "prompt": "bakeries in Ho Chi Minh City with pastry product pages, product name, product URL, and bakery name", + "requiredColumns": ["bakery_name", "product_name", "product_url", "source_url"], + "expectedStress": "Product-page proof and local business search." + }, + { + "id": "ny-ai-startup-careers", + "quality": "average", + "persona": "job seeker", + "prompt": "AI startups in New York that have careers pages. I want company name, website, and whether they look like they are hiring.", + "requiredColumns": ["entity_name", "company_website", "careers_page_url", "is_hiring"], + "expectedStress": "Careers-page verification with partial data accepted." + }, + { + "id": "vietnam-fintech-sites", + "quality": "average", + "persona": "market researcher", + "prompt": "Vietnamese fintech startups with official websites, short description, and source URL", + "requiredColumns": ["entity_name", "official_website", "description", "source_url"], + "expectedStress": "Company discovery with official-source preference." + }, + { + "id": "district-one-coffee-sites", + "quality": "average", + "persona": "tourist", + "prompt": "coffee shops in District 1 Ho Chi Minh City that have their own website or online menu", + "requiredColumns": ["entity_name", "website_or_menu_url", "address", "source_url"], + "expectedStress": "Local search plus website/menu disambiguation." + }, + { + "id": "amazon-starbucks-products", + "quality": "average", + "persona": "ecommerce operator", + "prompt": "I saw there is a Starbucks shop on Amazon. Can you scrape the Starbucks products with name, price, image, and whether each item is in stock?", + "requiredColumns": ["product_name", "price", "image_url", "in_stock"], + "expectedStress": "Ecommerce listing freshness; likely needs browser-style verification." + }, + { + "id": "california-insurance-prices", + "quality": "bad", + "persona": "consumer", + "prompt": "find me the best car insurance prices in California so I can pick the best bang for my buck", + "requiredColumns": ["provider_name", "quote_page_url", "missing_inputs", "source_url"], + "expectedStress": "Missing driver, vehicle, ZIP, coverage, deductible; should ask clarifying questions." + }, + { + "id": "la-coke-menu-lol", + "quality": "bad", + "persona": "casual user", + "prompt": "i need places in LA with coke on the menu lol", + "requiredColumns": ["entity_name", "menu_url", "serves_requested_item", "source_url"], + "expectedStress": "Ambiguous location and entity type; should still infer restaurants but require menu evidence." + }, + { + "id": "sf-ml-hiring-rn", + "quality": "bad", + "persona": "job seeker", + "prompt": "who's hiring ML engineers around sf rn", + "requiredColumns": ["entity_name", "careers_page_url", "open_role_title", "source_url"], + "expectedStress": "Casual wording and broad geography; should find careers pages without over-claiming." + }, + { + "id": "latest-ai-company-stuff", + "quality": "bad", + "persona": "busy founder", + "prompt": "get me the latest stuff from the big AI companies", + "requiredColumns": ["entity_name", "latest_item_title", "latest_item_url", "source_url"], + "expectedStress": "Underspecified entities, source type, and columns; should expose weak plan/questions." + }, + { + "id": "pastry-things-menlo", + "quality": "bad", + "persona": "casual food search", + "prompt": "good pastry things near Menlo Park with websites", + "requiredColumns": ["entity_name", "product_or_business_name", "website_url", "source_url"], + "expectedStress": "Vague quality word and entity boundary; should return product/business evidence only." + }, + { + "id": "perplexity-like-companies", + "quality": "bad", + "persona": "founder", + "prompt": "make a table of companies like Perplexity but with useful info", + "requiredColumns": ["entity_name", "official_website", "why_similar", "source_url"], + "expectedStress": "Vague comparator and columns; should avoid inventing what useful info means." + } +] diff --git a/benchmarks/dataset-agent/run-benchmark.mjs b/benchmarks/dataset-agent/run-benchmark.mjs new file mode 100755 index 0000000..3c3ed9e --- /dev/null +++ b/benchmarks/dataset-agent/run-benchmark.mjs @@ -0,0 +1,1796 @@ +#!/usr/bin/env node +import { spawn } from "node:child_process"; +import { mkdir, readFile, writeFile } from "node:fs/promises"; +import { dirname, join, resolve } from "node:path"; +import { fileURLToPath } from "node:url"; + +const scriptDir = dirname(fileURLToPath(import.meta.url)); +const defaultPromptsPath = join(scriptDir, "prompts.json"); +const defaultMinimumFactualAccuracy = 0.75; + +async function main() { + const config = parseArgs(process.argv.slice(2)); + const allPrompts = JSON.parse(await readFile(config.promptsPath, "utf8")); + const prompts = selectPrompts(allPrompts, config.promptIds); + const runStartedAt = new Date(); + const runDirectory = config.outDirectory ?? join( + process.cwd(), + "benchmark-results", + runStartedAt.toISOString().replace(/[:.]/g, "-") + ); + + if (config.rescoreDirectory) { + const rescoredSummary = await rescoreBenchmarkRun({ + runDirectory: config.rescoreDirectory, + prompts, + config, + }); + await writeJson(join(config.rescoreDirectory, "summary.rescored.json"), rescoredSummary); + await writeMarkdownReport( + join(config.rescoreDirectory, "benchmark-report.rescored.md"), + rescoredSummary, + prompts + ); + console.log(JSON.stringify(rescoredSummary, null, 2)); + process.exit(0); + } + + if (config.systems.length === 0) { + console.error("No systems configured. Pass --system name='command with {{promptJson}}'."); + process.exit(1); + } + + await mkdir(runDirectory, { recursive: true }); + + const laneResults = []; + for (const system of config.systems) { + for (const [promptIndex, promptDefinition] of prompts.entries()) { + const result = await runSystemPrompt({ + system, + promptDefinition, + promptIndex, + promptCount: prompts.length, + runDirectory, + config, + }); + laneResults.push(result); + } + } + + const summary = { + testedAt: runStartedAt.toISOString(), + completedAt: new Date().toISOString(), + wallClockMs: Date.now() - runStartedAt.getTime(), + promptCount: prompts.length, + promptMix: promptMixSummary(prompts), + systems: config.systems.map(({ name }) => name), + costAssumptions: { + inputUsdPer1M: config.inputUsdPer1M, + outputUsdPer1M: config.outputUsdPer1M, + tinyFishAgentStepUsd: config.tinyFishAgentStepUsd, + }, + aggregate: aggregateResults(laneResults), + laneResults, + }; + + await writeJson(join(runDirectory, "summary.json"), summary); + await writeMarkdownReport(join(runDirectory, "benchmark-report.md"), summary, prompts); + console.log(JSON.stringify(summary, null, 2)); +} + +const verifiedAt = "2026-05-20"; +const answerKeysByPromptId = { + "latest-ai-blog-posts": { + verifiedAt, + sourceUrls: [ + "https://openai.com/index/advancing-content-provenance/", + "https://www.anthropic.com/news/anthropic-kpmg", + "https://deepmind.google/blog/co-scientist-a-multi-agent-ai-partner-to-accelerate-research/", + ], + scoringNotes: + "Latest-post titles drift. Score entity coverage, official domains, dated titles, and source URLs rather than one frozen title only.", + expectedBehavior: "answer", + requiredColumns: ["entity_name", "latest_post_title", "latest_post_date", "source_url"], + expectedEntities: [ + { + id: "openai", + label: "OpenAI", + aliases: ["openai"], + allowedSourceDomains: ["openai.com"], + requiredText: ["2026"], + }, + { + id: "anthropic", + label: "Anthropic", + aliases: ["anthropic"], + allowedSourceDomains: ["anthropic.com"], + requiredText: ["2026"], + }, + { + id: "google-deepmind", + label: "Google DeepMind", + aliases: ["google deepmind", "deepmind"], + allowedSourceDomains: ["deepmind.google"], + requiredText: ["2026"], + }, + ], + minimumExpectedEntityMatches: 3, + officialSourceDomains: ["openai.com", "anthropic.com", "deepmind.google"], + }, + "saas-pricing-pages": { + verifiedAt: "2026-05-22", + sourceUrls: [ + "https://stripe.com/pricing", + "https://www.paddle.com/pricing", + "https://www.chargebee.com/pricing/", + ], + scoringNotes: + "Pass requires all three vendors, official domains, and visible plan or price text. Paddle's current pricing page can show Checkout transaction pricing.", + expectedBehavior: "answer", + requiredColumns: ["entity_name", "pricing_page_url", "plan_or_price", "source_url"], + expectedEntities: [ + { + id: "stripe", + label: "Stripe", + aliases: ["stripe"], + allowedSourceDomains: ["stripe.com"], + requiredText: ["pricing"], + }, + { + id: "paddle", + label: "Paddle", + aliases: ["paddle"], + allowedSourceDomains: ["paddle.com"], + requiredText: ["checkout", "5%", "50"], + }, + { + id: "chargebee", + label: "Chargebee", + aliases: ["chargebee"], + allowedSourceDomains: ["chargebee.com"], + requiredText: ["starter", "performance", "enterprise"], + }, + ], + minimumExpectedEntityMatches: 3, + officialSourceDomains: ["stripe.com", "paddle.com", "chargebee.com"], + }, + "earnings-release-pages": { + verifiedAt: "2026-05-22", + sourceUrls: [ + "https://www.apple.com/newsroom/2026/04/apple-reports-second-quarter-results/", + "https://www.microsoft.com/en-us/investor/earnings/fy-2026-q3/press-release-webcast", + "https://nvidianews.nvidia.com/news/nvidia-announces-financial-results-for-first-quarter-fiscal-2027", + ], + scoringNotes: + "As of 2026-05-22, Apple latest verified release is fiscal 2026 Q2 on 2026-04-30, Microsoft is FY26 Q3 on 2026-04-29, and NVIDIA is Q1 fiscal 2027 on 2026-05-20.", + expectedBehavior: "answer", + requiredColumns: ["entity_name", "release_date", "fiscal_quarter", "source_url"], + expectedEntities: [ + { + id: "apple", + label: "Apple", + aliases: ["apple"], + allowedSourceDomains: ["apple.com"], + requiredText: ["second quarter", "q2", "2026", "april 30"], + }, + { + id: "microsoft", + label: "Microsoft", + aliases: ["microsoft"], + allowedSourceDomains: ["microsoft.com"], + requiredText: ["fy26 q3", "q3", "april 29", "2026"], + }, + { + id: "nvidia", + label: "NVIDIA", + aliases: ["nvidia"], + allowedSourceDomains: ["nvidia.com"], + requiredText: ["first quarter", "q1", "fiscal 2027", "may 20"], + }, + ], + minimumExpectedEntityMatches: 3, + officialSourceDomains: ["apple.com", "microsoft.com", "nvidia.com"], + }, + "mcp-docs-pages": { + verifiedAt, + sourceUrls: [ + "https://developers.openai.com/api/docs/mcp", + "https://platform.claude.com/docs/en/agents-and-tools/mcp-connector", + "https://developers.cloudflare.com/agents/model-context-protocol/", + ], + scoringNotes: + "Pass requires official docs for all three vendors. Blog posts, GitHub examples, and community roundups are not enough.", + expectedBehavior: "answer", + requiredColumns: ["entity_name", "docs_title", "docs_url", "summary"], + expectedEntities: [ + { + id: "openai", + label: "OpenAI", + aliases: ["openai"], + allowedSourceDomains: ["developers.openai.com", "platform.openai.com", "openai.com"], + requiredText: ["mcp"], + }, + { + id: "anthropic", + label: "Anthropic", + aliases: ["anthropic"], + allowedSourceDomains: ["docs.anthropic.com", "platform.claude.com"], + requiredText: ["mcp"], + }, + { + id: "cloudflare", + label: "Cloudflare", + aliases: ["cloudflare"], + allowedSourceDomains: ["developers.cloudflare.com"], + requiredText: ["mcp"], + }, + ], + minimumExpectedEntityMatches: 3, + officialSourceDomains: [ + "developers.openai.com", + "platform.openai.com", + "openai.com", + "docs.anthropic.com", + "platform.claude.com", + "developers.cloudflare.com", + ], + }, + "menlo-park-coca-cola": { + verifiedAt, + sourceUrls: [ + "https://order-menlopark.celiasrestaurants.com/", + "https://www.portablurestaurant.com/menus", + ], + scoringNotes: + "Pass requires direct menu/order evidence for Coke/Coca-Cola. A directory saying a restaurant exists is not proof.", + expectedBehavior: "answer", + requiredColumns: ["entity_name", "address", "serves_requested_item", "source_url"], + rowMustContainAny: ["coca-cola", "coke", "diet coke", "diet coca-cola"], + minimumScore: 0.7, + }, + "hcmc-bakery-products": { + verifiedAt, + sourceUrls: [ + "https://maisonmarou.com/product/croissant/", + "https://moncannele.com/products/box-of-9-mini", + ], + scoringNotes: + "Pass requires product-detail URLs from bakery-owned sites, not generic listicles.", + expectedBehavior: "answer", + requiredColumns: ["bakery_name", "product_name", "product_url", "source_url"], + expectedEntities: [ + { + id: "maison-marou", + label: "Maison Marou", + aliases: ["maison marou", "marou"], + allowedSourceDomains: ["maisonmarou.com"], + requiredText: ["croissant", "macaron", "opera", "pastry"], + }, + { + id: "mon-cannele", + label: "Mon Cannele", + aliases: ["mon cannele", "cannel"], + allowedSourceDomains: ["moncannele.com"], + requiredText: ["cannel"], + }, + ], + minimumExpectedEntityMatches: 1, + officialSourceDomains: ["maisonmarou.com", "moncannele.com"], + }, + "ny-ai-startup-careers": { + verifiedAt, + sourceUrls: [ + "https://www.runwayml.com/careers", + "https://www.huggingface.co/jobs", + "https://www.hebbia.ai/careers", + ], + scoringNotes: + "Pass requires company-owned websites or careers pages. One third-party startup directory with repeated 'View Jobs' text is not enough.", + expectedBehavior: "answer", + requiredColumns: ["entity_name", "company_website", "careers_page_url", "is_hiring"], + expectedEntities: [ + { + id: "runway", + label: "Runway", + aliases: ["runway"], + allowedSourceDomains: ["runwayml.com"], + requiredText: ["careers", "jobs"], + }, + { + id: "hugging-face", + label: "Hugging Face", + aliases: ["hugging face", "huggingface"], + allowedSourceDomains: ["huggingface.co"], + requiredText: ["jobs", "careers"], + }, + { + id: "hebbia", + label: "Hebbia", + aliases: ["hebbia"], + allowedSourceDomains: ["hebbia.ai"], + requiredText: ["careers", "jobs"], + }, + ], + minimumExpectedEntityMatches: 2, + }, + "vietnam-fintech-sites": { + verifiedAt, + sourceUrls: [ + "https://www.momo.vn/", + "https://zalopay.vn/", + "https://vnpay.vn/", + "https://www.finhay.com.vn/", + ], + scoringNotes: + "Pass requires official company/product domains for Vietnamese fintech examples.", + expectedBehavior: "answer", + requiredColumns: ["entity_name", "official_website", "description", "source_url"], + expectedEntities: [ + { + id: "momo", + label: "MoMo", + aliases: ["momo"], + allowedSourceDomains: ["momo.vn"], + }, + { + id: "zalopay", + label: "ZaloPay", + aliases: ["zalopay", "zalo pay"], + allowedSourceDomains: ["zalopay.vn"], + }, + { + id: "vnpay", + label: "VNPAY", + aliases: ["vnpay"], + allowedSourceDomains: ["vnpay.vn"], + }, + { + id: "finhay", + label: "Finhay", + aliases: ["finhay"], + allowedSourceDomains: ["finhay.com.vn"], + }, + ], + minimumExpectedEntityMatches: 3, + officialSourceDomains: ["momo.vn", "zalopay.vn", "vnpay.vn", "finhay.com.vn"], + }, + "district-one-coffee-sites": { + verifiedAt, + sourceUrls: ["https://tonkin.coffee/menu/", "https://www.cafehien.com/"], + scoringNotes: + "Pass requires a shop-owned site or online menu plus District 1 address evidence.", + expectedBehavior: "answer", + requiredColumns: ["entity_name", "website_or_menu_url", "address", "source_url"], + expectedEntities: [ + { + id: "tonkin", + label: "Tonkin Coffee", + aliases: ["tonkin"], + allowedSourceDomains: ["tonkin.coffee"], + requiredText: ["district 1", "menu"], + }, + { + id: "hien", + label: "Hien Cafe", + aliases: ["hien cafe", "cafe hien"], + allowedSourceDomains: ["cafehien.com"], + requiredText: ["menu", "ho chi minh"], + }, + ], + minimumExpectedEntityMatches: 1, + }, + "amazon-starbucks-products": { + verifiedAt, + sourceUrls: ["https://www.amazon.com/stores/Starbucks/Starbucks/page/"], + scoringNotes: + "Pass requires Amazon product/listing evidence with product name, price, image URL, and stock/availability. If Amazon blocks access, an honest validation issue beats hallucinated products.", + expectedBehavior: "answer", + requiredColumns: ["product_name", "price", "image_url", "in_stock"], + officialSourceDomains: ["amazon.com"], + rowMustContainAny: ["starbucks"], + minimumScore: 0.7, + }, + "california-insurance-prices": { + verifiedAt, + sourceUrls: [ + "https://www.geico.com/auto-insurance/", + "https://www.progressive.com/auto/", + "https://www.statefarm.com/insurance/auto", + ], + scoringNotes: + "Actual prices require driver, vehicle, ZIP, coverage, and deductible. Best behavior is official quote pages plus missing-input validation, not invented premiums.", + expectedBehavior: "clarify_or_abstain", + requiredColumns: ["provider_name", "quote_page_url", "missing_inputs", "source_url"], + clarificationTerms: ["driver", "vehicle", "zip", "coverage", "deductible"], + officialSourceDomains: ["geico.com", "progressive.com", "statefarm.com"], + }, + "la-coke-menu-lol": { + verifiedAt, + sourceUrls: [], + scoringNotes: + "Pass requires direct LA menu/order evidence for Coke/Coca-Cola. Yelp/listicle rows are not enough.", + expectedBehavior: "answer", + requiredColumns: ["entity_name", "menu_url", "serves_requested_item", "source_url"], + rowMustContainAny: ["coca-cola", "coke", "diet coke", "soft drink"], + minimumScore: 0.9, + }, + "sf-ml-hiring-rn": { + verifiedAt, + sourceUrls: [ + "https://openai.com/careers/", + "https://www.anthropic.com/careers", + "https://www.perplexity.ai/careers", + ], + scoringNotes: + "Pass requires current company-owned careers/job pages with ML or AI role evidence near San Francisco or the Bay Area.", + expectedBehavior: "answer", + requiredColumns: ["entity_name", "careers_page_url", "open_role_title", "source_url"], + expectedEntities: [ + { + id: "openai", + label: "OpenAI", + aliases: ["openai"], + allowedSourceDomains: ["openai.com"], + requiredText: ["machine learning", "ml", "research", "engineer"], + }, + { + id: "anthropic", + label: "Anthropic", + aliases: ["anthropic"], + allowedSourceDomains: ["anthropic.com"], + requiredText: ["machine learning", "ml", "research", "engineer"], + }, + { + id: "perplexity", + label: "Perplexity", + aliases: ["perplexity"], + allowedSourceDomains: ["perplexity.ai"], + requiredText: ["machine learning", "ml", "engineer"], + }, + ], + minimumExpectedEntityMatches: 1, + }, + "latest-ai-company-stuff": { + verifiedAt, + sourceUrls: [], + scoringNotes: + "Prompt is underspecified. Best behavior is ask which companies and item types count, or return an explicitly scoped partial dataset with validation issues.", + expectedBehavior: "clarify_or_abstain", + requiredColumns: ["entity_name", "latest_item_title", "latest_item_url", "source_url"], + clarificationTerms: ["which companies", "source type", "news", "blog", "release", "columns"], + }, + "pastry-things-menlo": { + verifiedAt, + sourceUrls: ["https://mademoisellecolette.com/", "https://www.fleurdelysbakery.com/"], + scoringNotes: + "Pass requires bakery-owned websites or product/menu pages near Menlo Park. 'Good' should not become invented ratings.", + expectedBehavior: "answer", + requiredColumns: ["entity_name", "product_or_business_name", "website_url", "source_url"], + expectedEntities: [ + { + id: "mademoiselle-colette", + label: "Mademoiselle Colette", + aliases: ["mademoiselle colette"], + allowedSourceDomains: ["mademoisellecolette.com"], + }, + { + id: "fleur-de-lys", + label: "Fleur de Lys", + aliases: ["fleur de lys"], + allowedSourceDomains: ["fleurdelysbakery.com"], + }, + ], + minimumExpectedEntityMatches: 1, + }, + "perplexity-like-companies": { + verifiedAt, + sourceUrls: ["https://www.perplexity.ai/", "https://you.com/", "https://www.glean.com/"], + scoringNotes: + "Prompt is vague but answerable as AI search/answer companies if the system explains the comparison. Pass requires official websites and a concrete similarity reason.", + expectedBehavior: "answer", + requiredColumns: ["entity_name", "official_website", "why_similar", "source_url"], + expectedEntities: [ + { + id: "you-com", + label: "You.com", + aliases: ["you.com", "youcom"], + allowedSourceDomains: ["you.com"], + requiredText: ["search", "answer", "ai"], + }, + { + id: "glean", + label: "Glean", + aliases: ["glean"], + allowedSourceDomains: ["glean.com"], + requiredText: ["search", "workplace", "ai"], + }, + { + id: "exa", + label: "Exa", + aliases: ["exa"], + allowedSourceDomains: ["exa.ai"], + requiredText: ["search", "web", "ai"], + }, + ], + minimumExpectedEntityMatches: 1, + }, +}; + +if (process.argv[1] && resolve(process.argv[1]) === fileURLToPath(import.meta.url)) { + await main(); +} + +async function runSystemPrompt(input) { + const startedAt = Date.now(); + const minimumRequiredColumns = minimumRequiredColumnsForPrompt( + input.promptDefinition + ); + const command = renderCommand(input.system.command, input.promptDefinition); + console.error( + `[${input.system.name}] ${input.promptIndex + 1}/${input.promptCount} ${input.promptDefinition.id}` + ); + + const execution = await runCommand({ + command, + timeoutMs: input.config.timeoutMs, + env: { + BIGSET_BENCHMARK_PROMPT: input.promptDefinition.prompt, + BIGSET_BENCHMARK_PROMPT_ID: input.promptDefinition.id, + BIGSET_BENCHMARK_PROMPT_QUALITY: input.promptDefinition.quality, + BIGSET_BENCHMARK_PERSONA: input.promptDefinition.persona, + BIGSET_BENCHMARK_EXPECTED_STRESS: input.promptDefinition.expectedStress, + BIGSET_BENCHMARK_REQUIRED_COLUMNS: input.promptDefinition.requiredColumns.join(","), + BIGSET_BENCHMARK_MINIMUM_REQUIRED_COLUMNS: minimumRequiredColumns.join(","), + }, + }); + const parsedPayload = parseJsonPayload(execution.stdout); + const normalized = normalizePayload(parsedPayload); + const validation = evaluateRows({ + rows: normalized.rows, + promptDefinition: input.promptDefinition, + }); + const answerKeyScore = scoreBenchmarkRows({ + promptDefinition: input.promptDefinition, + rows: normalized.rows, + validationIssues: normalized.validationIssues, + validation, + minRequiredCompleteness: input.config.minRequiredCompleteness, + minFactualAccuracy: input.config.minFactualAccuracy, + }); + const usage = normalized.usage; + const estimatedModelCostUsd = estimateModelCostUsd(usage, input.config); + const estimatedTinyFishAgentCostUsd = roundUsd( + normalized.metrics.agentStepCount * input.config.tinyFishAgentStepUsd + ); + const infraBlockerReason = findInfrastructureBlockerReason({ + execution, + parsedPayload, + normalized, + }); + const status = infraBlockerReason + ? "blocked" + : execution.exitCode === 0 && parsedPayload && answerKeyScore.passed + ? "ok" + : "failed"; + + const promptRunDirectory = join( + input.runDirectory, + input.system.name, + `${String(input.promptIndex + 1).padStart(2, "0")}-${input.promptDefinition.id}` + ); + await mkdir(promptRunDirectory, { recursive: true }); + await writeFile(join(promptRunDirectory, "stdout.txt"), execution.stdout); + await writeFile(join(promptRunDirectory, "stderr.txt"), execution.stderr); + await writeJson(join(promptRunDirectory, "parsed-output.json"), parsedPayload ?? { + error: "No JSON object found in stdout.", + }); + + return { + system: input.system.name, + promptId: input.promptDefinition.id, + promptQuality: input.promptDefinition.quality, + promptPersona: input.promptDefinition.persona, + prompt: input.promptDefinition.prompt, + requestedColumns: input.promptDefinition.requiredColumns, + requiredColumns: input.promptDefinition.requiredColumns, + minimumRequiredColumns, + expectedStress: input.promptDefinition.expectedStress, + answerKey: answerKeyForPrompt(input.promptDefinition), + status, + failureCategory: status === "ok" ? undefined : ( + infraBlockerReason ? "infra" : answerKeyScore.failureCategory + ), + factualAccuracyScore: answerKeyScore.factualAccuracyScore, + entityCoverageRatio: answerKeyScore.entityCoverageRatio, + domainAccuracyRatio: answerKeyScore.domainAccuracyRatio, + evidenceSupportRatio: answerKeyScore.evidenceSupportRatio, + claimSupportRatio: answerKeyScore.claimSupportRatio, + abstentionScore: answerKeyScore.abstentionScore, + matchedExpectedEntities: answerKeyScore.matchedExpectedEntities, + missingExpectedEntities: answerKeyScore.missingExpectedEntities, + missingClaimSupportEntities: answerKeyScore.missingClaimSupportEntities, + latencyMs: Date.now() - startedAt, + exitCode: execution.exitCode, + timedOut: execution.timedOut, + rowCount: validation.rowCount, + nonEmptyCellCount: validation.nonEmptyCellCount, + totalExpectedCellCount: validation.totalExpectedCellCount, + requestedCellCompletenessRatio: validation.requestedCellCompletenessRatio, + requiredCellCompletenessRatio: validation.requiredCellCompletenessRatio, + sourceUrlCount: validation.sourceUrlCount, + evidenceQuoteCount: validation.evidenceQuoteCount, + duplicateIdentityCount: validation.duplicateIdentityCount, + missingRequestedCellCount: validation.missingRequestedCellCount, + missingRequestedCells: validation.missingRequestedCells, + missingRequiredCellCount: validation.missingRequiredCellCount, + missingRequiredCells: validation.missingRequiredCells, + needsReviewCount: validation.needsReviewCount, + validationIssueCount: normalized.validationIssues.length, + validationIssues: normalized.validationIssues, + usage, + searchCallCount: normalized.metrics.searchCallCount, + fetchCallCount: normalized.metrics.fetchCallCount, + browserCallCount: normalized.metrics.browserCallCount, + agentRunCount: normalized.metrics.agentRunCount, + agentStepCount: normalized.metrics.agentStepCount, + estimatedModelCostUsd, + estimatedTinyFishAgentCostUsd, + estimatedTotalCostUsd: roundUsd(estimatedModelCostUsd + estimatedTinyFishAgentCostUsd), + artifactDirectory: promptRunDirectory, + errorMessage: status === "ok" + ? undefined + : failureReason({ + execution, + parsedPayload, + validation, + answerKeyScore, + infraBlockerReason, + minRequiredCompleteness: input.config.minRequiredCompleteness, + validationIssues: normalized.validationIssues, + }), + }; +} + +function minimumRequiredColumnsForPrompt(promptDefinition) { + if (Array.isArray(promptDefinition.minimumRequiredColumns)) { + return uniqueStrings(promptDefinition.minimumRequiredColumns); + } + return inferConservativeMinimumRequiredColumns(promptDefinition.requiredColumns ?? []); +} + +function inferConservativeMinimumRequiredColumns(columns) { + const requestedColumns = uniqueStrings(columns); + const identityPriority = [ + "entity_name", + "company_name", + "organization_name", + "provider_name", + "restaurant_name", + "store_name", + "business_name", + "bakery_name", + "product_name", + "person_name", + "profile_name", + "docs_title", + "latest_item_title", + "open_role_title", + ]; + const identityUrlPriority = [ + "company_domain", + "official_website", + "official_source_url", + "profile_url", + "linkedin_url", + "product_url", + "website_url", + "docs_url", + "careers_page_url", + "quote_page_url", + "menu_url", + "pricing_page_url", + ]; + + const prioritizedIdentityColumn = identityPriority.find((columnName) => + requestedColumns.includes(columnName) + ); + if (prioritizedIdentityColumn) { + return [prioritizedIdentityColumn]; + } + + const nameColumn = requestedColumns.find((columnName) => + /(^|_)name$/.test(columnName) + ); + if (nameColumn) { + return [nameColumn]; + } + + const titleColumn = requestedColumns.find((columnName) => + /(^|_)title$/.test(columnName) + ); + if (titleColumn) { + return [titleColumn]; + } + + const identityUrlColumn = identityUrlPriority.find((columnName) => + requestedColumns.includes(columnName) + ); + if (identityUrlColumn) { + return [identityUrlColumn]; + } + + const fallbackIdentityColumn = requestedColumns.find( + (columnName) => + columnName !== "source_url" && + !columnName.endsWith("_at") && + !columnName.includes("score") && + !columnName.startsWith("is_") && + !columnName.startsWith("has_") + ); + + return fallbackIdentityColumn ? [fallbackIdentityColumn] : []; +} + +function uniqueStrings(values) { + return [...new Set(values.filter((value) => typeof value === "string" && value.length > 0))]; +} + +function parseArgs(args) { + const config = { + promptsPath: defaultPromptsPath, + promptIds: null, + systems: [], + timeoutMs: 10 * 60 * 1000, + inputUsdPer1M: 0.05, + outputUsdPer1M: 0.5, + tinyFishAgentStepUsd: 0.015, + minRequiredCompleteness: 0.75, + minFactualAccuracy: defaultMinimumFactualAccuracy, + }; + + for (let index = 0; index < args.length; index += 1) { + const arg = args[index]; + const value = args[index + 1]; + if (arg === "--prompts") { + config.promptsPath = value; + index += 1; + } else if (arg === "--prompt-ids") { + config.promptIds = parsePromptIds(value); + index += 1; + } else if (arg === "--out") { + config.outDirectory = value; + index += 1; + } else if (arg === "--rescore-dir") { + config.rescoreDirectory = value; + index += 1; + } else if (arg === "--system") { + const parsed = parseSystem(value); + config.systems.push(parsed); + index += 1; + } else if (arg === "--timeout-ms") { + config.timeoutMs = positiveNumber(value, config.timeoutMs); + index += 1; + } else if (arg === "--input-usd-per-1m") { + config.inputUsdPer1M = nonNegativeNumber(value, config.inputUsdPer1M); + index += 1; + } else if (arg === "--output-usd-per-1m") { + config.outputUsdPer1M = nonNegativeNumber(value, config.outputUsdPer1M); + index += 1; + } else if (arg === "--tinyfish-agent-step-usd") { + config.tinyFishAgentStepUsd = nonNegativeNumber(value, config.tinyFishAgentStepUsd); + index += 1; + } else if (arg === "--min-required-completeness") { + config.minRequiredCompleteness = nonNegativeNumber(value, config.minRequiredCompleteness); + index += 1; + } else if (arg === "--min-factual-accuracy") { + config.minFactualAccuracy = nonNegativeNumber(value, config.minFactualAccuracy); + index += 1; + } else if (arg === "--help" || arg === "-h") { + printHelpAndExit(); + } else { + throw new Error(`Unknown argument: ${arg}`); + } + } + + return config; +} + +function parsePromptIds(value) { + const promptIds = value + .split(",") + .map((promptId) => promptId.trim()) + .filter(Boolean); + + if (promptIds.length === 0) { + throw new Error("--prompt-ids requires at least one prompt id"); + } + + return promptIds; +} + +function selectPrompts(prompts, promptIds) { + if (!promptIds) { + return prompts; + } + + const promptsById = new Map(prompts.map((promptDefinition) => [ + promptDefinition.id, + promptDefinition, + ])); + const selectedPrompts = []; + const missingPromptIds = []; + + for (const promptId of promptIds) { + const promptDefinition = promptsById.get(promptId); + if (promptDefinition) { + selectedPrompts.push(promptDefinition); + } else { + missingPromptIds.push(promptId); + } + } + + if (missingPromptIds.length > 0) { + const availablePromptIds = prompts.map((promptDefinition) => promptDefinition.id).join(", "); + throw new Error( + `Unknown prompt id(s): ${missingPromptIds.join(", ")}. Available ids: ${availablePromptIds}` + ); + } + + return selectedPrompts; +} + +function parseSystem(value) { + const separatorIndex = value.indexOf("="); + if (separatorIndex <= 0) { + throw new Error("--system must look like name=command"); + } + + return { + name: value.slice(0, separatorIndex).trim(), + command: value.slice(separatorIndex + 1).trim(), + }; +} + +function renderCommand(command, promptDefinition) { + const minimumRequiredColumns = minimumRequiredColumnsForPrompt(promptDefinition); + return command + .replaceAll("{{prompt}}", shellEscape(promptDefinition.prompt)) + .replaceAll("{{promptJson}}", shellEscape(JSON.stringify(promptDefinition.prompt))) + .replaceAll("{{promptId}}", shellEscape(promptDefinition.id)) + .replaceAll("{{requiredColumnsJson}}", shellEscape(JSON.stringify(promptDefinition.requiredColumns))) + .replaceAll("{{minimumRequiredColumnsJson}}", shellEscape(JSON.stringify(minimumRequiredColumns))); +} + +function runCommand({ command, timeoutMs, env }) { + return new Promise((resolve) => { + const child = spawn(command, { + shell: true, + env: { ...process.env, ...env }, + stdio: ["ignore", "pipe", "pipe"], + }); + let stdout = ""; + let stderr = ""; + let timedOut = false; + const timeout = setTimeout(() => { + timedOut = true; + child.kill("SIGTERM"); + }, timeoutMs); + + child.stdout.on("data", (chunk) => { + stdout += chunk.toString(); + }); + child.stderr.on("data", (chunk) => { + stderr += chunk.toString(); + }); + child.on("close", (exitCode) => { + clearTimeout(timeout); + resolve({ stdout, stderr, exitCode: exitCode ?? 1, timedOut }); + }); + }); +} + +function parseJsonPayload(stdout) { + const trimmed = stdout.trim(); + if (!trimmed) { + return null; + } + + try { + return JSON.parse(trimmed); + } catch { + const lastObject = extractLastJsonObject(trimmed); + if (!lastObject) { + return null; + } + try { + return JSON.parse(lastObject); + } catch { + return null; + } + } +} + +function extractLastJsonObject(value) { + let depth = 0; + let endIndex = -1; + for (let index = value.length - 1; index >= 0; index -= 1) { + const char = value[index]; + if (char === "}") { + if (endIndex === -1) { + endIndex = index; + } + depth += 1; + } else if (char === "{") { + depth -= 1; + if (depth === 0 && endIndex !== -1) { + return value.slice(index, endIndex + 1); + } + } + } + return null; +} + +function normalizePayload(payload) { + const rows = arrayValue( + payload?.rows ?? + payload?.data ?? + payload?.records ?? + payload?.result ?? + payload?.datasetRows + ); + const validationIssues = stringArrayValue( + payload?.validationIssues ?? payload?.issues ?? payload?.errors + ); + const metrics = payload?.metrics ?? payload?.benchmarkMetrics ?? {}; + const usage = normalizeUsage(payload?.usage ?? metrics.usage ?? metrics); + + return { + rows, + validationIssues, + usage, + metrics: { + searchCallCount: numberValue(metrics.searchCallCount ?? metrics.searchCalls), + fetchCallCount: numberValue(metrics.fetchCallCount ?? metrics.fetchCalls), + browserCallCount: numberValue(metrics.browserCallCount ?? metrics.browserCalls), + agentRunCount: numberValue(metrics.agentRunCount ?? metrics.agentRuns), + agentStepCount: numberValue(metrics.agentStepCount ?? metrics.agentSteps), + }, + }; +} + +function normalizeUsage(value) { + return { + promptTokens: numberValue(value?.promptTokens ?? value?.inputTokens ?? value?.prompt_tokens), + completionTokens: numberValue( + value?.completionTokens ?? value?.outputTokens ?? value?.completion_tokens + ), + totalTokens: numberValue(value?.totalTokens ?? value?.total_tokens), + }; +} + +function evaluateRows({ rows, promptDefinition }) { + const missingRequiredCells = []; + const sourceUrls = new Set(); + const identityKeys = new Set(); + let duplicateIdentityCount = 0; + let nonEmptyCellCount = 0; + let evidenceQuoteCount = 0; + let needsReviewCount = 0; + + for (const [rowIndex, row] of rows.entries()) { + const cells = rowCells(row); + const identity = identityKey(cells, row); + if (identity) { + if (identityKeys.has(identity)) { + duplicateIdentityCount += 1; + } + identityKeys.add(identity); + } + + for (const requiredColumn of promptDefinition.requiredColumns) { + const value = cells[requiredColumn] ?? row?.[requiredColumn]; + if (isPresent(value)) { + nonEmptyCellCount += 1; + } else { + missingRequiredCells.push({ rowIndex, column: requiredColumn }); + } + } + + for (const url of rowSourceUrls(row, cells)) { + sourceUrls.add(url); + } + evidenceQuoteCount += rowEvidenceQuoteCount(row); + if (row?.needsReview === true || row?.needs_review === true) { + needsReviewCount += 1; + } + } + + const totalExpectedCellCount = rows.length * promptDefinition.requiredColumns.length; + const requiredCellCompletenessRatio = totalExpectedCellCount === 0 + ? 0 + : roundRatio(nonEmptyCellCount / totalExpectedCellCount); + + return { + rowCount: rows.length, + nonEmptyCellCount, + totalExpectedCellCount, + requestedCellCompletenessRatio: requiredCellCompletenessRatio, + requiredCellCompletenessRatio, + sourceUrlCount: sourceUrls.size, + evidenceQuoteCount, + duplicateIdentityCount, + missingRequestedCellCount: missingRequiredCells.length, + missingRequestedCells: missingRequiredCells, + missingRequiredCellCount: missingRequiredCells.length, + missingRequiredCells, + needsReviewCount, + }; +} + +async function rescoreBenchmarkRun({ runDirectory, prompts, config }) { + const previousSummary = JSON.parse(await readFile(join(runDirectory, "summary.json"), "utf8")); + const promptsById = new Map(prompts.map((promptDefinition) => [ + promptDefinition.id, + promptDefinition, + ])); + const rescoredLaneResults = []; + + for (const laneResult of previousSummary.laneResults ?? []) { + if (config.promptIds && !config.promptIds.includes(laneResult.promptId)) { + continue; + } + + const promptDefinition = promptsById.get(laneResult.promptId); + if (!promptDefinition) { + rescoredLaneResults.push(laneResult); + continue; + } + + const artifactDirectory = await resolveRescoreArtifactDirectory({ + runDirectory, + laneResult, + }); + const parsedPayload = await readJsonOrNull(join(artifactDirectory, "parsed-output.json")); + const stdout = await readTextOrEmpty(join(artifactDirectory, "stdout.txt")); + const stderr = await readTextOrEmpty(join(artifactDirectory, "stderr.txt")); + const usablePayload = parsedPayload?.error ? null : parsedPayload; + const normalized = normalizePayload(usablePayload); + const validation = evaluateRows({ rows: normalized.rows, promptDefinition }); + const answerKeyScore = scoreBenchmarkRows({ + promptDefinition, + rows: normalized.rows, + validationIssues: normalized.validationIssues, + validation, + minRequiredCompleteness: config.minRequiredCompleteness, + minFactualAccuracy: config.minFactualAccuracy, + }); + const execution = { + stdout, + stderr, + exitCode: laneResult.exitCode ?? 0, + timedOut: Boolean(laneResult.timedOut), + }; + const infraBlockerReason = findInfrastructureBlockerReason({ + execution, + parsedPayload: usablePayload, + normalized, + }); + const status = infraBlockerReason + ? "blocked" + : execution.exitCode === 0 && usablePayload && answerKeyScore.passed + ? "ok" + : "failed"; + + rescoredLaneResults.push({ + ...laneResult, + requestedColumns: promptDefinition.requiredColumns, + requiredColumns: promptDefinition.requiredColumns, + minimumRequiredColumns: minimumRequiredColumnsForPrompt(promptDefinition), + expectedStress: promptDefinition.expectedStress, + answerKey: answerKeyForPrompt(promptDefinition), + status, + failureCategory: status === "ok" ? undefined : ( + infraBlockerReason ? "infra" : answerKeyScore.failureCategory + ), + factualAccuracyScore: answerKeyScore.factualAccuracyScore, + entityCoverageRatio: answerKeyScore.entityCoverageRatio, + domainAccuracyRatio: answerKeyScore.domainAccuracyRatio, + evidenceSupportRatio: answerKeyScore.evidenceSupportRatio, + claimSupportRatio: answerKeyScore.claimSupportRatio, + abstentionScore: answerKeyScore.abstentionScore, + matchedExpectedEntities: answerKeyScore.matchedExpectedEntities, + missingExpectedEntities: answerKeyScore.missingExpectedEntities, + missingClaimSupportEntities: answerKeyScore.missingClaimSupportEntities, + rowCount: validation.rowCount, + nonEmptyCellCount: validation.nonEmptyCellCount, + totalExpectedCellCount: validation.totalExpectedCellCount, + requestedCellCompletenessRatio: validation.requestedCellCompletenessRatio, + requiredCellCompletenessRatio: validation.requiredCellCompletenessRatio, + sourceUrlCount: validation.sourceUrlCount, + evidenceQuoteCount: validation.evidenceQuoteCount, + duplicateIdentityCount: validation.duplicateIdentityCount, + missingRequestedCellCount: validation.missingRequestedCellCount, + missingRequestedCells: validation.missingRequestedCells, + missingRequiredCellCount: validation.missingRequiredCellCount, + missingRequiredCells: validation.missingRequiredCells, + needsReviewCount: validation.needsReviewCount, + validationIssueCount: normalized.validationIssues.length, + validationIssues: normalized.validationIssues, + errorMessage: status === "ok" + ? undefined + : failureReason({ + execution, + parsedPayload: usablePayload, + validation, + answerKeyScore, + infraBlockerReason, + minRequiredCompleteness: config.minRequiredCompleteness, + validationIssues: normalized.validationIssues, + }), + }); + } + + return { + ...previousSummary, + rescoredAt: new Date().toISOString(), + aggregate: aggregateResults(rescoredLaneResults), + laneResults: rescoredLaneResults, + }; +} + +async function resolveRescoreArtifactDirectory({ runDirectory, laneResult }) { + const declaredArtifactDirectory = laneResult.artifactDirectory; + const candidates = []; + + if (declaredArtifactDirectory) { + candidates.push(declaredArtifactDirectory); + + const normalizedArtifactDirectory = declaredArtifactDirectory.replaceAll("\\", "/"); + const runDirectoryName = runDirectory.split(/[\\/]/).filter(Boolean).at(-1); + const runDirectoryMarker = runDirectoryName ? `${runDirectoryName}/` : null; + const markerIndex = runDirectoryMarker + ? normalizedArtifactDirectory.indexOf(runDirectoryMarker) + : -1; + + if (markerIndex >= 0) { + const artifactPathWithinRun = normalizedArtifactDirectory.slice( + markerIndex + runDirectoryMarker.length + ); + candidates.push(join(runDirectory, ...artifactPathWithinRun.split("/"))); + } + + candidates.push( + join( + runDirectory, + laneResult.system, + normalizedArtifactDirectory.split("/").filter(Boolean).at(-1) ?? laneResult.promptId + ) + ); + } + + candidates.push(join(runDirectory, laneResult.system, laneResult.promptId)); + + for (const candidate of uniqueStrings(candidates)) { + const parsedPayload = await readJsonOrNull(join(candidate, "parsed-output.json")); + if (parsedPayload) return candidate; + } + + return candidates[0]; +} + +export function scoreBenchmarkRows(input) { + const answerKey = answerKeyForPrompt(input.promptDefinition); + const rowTexts = input.rows.map(rowSearchText); + const validationIssueText = input.validationIssues.join(" ").toLowerCase(); + const allText = [...rowTexts, validationIssueText].join(" "); + const expectedEntities = answerKey.expectedEntities ?? []; + const matchedExpectedEntities = []; + const missingExpectedEntities = []; + const missingClaimSupportEntities = []; + let expectedEntityDomainMatches = 0; + let expectedEntityClaimMatches = 0; + + for (const expectedEntity of expectedEntities) { + const aliases = expectedEntity.aliases ?? [expectedEntity.label, expectedEntity.id]; + const aliasMatched = aliases.some((alias) => allText.includes(String(alias).toLowerCase())); + if (!aliasMatched) { + missingExpectedEntities.push(expectedEntity.label ?? expectedEntity.id); + continue; + } + + matchedExpectedEntities.push(expectedEntity.label ?? expectedEntity.id); + const entityRows = input.rows.filter((row) => { + const rowText = rowSearchText(row); + return aliases.some((alias) => rowText.includes(String(alias).toLowerCase())); + }); + const rowsToCheck = entityRows.length > 0 ? entityRows : input.rows; + if (rowsToCheck.some((row) => rowHasAllowedDomain(row, expectedEntity.allowedSourceDomains))) { + expectedEntityDomainMatches += 1; + } + const hasRequiredClaimText = !expectedEntity.requiredText?.length || + rowsToCheck.some((row) => textContainsAny(rowSearchText(row), expectedEntity.requiredText)); + if (hasRequiredClaimText) { + expectedEntityClaimMatches += 1; + } else { + missingClaimSupportEntities.push(expectedEntity.label ?? expectedEntity.id); + } + } + + const minimumEntityMatches = answerKey.minimumExpectedEntityMatches ?? expectedEntities.length; + const entityCoverageRatio = expectedEntities.length === 0 + ? 1 + : roundRatio(matchedExpectedEntities.length / Math.max(1, minimumEntityMatches)); + const domainAccuracyRatio = expectedEntities.length > 0 + ? roundRatio(expectedEntityDomainMatches / Math.max(1, matchedExpectedEntities.length)) + : domainCoverageRatio(input.rows, answerKeyDomains(answerKey)); + const evidenceSupportRatio = input.validation.rowCount === 0 + ? 0 + : roundRatio(input.validation.evidenceQuoteCount / Math.max(1, input.validation.rowCount)); + const claimSupportRatio = claimSupportRatioForRows({ + rows: input.rows, + answerKey, + expectedEntities, + expectedEntityClaimMatches, + matchedExpectedEntityCount: matchedExpectedEntities.length, + }); + const abstentionScore = answerKey.expectedBehavior === "clarify_or_abstain" + ? clarificationScore(allText, answerKey.clarificationTerms ?? []) + : 0; + const shapeScore = shapeScoreForRows({ + validation: input.validation, + minRequiredCompleteness: input.minRequiredCompleteness, + expectedBehavior: answerKey.expectedBehavior, + validationIssues: input.validationIssues, + }); + const factualAccuracyScore = answerKey.expectedBehavior === "clarify_or_abstain" + ? roundRatio( + shapeScore * 0.2 + + domainAccuracyRatio * 0.2 + + abstentionScore * 0.6 + ) + : roundRatio( + shapeScore * 0.25 + + Math.min(1, entityCoverageRatio) * 0.3 + + domainAccuracyRatio * 0.2 + + Math.min(1, evidenceSupportRatio) * 0.15 + + claimSupportRatio * 0.1 + ); + const minimumScore = answerKey.minimumScore ?? input.minFactualAccuracy; + const hasExpectedEntityCoverage = expectedEntities.length === 0 || + matchedExpectedEntities.length >= minimumEntityMatches; + const hasRequiredDomainAccuracy = !requiresDomainProof(answerKey, expectedEntities) || + domainAccuracyRatio >= 1; + const hasRequiredClaimSupport = !requiresClaimProof(answerKey, expectedEntities) || + claimSupportRatio >= 1; + const passed = answerKey.expectedBehavior === "clarify_or_abstain" + ? factualAccuracyScore >= minimumScore && abstentionScore >= 0.5 + : factualAccuracyScore >= minimumScore && + shapeScore >= 1 && + hasExpectedEntityCoverage && + hasRequiredDomainAccuracy && + hasRequiredClaimSupport; + + return { + passed, + failureCategory: failureCategoryForScore({ + answerKey, + parsedRows: input.rows, + shapeScore, + entityCoverageRatio, + domainAccuracyRatio, + evidenceSupportRatio, + claimSupportRatio, + abstentionScore, + factualAccuracyScore, + minimumScore, + }), + factualAccuracyScore, + entityCoverageRatio: roundRatio(Math.min(1, entityCoverageRatio)), + domainAccuracyRatio, + evidenceSupportRatio: roundRatio(Math.min(1, evidenceSupportRatio)), + claimSupportRatio, + abstentionScore, + matchedExpectedEntities, + missingExpectedEntities, + missingClaimSupportEntities, + minimumScore, + }; +} + +function answerKeyForPrompt(promptDefinition) { + return promptDefinition.answerKey ?? answerKeysByPromptId[promptDefinition.id] ?? { + expectedBehavior: "answer", + requiredColumns: promptDefinition.requiredColumns, + sourceUrls: [], + scoringNotes: "No prompt-specific answer key. Falling back to shape-only scoring.", + }; +} + +function shapeScoreForRows({ validation, minRequiredCompleteness, expectedBehavior, validationIssues }) { + if (expectedBehavior === "clarify_or_abstain" && validationIssues.length > 0) { + return 1; + } + if (validation.rowCount === 0 || validation.sourceUrlCount === 0 || validation.evidenceQuoteCount === 0) { + return 0; + } + if (validation.requiredCellCompletenessRatio < minRequiredCompleteness) { + return roundRatio(validation.requiredCellCompletenessRatio / Math.max(0.001, minRequiredCompleteness)); + } + return 1; +} + +function claimSupportRatioForRows({ + rows, + answerKey, + expectedEntities, + expectedEntityClaimMatches, + matchedExpectedEntityCount, +}) { + if (answerKey.rowMustContainAny?.length) { + const matchingRows = rows.filter((row) => + textContainsAny(rowSearchText(row), answerKey.rowMustContainAny) + ).length; + return rows.length === 0 ? 0 : roundRatio(matchingRows / rows.length); + } + if (expectedEntities.some((entity) => entity.requiredText?.length)) { + return roundRatio(expectedEntityClaimMatches / Math.max(1, matchedExpectedEntityCount)); + } + return rows.length > 0 ? 1 : 0; +} + +function domainCoverageRatio(rows, allowedDomains) { + if (!allowedDomains?.length) { + if (rows.length === 0) return 0; + const hasPlaceholderOnly = rows.every((row) => { + const cells = rowCells(row); + const hostnames = rowSourceUrls(row, cells).map(urlHostname).filter(Boolean); + return hostnames.length > 0 && hostnames.every(isPlaceholderHostname); + }); + return hasPlaceholderOnly ? 0 : 1; + } + if (rows.length === 0) return 0; + const matchingRows = rows.filter((row) => rowHasAllowedDomain(row, allowedDomains)).length; + return roundRatio(matchingRows / rows.length); +} + +function answerKeyDomains(answerKey) { + const configuredDomains = answerKey.officialSourceDomains ?? []; + const sourceDomains = (answerKey.sourceUrls ?? []).map(urlHostname).filter(Boolean); + return [...new Set([...configuredDomains, ...sourceDomains])]; +} + +function requiresDomainProof(answerKey, expectedEntities) { + return answerKeyDomains(answerKey).length > 0 || + expectedEntities.some((entity) => entity.allowedSourceDomains?.length); +} + +function requiresClaimProof(answerKey, expectedEntities) { + return Boolean(answerKey.rowMustContainAny?.length) || + expectedEntities.some((entity) => entity.requiredText?.length); +} + +function isPlaceholderHostname(hostname) { + return hostname === "example.com" || + hostname.endsWith(".example.com") || + hostname === "localhost" || + hostname === "127.0.0.1"; +} + +function clarificationScore(text, terms) { + if (terms.length === 0) return text.length > 0 ? 1 : 0; + const matchedTerms = terms.filter((term) => text.includes(term.toLowerCase())).length; + return roundRatio(matchedTerms / terms.length); +} + +function failureCategoryForScore(input) { + if (input.parsedRows.length === 0 && input.answerKey.expectedBehavior !== "clarify_or_abstain") { + return "schema"; + } + if (input.shapeScore < 1) return "source_evidence"; + if (input.answerKey.expectedBehavior === "clarify_or_abstain" && input.abstentionScore < 0.5) { + return "clarification"; + } + if (input.entityCoverageRatio < 1) return "factual_accuracy"; + if (input.domainAccuracyRatio < 1) return "source_evidence"; + if (input.claimSupportRatio < 1) return "factual_accuracy"; + if (input.factualAccuracyScore < input.minimumScore) return "factual_accuracy"; + return "factual_accuracy"; +} + +export function findInfrastructureBlockerReason({ execution, parsedPayload, normalized }) { + const combinedText = [ + execution.stderr, + execution.stdout, + JSON.stringify(parsedPayload ?? {}), + ...(normalized?.validationIssues ?? []), + ].join("\n").toLowerCase(); + + if (execution.timedOut) return "Command timed out."; + const blockerPatterns = [ + /authentication failed/, + /active subscription/, + /insufficient credits/, + /not enough credits/, + /(?:missing|required|invalid|not configured|not set|unset)[^.]{0,80}api[_ -]?key/, + /api[_ -]?key[^.]{0,80}(?:missing|required|invalid|not configured|not set|unset)/, + /tinyfish_api_key/, + /openrouter_api_key/, + /quota exceeded/, + /rate[_ -]?limit[_ -]?exceeded/, + /benchmark deadline/, + ]; + return blockerPatterns.some((pattern) => pattern.test(combinedText)) + ? "Infrastructure/auth/credits blocker." + : null; +} + +function aggregateResults(results) { + const groups = new Map(); + for (const result of results) { + groups.set(result.system, [...(groups.get(result.system) ?? []), result]); + } + + return Array.from(groups.entries()).map(([system, group]) => { + const passed = group.filter((result) => result.status === "ok").length; + const blocked = group.filter((result) => result.status === "blocked").length; + const failed = group.length - passed - blocked; + const eligibleGroup = group.filter((result) => result.status !== "blocked"); + const eligibleCount = eligibleGroup.length; + const totalLatencyMs = sum(group, "latencyMs"); + const totalEstimatedCostUsd = sum(group, "estimatedTotalCostUsd"); + return { + system, + total: group.length, + passed, + failed, + blocked, + passRate: roundRatio(passed / Math.max(1, group.length)), + eligiblePassRate: roundRatio(passed / Math.max(1, eligibleCount)), + wallClockMs: totalLatencyMs, + avgLatencyMs: Math.round(totalLatencyMs / Math.max(1, group.length)), + avgRequiredCellCompletenessRatio: roundRatio( + sum(eligibleGroup, "requiredCellCompletenessRatio") / Math.max(1, eligibleCount) + ), + avgRequestedCellCompletenessRatio: roundRatio( + sum(eligibleGroup, "requestedCellCompletenessRatio") / Math.max(1, eligibleCount) + ), + avgFactualAccuracyScore: roundRatio( + sum(eligibleGroup, "factualAccuracyScore") / Math.max(1, eligibleCount) + ), + avgEntityCoverageRatio: roundRatio( + sum(eligibleGroup, "entityCoverageRatio") / Math.max(1, eligibleCount) + ), + avgDomainAccuracyRatio: roundRatio( + sum(eligibleGroup, "domainAccuracyRatio") / Math.max(1, eligibleCount) + ), + totalRows: sum(group, "rowCount"), + totalEvidenceQuotes: sum(group, "evidenceQuoteCount"), + totalSourceUrls: sum(group, "sourceUrlCount"), + totalMissingRequestedCells: sum(group, "missingRequestedCellCount"), + totalMissingRequiredCells: sum(group, "missingRequiredCellCount"), + totalDuplicateIdentities: sum(group, "duplicateIdentityCount"), + totalPromptTokens: group.reduce((total, result) => total + result.usage.promptTokens, 0), + totalCompletionTokens: group.reduce((total, result) => total + result.usage.completionTokens, 0), + totalTokens: group.reduce((total, result) => total + result.usage.totalTokens, 0), + searchCallCount: sum(group, "searchCallCount"), + fetchCallCount: sum(group, "fetchCallCount"), + browserCallCount: sum(group, "browserCallCount"), + agentRunCount: sum(group, "agentRunCount"), + agentStepCount: sum(group, "agentStepCount"), + estimatedTotalCostUsd: roundUsd(totalEstimatedCostUsd), + }; + }); +} + +async function writeMarkdownReport(filePath, summary, prompts) { + const lines = [ + "# Dataset Agent Benchmark Report", + "", + `Tested: ${summary.testedAt}`, + `Completed: ${summary.completedAt}`, + `Wall clock: ${formatDuration(summary.wallClockMs)}`, + `Prompt mix: good ${summary.promptMix.good}, average ${summary.promptMix.average}, bad ${summary.promptMix.bad}`, + "", + "## Aggregate", + "", + "| System | Runs | Passed | Failed | Blocked | Pass Rate | Eligible Pass | Avg Accuracy | Avg Latency | Rows | Evidence | Sources | Completeness | Missing Requested | Duplicates | Tokens In | Tokens Out | Agent Steps | Est Cost |", + "| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |", + ...summary.aggregate.map((row) => + `| ${escapeMarkdown(row.system)} | ${row.total} | ${row.passed} | ${row.failed} | ${row.blocked} | ${row.passRate} | ${row.eligiblePassRate} | ${row.avgFactualAccuracyScore} | ${formatDuration(row.avgLatencyMs)} | ${row.totalRows} | ${row.totalEvidenceQuotes} | ${row.totalSourceUrls} | ${row.avgRequestedCellCompletenessRatio ?? row.avgRequiredCellCompletenessRatio} | ${row.totalMissingRequestedCells ?? row.totalMissingRequiredCells} | ${row.totalDuplicateIdentities} | ${row.totalPromptTokens} | ${row.totalCompletionTokens} | ${row.agentStepCount} | ${formatUsd(row.estimatedTotalCostUsd)} |` + ), + "", + "## Prompt Pack", + "", + "| # | Quality | Persona | Prompt | Requested Columns | Minimum Required | Stress |", + "| ---: | --- | --- | --- | --- | --- | --- |", + ...prompts.map((prompt, index) => + `| ${index + 1} | ${prompt.quality} | ${escapeMarkdown(prompt.persona)} | ${escapeMarkdown(prompt.prompt)} | ${prompt.requiredColumns.join(", ")} | ${minimumRequiredColumnsForPrompt(prompt).join(", ")} | ${escapeMarkdown(prompt.expectedStress)} |` + ), + "", + "## Raw Results", + "", + "| System | Prompt | Quality | Status | Category | Accuracy | Entity Coverage | Domain Accuracy | Latency | Rows | Completeness | Evidence | Sources | Missing Requested | Duplicates | Tokens In | Tokens Out | Search | Fetch | Browser | Agent Runs | Agent Steps | Est Cost | Issue |", + "| --- | --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- |", + ...summary.laneResults.map((result) => + `| ${escapeMarkdown(result.system)} | ${escapeMarkdown(result.promptId)} | ${result.promptQuality} | ${result.status} | ${escapeMarkdown(result.failureCategory ?? "")} | ${result.factualAccuracyScore ?? 0} | ${result.entityCoverageRatio ?? 0} | ${result.domainAccuracyRatio ?? 0} | ${formatDuration(result.latencyMs)} | ${result.rowCount} | ${result.requestedCellCompletenessRatio ?? result.requiredCellCompletenessRatio} | ${result.evidenceQuoteCount} | ${result.sourceUrlCount} | ${result.missingRequestedCellCount ?? result.missingRequiredCellCount} | ${result.duplicateIdentityCount} | ${result.usage.promptTokens} | ${result.usage.completionTokens} | ${result.searchCallCount} | ${result.fetchCallCount} | ${result.browserCallCount} | ${result.agentRunCount} | ${result.agentStepCount} | ${formatUsd(result.estimatedTotalCostUsd)} | ${escapeMarkdown(result.errorMessage ?? "")} |` + ), + "", + ]; + await writeFile(filePath, `${lines.join("\n")}\n`); +} + +function promptMixSummary(prompts) { + return prompts.reduce( + (mix, prompt) => { + mix[prompt.quality] = (mix[prompt.quality] ?? 0) + 1; + return mix; + }, + { good: 0, average: 0, bad: 0 } + ); +} + +function estimateModelCostUsd(usage, config) { + return roundUsd( + (usage.promptTokens / 1_000_000) * config.inputUsdPer1M + + (usage.completionTokens / 1_000_000) * config.outputUsdPer1M + ); +} + +function rowCells(row) { + if (isRecord(row?.cells)) return row.cells; + if (isRecord(row?.data)) return row.data; + return isRecord(row) ? row : {}; +} + +function rowSourceUrls(row, cells) { + return uniqueStrings([ + ...stringArrayValue(row?.sourceUrls), + ...stringArrayValue(row?.sources), + ...stringArrayValue(row?.source_urls), + ...stringArrayValue(cells?.source_urls), + ...stringArrayValue(cells?.sources), + ...singleStringArray(row?.sourceUrl), + ...singleStringArray(row?.source_url), + ...singleStringArray(cells?.source_url), + ...singleStringArray(cells?.sourceUrl), + ...urlLikeCellValues(cells), + ].filter((value) => value.startsWith("http"))); +} + +function urlLikeCellValues(cells) { + if (!isRecord(cells)) return []; + return Object.entries(cells) + .filter(([key, value]) => + isUrlLikeCellName(key) && typeof value === "string" + ) + .map(([, value]) => value); +} + +function isUrlLikeCellName(name) { + const lower = String(name).toLowerCase(); + return lower === "url" || + lower.endsWith("_url") || + lower.includes("url") || + lower === "website" || + lower.endsWith("_website") || + lower === "homepage" || + lower.endsWith("_homepage"); +} + +function rowSearchText(row) { + const cells = rowCells(row); + return [ + JSON.stringify(cells), + ...rowSourceUrls(row, cells), + ...arrayValue(row?.evidence).map((evidence) => + typeof evidence === "string" ? evidence : evidence?.quote ?? "" + ), + ].join(" ").toLowerCase(); +} + +function rowHasAllowedDomain(row, allowedDomains) { + if (!allowedDomains?.length) return true; + const cells = rowCells(row); + return rowSourceUrls(row, cells).some((url) => + allowedDomains.some((allowedDomain) => urlHostname(url).endsWith(allowedDomain)) + ); +} + +function textContainsAny(text, terms) { + const lowerText = text.toLowerCase(); + return terms.some((term) => lowerText.includes(String(term).toLowerCase())); +} + +function urlHostname(url) { + try { + return new URL(url).hostname.replace(/^www\./, ""); + } catch { + return ""; + } +} + +function rowEvidenceQuoteCount(row) { + return arrayValue(row?.evidence).filter((evidence) => { + if (typeof evidence === "string") return evidence.trim().length > 0; + return typeof evidence?.quote === "string" && evidence.quote.trim().length > 0; + }).length; +} + +function identityKey(cells, row) { + const candidates = [ + cells.entity_name, + cells.company_name, + cells.product_name, + cells.bakery_name, + cells.provider_name, + cells.name, + row.id, + ]; + const identityParts = candidates.filter(isPresent).map((value) => + String(value).trim().toLowerCase() + ); + return identityParts[0] ?? null; +} + +export function failureReason({ + execution, + parsedPayload, + validation, + answerKeyScore, + infraBlockerReason, + minRequiredCompleteness, + validationIssues = [], +}) { + if (infraBlockerReason) return infraBlockerReason; + if (execution.timedOut) return "Command timed out."; + if (execution.exitCode !== 0) return `Command exited ${execution.exitCode}.`; + if (!parsedPayload) return "No parseable JSON object found in stdout."; + const capabilityDiagnostic = capabilityDiagnosticReason(validationIssues); + if (capabilityDiagnostic) return capabilityDiagnostic; + if (answerKeyScore?.failureCategory === "clarification") { + return `Clarification/abstention score ${answerKeyScore.abstentionScore} below required threshold.`; + } + if (validation.rowCount === 0) return "Parsed JSON had zero rows."; + if (validation.sourceUrlCount === 0) return "No source URLs found."; + if (validation.evidenceQuoteCount === 0) return "No evidence quotes found."; + if (validation.requiredCellCompletenessRatio < minRequiredCompleteness) { + return `Requested-cell completeness ${validation.requiredCellCompletenessRatio} below ${minRequiredCompleteness}.`; + } + if (answerKeyScore && !answerKeyScore.passed) { + if (answerKeyScore.failureCategory === "source_evidence") { + return `Source/domain evidence failed; factual accuracy ${answerKeyScore.factualAccuracyScore}, domain accuracy ${answerKeyScore.domainAccuracyRatio}.`; + } + if (answerKeyScore.entityCoverageRatio < 1) { + return `Entity coverage ${answerKeyScore.entityCoverageRatio} below required coverage; missing entities: ${answerKeyScore.missingExpectedEntities.join(", ") || "none"}.`; + } + if (answerKeyScore.claimSupportRatio < 1) { + return `Claim support ${answerKeyScore.claimSupportRatio} below required support; missing required claim text for: ${(answerKeyScore.missingClaimSupportEntities ?? []).join(", ") || "none"}.`; + } + return `Factual accuracy ${answerKeyScore.factualAccuracyScore} below ${answerKeyScore.minimumScore}; missing entities: ${answerKeyScore.missingExpectedEntities.join(", ") || "none"}.`; + } + return "Benchmark failed."; +} + +function capabilityDiagnosticReason(validationIssues) { + return validationIssues.find((issue) => + /^capability diagnostic:/i.test(String(issue)) + ) ?? null; +} + +function arrayValue(value) { + return Array.isArray(value) ? value : []; +} + +function stringArrayValue(value) { + if (Array.isArray(value)) { + return value.filter((item) => typeof item === "string"); + } + if (typeof value === "string") { + return [value]; + } + return []; +} + +function singleStringArray(value) { + return typeof value === "string" ? [value] : []; +} + +function numberValue(value) { + return Number.isFinite(Number(value)) ? Number(value) : 0; +} + +function positiveNumber(value, fallback) { + const number = Number(value); + return Number.isFinite(number) && number > 0 ? number : fallback; +} + +function nonNegativeNumber(value, fallback) { + const number = Number(value); + return Number.isFinite(number) && number >= 0 ? number : fallback; +} + +function isRecord(value) { + return typeof value === "object" && value !== null && !Array.isArray(value); +} + +function isPresent(value) { + if (value === null || value === undefined) return false; + if (typeof value === "string") return value.trim().length > 0; + if (Array.isArray(value)) return value.length > 0; + return true; +} + +function sum(items, key) { + return items.reduce((total, item) => total + numberValue(item[key]), 0); +} + +function shellEscape(value) { + return `'${String(value).replaceAll("'", "'\\''")}'`; +} + +function escapeMarkdown(value) { + return String(value).replaceAll("|", "\\|").replaceAll("\n", " "); +} + +function formatDuration(ms) { + if (ms < 1000) return `${ms}ms`; + const totalSeconds = Math.round(ms / 1000); + const minutes = Math.floor(totalSeconds / 60); + const seconds = totalSeconds % 60; + return minutes > 0 ? `${minutes}m ${seconds}s` : `${seconds}s`; +} + +function formatUsd(value) { + return `$${value.toFixed(value < 1 ? 4 : 2)}`; +} + +function roundRatio(value) { + return Number(value.toFixed(3)); +} + +function roundUsd(value) { + return Number(value.toFixed(6)); +} + +async function writeJson(filePath, value) { + await writeFile(filePath, `${JSON.stringify(value, null, 2)}\n`); +} + +async function readJsonOrNull(filePath) { + try { + return JSON.parse(await readFile(filePath, "utf8")); + } catch { + return null; + } +} + +async function readTextOrEmpty(filePath) { + try { + return await readFile(filePath, "utf8"); + } catch { + return ""; + } +} + +function printHelpAndExit() { + console.log(`Usage: +node benchmarks/dataset-agent/run-benchmark.mjs \\ + --system mengzhe='npm run benchmark -- {{promptJson}}' \\ + --system edward='node ./my-agent.js --prompt {{promptJson}}' + +Run a canary subset before spending credits on all prompts: +node benchmarks/dataset-agent/run-benchmark.mjs \\ + --prompt-ids latest-ai-blog-posts,saas-pricing-pages \\ + --system edward='node ./my-agent.js --prompt {{promptJson}}' + +Rescore existing artifacts without spending credits: +node benchmarks/dataset-agent/run-benchmark.mjs --rescore-dir benchmark-results/ + +Agent command contract: +- stdout should contain a JSON object. +- Preferred shape: { "rows": [], "validationIssues": [], "usage": {}, "metrics": {} } +- usage supports promptTokens/inputTokens, completionTokens/outputTokens, totalTokens. +- metrics supports searchCalls, fetchCalls, browserCalls, agentRuns, agentSteps. +`); + process.exit(0); +} diff --git a/benchmarks/dataset-agent/run-benchmark.test.mjs b/benchmarks/dataset-agent/run-benchmark.test.mjs new file mode 100644 index 0000000..773557a --- /dev/null +++ b/benchmarks/dataset-agent/run-benchmark.test.mjs @@ -0,0 +1,179 @@ +import assert from "node:assert/strict"; +import { test } from "node:test"; + +import { + failureReason, + findInfrastructureBlockerReason, + scoreBenchmarkRows, +} from "./run-benchmark.mjs"; + +const passingValidation = { + rowCount: 1, + sourceUrlCount: 1, + evidenceQuoteCount: 1, + requiredCellCompletenessRatio: 1, + missingRequiredCellCount: 0, +}; + +test("benchmark failure reason prefers capability diagnostic over generic zero rows", () => { + const diagnostic = "Capability diagnostic: TinyFish Agent disabled; triage requested browser/form/detail follow-up for 2 page(s) (requires_navigation=1, requires_form_submission=1). Enable COLLECTION_AGENT_ENABLE_AGENT=true for live navigation."; + + const reason = failureReason({ + execution: { + timedOut: false, + exitCode: 0, + }, + parsedPayload: { + rows: [], + validationIssues: [diagnostic], + }, + validation: { + rowCount: 0, + sourceUrlCount: 0, + evidenceQuoteCount: 0, + requiredCellCompletenessRatio: 0, + }, + answerKeyScore: null, + infraBlockerReason: null, + minRequiredCompleteness: 0.75, + validationIssues: [diagnostic], + }); + + assert.equal(reason, diagnostic); +}); + +test("infrastructure blocker detection ignores ordinary API-key documentation text", () => { + const reason = findInfrastructureBlockerReason({ + execution: { + timedOut: false, + stderr: "The documentation page covers general API key setup and SDK usage.", + stdout: "", + }, + parsedPayload: { + rows: [{ + cells: { + summary: "Covers API key setup for developers.", + }, + }], + }, + normalized: { + validationIssues: [ + "Capability diagnostic: TinyFish Agent disabled; triage requested browser/form/detail follow-up for 1 page(s) (requires_navigation=1). Enable COLLECTION_AGENT_ENABLE_AGENT=true for live navigation.", + ], + }, + }); + + assert.equal(reason, null); +}); + +test("infrastructure blocker detection still catches missing API key configuration", () => { + const reason = findInfrastructureBlockerReason({ + execution: { + timedOut: false, + stderr: "Missing OPENROUTER_API_KEY.", + stdout: "", + }, + parsedPayload: null, + normalized: { + validationIssues: [], + }, + }); + + assert.equal(reason, "Infrastructure/auth/credits blocker."); +}); + +test("domain scoring counts official website cells as source evidence", () => { + const score = scoreBenchmarkRows({ + rows: [{ + cells: { + entity_name: "MoMo", + official_website: "https://momo.vn", + source_url: "https://example-directory.test/vietnam-fintech", + }, + evidence: [{ quote: "MoMo official website is https://momo.vn" }], + }], + validation: passingValidation, + validationIssues: [], + minRequiredCompleteness: 1, + minFactualAccuracy: 0.75, + promptDefinition: { + answerKey: { + expectedBehavior: "answer", + requiredColumns: ["entity_name", "official_website", "source_url"], + expectedEntities: [{ + label: "MoMo", + aliases: ["momo"], + allowedSourceDomains: ["momo.vn"], + }], + minimumExpectedEntityMatches: 1, + }, + }, + }); + + assert.equal(score.passed, true); + assert.equal(score.domainAccuracyRatio, 1); +}); + +test("domain scoring counts product, careers, and docs URL cells", () => { + const cases = [ + { + cells: { + bakery_name: "Bakes", + product_name: "Croissant", + product_url: "https://bakes-saigon.com/products/croissant", + source_url: "https://example-directory.test/bakeries", + }, + label: "Bakes", + aliases: ["bakes"], + allowedSourceDomains: ["bakes-saigon.com"], + }, + { + cells: { + entity_name: "Runway", + careers_page_url: "https://runwayml.com/careers", + source_url: "https://example-directory.test/ai-startups", + }, + label: "Runway", + aliases: ["runway"], + allowedSourceDomains: ["runwayml.com"], + }, + { + cells: { + entity_name: "Cloudflare", + docs_url: "https://developers.cloudflare.com/agents/model-context-protocol/", + source_url: "https://example-directory.test/mcp-docs", + }, + label: "Cloudflare", + aliases: ["cloudflare"], + allowedSourceDomains: ["developers.cloudflare.com"], + }, + ]; + + for (const item of cases) { + const score = scoreBenchmarkRows({ + rows: [{ + cells: item.cells, + evidence: [{ quote: JSON.stringify(item.cells) }], + }], + validation: passingValidation, + validationIssues: [], + minRequiredCompleteness: 1, + minFactualAccuracy: 0.75, + promptDefinition: { + answerKey: { + expectedBehavior: "answer", + requiredColumns: Object.keys(item.cells), + expectedEntities: [{ + label: item.label, + aliases: item.aliases, + allowedSourceDomains: item.allowedSourceDomains, + }], + minimumExpectedEntityMatches: 1, + }, + }, + }); + + assert.equal(score.passed, true, `${item.label} should pass`); + assert.equal(score.domainAccuracyRatio, 1, `${item.label} domain`); + } +}); diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index 7a0eec1..05ab9c7 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -24,10 +24,12 @@ services: - "3501:3501" volumes: - ./backend/src:/app/src + - populate_recipe_data:/app/.bigset environment: CLIENT_ORIGIN: http://localhost:3500 CONVEX_URL: http://convex:3210 PORT: 3501 + POPULATE_RECIPE_STORE_DIR: /app/.bigset/populate-recipes CONVEX_SELF_HOSTED_ADMIN_KEY: ${CONVEX_SELF_HOSTED_ADMIN_KEY:-} CLERK_SECRET_KEY: ${CLERK_SECRET_KEY:-} CLERK_PUBLISHABLE_KEY: ${NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY:-} @@ -130,3 +132,4 @@ services: volumes: pgdata: convex_data: + populate_recipe_data: diff --git a/docs/data-collection-agent-migration-plan.md b/docs/data-collection-agent-migration-plan.md new file mode 100644 index 0000000..2bb1847 --- /dev/null +++ b/docs/data-collection-agent-migration-plan.md @@ -0,0 +1,293 @@ +# Data Collection Agent Migration Plan + +This plan keeps the app, benchmark harness, and self-healing layer aligned while +the collection pipeline is migrated into BigSet. + +## Current State + +- PR #31-#37 form the current Mastra populate/self-healing stack. They are + intentionally stacked and should not be merged out of order. +- PR #37 adds `make verify-self-healing`, which is the cheap local gate before + touching live data or spending OpenRouter/TinyFish credits. +- PR #38 adds this migration plan and keeps the target boundaries explicit. +- PR #39 adds `CollectionPopulateRecipeRuntime`, an adapter boundary that can + run a collection pipeline through the same `PopulateRecipeRuntime` interface + as Mastra. +- PR #40 adds `POPULATE_AGENT_RUNTIME=collection` selection through the real + HTTP and CLI entrypoints. PR #42 extends that socket so app/CLI runs can load + a runner module from `POPULATE_COLLECTION_RUNNER_MODULE`. +- PR #41 adds a `collection-self-heal` benchmark lane that wraps the collection + runtime inside `SelfHealingPopulateRecipeService`. This is the benchmark + socket Meteor can use once the real collection runner is available. +- PR #43 ports the real vendored collection pipeline behind + `runCollectionPopulatePipeline(input)`, so the collection benchmark lane now + runs the BigSet-wrapped collection runner instead of a fake injected runner. +- PR #44 keeps TinyFish Agent/browser work opt-in and bounded by a per-run poll + timeout. This preserves cheap cron/benchmark reruns as the default path. +- PR #45 improves collection source targeting for official-source prompts + without injecting answer-key URLs at runtime. +- PR #46 surfaces no-Agent browser/form/detail follow-up as a safe capability + diagnostic instead of hiding it as generic bad data or infra failure. +- `feat/data-collection-agent-v14` is no longer the branch to build on directly. + It was the source of the collection pipeline port. New work should branch on + top of the current draft stack, not edit Meteor's branch or the dirty main + checkout. + +## Target Shape + +The app should have one stable populate boundary: + +```text +POST /populate or cron CLI + -> load DatasetContext + -> self-healing populate service + -> selected PopulateRecipeRuntime + -> source-backed rows + evidence + -> validation gate + -> optional Convex atomic row replace +``` + +The collection pipeline should become one implementation of +`PopulateRecipeRuntime`. It should not own app auth, row deletion, Convex writes, +or cron scheduling. Those stay in BigSet. + +The critical contract is `runRecipe({ recipe, context })`. A collection runtime +adapter must thread `recipe.runtimeInstructions` into the collection prompt/spec, +because those instructions are how a repaired recipe changes future runtime +behavior. A runtime that ignores `recipe.runtimeInstructions` is not actually +self-healing. + +## What Self-Healing Does Now + +The current layer: + +- stores active recipes and run records in a filesystem recipe store on the + durable app/commit path +- reruns the active recipe when one exists +- generates an initial recipe when no active recipe exists +- repairs a failed active recipe through `DefaultPopulateRecipeAuthor` +- validates rows for requested-column completeness, source URL coverage, + evidence quote coverage, and expected-entity coverage when the prompt names + explicit entities +- promotes a repaired recipe only if it is valid and does not score below the + active recipe baseline +- commits rows only after a successful tick, using one Convex atomic replace +- supports a CLI path for cron/live smoke via `populate:self-heal --dataset-id` + +Dry-run and benchmark paths intentionally use in-memory stores so they do not +pollute durable recipe history. + +The current layer now can: + +- run an injected collection runner through the same self-healing runtime + boundary and benchmark harness as Mastra +- run the real vendored collection pipeline through that same boundary +- preserve `recipe.runtimeInstructions`, required columns, and benchmark + metadata through the collection runner +- emit a capability diagnostic when no-Agent mode sees pages that need browser, + form, or detail-page follow-up + +The current layer does not yet: + +- generate Playwright scripts as a durable production recipe +- run a green live Convex canary in this local environment +- prove Agent-enabled collection quality on a full real benchmark +- prove the collection runtime should replace Mastra as the default app runtime + +## Migration Sequence + +1. Branch from the top of the self-healing stack. + - For new collection-runner or benchmark work, base on + `codex/collection-capability-diagnostics` unless that PR has been + superseded. + - Do not edit `main`, the dirty local checkout, or + `feat/data-collection-agent-v14` directly. + +2. Fix the collection branch as a clean build source. + - Status: done in PR #43 for the BigSet-wrapped collection runner path. + - Keep vendored code isolated until the adapter is green. + - Preserve the current backend Convex boundary: do not reintroduce imports + from `frontend/convex/_generated` into backend compile. Use the existing + `anyApi`/HTTP-client boundary instead. + - Exclude non-essential vendored artifacts from the PR scope until the + runtime adapter needs them. + - Gate: `npm --prefix backend test` and `npm --prefix backend run build`. + +3. Add a collection runtime adapter. + - Status: done in PR #39. + - Implement the existing `PopulateRecipeRuntime` interface. + - Input: BigSet `DatasetContext`. + - Transform `recipe.runtimeInstructions` into the collection pipeline + prompt/spec alongside the dataset description and columns. + - Propagate `requiredColumns`, prompt id, prompt quality, persona, and + benchmark stress metadata into the collection pipeline's benchmark/spec + generation path when those fields are available. + - Output: rows, source URLs, evidence quotes, usage, metrics, and debug + captured sources. + - No direct Convex writes inside the adapter. + - Gate: a unit test proving a repaired recipe's runtime instructions reach + the downstream collection prompt/spec and can change observable runtime + behavior. + +4. Add runtime selection through the real entrypoints. + - Status: done in PR #40 for injected collection runners. + - Add a runtime factory for the self-healing runner. + - Add an env switch such as `POPULATE_AGENT_RUNTIME=collection`. + - Wire both `POST /populate` and `populate:self-heal --dataset-id` through + that same factory. + - Gate: one HTTP-route test, one CLI test, and one dry-run smoke proving both + entrypoints use the selected runtime. + +5. Add a self-healing-wrapped benchmark adapter for the collection runtime. + - Status: done in PR #41 for injected collection runners. + - Reuse `benchmarks/dataset-agent/run-benchmark.mjs`. + - Exercise `SelfHealingPopulateRecipeService` with the collection runtime + inside it, not the direct collection pipeline alone. + - Compare this lane against the existing Mastra-inside-self-healing lane. + - Return blocked results when required API keys are missing. + - Gate: no-key smoke must block with zero tokens, zero tool calls, and zero + estimated spend. + +6. Run quality gates in increasing cost order. + - `make verify-self-healing` + - 2-prompt real benchmark + - 1-prompt Agent-enabled capability canary for prompts that need browser or + detail follow-up + - full benchmark only after the 2-prompt run is not obviously broken + - live `--dataset-id` dry-run only after Convex/env prerequisites are ready + - `--commit` only on a throwaway dataset first + +7. Keep runtime selection explicit. + - Keep current Mastra runtime as default until collection runtime benchmark + evidence is better. + - Do not claim collection runtime quality from a direct, non-self-healing + benchmark lane. + +8. Decide merge order from evidence, not preference. + - If collection runtime is better, stack it after #37 and merge the stack + from bottom to top. + - If collection runtime is not better, keep it as a draft branch and use + benchmark artifacts to decide what to fix next. + +## Acceptance Gates + +Before any merge: + +- no real `.env` files or private notes in the diff +- `git diff --name-status main...HEAD` reviewed for public PR hygiene +- `make verify-self-healing` passes +- `npm --prefix backend test` passes +- `npm --prefix backend run build` passes +- adapter test proves `recipe.runtimeInstructions` reaches the collection + pipeline prompt/spec +- adapter or runner tests prove benchmark metadata and `requiredColumns` reach + the collection pipeline's spec generation path +- HTTP-route and CLI tests prove `POPULATE_AGENT_RUNTIME=collection` reaches + the selected runtime through real app entrypoints +- benchmark no-key smoke proves blocked with zero spend +- benchmark evidence comes from the collection runtime wrapped inside the + self-healing service, not the direct collection pipeline alone +- real benchmark artifacts are linked in the PR when runtime quality is claimed +- capability diagnostics are treated as warnings for healthy rows and as honest + benchmark failure messages when no-Agent mode cannot complete browser/form + follow-up +- live dataset commit is tested only on a throwaway dataset +- backend build does not depend on `frontend/convex/_generated` + +## Meteor Handoff Shape + +Meteor does not need to rebuild the self-healing wrapper. The socket is now: + +```text +runCollectionPopulatePipeline(CollectionPopulatePipelineInput) + -> Promise +``` + +`CollectionPopulatePipelineInput.recipeInstructions` is the self-healing signal. +`requiredColumns` and benchmark metadata are the scoring signal. If the +collection runner ignores `recipeInstructions`, repaired recipes cannot change +future behavior. If it ignores `requiredColumns` or benchmark metadata, the +benchmark can stop measuring the same task. + +The real benchmark command after a runner module exists is: + +```bash +COLLECTION_AGENT_PIPELINE_MODULE=./backend/BigSet_Data_Collection_Agent/src/orchestrator/pipeline.ts \ +BIGSET_COLLECTION_BENCHMARK_RUNNER_MODULE=./backend/src/pipeline/collection-agent-runner.ts \ +node benchmarks/dataset-agent/run-benchmark.mjs \ + --prompt-ids latest-ai-blog-posts,saas-pricing-pages \ + --system collection-self-heal='node --import ./backend/node_modules/tsx/dist/esm/index.mjs benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs' +``` + +For prompts that likely require browser/detail follow-up, run the same lane with +Agent explicitly enabled: + +```bash +COLLECTION_AGENT_ENABLE_AGENT=true \ +COLLECTION_AGENT_POLL_TIMEOUT_MS=480000 \ +COLLECTION_AGENT_PIPELINE_MODULE=./backend/BigSet_Data_Collection_Agent/src/orchestrator/pipeline.ts \ +BIGSET_COLLECTION_BENCHMARK_RUNNER_MODULE=./backend/src/pipeline/collection-agent-runner.ts \ +node benchmarks/dataset-agent/run-benchmark.mjs \ + --prompt-ids mcp-docs-pages \ + --timeout-ms 900000 \ + --system collection-self-heal='node --import ./backend/node_modules/tsx/dist/esm/index.mjs benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs' +``` + +No-Agent `mcp-docs-pages` evidence from PR #46: + +- artifact: `benchmark-results/collection-capability-diagnostics-mcp-20260523-001` +- result: 3 rows, 6 evidence quotes, cost about `$0.007287` +- status: failed with +`Capability diagnostic: TinyFish Agent disabled; triage requested browser/form/detail follow-up...`. +That is not a pass, but it is useful: it tells us the next benchmark should +turn Agent on and measure whether browser/detail follow-up fixes the source +evidence miss. + +Agent-enabled `mcp-docs-pages` evidence from the stack-handoff branch: + +- artifact: `benchmark-results/collection-agent-canary-mcp-20260523-001` +- result: 3 rows, 12 evidence quotes, 10 source URLs, 3 Agent runs +- cost: about `$0.053552` +- status: failed, not blocked +- score: factual accuracy `0.933`, entity coverage `1.0`, claim support `1.0`, + domain accuracy `0.667` +- conclusion: Agent/browser follow-up runs successfully and improves claim + support, but source/domain evidence still misses. The next code target is + source coherence: keep each row's docs URL/evidence/source URLs aligned with + that entity's official docs domain instead of merging discovery/blog/course + evidence across vendors. + +## Next Engineering Move + +Create a fresh branch from `codex/collection-capability-diagnostics` and fix +source coherence before running the full benchmark: + +1. Keep `COLLECTION_AGENT_ENABLE_AGENT=false` as the default. +2. Add focused tests around record merge/source selection so a row does not gain + evidence for a populated field from another record unless the incoming row + value supports the existing value. +3. Tighten docs/official-source selection so docs prompts prefer docs/developers + pages over blogs, news, courses, directories, or third-party discovery pages. +4. Re-run the Agent-enabled `mcp-docs-pages` canary. +5. If domain accuracy reaches `1.0`, run the 4-prompt focused benchmark from + PR #45. +6. Run the full prompt pack only after the focused benchmark is not obviously + broken. + +When testing the real app or CLI path, set: + +```bash +POPULATE_AGENT_RUNTIME=collection +POPULATE_COLLECTION_RUNNER_MODULE=./backend/src/pipeline/collection-agent-runner.ts +COLLECTION_AGENT_PIPELINE_MODULE=./backend/BigSet_Data_Collection_Agent/src/orchestrator/pipeline.ts +``` + +The BigSet runner keeps TinyFish Agent/browser calls disabled unless +`COLLECTION_AGENT_ENABLE_AGENT=true`. This makes cron and benchmark reruns cheap +and repeatable first. Agent-enabled runs should also set +`COLLECTION_AGENT_POLL_TIMEOUT_MS` or `AGENT_POLL_TIMEOUT_MS` so a browser run +cannot outlive the benchmark/job budget. + +Do not switch the default runtime from Mastra to collection until the +self-healing-wrapped collection benchmark has better evidence than the current +Mastra lane. diff --git a/frontend/components/ThemeToggle.tsx b/frontend/components/ThemeToggle.tsx index 89eebe5..52db527 100644 --- a/frontend/components/ThemeToggle.tsx +++ b/frontend/components/ThemeToggle.tsx @@ -37,8 +37,11 @@ export function ThemeToggle({ className = "" }: { className?: string }) { const [theme, setTheme] = useState("light"); useEffect(() => { - setTheme(readEffectiveTheme()); - setMounted(true); + const timeoutId = window.setTimeout(() => { + setTheme(readEffectiveTheme()); + setMounted(true); + }, 0); + return () => window.clearTimeout(timeoutId); }, []); function toggle() { diff --git a/frontend/components/table/ColumnHeader.tsx b/frontend/components/table/ColumnHeader.tsx index 27adfee..47a11d5 100644 --- a/frontend/components/table/ColumnHeader.tsx +++ b/frontend/components/table/ColumnHeader.tsx @@ -10,7 +10,6 @@ export function ColumnHeader({ header, column, isResizing, - tableContainerRef, }: { header: Header; column?: DatasetColumn; @@ -24,10 +23,7 @@ export function ColumnHeader({ > {isResizing && (
)} diff --git a/frontend/convex/datasetRows.ts b/frontend/convex/datasetRows.ts index 567f85e..a6420a8 100644 --- a/frontend/convex/datasetRows.ts +++ b/frontend/convex/datasetRows.ts @@ -137,3 +137,40 @@ export const insertBatch = internalMutation({ } }, }); + +export const replaceByDataset = internalMutation({ + args: { + datasetId: v.id("datasets"), + rows: v.array(v.object({ + data: v.record(v.string(), v.any()), + sources: v.optional(v.array(v.string())), + })), + }, + handler: async (ctx, args) => { + const dataset = await ctx.db.get(args.datasetId); + if (!dataset) { + throw new Error("Dataset not found"); + } + + const existingRows = await ctx.db + .query("datasetRows") + .withIndex("by_dataset", (q) => q.eq("datasetId", args.datasetId)) + .collect(); + + for (const row of existingRows) { + await ctx.db.delete(row._id); + } + for (const row of args.rows) { + await ctx.db.insert("datasetRows", { + datasetId: args.datasetId, + data: row.data, + sources: row.sources, + }); + } + + return { + clearedRowCount: existingRows.length, + insertedRowCount: args.rows.length, + }; + }, +}); diff --git a/frontend/convex/datasets.ts b/frontend/convex/datasets.ts index f0fb8cc..b240948 100644 --- a/frontend/convex/datasets.ts +++ b/frontend/convex/datasets.ts @@ -1,4 +1,4 @@ -import { query, mutation } from "./_generated/server.js"; +import { query, mutation, internalQuery } from "./_generated/server.js"; import type { QueryCtx } from "./_generated/server.js"; import { v } from "convex/values"; import type { Doc } from "./_generated/dataModel.js"; @@ -83,6 +83,13 @@ export const get = query({ }, }); +export const getForSystemPopulate = internalQuery({ + args: { id: v.id("datasets") }, + handler: async (ctx, args) => { + return await ctx.db.get(args.id); + }, +}); + export const create = mutation({ args: { name: v.string(), diff --git a/frontend/lib/analytics.ts b/frontend/lib/analytics.ts index 7b60702..93bed6f 100644 --- a/frontend/lib/analytics.ts +++ b/frontend/lib/analytics.ts @@ -84,8 +84,6 @@ export function initAnalytics(): boolean { // - maskInputOptions: every form input/textarea value is masked // unconditionally. Catches the search box, the wizard prompt, // Clerk's email + password fields. - // - recordConsole: console.error/warn shows up alongside the - // replay timeline — invaluable for "user says it broke". // - recordCrossOriginIframes: false → Clerk's hosted iframes // (if any) are not pierced into. session_recording: { @@ -97,7 +95,6 @@ export function initAnalytics(): boolean { email: true, }, recordCrossOriginIframes: false, - recordConsole: true, }, loaded: () => { diff --git a/makefiles/Makefile b/makefiles/Makefile index 497efef..633df80 100644 --- a/makefiles/Makefile +++ b/makefiles/Makefile @@ -1,4 +1,4 @@ -.PHONY: all dev down clean convex-push convex-env +.PHONY: all dev down clean convex-push convex-env verify-self-healing all: dev @@ -33,6 +33,9 @@ convex-push: --url http://127.0.0.1:3210 \ --admin-key "$$(grep CONVEX_SELF_HOSTED_ADMIN_KEY .env.local | cut -d= -f2-)" +verify-self-healing: + bash scripts/verify-self-healing-stack.sh + down: docker compose -f docker-compose.dev.yml down diff --git a/scripts/verify-self-healing-stack.sh b/scripts/verify-self-healing-stack.sh new file mode 100755 index 0000000..6e8eacf --- /dev/null +++ b/scripts/verify-self-healing-stack.sh @@ -0,0 +1,296 @@ +#!/usr/bin/env bash +set -uo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +cd "$ROOT_DIR" || exit 1 + +DATASET_ID="" +SHOULD_COMMIT_ROWS=0 +SHOULD_RUN_CONVEX_PUSH=0 +SHOULD_RUN_LOCAL_GATES=1 +SHOULD_RUN_BLOCKED_BENCHMARK_SMOKE=1 +SHOULD_RUN_REAL_BENCHMARK=0 +EXIT_STATUS=0 + +usage() { + cat <<'USAGE' +Usage: + bash scripts/verify-self-healing-stack.sh [options] + +Options: + --dataset-id Run a live self-healing populate smoke for one dataset. + --commit Commit rows for --dataset-id instead of dry-run. + --convex-push Deploy Convex functions before the live dataset smoke. + --real-benchmark Run a 2-prompt real Mastra benchmark. May spend API credits. + --skip-local Skip backend test/build/node-check gates. + --no-blocked-smoke Skip the no-key benchmark blocked-contract smoke. + -h, --help Show this help. + +Default behavior runs only local checks and a no-key benchmark smoke. It does +not load secret files and does not spend OpenRouter or TinyFish credits. Live +dataset and benchmark modes require needed env vars to be exported already. +USAGE +} + +mark_pass() { + printf 'PASS %s\n' "$1" +} + +mark_fail() { + printf 'FAIL %s\n' "$1" + EXIT_STATUS=1 +} + +mark_blocked() { + printf 'BLOCK %s\n' "$1" + if [[ "$EXIT_STATUS" -eq 0 ]]; then + EXIT_STATUS=2 + fi +} + +run_required_step() { + local label="$1" + shift + + printf 'RUN %s\n' "$label" + if "$@"; then + mark_pass "$label" + else + mark_fail "$label" + fi +} + +require_command() { + local command_name="$1" + if command -v "$command_name" >/dev/null 2>&1; then + return 0 + fi + mark_blocked "missing command: ${command_name}" + return 1 +} + +require_env_var() { + local env_name="$1" + if [[ -n "${!env_name:-}" ]]; then + return 0 + fi + mark_blocked "missing env: ${env_name}" + return 1 +} + +check_docker_compose_ready() { + require_command docker || return 1 + docker compose -f docker-compose.dev.yml ps >/dev/null 2>&1 +} + +check_convex_ready() { + local convex_url="$1" + require_command curl || return 1 + curl -sf "${convex_url%/}/version" >/dev/null 2>&1 +} + +run_blocked_benchmark_smoke() { + local system_name="$1" + local system_command="$2" + local out_dir="benchmark-results/${system_name}-blocked-smoke-$(date +%Y%m%d-%H%M%S)" + local stdout_file="${out_dir}/runner-stdout.json" + + mkdir -p "$out_dir" + printf 'RUN %s benchmark no-key blocked smoke\n' "$system_name" + if ! env -u OPENROUTER_API_KEY -u TINYFISH_API_KEY node benchmarks/dataset-agent/run-benchmark.mjs \ + --prompt-ids latest-ai-blog-posts \ + --timeout-ms 60000 \ + --out "$out_dir" \ + --system "${system_name}=${system_command}" \ + > "$stdout_file"; then + mark_fail "${system_name} benchmark no-key blocked smoke" + return + fi + + if node -e ' +const fs = require("fs"); +const summary = JSON.parse(fs.readFileSync(process.argv[1], "utf8")); +const group = summary.aggregate?.[0]; +if (!group || group.total !== 1 || group.blocked !== 1 || group.failed !== 0) { + console.error("expected exactly one blocked benchmark result"); + process.exit(1); +} +const aggregateSpendFields = [ + "totalRows", + "totalPromptTokens", + "totalCompletionTokens", + "totalTokens", + "searchCallCount", + "fetchCallCount", + "browserCallCount", + "agentRunCount", + "agentStepCount", + "estimatedTotalCostUsd", +]; +const nonZeroAggregateFields = aggregateSpendFields.filter( + (field) => Number(group[field] ?? 0) !== 0 +); +if (nonZeroAggregateFields.length > 0) { + console.error(`expected zero spend/calls for blocked smoke: ${nonZeroAggregateFields.join(", ")}`); + process.exit(1); +} +for (const result of summary.laneResults ?? []) { + const laneSpendFields = [ + ["rowCount", result.rowCount], + ["promptTokens", result.usage?.promptTokens], + ["completionTokens", result.usage?.completionTokens], + ["totalTokens", result.usage?.totalTokens], + ["searchCallCount", result.searchCallCount], + ["fetchCallCount", result.fetchCallCount], + ["browserCallCount", result.browserCallCount], + ["agentRunCount", result.agentRunCount], + ["agentStepCount", result.agentStepCount], + ["estimatedTotalCostUsd", result.estimatedTotalCostUsd], + ]; + const nonZeroLaneFields = laneSpendFields + .filter(([, value]) => Number(value ?? 0) !== 0) + .map(([field]) => field); + if (nonZeroLaneFields.length > 0) { + console.error(`expected zero spend/calls for blocked lane: ${nonZeroLaneFields.join(", ")}`); + process.exit(1); + } +} +' "${out_dir}/summary.json"; then + mark_pass "${system_name} benchmark no-key blocked smoke (${out_dir})" + else + mark_fail "${system_name} benchmark no-key blocked smoke" + fi +} + +run_real_benchmark() { + require_env_var OPENROUTER_API_KEY || return + require_env_var TINYFISH_API_KEY || return + + local out_dir="benchmark-results/self-healing-real-smoke-$(date +%Y%m%d-%H%M%S)" + local stdout_file="${out_dir}/runner-stdout.json" + + mkdir -p "$out_dir" + printf 'RUN mastra real benchmark smoke\n' + if node benchmarks/dataset-agent/run-benchmark.mjs \ + --prompt-ids latest-ai-blog-posts,saas-pricing-pages \ + --timeout-ms 900000 \ + --out "$out_dir" \ + --system "mastra=node --import ./backend/node_modules/tsx/dist/esm/index.mjs benchmarks/dataset-agent/adapters/mastra-populate-adapter.mjs" \ + > "$stdout_file"; then + mark_pass "mastra real benchmark smoke (${out_dir})" + else + mark_fail "mastra real benchmark smoke" + fi +} + +run_live_dataset_smoke() { + require_env_var CONVEX_URL || return + require_env_var CONVEX_SELF_HOSTED_ADMIN_KEY || return + require_env_var OPENROUTER_API_KEY || return + require_env_var TINYFISH_API_KEY || return + + if ! check_convex_ready "$CONVEX_URL"; then + mark_blocked "Convex is not reachable at ${CONVEX_URL%/}/version" + return + fi + + local populate_args=(--dataset-id "$DATASET_ID" --max-rows 3) + local label="self-healing dataset smoke dry-run" + if [[ "$SHOULD_COMMIT_ROWS" -eq 1 ]]; then + populate_args+=(--commit) + label="self-healing dataset smoke commit" + fi + + run_required_step "$label" npm --silent --prefix backend run populate:self-heal -- "${populate_args[@]}" +} + +while [[ "$#" -gt 0 ]]; do + case "$1" in + --dataset-id) + DATASET_ID="${2:-}" + if [[ -z "$DATASET_ID" ]]; then + printf 'Error: --dataset-id requires a value.\n' >&2 + exit 1 + fi + shift 2 + ;; + --commit) + SHOULD_COMMIT_ROWS=1 + shift + ;; + --convex-push) + SHOULD_RUN_CONVEX_PUSH=1 + shift + ;; + --real-benchmark) + SHOULD_RUN_REAL_BENCHMARK=1 + shift + ;; + --skip-local) + SHOULD_RUN_LOCAL_GATES=0 + shift + ;; + --no-blocked-smoke) + SHOULD_RUN_BLOCKED_BENCHMARK_SMOKE=0 + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + printf 'Error: unknown option: %s\n' "$1" >&2 + usage >&2 + exit 1 + ;; + esac +done + +if [[ "$SHOULD_COMMIT_ROWS" -eq 1 && -z "$DATASET_ID" ]]; then + printf 'Error: --commit requires --dataset-id.\n' >&2 + exit 1 +fi + +if [[ "$SHOULD_RUN_LOCAL_GATES" -eq 1 ]]; then + run_required_step "backend tests" npm --prefix backend test + run_required_step "backend build" npm --prefix backend run build + run_required_step "mastra adapter syntax" node --check benchmarks/dataset-agent/adapters/mastra-populate-adapter.mjs + run_required_step "collection adapter syntax" node --check benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs +fi + +if [[ "$SHOULD_RUN_BLOCKED_BENCHMARK_SMOKE" -eq 1 ]]; then + run_blocked_benchmark_smoke \ + "mastra" \ + "node --import ./backend/node_modules/tsx/dist/esm/index.mjs benchmarks/dataset-agent/adapters/mastra-populate-adapter.mjs" + run_blocked_benchmark_smoke \ + "collection-self-heal" \ + "node --import ./backend/node_modules/tsx/dist/esm/index.mjs benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs" +fi + +if [[ "$SHOULD_RUN_CONVEX_PUSH" -eq 1 ]]; then + if [[ ! -f frontend/.env.local ]]; then + mark_blocked "frontend/.env.local missing; cannot run make convex-push" + elif ! check_docker_compose_ready; then + mark_blocked "Docker Compose is not ready; cannot run make convex-push" + elif ! check_convex_ready "http://127.0.0.1:3210"; then + mark_blocked "Convex is not reachable at http://127.0.0.1:3210/version" + else + run_required_step "convex push" make convex-push + fi +fi + +if [[ "$SHOULD_RUN_REAL_BENCHMARK" -eq 1 ]]; then + run_real_benchmark +fi + +if [[ -n "$DATASET_ID" ]]; then + run_live_dataset_smoke +fi + +case "$EXIT_STATUS" in + 0) printf 'DONE self-healing stack verification passed\n' ;; + 1) printf 'DONE self-healing stack verification failed\n' ;; + 2) printf 'DONE self-healing stack verification blocked by local prerequisites\n' ;; +esac + +exit "$EXIT_STATUS"