diff --git a/packages/scripts/package.json b/packages/scripts/package.json index 0388be4af..962ffe055 100644 --- a/packages/scripts/package.json +++ b/packages/scripts/package.json @@ -5,37 +5,38 @@ "main": "index.js", "private": true, "scripts": { + "assessCases": "npm run build && node ./build/main/assessCasesMain.js", + "analyzeMessages": "npm run build && node ./build/analyzeMessages.js", + "build": "npm run clean && tsc -b tsconfig.build.json", + "checkUrlsAgainstDB": "npm run build && node ./build/checkUrlsAgainstDB.js", + "clean": "rm -rf ./build", "createQualityTestsYaml-aug-2023": "npm run build && node ./build/createAug2023QualityTestsYaml.js", "createQualityTestsYaml-sept-2023": "npm run build && node ./build/createSept2023QualityTestsYaml.js", - "scrubMessages": "npm run build && node ./build/scrubMessages.js", - "sampleAndSaveMarkdownPages": "npm run build && node ./build/sampleAndSaveMarkdownPages.js", - "upsertMetaDirective": "npm run build && node ./build/upsertMetaDirective.js", - "analyzeMessages": "npm run build && node ./build/analyzeMessages.js", - "findFaq": "npm run build && node ./build/main/findFaqMain.js", - "upgradeFaqEntries": "npm run build && node ./build/main/upgradeFaqEntriesMain.js", - "materializeScrubbedMessagesStats:all": "npm run build && node ./build/main/materializeScrubbedMessagesStats.js --all", - "materializeScrubbedMessagesStats:since": "npm run build && node ./build/main/materializeScrubbedMessagesStats.js --since", - "materializeScrubbedMessagesStats:latest": "npm run build && node ./build/main/materializeScrubbedMessagesStats.js", "createView/messages_by_rating": "npm run build && node ./build/createView/messages_by_rating.js", - "createView/top_250_references": "npm run build && node ./build/createView/top_250_references.js", "createView/scrubbed_messages_by_rating": "npm run build && node ./build/createView/scrubbed_messages_by_rating.js", "createView/scrubbed_messages_stats": "npm run build && node ./build/createView/scrubbed_messages_stats.js", "createView/scrubbed_top_250_references": "npm run build && node ./build/createView/scrubbed_top_250_references.js", "createView/scrubbed_topics": "npm run build && node ./build/createView/scrubbed_topics.js", - "removeTestDatabases": "npm run build && node ./build/removeTestDatabases.js", - "getConversationText": "npm run build && node ./build/getConversationText.js", + "createView/top_250_references": "npm run build && node ./build/createView/top_250_references.js", + "findFaq": "npm run build && node ./build/main/findFaqMain.js", "findPageTitles": "npm run build && node ./build/main/findPageTitlesMain.js", + "getConversationText": "npm run build && node ./build/getConversationText.js", + "getLLMAnswers": "npm run build && node ./build/profound/getAndProcessAnswers.js", + "lint:fix": "npm run lint -- --fix && prettier ./src --check --write", + "lint": "eslint ./src --ext ts,tsx,js,jsx --report-unused-disable-directives", "listSlackMessages": "npm run build && node ./build/main/listSlackMessagesMain.js", + "materializeScrubbedMessagesStats:all": "npm run build && node ./build/main/materializeScrubbedMessagesStats.js --all", + "materializeScrubbedMessagesStats:latest": "npm run build && node ./build/main/materializeScrubbedMessagesStats.js", + "materializeScrubbedMessagesStats:since": "npm run build && node ./build/main/materializeScrubbedMessagesStats.js --since", + "release": "release-it", "removeSlackMessage": "npm run build && node ./build/main/removeSlackMessageMain.js", - "checkUrlsAgainstDB": "npm run build && node ./build/checkUrlsAgainstDB.js", - "getLLMAnswers": "npm run build && node ./build/profound/getAndProcessAnswers.js", - "verifyPagesInSource": "npm run build && node ./build/verifyPagesInSource.js", + "removeTestDatabases": "npm run build && node ./build/removeTestDatabases.js", + "sampleAndSaveMarkdownPages": "npm run build && node ./build/sampleAndSaveMarkdownPages.js", + "scrubMessages": "npm run build && node ./build/scrubMessages.js", "test": "jest --forceExit", - "build": "npm run clean && tsc -b tsconfig.build.json", - "clean": "rm -rf ./build", - "release": "release-it", - "lint": "eslint ./src --ext ts,tsx,js,jsx --report-unused-disable-directives", - "lint:fix": "npm run lint -- --fix && prettier ./src --check --write" + "upgradeFaqEntries": "npm run build && node ./build/main/upgradeFaqEntriesMain.js", + "upsertMetaDirective": "npm run build && node ./build/upsertMetaDirective.js", + "verifyPagesInSource": "npm run build && node ./build/verifyPagesInSource.js" }, "keywords": [], "author": "", diff --git a/packages/scripts/src/Case.ts b/packages/scripts/src/Case.ts new file mode 100644 index 000000000..5f2521473 --- /dev/null +++ b/packages/scripts/src/Case.ts @@ -0,0 +1,74 @@ +import z from "zod"; +import { PromptResponseRating } from "./generateRating"; + +// Map of embedding model name -> vector (array of numbers) +export const Embeddings = z.record(z.string(), z.number().array()); + +export type Embeddings = z.infer; + +export const PromptAndEmbeddings = z.object({ + prompt: z.string(), + embeddings: Embeddings, +}); + +export type PromptAndEmbeddings = z.infer; + +/** + Answer relevance: given prompt and expected answer pair, generate N possible + prompts that would elicit that answer, then compare their embeddings with the + embedding of the original prompt. + */ +export const RelevanceMetrics = z.object({ + /** + Normalized square magnitude difference. Lower = closer = better. This gives + an idea of how close the vectors are to each other in their N-dimensional + space, but doesn't seem to work as well as cos_similarity. + */ + norm_sq_mag_diff: z.number(), + /** + Cosine similarity: are vectors pointing the same way? Range [-1, 1]. + */ + cos_similarity: z.number(), +}); + +export type RelevanceMetrics = z.infer; + +export const ScoredPromptAndEmbeddings = PromptAndEmbeddings.and( + z.object({ + relevance: + // embedding model name -> score + z.record(z.string(), RelevanceMetrics), + }) +); + +export type ScoredPromptAndEmbeddings = z.infer< + typeof ScoredPromptAndEmbeddings +>; + +export const Relevance = z.object({ + prompt_embeddings: Embeddings, + generated_prompts: ScoredPromptAndEmbeddings.array(), + averages: RelevanceMetrics, +}); + +export type Relevance = z.infer; + +export const Case = z.object({ + type: z.string(), + tags: z.string().array(), + name: z.string(), + prompt: z + .object({ + content: z.string(), + role: z.string(), + }) + .array(), + expected: z.string(), + + // Fields to add + prompt_embeddings: Embeddings.optional(), + relevance: Relevance.optional(), + prompt_response_rating: PromptResponseRating.optional(), +}); + +export type Case = z.infer; diff --git a/packages/scripts/src/SimpleTextGenerator.ts b/packages/scripts/src/SimpleTextGenerator.ts new file mode 100644 index 000000000..e47d0e38d --- /dev/null +++ b/packages/scripts/src/SimpleTextGenerator.ts @@ -0,0 +1,36 @@ +import { generateText, LanguageModel } from "mongodb-rag-core/aiSdk"; + +export const makeSimpleTextGenerator = ({ + model, + systemPrompt, +}: { + model: LanguageModel; + systemPrompt?: string; +}) => { + return async ({ + prompt, + temperature = 0, + n = 1, + }: { + prompt: string; + temperature?: number; + + n?: number; + }): Promise => { + const result = await Promise.all( + Array(n) + .fill(0) + .map(async () => + generateText({ + model, + prompt, + system: systemPrompt, + temperature, + }) + ) + ); + return result.map(({ text }) => text); + }; +}; + +export type SimpleTextGenerator = ReturnType; diff --git a/packages/scripts/src/assessRelevance.eval.ts b/packages/scripts/src/assessRelevance.eval.ts new file mode 100644 index 000000000..1c72b93c5 --- /dev/null +++ b/packages/scripts/src/assessRelevance.eval.ts @@ -0,0 +1,143 @@ +import "dotenv/config"; +import { Eval, BraintrustMiddleware } from "braintrust"; +import { Scorer } from "autoevals"; +import { MongoDbTag } from "mongodb-rag-core/mongoDbMetadata"; +import { createOpenAI, wrapLanguageModel } from "mongodb-rag-core/aiSdk"; +import { + assertEnvVars, + BRAINTRUST_ENV_VARS, + makeOpenAiEmbedder, +} from "mongodb-rag-core"; +import { Relevance } from "./Case"; +import { assessRelevance } from "./assessRelevance"; +import { AzureOpenAI } from "mongodb-rag-core/openai"; +import { makeSimpleTextGenerator } from "./SimpleTextGenerator"; + +const { + OPENAI_API_KEY, + OPENAI_ENDPOINT, + OPENAI_API_VERSION, + OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT, + BRAINTRUST_API_KEY, + BRAINTRUST_ENDPOINT, +} = assertEnvVars({ + OPENAI_API_KEY: "", + OPENAI_ENDPOINT: "", + OPENAI_API_VERSION: "", + OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT: "", + ...BRAINTRUST_ENV_VARS, +}); + +interface AssessRelevanceEvalCase { + input: { + prompt: string; + expectedResponse: string; + }; + expected: Pick; + tags?: MongoDbTag[]; +} + +const evalCases: AssessRelevanceEvalCase[] = [ + { + // High similarity + input: { + prompt: "When to use $pull and $push mongodb", + expectedResponse: + "Use the $pull operator when you want to remove a value or values that match specific conditions from an existing array. \nUse the $push operator when you want to add a specific value to an array. ", + }, + expected: { + averages: { + cos_similarity: 0.9639615103046141, + norm_sq_mag_diff: 1.876484720646915e-11, + }, + }, + }, + { + // Low similarity + input: { + prompt: "give me an example of how to use the $and operator", + expectedResponse: + "The following example returns inventory documents where the price is greater than 25 and the quantity is less than 20:\n\ndb.inventory.find( {\n $and: [\n { price: { $gt: 25 } },\n { quantity: { $lt: 20 } }\n ]\n} )", + }, + expected: { + averages: { + cos_similarity: 0.3442199438560915, + norm_sq_mag_diff: 1.0454893515591396e-10, + }, + }, + }, +]; + +const cosSimilarityScorer: Scorer, unknown> = ({ + output, + expected, +}) => ({ + name: `closeCosSimilarity`, + score: + expected === undefined + ? 0 + : 1 - + Math.abs( + output.averages.cos_similarity - expected.averages.cos_similarity + ), +}); + +const model = wrapLanguageModel({ + model: createOpenAI({ + apiKey: BRAINTRUST_API_KEY, + baseURL: BRAINTRUST_ENDPOINT, + }).chat("o3"), + middleware: [BraintrustMiddleware({ debug: true })], +}); + +const openAiClient = new AzureOpenAI({ + apiKey: OPENAI_API_KEY, + endpoint: OPENAI_ENDPOINT, + apiVersion: OPENAI_API_VERSION, +}); + +const embedders = [ + makeOpenAiEmbedder({ + openAiClient, + deployment: OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT, + backoffOptions: { + numOfAttempts: 25, + startingDelay: 1000, + }, + }), +]; + +const generate = makeSimpleTextGenerator({ + model: wrapLanguageModel({ + model: createOpenAI({ + apiKey: BRAINTRUST_API_KEY, + baseURL: BRAINTRUST_ENDPOINT, + }).chat("gpt-4.1"), + middleware: [BraintrustMiddleware({ debug: true })], + }), +}); + +Eval("assess-prompt-response-relevance", { + data: evalCases, + experimentName: "assess-relevance", + metadata: { + description: "Evaluates assessRelevance().", + model: model.modelId, + }, + maxConcurrency: 10, + async task({ prompt, expectedResponse }) { + try { + return await assessRelevance({ + prompt, + expectedResponse, + embedders, + generate, + }); + } catch (error) { + console.error(`Error evaluating input: ${prompt} - ${expectedResponse}`); + console.error(error); + throw error; + } + }, + scores: [cosSimilarityScorer], +}); diff --git a/packages/scripts/src/assessRelevance.ts b/packages/scripts/src/assessRelevance.ts new file mode 100644 index 000000000..4ecd42f96 --- /dev/null +++ b/packages/scripts/src/assessRelevance.ts @@ -0,0 +1,177 @@ +import { strict as assert } from "assert"; +import { normalizedSquareMagnitudeDifference } from "./squareMagnitude"; +import { calculateEmbeddings } from "./calculateEmbeddings"; +import { SimpleTextGenerator } from "./SimpleTextGenerator"; +import { Embedder } from "mongodb-rag-core"; +import { + PromptAndEmbeddings, + Relevance, + RelevanceMetrics, + ScoredPromptAndEmbeddings, +} from "./Case"; +import { cosineSimilarity } from "mongodb-rag-core/aiSdk"; + +/** + Given the expected answer, generate a number of possible prompts that could + elicit that expected answer. + */ +export const generatePromptsFromExpectedAnswer = async ({ + expectedResponse, + embedders, + generate, + howMany, +}: { + expectedResponse: string; + embedders: Embedder[]; + generate: SimpleTextGenerator; + howMany: number; +}): Promise => { + const variants = await generate({ + prompt: `Given the following "expected answer", formulate a question that is likely to elicit the expected answer. +Don't necessarily use proper grammar or punctuation; write like a user of a chatbot, search engine, or LLM would. +Just return the generated question. + +Expected answer:\n\n${expectedResponse}`, + n: howMany, + temperature: 0.5, + }); + + return await Promise.all( + variants.map(async (text) => { + return { + prompt: text, + embeddings: await calculateEmbeddings({ embedders, text }), + }; + }) + ); +}; + +export const scoreVariants = ({ + original, + variants, +}: { + original: PromptAndEmbeddings; + variants: PromptAndEmbeddings[]; +}): ScoredPromptAndEmbeddings[] => { + return variants.map( + ({ embeddings: variantEmbeddings, prompt }): ScoredPromptAndEmbeddings => { + const relevance = Object.fromEntries( + Object.entries(original.embeddings).map( + ([model, originalEmbedding]): [string, RelevanceMetrics] => { + assert( + variantEmbeddings[model] !== undefined, + `No embedding for model ${model}!` + ); + return [ + model, + { + cos_similarity: cosineSimilarity( + originalEmbedding, + variantEmbeddings[model] + ), + norm_sq_mag_diff: normalizedSquareMagnitudeDifference( + originalEmbedding, + variantEmbeddings[model] + ), + }, + ]; + } + ) + ); + return { + embeddings: variantEmbeddings, + prompt, + relevance, + }; + } + ); +}; + +export const assessRelevance = async ({ + prompt, + embedders, + expectedResponse, + generate, +}: { + prompt: string; + expectedResponse: string; + embedders: Embedder[]; + generate: SimpleTextGenerator; +}): Promise => { + const shortName = makeShortName(prompt); + console.log(`Calculating embeddings for '${shortName}'...`); + + const promptEmbeddings = await calculateEmbeddings({ + text: prompt, + embedders, + }); + + console.log(`Generating variants for '${shortName}'...`); + const variants = await generatePromptsFromExpectedAnswer({ + embedders, + expectedResponse, + generate, + howMany: 3, + }); + + const variantCount = Object.values(variants).length; + if (variantCount === 0) { + throw new Error(`Unexpectedly without variants for ${shortName}!`); + } + + const scoredVariants = scoreVariants({ + original: { prompt, embeddings: promptEmbeddings }, + variants, + }); + assert(variantCount === Object.values(scoredVariants).length); + + console.log( + `- Expected: "${expectedResponse}" +- Original: "${prompt}" +- Generated variants: +${scoredVariants.map(({ prompt }) => ` - "${prompt}"`).join("\n")}` + ); + + const summedMetrics = scoredVariants.reduce( + (outer, { relevance }): RelevanceMetrics => { + const relevanceValues = Object.values(relevance); + const modelCount = relevanceValues.length; + assert(modelCount > 0); + // Sum metrics across models + const crossModel = relevanceValues.reduce( + (acc, { cos_similarity, norm_sq_mag_diff }) => ({ + cos_similarity: acc.cos_similarity + cos_similarity, + norm_sq_mag_diff: acc.norm_sq_mag_diff + norm_sq_mag_diff, + }), + { cos_similarity: 0, norm_sq_mag_diff: 0 } + ); + + // Accumulate averages across models + return { + cos_similarity: + outer.cos_similarity + crossModel.cos_similarity / modelCount, + norm_sq_mag_diff: + outer.norm_sq_mag_diff + crossModel.norm_sq_mag_diff / modelCount, + }; + }, + { cos_similarity: 0, norm_sq_mag_diff: 0 } + ); + const averages: RelevanceMetrics = { + cos_similarity: summedMetrics.cos_similarity / variantCount, + norm_sq_mag_diff: summedMetrics.norm_sq_mag_diff / variantCount, + }; + + console.log(`Average score: ${JSON.stringify(averages)}`); + return { + prompt_embeddings: promptEmbeddings, + generated_prompts: scoredVariants, + averages, + }; +}; + +export const makeShortName = (prompt: string, ellipsizeAtLength = 64) => { + assert(ellipsizeAtLength > 0); + return prompt.length > ellipsizeAtLength + ? prompt.slice(0, ellipsizeAtLength - 3) + "..." + : prompt; +}; diff --git a/packages/scripts/src/calculateEmbeddings.ts b/packages/scripts/src/calculateEmbeddings.ts new file mode 100644 index 000000000..d9b83c3f5 --- /dev/null +++ b/packages/scripts/src/calculateEmbeddings.ts @@ -0,0 +1,21 @@ +import assert from "node:assert/strict"; +import { Embedder } from "mongodb-rag-core"; + +export const calculateEmbeddings = async ({ + text, + embedders, +}: { + text: string; + embedders: Embedder[]; +}) => { + return Object.fromEntries( + await Promise.all( + embedders.map(async (embedder): Promise<[string, number[]]> => { + const { embedding } = await embedder.embed({ text }); + const model = embedder.modelName; + assert(model !== undefined, "Missing embedder model name!"); + return [model, embedding]; + }) + ) + ); +}; diff --git a/packages/scripts/src/generateRating.eval.ts b/packages/scripts/src/generateRating.eval.ts new file mode 100644 index 000000000..ddf556061 --- /dev/null +++ b/packages/scripts/src/generateRating.eval.ts @@ -0,0 +1,170 @@ +import "dotenv/config"; +import { Eval, BraintrustMiddleware } from "braintrust"; +import { Scorer } from "autoevals"; +import { MongoDbTag } from "mongodb-rag-core/mongoDbMetadata"; +import { createOpenAI, wrapLanguageModel } from "mongodb-rag-core/aiSdk"; +import { makeGenerateRating, PromptResponseRating } from "./generateRating"; +import { assertEnvVars, BRAINTRUST_ENV_VARS } from "mongodb-rag-core"; + +const { BRAINTRUST_API_KEY, BRAINTRUST_ENDPOINT } = + assertEnvVars(BRAINTRUST_ENV_VARS); + +interface GenerateRatingEvalCase { + input: { + prompt: string; + expectedResponse: string; + }; + expected: PromptResponseRating; + tags?: MongoDbTag[]; +} + +const evalCases: GenerateRatingEvalCase[] = [ + { + input: { + prompt: "Is there a limit for mongodb deletemany", + expectedResponse: + "db.collection.deleteMany() removes all documents that match the filter from a collection.\n\nNOTE: If you are deleting all documents in a large collection, it may be faster to drop the collection and recreate it. Before dropping the collection, note all indexes on the collection. You must recreate any indexes that existed in the original collection. If the original collection was sharded, you must also shard the recreated collection.\n\nFor more information on dropping a collection, see db.collection.drop().", + }, + expected: { + answer_fit: 3, + answer_reasonableness: 4, + business_impact: 3, + prompt_clarity: 3, + prompt_knowledge_assumption: 2, + answer_fit_rationale: + "Prompt asks specifically about a possible limit on deleteMany; expected answer implicitly says it removes all matching docs and advises using drop for large collections but never explicitly states whether a limit exists. Partially addresses question but not directly or definitively.", + answer_reasonableness_rationale: + "It is reasonable for an LLM to mention that deleteMany removes all matching documents and suggest dropping the collection for large-scale deletes; that is common guidance in MongoDB docs.", + business_impact_rationale: + "Understanding deleteMany behavior is moderately important for applications that manage large datasets, but not highly strategic.", + prompt_clarity_rationale: + "Prompt is understandable but contains a typo and lacks context or detail, making it only moderately clear.", + prompt_knowledge_assumption_rationale: + "Requires only basic MongoDB CRUD knowledge; little domain depth needed.", + }, + }, + { + input: { + prompt: "What are best practices for mongodb pagination", + expectedResponse: + "The following principles are best practices for paginating your Atlas Search query results:\n- Sort queries by unique fields to prevent ties in relevance scores.\n- If you want to primarily sort by a field that is not unique, add an additional sort clause on a unique field to act as a tiebreaker.\n- Sort your query results by an immutable field to ensure that your results reflect updates made between queries. \n", + }, + expected: { + answer_fit: 2, + answer_reasonableness: 3, + business_impact: 3, + prompt_clarity: 4, + prompt_knowledge_assumption: 3, + guidance: + "Answer omits several widely-recognized pagination practices (e.g., avoiding large skip, using range queries or ‘after’ cursors, leveraging _id for seek-based pagination). Either broaden the expected answer or narrow the prompt to “Atlas Search pagination sorting best practices.”", + answer_fit_rationale: + "Prompt asks for general MongoDB pagination best practices, but expected answer covers only Atlas Search sorting guidance; many core practices are missing.", + answer_reasonableness_rationale: + "An LLM could plausibly reply with the three listed bullets, but would usually add other common tips; partial but not implausible.", + business_impact_rationale: + "Pagination efficiency affects app performance; guidance has moderate practical value.", + prompt_clarity_rationale: + "Question is concise and unambiguous, though it doesn’t specify Atlas Search scope, causing mismatch.", + prompt_knowledge_assumption_rationale: + "Requires some database and MongoDB operational knowledge, but not deep specialist expertise.", + }, + }, + { + input: { + prompt: "How to use unset field in array mongodb", + expectedResponse: + 'To use the $unset operator on a field that contains an array, use the update method with a filter to identify the specific value in the array that you want to unset and the $ operator to unset that value. \n\n```\ndb.grades.insertMany([{user: "A", grades: [90, 30, 40]}, {user: "B", grades: [30, 70, 60]}])\n\ndb.grades.updateMany({grades: 30}, {$unset: {"grades.$": 1}})\n```\n\nWhen the operation is complete, the value that matches the filter in the array is changed to null.\n', + }, + expected: { + answer_fit: 1, + answer_reasonableness: 3, + business_impact: 3, + prompt_clarity: 3, + prompt_knowledge_assumption: 3, + guidance: + 'The answer is technically incorrect: $unset cannot be combined with the positional $ operator. To null-out an array element you must reference it by index (e.g., {$unset:{"grades.0":1}}) or use $set with the positional operator. Clarify the prompt (“How do I unset/remove a single element inside an array in MongoDB?”) and ensure the expected answer reflects MongoDB’s documented behavior.', + answer_fit_rationale: + '$unset with "grades.$" is invalid in MongoDB; the operation shown would fail. Therefore the answer does not correctly satisfy the prompt.', + answer_reasonableness_rationale: + "Given the vague prompt, an LLM might guess the misuse of $unset and $; although wrong, it’s a plausible but not authoritative response.", + business_impact_rationale: + "Array updates are common developer tasks; wrong guidance could cause wasted time but not critical business failure.", + prompt_clarity_rationale: + "Prompt is understandable but ungrammatical; intent is clear enough.", + prompt_knowledge_assumption_rationale: + "Requires basic MongoDB update knowledge; moderate domain specificity.", + }, + }, + { + input: { + prompt: "How can i combine vector search with lexical search?", + expectedResponse: "[TO FILL] update when $rankFusion syntax is released", + }, + expected: { + answer_fit: 1, + answer_reasonableness: 1, + business_impact: 3, + prompt_clarity: 4, + prompt_knowledge_assumption: 3, + guidance: + "The expected answer is missing, so scores for fit and reasonableness are 1. Provide a complete answer explaining concrete techniques (e.g., hybrid search, result fusion, the upcoming $rankFusion syntax) and practical examples. Clarify any prerequisites, such as the search engine or framework in use, to let respondents tailor detailed guidance.", + answer_fit_rationale: + "No substantive content; expected answer is a placeholder.", + answer_reasonableness_rationale: + "An LLM cannot derive a full answer from the placeholder; unreasonable to expect a correct output.", + business_impact_rationale: + "Hybrid search guidance is valuable for many search applications, but impact is undermined by missing answer.", + prompt_clarity_rationale: "Direct, concise question; understandable.", + prompt_knowledge_assumption_rationale: + "Assumes familiarity with search paradigms but not excessively specialized.", + }, + }, +]; + +const scores = ( + [ + "answer_fit", + "answer_reasonableness", + "business_impact", + "prompt_clarity", + "prompt_knowledge_assumption", + ] satisfies (keyof PromptResponseRating)[] +).map((key): Scorer => { + return ({ output, expected }) => ({ + name: `correct_${key}`, + score: 1 - Math.abs(output[key] - (expected?.[key] ?? 0)) / 4, + }); +}); + +const model = wrapLanguageModel({ + model: createOpenAI({ + apiKey: BRAINTRUST_API_KEY, + baseURL: BRAINTRUST_ENDPOINT, + }).chat("o3"), + middleware: [BraintrustMiddleware({ debug: true })], +}); + +const generateRating = makeGenerateRating({ + model, +}); + +Eval("generate-prompt-response-rating", { + data: evalCases, + experimentName: "response", + metadata: { + description: + "Evaluates the quality of LLM as judge in rating prompt & expected response pairs.", + model: model.modelId, + }, + maxConcurrency: 10, + async task(input) { + try { + return await generateRating(input); + } catch (error) { + console.error(`Error evaluating input: ${input}`); + console.error(error); + throw error; + } + }, + scores, +}); diff --git a/packages/scripts/src/generateRating.ts b/packages/scripts/src/generateRating.ts new file mode 100644 index 000000000..909eb8562 --- /dev/null +++ b/packages/scripts/src/generateRating.ts @@ -0,0 +1,95 @@ +import { generateObject, LanguageModel } from "mongodb-rag-core/aiSdk"; +import { makeShortName } from "./assessRelevance"; +import z from "zod"; + +const Rating = z.number().min(1).max(5); +const Rationale = (name: string) => + z + .string() + .optional() + .describe( + `A terse and direct chain of thought or rationale for why you gave the ${name} score.` + ); +export const PromptResponseRating = z.object({ + answer_fit: Rating.describe( + "How well the expected answer actually matches the prompt." + ), + answer_reasonableness: Rating.describe( + "How reasonable it would be to expect an LLM to produce the given response from the given prompt." + ), + business_impact: Rating.describe( + "The business impact/relevance of the question and answer. Good examples: competitor questions, technical questions. Bad examples: when was MongoDB founded?" + ), // TODO: Clarify this - it is intended to evaluate how useful the prompt actually is - something you can just google? Or something people might actually need help with? + prompt_clarity: Rating.describe( + "How well-formulated and clear the prompt is." + ), + prompt_knowledge_assumption: Rating.describe( + "How much domain-specific knowledge might be required to effectively answer the prompt." + ), + + guidance: z + .string() + .optional() + .describe( + "TERSELY and DIRECTLY detail the issue; suggest how to improve the prompt and/or expected response. Only include this if ANY of the above scores <= 2" + ), + + answer_fit_rationale: Rationale("answer fit"), + answer_reasonableness_rationale: Rationale("answer reasonableness"), + business_impact_rationale: Rationale("business impact"), + prompt_clarity_rationale: Rationale("prompt clarity"), + prompt_knowledge_assumption_rationale: Rationale( + "prompt knowledge assumption" + ), +}); + +export type PromptResponseRating = z.infer; + +const schemaName = "prompt_response_rating"; + +/** + Creates a function that, given the prompt and expected response pair, uses the + LLM to assess the quality on a variety of metrics and provide recommendations + for improvement. + */ +export const makeGenerateRating = ({ model }: { model: LanguageModel }) => { + return async ({ + prompt, + expectedResponse, + }: { + prompt: string; + expectedResponse: string; + }): Promise => { + const shortName = makeShortName(prompt); + console.log(`Rating '${shortName}' with LLM...`); + + const result = await generateObject({ + // TODO: TOMAYBEDO: Add more context to 1-5, add more context on different scores... + // - Output chain of thought - helpful for debugging + prompt: ` +Evaluate the quality of the following prompt-expected answer pair across +multiple dimensions. Return your evaluation as a JSON object with numeric scores +from 1 (poor) to 5 (excellent). + +Now evaluate this promt/expected answer pair: + + +${prompt} + + +${expectedResponse} + + +Format the response in a '${schemaName}' JSON object. +`.trim(), + model, + schema: PromptResponseRating, + schemaName, + schemaDescription: "Ratings for prompt response pair.", + }); + + return result.object; + }; +}; + +export type GenerateRating = ReturnType; diff --git a/packages/scripts/src/main/assessCasesMain.ts b/packages/scripts/src/main/assessCasesMain.ts new file mode 100644 index 000000000..85bc4a6b2 --- /dev/null +++ b/packages/scripts/src/main/assessCasesMain.ts @@ -0,0 +1,124 @@ +import { AzureOpenAI } from "mongodb-rag-core/openai"; +import { + assertEnvVars, + BRAINTRUST_ENV_VARS, + makeOpenAiEmbedder, +} from "mongodb-rag-core"; +import { MongoClient } from "mongodb"; +import { Case } from "../Case"; +import { makeSimpleTextGenerator } from "../SimpleTextGenerator"; +import "dotenv/config"; +import { assessRelevance, makeShortName } from "../assessRelevance"; +import { makeGenerateRating } from "../generateRating"; +import { createOpenAI, wrapLanguageModel } from "mongodb-rag-core/aiSdk"; +import { BraintrustMiddleware } from "mongodb-rag-core/braintrust"; + +const assessRelevanceMain = async () => { + const { + FROM_CONNECTION_URI, + FROM_DATABASE_NAME, + OPENAI_API_KEY, + OPENAI_ENDPOINT, + OPENAI_API_VERSION, + OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT, + CASE_COLLECTION_NAME, + BRAINTRUST_API_KEY, + BRAINTRUST_ENDPOINT, + } = assertEnvVars({ + FROM_CONNECTION_URI: "", + FROM_DATABASE_NAME: "", + OPENAI_API_KEY: "", + OPENAI_ENDPOINT: "", + OPENAI_API_VERSION: "", + OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT: "", + CASE_COLLECTION_NAME: "", + ...BRAINTRUST_ENV_VARS, + }); + + const openAiClient = new AzureOpenAI({ + apiKey: OPENAI_API_KEY, + endpoint: OPENAI_ENDPOINT, + apiVersion: OPENAI_API_VERSION, + }); + + const embedders = [ + makeOpenAiEmbedder({ + openAiClient, + deployment: OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT, + backoffOptions: { + numOfAttempts: 25, + startingDelay: 1000, + }, + }), + ]; + + const generate = makeSimpleTextGenerator({ + model: wrapLanguageModel({ + model: createOpenAI({ + apiKey: BRAINTRUST_API_KEY, + baseURL: BRAINTRUST_ENDPOINT, + }).chat("gpt-4.1"), + middleware: [BraintrustMiddleware({ debug: true })], + }), + }); + + const judgmentModel = wrapLanguageModel({ + model: createOpenAI({ + apiKey: BRAINTRUST_API_KEY, + baseURL: BRAINTRUST_ENDPOINT, + }).chat("o3"), + middleware: [BraintrustMiddleware({ debug: true })], + }); + const generateRating = makeGenerateRating({ model: judgmentModel }); + + const client = await MongoClient.connect(FROM_CONNECTION_URI); + try { + console.log( + `Fetching cases ${FROM_DATABASE_NAME}.${CASE_COLLECTION_NAME}...` + ); + const db = client.db(FROM_DATABASE_NAME); + const collection = db.collection(CASE_COLLECTION_NAME); + const cases = await collection.find().toArray(); + const relevancePromises = cases.map( + async ({ _id, name: prompt, expected: expectedResponse }) => { + const shortName = makeShortName(prompt); + + const relevance = await assessRelevance({ + prompt, + expectedResponse, + embedders, + generate, + }); + + const prompt_response_rating = await generateRating({ + prompt, + expectedResponse, + }); + console.log(`Updating '${shortName}'...`); + const updateResult = await collection.updateOne( + { + _id, + }, + { + $set: { + relevance, + prompt_response_rating, + }, + } + ); + + if (updateResult.modifiedCount === 1) { + console.log(`Updated '${shortName}'.`); + } else { + console.warn(`Failed to update '${shortName}' (${_id})`); + } + } + ); + + await Promise.allSettled(relevancePromises); + } finally { + await client.close(); + } +}; + +assessRelevanceMain(); diff --git a/packages/scripts/src/squareMagnitude.test.ts b/packages/scripts/src/squareMagnitude.test.ts new file mode 100644 index 000000000..a4b5ebb8e --- /dev/null +++ b/packages/scripts/src/squareMagnitude.test.ts @@ -0,0 +1,108 @@ +import { + normalizedSquareMagnitude, + normalizedSquareMagnitudeDifference, + squareMagnitude, + squareMagnitudeDifference, +} from "./squareMagnitude"; + +describe("squareMagnitude", () => { + it("returns 0 for an empty vector", () => { + expect(squareMagnitude([])).toBe(0); + }); + + it("returns correct value for a single-element vector", () => { + expect(squareMagnitude([3])).toBe(9); + }); + + it("returns correct value for a multi-element vector", () => { + // [1, 2, 2] → 1^2 + 2^2 + 2^2 = 1 + 4 + 4 = 9 + expect(squareMagnitude([1, 2, 2])).toBe(9); + }); + + it("handles negative values correctly", () => { + // [-3, 4] → 9 + 16 = 25 + expect(squareMagnitude([-3, 4])).toBe(25); + }); +}); + +describe("squareMagnitudeDifference", () => { + it("returns 0 for identical vectors", () => { + expect(squareMagnitudeDifference([1, 2], [1, 2])).toBe(0); + }); + + it("returns correct difference for vectors of different magnitudes", () => { + const result = squareMagnitudeDifference([1, 2], [2, 2]); + // [1,2] → 1^2 + 2^2 = 5; [2,2] → 2^2 + 2^2 = 8; diff = 3 + expect(result).toBe(3); + }); + + it("is symmetric with respect to argument order", () => { + const a = [3, 4]; // mag^2 = 25 + const b = [0, 5]; // mag^2 = 25 + expect(squareMagnitudeDifference(a, b)).toBe(0); + expect(squareMagnitudeDifference(b, a)).toBe(0); + }); + + it("throws an error for vectors of different dimensions", () => { + expect(() => squareMagnitudeDifference([1, 2], [1, 2, 3])).toThrow( + "Vector length mismatch!" + ); + }); +}); + +describe("normalizedSquareMagnitude", () => { + it("returns 0 for a zero vector", () => { + expect(normalizedSquareMagnitude([0, 0, 0])).toBe(0); + }); + + it("returns 1 for a vector of all ones", () => { + expect(normalizedSquareMagnitude([1, 1, 1, 1])).toBe(1); + }); + + it("returns correct value for mixed values", () => { + // [1, 0, -1] → squares: [1, 0, 1] → sum: 2 → norm: 2 / 3 + expect(normalizedSquareMagnitude([1, 0, -1])).toBeCloseTo(2 / 3); + }); + + it("returns correct value for single-element vector", () => { + expect(normalizedSquareMagnitude([0.5])).toBeCloseTo(0.25); + }); + + it("throws for out-of-bound values", () => { + expect(() => normalizedSquareMagnitude([1.5])).toThrow(); + expect(() => normalizedSquareMagnitude([-1.1])).toThrow(); + }); + + it("throws for empty vector", () => { + expect(() => normalizedSquareMagnitude([])).toThrow(); + }); +}); + +describe("normalizedSquareMagnitudeDifference", () => { + it("returns 0 for identical vectors", () => { + const v = [0.5, -0.5, 0]; + expect(normalizedSquareMagnitudeDifference(v, v)).toBe(0); + }); + + it("returns correct difference for different vectors", () => { + const a = [1, 0, 0]; // mag² = 1, norm = 1/3 + const b = [1, 1, 1]; // mag² = 3, norm = 1 + expect(normalizedSquareMagnitudeDifference(a, b)).toBeCloseTo(2 / 3); + }); + + it("handles negative values correctly", () => { + const a = [-1, 0]; // mag² = 1, norm = 0.5 + const b = [0, 0]; // mag² = 0, norm = 0 + expect(normalizedSquareMagnitudeDifference(a, b)).toBe(0.5); + }); + + it("throws for vectors with out-of-bound values", () => { + expect(() => normalizedSquareMagnitudeDifference([2], [0])).toThrow(); + expect(() => normalizedSquareMagnitudeDifference([0], [-2])).toThrow(); + }); + + it("throws for empty vectors", () => { + expect(() => normalizedSquareMagnitudeDifference([], [0])).toThrow(); + expect(() => normalizedSquareMagnitudeDifference([0], [])).toThrow(); + }); +}); diff --git a/packages/scripts/src/squareMagnitude.ts b/packages/scripts/src/squareMagnitude.ts new file mode 100644 index 000000000..2fba15dcb --- /dev/null +++ b/packages/scripts/src/squareMagnitude.ts @@ -0,0 +1,45 @@ +import assert from "node:assert/strict"; + +/** + Calculates the square magnitude of an N-dimensional vector. + */ +export const squareMagnitude = (vector: number[]): number => { + return vector.reduce((acc, cur) => acc + Math.pow(cur, 2), 0); +}; + +/** + For vectors where each dimension is in [−1,1], returns a square + magnitude normalized to [0, 1]. + */ +export const normalizedSquareMagnitude = (vector: number[]): number => { + assert( + vector.every((value) => -1 <= value && value <= 1), + "Vector dimension outside [-1, 1] will create meaningless normalization" + ); + assert(vector.length > 0, "Vector must have dimensionality!"); + return squareMagnitude(vector) / vector.length; +}; + +/** + Calculates the difference between square magnitude of an N-dimensional vector. + */ +export const squareMagnitudeDifference = ( + vector0: number[], + vector1: number[] +): number => { + assert(vector0.length === vector1.length, "Vector length mismatch!"); + return Math.abs(squareMagnitude(vector1) - squareMagnitude(vector0)); +}; + +/** + For two vectors where each dimension is in [−1,1], returns the absolute + difference between their square magnitudes normalized to [0, 1]. + */ +export const normalizedSquareMagnitudeDifference = ( + vector0: number[], + vector1: number[] +): number => { + return Math.abs( + normalizedSquareMagnitude(vector0) - normalizedSquareMagnitude(vector1) + ); +};