mongodb · nlarew · Aug 12, 2025 · Jul 24, 2025 · Jul 24, 2025 · Jul 24, 2025
diff --git a/packages/scripts/package.json b/packages/scripts/package.json
@@ -5,37 +5,38 @@
   "main": "index.js",
   "private": true,
   "scripts": {
+    "assessCases": "npm run build && node ./build/main/assessCasesMain.js",
+    "analyzeMessages": "npm run build && node ./build/analyzeMessages.js",
+    "build": "npm run clean && tsc -b tsconfig.build.json",
+    "checkUrlsAgainstDB": "npm run build && node ./build/checkUrlsAgainstDB.js",
+    "clean": "rm -rf ./build",
     "createQualityTestsYaml-aug-2023": "npm run build && node ./build/createAug2023QualityTestsYaml.js",
     "createQualityTestsYaml-sept-2023": "npm run build && node ./build/createSept2023QualityTestsYaml.js",
-    "scrubMessages": "npm run build && node ./build/scrubMessages.js",
-    "sampleAndSaveMarkdownPages": "npm run build && node ./build/sampleAndSaveMarkdownPages.js",
-    "upsertMetaDirective": "npm run build && node ./build/upsertMetaDirective.js",
-    "analyzeMessages": "npm run build && node ./build/analyzeMessages.js",
-    "findFaq": "npm run build && node ./build/main/findFaqMain.js",
-    "upgradeFaqEntries": "npm run build && node ./build/main/upgradeFaqEntriesMain.js",
-    "materializeScrubbedMessagesStats:all": "npm run build && node ./build/main/materializeScrubbedMessagesStats.js --all",
-    "materializeScrubbedMessagesStats:since": "npm run build && node ./build/main/materializeScrubbedMessagesStats.js --since",
-    "materializeScrubbedMessagesStats:latest": "npm run build && node ./build/main/materializeScrubbedMessagesStats.js",
     "createView/messages_by_rating": "npm run build && node ./build/createView/messages_by_rating.js",
-    "createView/top_250_references": "npm run build && node ./build/createView/top_250_references.js",
     "createView/scrubbed_messages_by_rating": "npm run build && node ./build/createView/scrubbed_messages_by_rating.js",
     "createView/scrubbed_messages_stats": "npm run build && node ./build/createView/scrubbed_messages_stats.js",
     "createView/scrubbed_top_250_references": "npm run build && node ./build/createView/scrubbed_top_250_references.js",
     "createView/scrubbed_topics": "npm run build && node ./build/createView/scrubbed_topics.js",
-    "removeTestDatabases": "npm run build && node ./build/removeTestDatabases.js",
-    "getConversationText": "npm run build && node ./build/getConversationText.js",
+    "createView/top_250_references": "npm run build && node ./build/createView/top_250_references.js",
+    "findFaq": "npm run build && node ./build/main/findFaqMain.js",
     "findPageTitles": "npm run build && node ./build/main/findPageTitlesMain.js",
+    "getConversationText": "npm run build && node ./build/getConversationText.js",
+    "getLLMAnswers": "npm run build && node ./build/profound/getAndProcessAnswers.js",
+    "lint:fix": "npm run lint -- --fix && prettier ./src --check --write",
+    "lint": "eslint ./src --ext ts,tsx,js,jsx --report-unused-disable-directives",
     "listSlackMessages": "npm run build && node ./build/main/listSlackMessagesMain.js",
+    "materializeScrubbedMessagesStats:all": "npm run build && node ./build/main/materializeScrubbedMessagesStats.js --all",
+    "materializeScrubbedMessagesStats:latest": "npm run build && node ./build/main/materializeScrubbedMessagesStats.js",
+    "materializeScrubbedMessagesStats:since": "npm run build && node ./build/main/materializeScrubbedMessagesStats.js --since",
+    "release": "release-it",
     "removeSlackMessage": "npm run build && node ./build/main/removeSlackMessageMain.js",
-    "checkUrlsAgainstDB": "npm run build && node ./build/checkUrlsAgainstDB.js",
-    "getLLMAnswers": "npm run build && node ./build/profound/getAndProcessAnswers.js",
-    "verifyPagesInSource": "npm run build && node ./build/verifyPagesInSource.js",
+    "removeTestDatabases": "npm run build && node ./build/removeTestDatabases.js",
+    "sampleAndSaveMarkdownPages": "npm run build && node ./build/sampleAndSaveMarkdownPages.js",
+    "scrubMessages": "npm run build && node ./build/scrubMessages.js",
     "test": "jest --forceExit",
-    "build": "npm run clean && tsc -b tsconfig.build.json",
-    "clean": "rm -rf ./build",
-    "release": "release-it",
-    "lint": "eslint ./src --ext ts,tsx,js,jsx --report-unused-disable-directives",
-    "lint:fix": "npm run lint -- --fix && prettier ./src --check --write"
+    "upgradeFaqEntries": "npm run build && node ./build/main/upgradeFaqEntriesMain.js",
+    "upsertMetaDirective": "npm run build && node ./build/upsertMetaDirective.js",
+    "verifyPagesInSource": "npm run build && node ./build/verifyPagesInSource.js"
   },
   "keywords": [],
   "author": "",

diff --git a/packages/scripts/src/Case.ts b/packages/scripts/src/Case.ts
@@ -0,0 +1,74 @@
+import z from "zod";
+import { PromptResponseRating } from "./generateRating";
+
+// Map of embedding model name -> vector (array of numbers)
+export const Embeddings = z.record(z.string(), z.number().array());
+
+export type Embeddings = z.infer<typeof Embeddings>;
+
+export const PromptAndEmbeddings = z.object({
+  prompt: z.string(),
+  embeddings: Embeddings,
+});
+
+export type PromptAndEmbeddings = z.infer<typeof PromptAndEmbeddings>;
+
+/**
+  Answer relevance: given prompt and expected answer pair, generate N possible
+  prompts that would elicit that answer, then compare their embeddings with the
+  embedding of the original prompt.
+ */
+export const RelevanceMetrics = z.object({
+  /**
+    Normalized square magnitude difference. Lower = closer = better. This gives
+    an idea of how close the vectors are to each other in their N-dimensional
+    space, but doesn't seem to work as well as cos_similarity.
+   */
+  norm_sq_mag_diff: z.number(),
+  /**
+    Cosine similarity: are vectors pointing the same way? Range [-1, 1].
+   */
+  cos_similarity: z.number(),
+});
+
+export type RelevanceMetrics = z.infer<typeof RelevanceMetrics>;
+
+export const ScoredPromptAndEmbeddings = PromptAndEmbeddings.and(
+  z.object({
+    relevance:
+      // embedding model name -> score
+      z.record(z.string(), RelevanceMetrics),
+  })
+);
+
+export type ScoredPromptAndEmbeddings = z.infer<
+  typeof ScoredPromptAndEmbeddings
+>;
+
+export const Relevance = z.object({
+  prompt_embeddings: Embeddings,
+  generated_prompts: ScoredPromptAndEmbeddings.array(),
+  averages: RelevanceMetrics,
+});
+
+export type Relevance = z.infer<typeof Relevance>;
+
+export const Case = z.object({
+  type: z.string(),
+  tags: z.string().array(),
+  name: z.string(),
+  prompt: z
+    .object({
+      content: z.string(),
+      role: z.string(),
+    })
+    .array(),
+  expected: z.string(),
+
+  // Fields to add
+  prompt_embeddings: Embeddings.optional(),
+  relevance: Relevance.optional(),
+  prompt_response_rating: PromptResponseRating.optional(),
+});
+
+export type Case = z.infer<typeof Case>;
diff --git a/packages/scripts/src/SimpleTextGenerator.ts b/packages/scripts/src/SimpleTextGenerator.ts
@@ -0,0 +1,36 @@
+import { generateText, LanguageModel } from "mongodb-rag-core/aiSdk";
+
+export const makeSimpleTextGenerator = ({
+  model,
+  systemPrompt,
+}: {
+  model: LanguageModel;
+  systemPrompt?: string;
+}) => {
+  return async ({
+    prompt,
+    temperature = 0,
+    n = 1,
+  }: {
+    prompt: string;
+    temperature?: number;
+
+    n?: number;
+  }): Promise<string[]> => {
+    const result = await Promise.all(
+      Array(n)
+        .fill(0)
+        .map(async () =>
+          generateText({
+            model,
+            prompt,
+            system: systemPrompt,
+            temperature,
+          })
+        )
+    );
+    return result.map(({ text }) => text);
+  };
+};
+
+export type SimpleTextGenerator = ReturnType<typeof makeSimpleTextGenerator>;
diff --git a/packages/scripts/src/assessRelevance.eval.ts b/packages/scripts/src/assessRelevance.eval.ts
@@ -0,0 +1,143 @@
+import "dotenv/config";
+import { Eval, BraintrustMiddleware } from "braintrust";
+import { Scorer } from "autoevals";
+import { MongoDbTag } from "mongodb-rag-core/mongoDbMetadata";
+import { createOpenAI, wrapLanguageModel } from "mongodb-rag-core/aiSdk";
+import {
+  assertEnvVars,
+  BRAINTRUST_ENV_VARS,
+  makeOpenAiEmbedder,
+} from "mongodb-rag-core";
+import { Relevance } from "./Case";
+import { assessRelevance } from "./assessRelevance";
+import { AzureOpenAI } from "mongodb-rag-core/openai";
+import { makeSimpleTextGenerator } from "./SimpleTextGenerator";
+
+const {
+  OPENAI_API_KEY,
+  OPENAI_ENDPOINT,
+  OPENAI_API_VERSION,
+  OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT,
+  BRAINTRUST_API_KEY,
+  BRAINTRUST_ENDPOINT,
+} = assertEnvVars({
+  OPENAI_API_KEY: "",
+  OPENAI_ENDPOINT: "",
+  OPENAI_API_VERSION: "",
+  OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT: "",
+  ...BRAINTRUST_ENV_VARS,
+});
+
+interface AssessRelevanceEvalCase {
+  input: {
+    prompt: string;
+    expectedResponse: string;
+  };
+  expected: Pick<Relevance, "averages">;
+  tags?: MongoDbTag[];
+}
+
+const evalCases: AssessRelevanceEvalCase[] = [
+  {
+    // High similarity
+    input: {
+      prompt: "When to use $pull and $push mongodb",
+      expectedResponse:
+        "Use the $pull operator when you want to remove a value or values that match specific conditions from an existing array. \nUse the $push operator when you want to add a specific value to an array. ",
+    },
+    expected: {
+      averages: {
+        cos_similarity: 0.9639615103046141,
+        norm_sq_mag_diff: 1.876484720646915e-11,
+      },
+    },
+  },
+  {
+    // Low similarity
+    input: {
+      prompt: "give me an example of how to use the $and operator",
+      expectedResponse:
+        "The following example returns inventory documents where the price is greater than 25 and the quantity is less than 20:\n\ndb.inventory.find( {\n  $and: [\n    { price: { $gt: 25 } },\n    { quantity: { $lt: 20 } }\n  ]\n} )",
+    },
+    expected: {
+      averages: {
+        cos_similarity: 0.3442199438560915,
+        norm_sq_mag_diff: 1.0454893515591396e-10,
+      },
+    },
+  },
+];
+
+const cosSimilarityScorer: Scorer<Pick<Relevance, "averages">, unknown> = ({
+  output,
+  expected,
+}) => ({
+  name: `closeCosSimilarity`,
+  score:
+    expected === undefined
+      ? 0
+      : 1 -
+        Math.abs(
+          output.averages.cos_similarity - expected.averages.cos_similarity
+        ),
+});
+
+const model = wrapLanguageModel({
+  model: createOpenAI({
+    apiKey: BRAINTRUST_API_KEY,
+    baseURL: BRAINTRUST_ENDPOINT,
+  }).chat("o3"),
+  middleware: [BraintrustMiddleware({ debug: true })],
+});
+
+const openAiClient = new AzureOpenAI({
+  apiKey: OPENAI_API_KEY,
+  endpoint: OPENAI_ENDPOINT,
+  apiVersion: OPENAI_API_VERSION,
+});
+
+const embedders = [
+  makeOpenAiEmbedder({
+    openAiClient,
+    deployment: OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT,
+    backoffOptions: {
+      numOfAttempts: 25,
+      startingDelay: 1000,
+    },
+  }),
+];
+
+const generate = makeSimpleTextGenerator({
+  model: wrapLanguageModel({
+    model: createOpenAI({
+      apiKey: BRAINTRUST_API_KEY,
+      baseURL: BRAINTRUST_ENDPOINT,
+    }).chat("gpt-4.1"),
+    middleware: [BraintrustMiddleware({ debug: true })],
+  }),
+});
+
+Eval("assess-prompt-response-relevance", {
+  data: evalCases,
+  experimentName: "assess-relevance",
+  metadata: {
+    description: "Evaluates assessRelevance().",
+    model: model.modelId,
+  },
+  maxConcurrency: 10,
+  async task({ prompt, expectedResponse }) {
+    try {
+      return await assessRelevance({
+        prompt,
+        expectedResponse,
+        embedders,
+        generate,
+      });
+    } catch (error) {
+      console.error(`Error evaluating input: ${prompt} - ${expectedResponse}`);
+      console.error(error);
+      throw error;
+    }
+  },
+  scores: [cosSimilarityScorer],
+});