From 12139ea3ce0260640cf76c135df51683b0bf63d4 Mon Sep 17 00:00:00 2001
From: Chris Bush <chris.bush@mongodb.com>
Date: Thu, 24 Jul 2025 14:51:23 -0400
Subject: [PATCH 01/14] Case analysis wip

---
 packages/scripts/src/Case.ts                  |  62 ++++++++
 packages/scripts/src/SimpleTextGenerator.ts   |  49 +++++++
 packages/scripts/src/assessRelevance.ts       | 135 ++++++++++++++++++
 packages/scripts/src/calculateEmbeddings.ts   |  21 +++
 packages/scripts/src/embedPrompts.ts          |  83 +++++++++++
 .../scripts/src/main/assessRelevanceMain.ts   |  93 ++++++++++++
 packages/scripts/src/main/llmJudgmentMain.ts  |  26 ++++
 packages/scripts/src/squareMagnitude.test.ts  | 108 ++++++++++++++
 packages/scripts/src/squareMagnitude.ts       |  45 ++++++
 9 files changed, 622 insertions(+)
 create mode 100644 packages/scripts/src/Case.ts
 create mode 100644 packages/scripts/src/SimpleTextGenerator.ts
 create mode 100644 packages/scripts/src/assessRelevance.ts
 create mode 100644 packages/scripts/src/calculateEmbeddings.ts
 create mode 100644 packages/scripts/src/embedPrompts.ts
 create mode 100644 packages/scripts/src/main/assessRelevanceMain.ts
 create mode 100644 packages/scripts/src/main/llmJudgmentMain.ts
 create mode 100644 packages/scripts/src/squareMagnitude.test.ts
 create mode 100644 packages/scripts/src/squareMagnitude.ts
diff --git a/packages/scripts/src/Case.ts b/packages/scripts/src/Case.ts
new file mode 100644
index 000000000..da144e1cb
--- /dev/null
+++ b/packages/scripts/src/Case.ts
@@ -0,0 +1,62 @@
+import z from "zod";
+
+// Map of embedding model name -> vector (array of numbers)
+export const Embeddings = z.record(z.string(), z.number().array());
+
+export type Embeddings = z.infer<typeof Embeddings>;
+
+export const PromptAndEmbeddings = z.object({
+  prompt: z.string(),
+  embeddings: Embeddings,
+});
+
+export type PromptAndEmbeddings = z.infer<typeof PromptAndEmbeddings>;
+
+export const ScoredPromptAndEmbeddings = PromptAndEmbeddings.and(
+  z.object({
+    // embedding model name -> score
+    relevance: z.record(z.string(), z.number()),
+  })
+);
+
+export type ScoredPromptAndEmbeddings = z.infer<
+  typeof ScoredPromptAndEmbeddings
+>;
+
+export const LlmAsJudgment = z.object({
+  reasonableness: z.number(),
+  clarity: z.number(),
+  specificity: z.number(),
+  fit: z.number(),
+  assumption: z.number(),
+});
+
+export type LlmAsJudgment = z.infer<typeof LlmAsJudgment>;
+
+export const Relevance = z.object({
+  prompt_embeddings: Embeddings,
+  generated_prompts: ScoredPromptAndEmbeddings.array(),
+  average: z.number(),
+});
+
+export type Relevance = z.infer<typeof Relevance>;
+
+export const Case = z.object({
+  type: z.string(),
+  tags: z.string().array(),
+  name: z.string(),
+  prompt: z
+    .object({
+      content: z.string(),
+      role: z.string(),
+    })
+    .array(),
+  expected: z.string(),
+
+  // Fields to add
+  prompt_embeddings: Embeddings.optional(),
+  relevance: Relevance.optional(),
+  llm_as_judgment: LlmAsJudgment.optional(),
+});
+
+export type Case = z.infer<typeof Case>;
diff --git a/packages/scripts/src/SimpleTextGenerator.ts b/packages/scripts/src/SimpleTextGenerator.ts
new file mode 100644
index 000000000..2d05c8c13
--- /dev/null
+++ b/packages/scripts/src/SimpleTextGenerator.ts
@@ -0,0 +1,49 @@
+import { OpenAI } from "mongodb-rag-core/openai";
+
+export const makeSimpleTextGenerator = ({
+  client,
+  model,
+  systemPrompt,
+}: {
+  client: OpenAI;
+  model: string;
+  systemPrompt?: string;
+}) => {
+  return async ({
+    prompt,
+    temperature = 0,
+    maxTokens = 1500,
+    n = 1,
+  }: {
+    prompt: string;
+    temperature?: number;
+    maxTokens?: number;
+    n?: number;
+  }): Promise<string[]> => {
+    const messages = [
+      {
+        role: "system",
+        content: systemPrompt ?? "",
+      },
+      {
+        role: "user",
+        content: prompt,
+      },
+    ] satisfies OpenAI.ChatCompletionMessageParam[];
+    const result = await client.chat.completions.create({
+      model,
+      messages,
+      temperature,
+      max_tokens: maxTokens,
+      n,
+    });
+    return result.choices.map(({ message: { content } }) => {
+      if (content === null) {
+        throw new Error(`Failed to generate content!`);
+      }
+      return content;
+    });
+  };
+};
+
+export type SimpleTextGenerator = ReturnType<typeof makeSimpleTextGenerator>;
diff --git a/packages/scripts/src/assessRelevance.ts b/packages/scripts/src/assessRelevance.ts
new file mode 100644
index 000000000..1f751d422
--- /dev/null
+++ b/packages/scripts/src/assessRelevance.ts
@@ -0,0 +1,135 @@
+import { strict as assert } from "assert";
+import { normalizedSquareMagnitudeDifference } from "./squareMagnitude";
+import { calculateEmbeddings } from "./calculateEmbeddings";
+import { SimpleTextGenerator } from "./SimpleTextGenerator";
+import { Embedder } from "mongodb-rag-core";
+import {
+  PromptAndEmbeddings,
+  Relevance,
+  ScoredPromptAndEmbeddings,
+} from "./Case";
+
+export const makeShortName = (prompt: string) => {
+  return prompt.length > 32 ? prompt.slice(0, 29) + "..." : prompt;
+};
+
+/**
+  Given the expected answer, generate a number of possible prompts that could
+  elicit that expected answer.
+ */
+export const generatePromptsFromExpectedAnswer = async ({
+  expected,
+  embedders,
+  generate,
+  howMany,
+}: {
+  expected: string;
+  embedders: Embedder[];
+  generate: SimpleTextGenerator;
+  howMany: number;
+}): Promise<PromptAndEmbeddings[]> => {
+  const variants = await generate({
+    prompt: `Given the following "expected answer", formulate a question that is likely to elicit the expected answer. Just return the generated question. Expected answer:\n\n${expected}`,
+    n: howMany,
+  });
+
+  return await Promise.all(
+    variants.map(async (text) => {
+      return {
+        prompt: text,
+        embeddings: await calculateEmbeddings({ embedders, text }),
+      };
+    })
+  );
+};
+
+export const scoreVariants = ({
+  original,
+  variants,
+}: {
+  original: PromptAndEmbeddings;
+  variants: PromptAndEmbeddings[];
+}): ScoredPromptAndEmbeddings[] => {
+  return variants.map(
+    ({ embeddings: variantEmbeddings, prompt }): ScoredPromptAndEmbeddings => {
+      const relevance = Object.fromEntries(
+        Object.entries(original.embeddings).map(
+          ([model, originalEmbedding]) => {
+            assert(
+              variantEmbeddings[model] !== undefined,
+              `No embedding for model ${model}!`
+            );
+            return [
+              model,
+              normalizedSquareMagnitudeDifference(
+                originalEmbedding,
+                variantEmbeddings[model]
+              ),
+            ];
+          }
+        )
+      );
+      return {
+        embeddings: variantEmbeddings,
+        prompt,
+        relevance,
+      };
+    }
+  );
+};
+
+export const assessRelevance = async ({
+  prompt,
+  embedders,
+  expected,
+  generate,
+}: {
+  prompt: string;
+  expected: string;
+  embedders: Embedder[];
+  generate: SimpleTextGenerator;
+}): Promise<Relevance> => {
+  const shortName = makeShortName(prompt);
+  console.log(`Calculating embeddings for '${shortName}'...`);
+
+  const promptEmbeddings = await calculateEmbeddings({
+    text: prompt,
+    embedders,
+  });
+
+  console.log(`Generating variants for '${shortName}'...`);
+  const variants = await generatePromptsFromExpectedAnswer({
+    embedders,
+    expected,
+    generate,
+    howMany: 3,
+  });
+  const variantCount = Object.values(variants).length;
+  if (variantCount === 0) {
+    throw new Error(`Unexpectedly without variants for ${shortName}!`);
+  }
+
+  console.log(`Calculating variant scores for '${shortName}'...`);
+  const scoredVariants = scoreVariants({
+    original: { prompt, embeddings: promptEmbeddings },
+    variants,
+  });
+  assert(variantCount === Object.values(scoredVariants).length);
+
+  const average =
+    scoredVariants.reduce((acc, { relevance }) => {
+      const relevanceValues = Object.values(relevance);
+      const modelCount = relevanceValues.length;
+      assert(modelCount > 0);
+      const averageAcrossModels =
+        relevanceValues.reduce((acc, cur) => acc + cur, 0) / modelCount;
+      return acc + averageAcrossModels;
+    }, 0) / variantCount;
+
+  console.log(`Average score for '${shortName}': ${average}`);
+  return {
+    prompt_embeddings: promptEmbeddings,
+    generated_prompts: scoredVariants,
+    average,
+  };
+};
diff --git a/packages/scripts/src/calculateEmbeddings.ts b/packages/scripts/src/calculateEmbeddings.ts
new file mode 100644
index 000000000..d9b83c3f5
--- /dev/null
+++ b/packages/scripts/src/calculateEmbeddings.ts
@@ -0,0 +1,21 @@
+import assert from "node:assert/strict";
+import { Embedder } from "mongodb-rag-core";
+
+export const calculateEmbeddings = async ({
+  text,
+  embedders,
+}: {
+  text: string;
+  embedders: Embedder[];
+}) => {
+  return Object.fromEntries(
+    await Promise.all(
+      embedders.map(async (embedder): Promise<[string, number[]]> => {
+        const { embedding } = await embedder.embed({ text });
+        const model = embedder.modelName;
+        assert(model !== undefined, "Missing embedder model name!");
+        return [model, embedding];
+      })
+    )
+  );
+};
diff --git a/packages/scripts/src/embedPrompts.ts b/packages/scripts/src/embedPrompts.ts
new file mode 100644
index 000000000..5a37a9fde
--- /dev/null
+++ b/packages/scripts/src/embedPrompts.ts
@@ -0,0 +1,83 @@
+import { AzureOpenAI } from "mongodb-rag-core/openai";
+import { assertEnvVars, makeOpenAiEmbedder } from "mongodb-rag-core";
+import { MongoClient } from "mongodb";
+import { Case } from "./Case";
+
+import "dotenv/config";
+
+/**
+  Find prompts in llm_cases that don't have embeddings populated yet and populate them.
+ */
+async function main() {
+  const {
+    FROM_CONNECTION_URI,
+    FROM_DATABASE_NAME,
+    OPENAI_API_KEY,
+    OPENAI_ENDPOINT,
+    OPENAI_API_VERSION,
+    OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT,
+  } = assertEnvVars({
+    FROM_CONNECTION_URI: "",
+    FROM_DATABASE_NAME: "",
+    OPENAI_API_KEY: "",
+    OPENAI_ENDPOINT: "",
+    OPENAI_API_VERSION: "",
+    OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT: "",
+  });
+
+  const embedder = makeOpenAiEmbedder({
+    openAiClient: new AzureOpenAI({
+      apiKey: OPENAI_API_KEY,
+      endpoint: OPENAI_ENDPOINT,
+      apiVersion: OPENAI_API_VERSION,
+    }),
+    deployment: OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT,
+    backoffOptions: {
+      numOfAttempts: 25,
+      startingDelay: 1000,
+    },
+  });
+
+  const client = await MongoClient.connect(FROM_CONNECTION_URI);
+  try {
+    const db = client.db(FROM_DATABASE_NAME);
+    const collection = db.collection<Case>("llm_cases");
+
+    const cases = await collection
+      .find({ prompt_embeddings: undefined })
+      .toArray();
+
+    const embedPromises = cases.map(async ({ name, _id }) => {
+      const shortName = name.length > 32 ? name.slice(0, 29) + "..." : name;
+      console.log(`Embedding '${shortName}'...`);
+      const { embedding } = await embedder.embed({
+        text: name,
+      });
+      console.log(`Updating '${shortName}'...`);
+      const updateResult = await collection.updateOne(
+        {
+          _id,
+        },
+        {
+          $set: {
+            prompt_embeddings: {
+              [OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT]: embedding,
+            },
+          },
+        }
+      );
+
+      if (updateResult.modifiedCount === 1) {
+        console.log(`Updated '${shortName}'.`);
+      } else {
+        console.warn(`Failed to update '${shortName}' (${_id})`);
+      }
+    });
+
+    await Promise.allSettled(embedPromises);
+  } finally {
+    await client.close();
+  }
+}
+
+main();
diff --git a/packages/scripts/src/main/assessRelevanceMain.ts b/packages/scripts/src/main/assessRelevanceMain.ts
new file mode 100644
index 000000000..2997693cb
--- /dev/null
+++ b/packages/scripts/src/main/assessRelevanceMain.ts
@@ -0,0 +1,93 @@
+import { AzureOpenAI } from "mongodb-rag-core/openai";
+import { assertEnvVars, makeOpenAiEmbedder } from "mongodb-rag-core";
+import { MongoClient } from "mongodb";
+import { Case } from "../Case";
+import { makeSimpleTextGenerator } from "../SimpleTextGenerator";
+import "dotenv/config";
+import { assessRelevance, makeShortName } from "../assessRelevance";
+
+const assessRelevanceMain = async () => {
+  const {
+    FROM_CONNECTION_URI,
+    FROM_DATABASE_NAME,
+    OPENAI_API_KEY,
+    OPENAI_ENDPOINT,
+    OPENAI_API_VERSION,
+    OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT,
+  } = assertEnvVars({
+    FROM_CONNECTION_URI: "",
+    FROM_DATABASE_NAME: "",
+    OPENAI_API_KEY: "",
+    OPENAI_ENDPOINT: "",
+    OPENAI_API_VERSION: "",
+    OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT: "",
+  });
+
+  const openAiClient = new AzureOpenAI({
+    apiKey: OPENAI_API_KEY,
+    endpoint: OPENAI_ENDPOINT,
+    apiVersion: OPENAI_API_VERSION,
+  });
+
+  const embedders = [
+    makeOpenAiEmbedder({
+      openAiClient,
+      deployment: OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT,
+      backoffOptions: {
+        numOfAttempts: 25,
+        startingDelay: 1000,
+      },
+    }),
+  ];
+
+  const generate = makeSimpleTextGenerator({
+    client: openAiClient,
+    model: "gpt-4o",
+  });
+
+  const client = await MongoClient.connect(FROM_CONNECTION_URI);
+  try {
+    const db = client.db(FROM_DATABASE_NAME);
+    const collection = db.collection<Case>("llm_cases");
+
+    // Find cases where quality field has not been filled out yet
+    const cases = await collection
+      .find({
+        relevance: { $exists: false },
+      })
+      .toArray();
+
+    const relevancePromises = cases.map(async ({ _id, name, expected }) => {
+      const shortName = makeShortName(name);
+      const relevance = await assessRelevance({
+        prompt: name,
+        expected,
+        embedders,
+        generate,
+      });
+      console.log(`Updating '${shortName}'...`);
+      const updateResult = await collection.updateOne(
+        {
+          _id,
+        },
+        {
+          $set: {
+            relevance,
+          },
+        }
+      );
+
+      if (updateResult.modifiedCount === 1) {
+        console.log(`Updated '${shortName}'.`);
+      } else {
+        console.warn(`Failed to update '${shortName}' (${_id})`);
+      }
+    });
+
+    await Promise.allSettled(relevancePromises);
+  } finally {
+    await client.close();
+  }
+};
+
+assessRelevanceMain();
diff --git a/packages/scripts/src/main/llmJudgmentMain.ts b/packages/scripts/src/main/llmJudgmentMain.ts
new file mode 100644
index 000000000..2940da75d
--- /dev/null
+++ b/packages/scripts/src/main/llmJudgmentMain.ts
@@ -0,0 +1,26 @@
+/*
+
+Evaluate the quality of the following prompt-expected answer pair across
+multiple dimensions. Return your evaluation as a JSON object with numeric scores
+from 1 (poor) to 5 (excellent). Use the following keys:
+
+...
+
+Now evaluate this pair:
+
+PROMPT: "Is there a  limit for mongodb deletemany" EXPECTED ANSWER:
+"db.collection.deleteMany() removes all documents that match the filter from a
+collection.
+
+NOTE: If you are deleting all documents in a large collection, it may be faster
+to drop the collection and recreate it. Before dropping the collection, note all
+indexes on the collection. You must recreate any indexes that existed in the
+original collection. If the original collection was sharded, you must also shard
+the recreated collection.
+
+For more information on dropping a collection, see db.collection.drop()."
+
+Return only the JSON object.
+
+
+*/
diff --git a/packages/scripts/src/squareMagnitude.test.ts b/packages/scripts/src/squareMagnitude.test.ts
new file mode 100644
index 000000000..a4b5ebb8e
--- /dev/null
+++ b/packages/scripts/src/squareMagnitude.test.ts
@@ -0,0 +1,108 @@
+import {
+  normalizedSquareMagnitude,
+  normalizedSquareMagnitudeDifference,
+  squareMagnitude,
+  squareMagnitudeDifference,
+} from "./squareMagnitude";
+
+describe("squareMagnitude", () => {
+  it("returns 0 for an empty vector", () => {
+    expect(squareMagnitude([])).toBe(0);
+  });
+
+  it("returns correct value for a single-element vector", () => {
+    expect(squareMagnitude([3])).toBe(9);
+  });
+
+  it("returns correct value for a multi-element vector", () => {
+    // [1, 2, 2] → 1^2 + 2^2 + 2^2 = 1 + 4 + 4 = 9
+    expect(squareMagnitude([1, 2, 2])).toBe(9);
+  });
+
+  it("handles negative values correctly", () => {
+    // [-3, 4] → 9 + 16 = 25
+    expect(squareMagnitude([-3, 4])).toBe(25);
+  });
+});
+
+describe("squareMagnitudeDifference", () => {
+  it("returns 0 for identical vectors", () => {
+    expect(squareMagnitudeDifference([1, 2], [1, 2])).toBe(0);
+  });
+
+  it("returns correct difference for vectors of different magnitudes", () => {
+    const result = squareMagnitudeDifference([1, 2], [2, 2]);
+    // [1,2] → 1^2 + 2^2 = 5; [2,2] → 2^2 + 2^2 = 8; diff = 3
+    expect(result).toBe(3);
+  });
+
+  it("is symmetric with respect to argument order", () => {
+    const a = [3, 4]; // mag^2 = 25
+    const b = [0, 5]; // mag^2 = 25
+    expect(squareMagnitudeDifference(a, b)).toBe(0);
+    expect(squareMagnitudeDifference(b, a)).toBe(0);
+  });
+
+  it("throws an error for vectors of different dimensions", () => {
+    expect(() => squareMagnitudeDifference([1, 2], [1, 2, 3])).toThrow(
+      "Vector length mismatch!"
+    );
+  });
+});
+
+describe("normalizedSquareMagnitude", () => {
+  it("returns 0 for a zero vector", () => {
+    expect(normalizedSquareMagnitude([0, 0, 0])).toBe(0);
+  });
+
+  it("returns 1 for a vector of all ones", () => {
+    expect(normalizedSquareMagnitude([1, 1, 1, 1])).toBe(1);
+  });
+
+  it("returns correct value for mixed values", () => {
+    // [1, 0, -1] → squares: [1, 0, 1] → sum: 2 → norm: 2 / 3
+    expect(normalizedSquareMagnitude([1, 0, -1])).toBeCloseTo(2 / 3);
+  });
+
+  it("returns correct value for single-element vector", () => {
+    expect(normalizedSquareMagnitude([0.5])).toBeCloseTo(0.25);
+  });
+
+  it("throws for out-of-bound values", () => {
+    expect(() => normalizedSquareMagnitude([1.5])).toThrow();
+    expect(() => normalizedSquareMagnitude([-1.1])).toThrow();
+  });
+
+  it("throws for empty vector", () => {
+    expect(() => normalizedSquareMagnitude([])).toThrow();
+  });
+});
+
+describe("normalizedSquareMagnitudeDifference", () => {
+  it("returns 0 for identical vectors", () => {
+    const v = [0.5, -0.5, 0];
+    expect(normalizedSquareMagnitudeDifference(v, v)).toBe(0);
+  });
+
+  it("returns correct difference for different vectors", () => {
+    const a = [1, 0, 0]; // mag² = 1, norm = 1/3
+    const b = [1, 1, 1]; // mag² = 3, norm = 1
+    expect(normalizedSquareMagnitudeDifference(a, b)).toBeCloseTo(2 / 3);
+  });
+
+  it("handles negative values correctly", () => {
+    const a = [-1, 0]; // mag² = 1, norm = 0.5
+    const b = [0, 0]; // mag² = 0, norm = 0
+    expect(normalizedSquareMagnitudeDifference(a, b)).toBe(0.5);
+  });
+
+  it("throws for vectors with out-of-bound values", () => {
+    expect(() => normalizedSquareMagnitudeDifference([2], [0])).toThrow();
+    expect(() => normalizedSquareMagnitudeDifference([0], [-2])).toThrow();
+  });
+
+  it("throws for empty vectors", () => {
+    expect(() => normalizedSquareMagnitudeDifference([], [0])).toThrow();
+    expect(() => normalizedSquareMagnitudeDifference([0], [])).toThrow();
+  });
+});
diff --git a/packages/scripts/src/squareMagnitude.ts b/packages/scripts/src/squareMagnitude.ts
new file mode 100644
index 000000000..2fba15dcb
--- /dev/null
+++ b/packages/scripts/src/squareMagnitude.ts
@@ -0,0 +1,45 @@
+import assert from "node:assert/strict";
+
+/**
+  Calculates the square magnitude of an N-dimensional vector.
+ */
+export const squareMagnitude = (vector: number[]): number => {
+  return vector.reduce((acc, cur) => acc + Math.pow(cur, 2), 0);
+};
+
+/**
+  For vectors where each dimension is in [−1,1], returns a square
+  magnitude normalized to [0, 1].
+ */
+export const normalizedSquareMagnitude = (vector: number[]): number => {
+  assert(
+    vector.every((value) => -1 <= value && value <= 1),
+    "Vector dimension outside [-1, 1] will create meaningless normalization"
+  );
+  assert(vector.length > 0, "Vector must have dimensionality!");
+  return squareMagnitude(vector) / vector.length;
+};
+
+/**
+  Calculates the difference between square magnitude of an N-dimensional vector.
+ */
+export const squareMagnitudeDifference = (
+  vector0: number[],
+  vector1: number[]
+): number => {
+  assert(vector0.length === vector1.length, "Vector length mismatch!");
+  return Math.abs(squareMagnitude(vector1) - squareMagnitude(vector0));
+};
+
+/**
+  For two vectors where each dimension is in [−1,1], returns the absolute
+  difference between their square magnitudes normalized to [0, 1].
+ */
+export const normalizedSquareMagnitudeDifference = (
+  vector0: number[],
+  vector1: number[]
+): number => {
+  return Math.abs(
+    normalizedSquareMagnitude(vector0) - normalizedSquareMagnitude(vector1)
+  );
+};

From 9efb650d34e0b90abac408846771fe369cea52c7 Mon Sep 17 00:00:00 2001
From: Chris Bush <chris.bush@mongodb.com>
Date: Thu, 24 Jul 2025 14:58:53 -0400
Subject: [PATCH 02/14] WIP

---
 packages/scripts/package.json                 | 41 ++++++++++---------
 packages/scripts/src/assessRelevance.ts       | 15 ++++++-
 .../scripts/src/main/assessRelevanceMain.ts   |  7 +++-
 3 files changed, 40 insertions(+), 23 deletions(-)

diff --git a/packages/scripts/package.json b/packages/scripts/package.json
index 0388be4af..57bc1d503 100644
--- a/packages/scripts/package.json
+++ b/packages/scripts/package.json
@@ -5,37 +5,38 @@
   "main": "index.js",
   "private": true,
   "scripts": {
+    "assessRelevance": "npm run build && node ./build/main/assessRelevanceMain.js",
+    "analyzeMessages": "npm run build && node ./build/analyzeMessages.js",
+    "build": "npm run clean && tsc -b tsconfig.build.json",
+    "checkUrlsAgainstDB": "npm run build && node ./build/checkUrlsAgainstDB.js",
+    "clean": "rm -rf ./build",
     "createQualityTestsYaml-aug-2023": "npm run build && node ./build/createAug2023QualityTestsYaml.js",
     "createQualityTestsYaml-sept-2023": "npm run build && node ./build/createSept2023QualityTestsYaml.js",
-    "scrubMessages": "npm run build && node ./build/scrubMessages.js",
-    "sampleAndSaveMarkdownPages": "npm run build && node ./build/sampleAndSaveMarkdownPages.js",
-    "upsertMetaDirective": "npm run build && node ./build/upsertMetaDirective.js",
-    "analyzeMessages": "npm run build && node ./build/analyzeMessages.js",
-    "findFaq": "npm run build && node ./build/main/findFaqMain.js",
-    "upgradeFaqEntries": "npm run build && node ./build/main/upgradeFaqEntriesMain.js",
-    "materializeScrubbedMessagesStats:all": "npm run build && node ./build/main/materializeScrubbedMessagesStats.js --all",
-    "materializeScrubbedMessagesStats:since": "npm run build && node ./build/main/materializeScrubbedMessagesStats.js --since",
-    "materializeScrubbedMessagesStats:latest": "npm run build && node ./build/main/materializeScrubbedMessagesStats.js",
     "createView/messages_by_rating": "npm run build && node ./build/createView/messages_by_rating.js",
-    "createView/top_250_references": "npm run build && node ./build/createView/top_250_references.js",
     "createView/scrubbed_messages_by_rating": "npm run build && node ./build/createView/scrubbed_messages_by_rating.js",
     "createView/scrubbed_messages_stats": "npm run build && node ./build/createView/scrubbed_messages_stats.js",
     "createView/scrubbed_top_250_references": "npm run build && node ./build/createView/scrubbed_top_250_references.js",
     "createView/scrubbed_topics": "npm run build && node ./build/createView/scrubbed_topics.js",
-    "removeTestDatabases": "npm run build && node ./build/removeTestDatabases.js",
-    "getConversationText": "npm run build && node ./build/getConversationText.js",
+    "createView/top_250_references": "npm run build && node ./build/createView/top_250_references.js",
+    "findFaq": "npm run build && node ./build/main/findFaqMain.js",
     "findPageTitles": "npm run build && node ./build/main/findPageTitlesMain.js",
+    "getConversationText": "npm run build && node ./build/getConversationText.js",
+    "getLLMAnswers": "npm run build && node ./build/profound/getAndProcessAnswers.js",
+    "lint:fix": "npm run lint -- --fix && prettier ./src --check --write",
+    "lint": "eslint ./src --ext ts,tsx,js,jsx --report-unused-disable-directives",
     "listSlackMessages": "npm run build && node ./build/main/listSlackMessagesMain.js",
+    "materializeScrubbedMessagesStats:all": "npm run build && node ./build/main/materializeScrubbedMessagesStats.js --all",
+    "materializeScrubbedMessagesStats:latest": "npm run build && node ./build/main/materializeScrubbedMessagesStats.js",
+    "materializeScrubbedMessagesStats:since": "npm run build && node ./build/main/materializeScrubbedMessagesStats.js --since",
+    "release": "release-it",
     "removeSlackMessage": "npm run build && node ./build/main/removeSlackMessageMain.js",
-    "checkUrlsAgainstDB": "npm run build && node ./build/checkUrlsAgainstDB.js",
-    "getLLMAnswers": "npm run build && node ./build/profound/getAndProcessAnswers.js",
-    "verifyPagesInSource": "npm run build && node ./build/verifyPagesInSource.js",
+    "removeTestDatabases": "npm run build && node ./build/removeTestDatabases.js",
+    "sampleAndSaveMarkdownPages": "npm run build && node ./build/sampleAndSaveMarkdownPages.js",
+    "scrubMessages": "npm run build && node ./build/scrubMessages.js",
     "test": "jest --forceExit",
-    "build": "npm run clean && tsc -b tsconfig.build.json",
-    "clean": "rm -rf ./build",
-    "release": "release-it",
-    "lint": "eslint ./src --ext ts,tsx,js,jsx --report-unused-disable-directives",
-    "lint:fix": "npm run lint -- --fix && prettier ./src --check --write"
+    "upgradeFaqEntries": "npm run build && node ./build/main/upgradeFaqEntriesMain.js",
+    "upsertMetaDirective": "npm run build && node ./build/upsertMetaDirective.js",
+    "verifyPagesInSource": "npm run build && node ./build/verifyPagesInSource.js"
   },
   "keywords": [],
   "author": "",
diff --git a/packages/scripts/src/assessRelevance.ts b/packages/scripts/src/assessRelevance.ts
index 1f751d422..998c52abc 100644
--- a/packages/scripts/src/assessRelevance.ts
+++ b/packages/scripts/src/assessRelevance.ts
@@ -104,18 +104,29 @@ export const assessRelevance = async ({
     generate,
     howMany: 3,
   });
+
   const variantCount = Object.values(variants).length;
   if (variantCount === 0) {
     throw new Error(`Unexpectedly without variants for ${shortName}!`);
   }
 
-  console.log(`Calculating variant scores for '${shortName}'...`);
   const scoredVariants = scoreVariants({
     original: { prompt, embeddings: promptEmbeddings },
     variants,
   });
   assert(variantCount === Object.values(scoredVariants).length);
 
+  console.log(
+    `- Expected: "${expected}"
+- Original: "${prompt}"
+- Generated variants:
+${scoredVariants
+  .map(
+    ({ prompt, relevance }) => `  - "${prompt}" (${JSON.stringify(relevance)})`
+  )
+  .join("\n")}`
+  );
+
   const average =
     scoredVariants.reduce((acc, { relevance }) => {
       const relevanceValues = Object.values(relevance);
@@ -126,7 +137,7 @@ export const assessRelevance = async ({
       return acc + averageAcrossModels;
     }, 0) / variantCount;
 
-  console.log(`Average score for '${shortName}': ${average}`);
+  console.log(`Average score: ${average}`);
   return {
     prompt_embeddings: promptEmbeddings,
     generated_prompts: scoredVariants,
diff --git a/packages/scripts/src/main/assessRelevanceMain.ts b/packages/scripts/src/main/assessRelevanceMain.ts
index 2997693cb..f306e1976 100644
--- a/packages/scripts/src/main/assessRelevanceMain.ts
+++ b/packages/scripts/src/main/assessRelevanceMain.ts
@@ -14,6 +14,7 @@ const assessRelevanceMain = async () => {
     OPENAI_ENDPOINT,
     OPENAI_API_VERSION,
     OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT,
+    CASE_COLLECTION_NAME,
   } = assertEnvVars({
     FROM_CONNECTION_URI: "",
     FROM_DATABASE_NAME: "",
@@ -21,6 +22,7 @@ const assessRelevanceMain = async () => {
     OPENAI_ENDPOINT: "",
     OPENAI_API_VERSION: "",
     OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT: "",
+    CASE_COLLECTION_NAME: "",
   });
 
   const openAiClient = new AzureOpenAI({
@@ -47,8 +49,11 @@ const assessRelevanceMain = async () => {
 
   const client = await MongoClient.connect(FROM_CONNECTION_URI);
   try {
+    console.log(
+      `Fetching unscored cases from ${FROM_DATABASE_NAME}.${CASE_COLLECTION_NAME}...`
+    );
     const db = client.db(FROM_DATABASE_NAME);
-    const collection = db.collection<Case>("llm_cases");
+    const collection = db.collection<Case>(CASE_COLLECTION_NAME);
 
     // Find cases where quality field has not been filled out yet
     const cases = await collection

From 0e9f99845cfef40429aaae5530a1830d2657590a Mon Sep 17 00:00:00 2001
From: Chris Bush <chris.bush@mongodb.com>
Date: Thu, 24 Jul 2025 15:22:32 -0400
Subject: [PATCH 03/14] WIP

---
 packages/scripts/src/assessRelevance.ts        | 18 +++++++++++++-----
 .../scripts/src/main/assessRelevanceMain.ts    |  9 +--------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/packages/scripts/src/assessRelevance.ts b/packages/scripts/src/assessRelevance.ts
index 998c52abc..da6479b78 100644
--- a/packages/scripts/src/assessRelevance.ts
+++ b/packages/scripts/src/assessRelevance.ts
@@ -9,10 +9,6 @@ import {
   ScoredPromptAndEmbeddings,
 } from "./Case";
 
-export const makeShortName = (prompt: string) => {
-  return prompt.length > 32 ? prompt.slice(0, 29) + "..." : prompt;
-};
-
 /**
   Given the expected answer, generate a number of possible prompts that could
   elicit that expected answer.
@@ -29,8 +25,13 @@ export const generatePromptsFromExpectedAnswer = async ({
   howMany: number;
 }): Promise<PromptAndEmbeddings[]> => {
   const variants = await generate({
-    prompt: `Given the following "expected answer", formulate a question that is likely to elicit the expected answer. Just return the generated question. Expected answer:\n\n${expected}`,
+    prompt: `Given the following "expected answer", formulate a question that is likely to elicit the expected answer.
+Don't necessarily use proper grammar or punctuation; write like a user of a chatbot, search engine, or LLM would.
+Just return the generated question.
+
+Expected answer:\n\n${expected}`,
     n: howMany,
+    temperature: 0.5,
   });
 
   return await Promise.all(
@@ -144,3 +145,10 @@ ${scoredVariants
     average,
   };
 };
+
+export const makeShortName = (prompt: string, ellipsizeAtLength = 64) => {
+  assert(ellipsizeAtLength > 0);
+  return prompt.length > ellipsizeAtLength
+    ? prompt.slice(0, ellipsizeAtLength - 3) + "..."
+    : prompt;
+};
diff --git a/packages/scripts/src/main/assessRelevanceMain.ts b/packages/scripts/src/main/assessRelevanceMain.ts
index f306e1976..3a3ac354c 100644
--- a/packages/scripts/src/main/assessRelevanceMain.ts
+++ b/packages/scripts/src/main/assessRelevanceMain.ts
@@ -54,14 +54,7 @@ const assessRelevanceMain = async () => {
     );
     const db = client.db(FROM_DATABASE_NAME);
     const collection = db.collection<Case>(CASE_COLLECTION_NAME);
-
-    // Find cases where quality field has not been filled out yet
-    const cases = await collection
-      .find({
-        relevance: { $exists: false },
-      })
-      .toArray();
-
+    const cases = await collection.find().toArray();
     const relevancePromises = cases.map(async ({ _id, name, expected }) => {
       const shortName = makeShortName(name);
       const relevance = await assessRelevance({

From 25632a66926a1c6ef340e4e7431f3b7c94e36998 Mon Sep 17 00:00:00 2001
From: Chris Bush <chris.bush@mongodb.com>
Date: Thu, 24 Jul 2025 16:30:22 -0400
Subject: [PATCH 04/14] WIP

---
 packages/scripts/src/Case.ts            | 16 ++++-
 packages/scripts/src/assessRelevance.ts | 59 ++++++++++++------
 packages/scripts/src/embedPrompts.ts    | 83 -------------------------
 3 files changed, 54 insertions(+), 104 deletions(-)
 delete mode 100644 packages/scripts/src/embedPrompts.ts

diff --git a/packages/scripts/src/Case.ts b/packages/scripts/src/Case.ts
index da144e1cb..4a7561134 100644
--- a/packages/scripts/src/Case.ts
+++ b/packages/scripts/src/Case.ts
@@ -12,10 +12,20 @@ export const PromptAndEmbeddings = z.object({
 
 export type PromptAndEmbeddings = z.infer<typeof PromptAndEmbeddings>;
 
+export const RelevanceMetrics = z.object({
+  // normalized square magnitude difference (lower = closer = better)
+  norm_sq_mag_diff: z.number(),
+  // cosine similarity (are vectors pointing the same way?) [-1, 1]
+  cos_similarity: z.number(),
+});
+
+export type RelevanceMetrics = z.infer<typeof RelevanceMetrics>;
+
 export const ScoredPromptAndEmbeddings = PromptAndEmbeddings.and(
   z.object({
-    // embedding model name -> score
-    relevance: z.record(z.string(), z.number()),
+    relevance:
+      // embedding model name -> score
+      z.record(z.string(), RelevanceMetrics),
   })
 );
 
@@ -36,7 +46,7 @@ export type LlmAsJudgment = z.infer<typeof LlmAsJudgment>;
 export const Relevance = z.object({
   prompt_embeddings: Embeddings,
   generated_prompts: ScoredPromptAndEmbeddings.array(),
-  average: z.number(),
+  averages: RelevanceMetrics,
 });
 
 export type Relevance = z.infer<typeof Relevance>;
diff --git a/packages/scripts/src/assessRelevance.ts b/packages/scripts/src/assessRelevance.ts
index da6479b78..3f21540f8 100644
--- a/packages/scripts/src/assessRelevance.ts
+++ b/packages/scripts/src/assessRelevance.ts
@@ -6,8 +6,10 @@ import { Embedder } from "mongodb-rag-core";
 import {
   PromptAndEmbeddings,
   Relevance,
+  RelevanceMetrics,
   ScoredPromptAndEmbeddings,
 } from "./Case";
+import { cosineSimilarity } from "mongodb-rag-core/aiSdk";
 
 /**
   Given the expected answer, generate a number of possible prompts that could
@@ -55,17 +57,23 @@ export const scoreVariants = ({
     ({ embeddings: variantEmbeddings, prompt }): ScoredPromptAndEmbeddings => {
       const relevance = Object.fromEntries(
         Object.entries(original.embeddings).map(
-          ([model, originalEmbedding]) => {
+          ([model, originalEmbedding]): [string, RelevanceMetrics] => {
             assert(
               variantEmbeddings[model] !== undefined,
               `No embedding for model ${model}!`
             );
             return [
               model,
-              normalizedSquareMagnitudeDifference(
-                originalEmbedding,
-                variantEmbeddings[model]
-              ),
+              {
+                cos_similarity: cosineSimilarity(
+                  originalEmbedding,
+                  variantEmbeddings[model]
+                ),
+                norm_sq_mag_diff: normalizedSquareMagnitudeDifference(
+                  originalEmbedding,
+                  variantEmbeddings[model]
+                ),
+              },
             ];
           }
         )
@@ -121,28 +129,43 @@ export const assessRelevance = async ({
     `- Expected: "${expected}"
 - Original: "${prompt}"
 - Generated variants:
-${scoredVariants
-  .map(
-    ({ prompt, relevance }) => `  - "${prompt}" (${JSON.stringify(relevance)})`
-  )
-  .join("\n")}`
+${scoredVariants.map(({ prompt }) => `  - "${prompt}"`).join("\n")}`
   );
 
-  const average =
-    scoredVariants.reduce((acc, { relevance }) => {
+  const summedMetrics = scoredVariants.reduce(
+    (outer, { relevance }): RelevanceMetrics => {
       const relevanceValues = Object.values(relevance);
       const modelCount = relevanceValues.length;
       assert(modelCount > 0);
-      const averageAcrossModels =
-        relevanceValues.reduce((acc, cur) => acc + cur, 0) / modelCount;
-      return acc + averageAcrossModels;
-    }, 0) / variantCount;
+      // Sum metrics across models
+      const crossModel = relevanceValues.reduce(
+        (acc, { cos_similarity, norm_sq_mag_diff }) => ({
+          cos_similarity: acc.cos_similarity + cos_similarity,
+          norm_sq_mag_diff: acc.norm_sq_mag_diff + norm_sq_mag_diff,
+        }),
+        { cos_similarity: 0, norm_sq_mag_diff: 0 }
+      );
+
+      // Accumulate averages across models
+      return {
+        cos_similarity:
+          outer.cos_similarity + crossModel.cos_similarity / modelCount,
+        norm_sq_mag_diff:
+          outer.norm_sq_mag_diff + crossModel.norm_sq_mag_diff / modelCount,
+      };
+    },
+    { cos_similarity: 0, norm_sq_mag_diff: 0 }
+  );
+  const averages: RelevanceMetrics = {
+    cos_similarity: summedMetrics.cos_similarity / variantCount,
+    norm_sq_mag_diff: summedMetrics.norm_sq_mag_diff / variantCount,
+  };
 
-  console.log(`Average score: ${average}`);
+  console.log(`Average score: ${JSON.stringify(averages)}`);
   return {
     prompt_embeddings: promptEmbeddings,
     generated_prompts: scoredVariants,
-    average,
+    averages,
   };
 };
 
diff --git a/packages/scripts/src/embedPrompts.ts b/packages/scripts/src/embedPrompts.ts
deleted file mode 100644
index 5a37a9fde..000000000
--- a/packages/scripts/src/embedPrompts.ts
+++ /dev/null
@@ -1,83 +0,0 @@
-import { AzureOpenAI } from "mongodb-rag-core/openai";
-import { assertEnvVars, makeOpenAiEmbedder } from "mongodb-rag-core";
-import { MongoClient } from "mongodb";
-import { Case } from "./Case";
-
-import "dotenv/config";
-
-/**
-  Find prompts in llm_cases that don't have embeddings populated yet and populate them.
- */
-async function main() {
-  const {
-    FROM_CONNECTION_URI,
-    FROM_DATABASE_NAME,
-    OPENAI_API_KEY,
-    OPENAI_ENDPOINT,
-    OPENAI_API_VERSION,
-    OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT,
-  } = assertEnvVars({
-    FROM_CONNECTION_URI: "",
-    FROM_DATABASE_NAME: "",
-    OPENAI_API_KEY: "",
-    OPENAI_ENDPOINT: "",
-    OPENAI_API_VERSION: "",
-    OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT: "",
-  });
-
-  const embedder = makeOpenAiEmbedder({
-    openAiClient: new AzureOpenAI({
-      apiKey: OPENAI_API_KEY,
-      endpoint: OPENAI_ENDPOINT,
-      apiVersion: OPENAI_API_VERSION,
-    }),
-    deployment: OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT,
-    backoffOptions: {
-      numOfAttempts: 25,
-      startingDelay: 1000,
-    },
-  });
-
-  const client = await MongoClient.connect(FROM_CONNECTION_URI);
-  try {
-    const db = client.db(FROM_DATABASE_NAME);
-    const collection = db.collection<Case>("llm_cases");
-
-    const cases = await collection
-      .find({ prompt_embeddings: undefined })
-      .toArray();
-
-    const embedPromises = cases.map(async ({ name, _id }) => {
-      const shortName = name.length > 32 ? name.slice(0, 29) + "..." : name;
-      console.log(`Embedding '${shortName}'...`);
-      const { embedding } = await embedder.embed({
-        text: name,
-      });
-      console.log(`Updating '${shortName}'...`);
-      const updateResult = await collection.updateOne(
-        {
-          _id,
-        },
-        {
-          $set: {
-            prompt_embeddings: {
-              [OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT]: embedding,
-            },
-          },
-        }
-      );
-
-      if (updateResult.modifiedCount === 1) {
-        console.log(`Updated '${shortName}'.`);
-      } else {
-        console.warn(`Failed to update '${shortName}' (${_id})`);
-      }
-    });
-
-    await Promise.allSettled(embedPromises);
-  } finally {
-    await client.close();
-  }
-}
-
-main();

From 7a318fa0664d123ea8869a8226f441659c2ea9a2 Mon Sep 17 00:00:00 2001
From: Chris Bush <chris.bush@mongodb.com>
Date: Fri, 25 Jul 2025 16:57:01 -0400
Subject: [PATCH 05/14] LLM as judge

---
 packages/scripts/package.json                 |  2 +-
 packages/scripts/src/Case.ts                  | 17 +++---
 .../{assessRelevance.ts => assessCases.ts}    | 49 ++++++++++++++++
 ...essRelevanceMain.ts => assessCasesMain.ts} | 56 +++++++++++--------
 packages/scripts/src/main/llmJudgmentMain.ts  | 26 ---------
 5 files changed, 92 insertions(+), 58 deletions(-)
 rename packages/scripts/src/{assessRelevance.ts => assessCases.ts} (76%)
 rename packages/scripts/src/main/{assessRelevanceMain.ts => assessCasesMain.ts} (64%)
 delete mode 100644 packages/scripts/src/main/llmJudgmentMain.ts

diff --git a/packages/scripts/package.json b/packages/scripts/package.json
index 57bc1d503..962ffe055 100644
--- a/packages/scripts/package.json
+++ b/packages/scripts/package.json
@@ -5,7 +5,7 @@
   "main": "index.js",
   "private": true,
   "scripts": {
-    "assessRelevance": "npm run build && node ./build/main/assessRelevanceMain.js",
+    "assessCases": "npm run build && node ./build/main/assessCasesMain.js",
     "analyzeMessages": "npm run build && node ./build/analyzeMessages.js",
     "build": "npm run clean && tsc -b tsconfig.build.json",
     "checkUrlsAgainstDB": "npm run build && node ./build/checkUrlsAgainstDB.js",
diff --git a/packages/scripts/src/Case.ts b/packages/scripts/src/Case.ts
index 4a7561134..4237be480 100644
--- a/packages/scripts/src/Case.ts
+++ b/packages/scripts/src/Case.ts
@@ -33,13 +33,16 @@ export type ScoredPromptAndEmbeddings = z.infer<
   typeof ScoredPromptAndEmbeddings
 >;
 
-export const LlmAsJudgment = z.object({
-  reasonableness: z.number(),
-  clarity: z.number(),
-  specificity: z.number(),
-  fit: z.number(),
-  assumption: z.number(),
-});
+export const LlmAsJudgment = z
+  .object({
+    reasonableness: z.number(),
+    clarity: z.number(),
+    specificity: z.number(),
+    fit: z.number(),
+    assumption: z.number(),
+    guidance: z.string(),
+  })
+  .partial();
 
 export type LlmAsJudgment = z.infer<typeof LlmAsJudgment>;
 
diff --git a/packages/scripts/src/assessRelevance.ts b/packages/scripts/src/assessCases.ts
similarity index 76%
rename from packages/scripts/src/assessRelevance.ts
rename to packages/scripts/src/assessCases.ts
index 3f21540f8..2531180e4 100644
--- a/packages/scripts/src/assessRelevance.ts
+++ b/packages/scripts/src/assessCases.ts
@@ -4,6 +4,7 @@ import { calculateEmbeddings } from "./calculateEmbeddings";
 import { SimpleTextGenerator } from "./SimpleTextGenerator";
 import { Embedder } from "mongodb-rag-core";
 import {
+  LlmAsJudgment,
   PromptAndEmbeddings,
   Relevance,
   RelevanceMetrics,
@@ -169,6 +170,54 @@ ${scoredVariants.map(({ prompt }) => `  - "${prompt}"`).join("\n")}`
   };
 };
 
+export const rateWithLlm = async ({
+  prompt,
+  expected,
+  generate,
+}: {
+  prompt: string;
+  expected: string;
+  generate: SimpleTextGenerator;
+}): Promise<LlmAsJudgment | undefined> => {
+  const shortName = makeShortName(prompt);
+  console.log(`Rating '${shortName}' with LLM...`);
+
+  const [response] = await generate({
+    prompt: `
+Evaluate the quality of the following prompt-expected answer pair across
+multiple dimensions. Return your evaluation as a JSON object with numeric scores
+from 1 (poor) to 10 (excellent). Return only a JSON object with the following keys:
+
+- reasonableness (1-10): how reasonable it would be to expect an LLM to produce the given response from the given prompt.
+- clarity (1-10): how well formulated and clear the prompt is.
+- fit (1-10): how well the expected answer actually matches the prompt.
+- assumption (1-10): how much domain-specific knowledge is required to effectively answer the prompt.
+- guidance (string, optional): a text string containing detailing the issue and suggesting how to improve. Only include this if the above scores are low.
+
+Now evaluate this pair, returning only the JSON object:
+
+PROMPT: ${prompt}
+---
+EXPECTED ANSWER: ${expected}
+`,
+    n: 1,
+    temperature: 0,
+  });
+
+  try {
+    const judgment = LlmAsJudgment.parse(JSON.parse(response));
+    console.log(`Judgment of '${shortName}':
+${JSON.stringify(judgment, undefined, 2)}`);
+  } catch (e) {
+    console.error(
+      `Failed to parse response "${response}" into LlmAsJudgment: ${
+        (e as Error)?.message
+      }`
+    );
+    return undefined;
+  }
+};
+
 export const makeShortName = (prompt: string, ellipsizeAtLength = 64) => {
   assert(ellipsizeAtLength > 0);
   return prompt.length > ellipsizeAtLength
diff --git a/packages/scripts/src/main/assessRelevanceMain.ts b/packages/scripts/src/main/assessCasesMain.ts
similarity index 64%
rename from packages/scripts/src/main/assessRelevanceMain.ts
rename to packages/scripts/src/main/assessCasesMain.ts
index 3a3ac354c..d84037914 100644
--- a/packages/scripts/src/main/assessRelevanceMain.ts
+++ b/packages/scripts/src/main/assessCasesMain.ts
@@ -4,7 +4,7 @@ import { MongoClient } from "mongodb";
 import { Case } from "../Case";
 import { makeSimpleTextGenerator } from "../SimpleTextGenerator";
 import "dotenv/config";
-import { assessRelevance, makeShortName } from "../assessRelevance";
+import { assessRelevance, makeShortName, rateWithLlm } from "../assessCases";
 
 const assessRelevanceMain = async () => {
   const {
@@ -55,32 +55,40 @@ const assessRelevanceMain = async () => {
     const db = client.db(FROM_DATABASE_NAME);
     const collection = db.collection<Case>(CASE_COLLECTION_NAME);
     const cases = await collection.find().toArray();
-    const relevancePromises = cases.map(async ({ _id, name, expected }) => {
-      const shortName = makeShortName(name);
-      const relevance = await assessRelevance({
-        prompt: name,
-        expected,
-        embedders,
-        generate,
-      });
-      console.log(`Updating '${shortName}'...`);
-      const updateResult = await collection.updateOne(
-        {
-          _id,
-        },
-        {
-          $set: {
-            relevance,
+    const relevancePromises = cases.map(
+      async ({ _id, name: prompt, expected }) => {
+        const shortName = makeShortName(prompt);
+        const relevance = await assessRelevance({
+          prompt,
+          expected,
+          embedders,
+          generate,
+        });
+        const llm_as_judgment = await rateWithLlm({
+          prompt,
+          expected,
+          generate,
+        });
+        console.log(`Updating '${shortName}'...`);
+        const updateResult = await collection.updateOne(
+          {
+            _id,
           },
-        }
-      );
+          {
+            $set: {
+              relevance,
+              llm_as_judgment,
+            },
+          }
+        );
 
-      if (updateResult.modifiedCount === 1) {
-        console.log(`Updated '${shortName}'.`);
-      } else {
-        console.warn(`Failed to update '${shortName}' (${_id})`);
+        if (updateResult.modifiedCount === 1) {
+          console.log(`Updated '${shortName}'.`);
+        } else {
+          console.warn(`Failed to update '${shortName}' (${_id})`);
+        }
       }
-    });
+    );
 
     await Promise.allSettled(relevancePromises);
   } finally {
diff --git a/packages/scripts/src/main/llmJudgmentMain.ts b/packages/scripts/src/main/llmJudgmentMain.ts
deleted file mode 100644
index 2940da75d..000000000
--- a/packages/scripts/src/main/llmJudgmentMain.ts
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
-
-Evaluate the quality of the following prompt-expected answer pair across
-multiple dimensions. Return your evaluation as a JSON object with numeric scores
-from 1 (poor) to 5 (excellent). Use the following keys:
-
-...
-
-Now evaluate this pair:
-
-PROMPT: "Is there a  limit for mongodb deletemany" EXPECTED ANSWER:
-"db.collection.deleteMany() removes all documents that match the filter from a
-collection.
-
-NOTE: If you are deleting all documents in a large collection, it may be faster
-to drop the collection and recreate it. Before dropping the collection, note all
-indexes on the collection. You must recreate any indexes that existed in the
-original collection. If the original collection was sharded, you must also shard
-the recreated collection.
-
-For more information on dropping a collection, see db.collection.drop()."
-
-Return only the JSON object.
-
-
-*/

From 822c4cd6f65ad4531e233a0097c2d00cdb80f9c6 Mon Sep 17 00:00:00 2001
From: Chris Bush <chris.bush@mongodb.com>
Date: Wed, 30 Jul 2025 20:15:41 -0400
Subject: [PATCH 06/14] WIP

---
 packages/scripts/src/Case.ts                 |  1 +
 packages/scripts/src/assessCases.ts          | 16 +++++++++-------
 packages/scripts/src/main/assessCasesMain.ts |  2 +-
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/packages/scripts/src/Case.ts b/packages/scripts/src/Case.ts
index 4237be480..9f9e9d595 100644
--- a/packages/scripts/src/Case.ts
+++ b/packages/scripts/src/Case.ts
@@ -40,6 +40,7 @@ export const LlmAsJudgment = z
     specificity: z.number(),
     fit: z.number(),
     assumption: z.number(),
+    impact: z.number(),
     guidance: z.string(),
   })
   .partial();
diff --git a/packages/scripts/src/assessCases.ts b/packages/scripts/src/assessCases.ts
index 2531180e4..0e8db2ca8 100644
--- a/packages/scripts/src/assessCases.ts
+++ b/packages/scripts/src/assessCases.ts
@@ -185,14 +185,15 @@ export const rateWithLlm = async ({
   const [response] = await generate({
     prompt: `
 Evaluate the quality of the following prompt-expected answer pair across
-multiple dimensions. Return your evaluation as a JSON object with numeric scores
-from 1 (poor) to 10 (excellent). Return only a JSON object with the following keys:
+multiple dimensions. Return your evaluation as a JSON object with numeric percentage scores
+from 0 (poor) to 1 (excellent) up to 3 decimal places. Return only a JSON object (NOT IN MARKDOWN) with the following keys:
 
-- reasonableness (1-10): how reasonable it would be to expect an LLM to produce the given response from the given prompt.
-- clarity (1-10): how well formulated and clear the prompt is.
-- fit (1-10): how well the expected answer actually matches the prompt.
-- assumption (1-10): how much domain-specific knowledge is required to effectively answer the prompt.
-- guidance (string, optional): a text string containing detailing the issue and suggesting how to improve. Only include this if the above scores are low.
+- reasonableness: how reasonable it would be to expect an LLM to produce the given response from the given prompt.
+- clarity: how well formulated and clear the prompt is.
+- fit: how well the expected answer actually matches the prompt.
+- assumption: how much domain-specific knowledge is required to effectively answer the prompt.
+- impact: the business impact/relevance of the question and answer. Good examples: competitor questions, technical questions. Bad exaples: when was MongoDB founded?
+- guidance (string, optional): TERSELY and DIRECTLY detail the issue; suggest how to improve. Only include this if the above scores are low.
 
 Now evaluate this pair, returning only the JSON object:
 
@@ -208,6 +209,7 @@ EXPECTED ANSWER: ${expected}
     const judgment = LlmAsJudgment.parse(JSON.parse(response));
     console.log(`Judgment of '${shortName}':
 ${JSON.stringify(judgment, undefined, 2)}`);
+    return judgment;
   } catch (e) {
     console.error(
       `Failed to parse response "${response}" into LlmAsJudgment: ${
diff --git a/packages/scripts/src/main/assessCasesMain.ts b/packages/scripts/src/main/assessCasesMain.ts
index d84037914..3da4d872f 100644
--- a/packages/scripts/src/main/assessCasesMain.ts
+++ b/packages/scripts/src/main/assessCasesMain.ts
@@ -50,7 +50,7 @@ const assessRelevanceMain = async () => {
   const client = await MongoClient.connect(FROM_CONNECTION_URI);
   try {
     console.log(
-      `Fetching unscored cases from ${FROM_DATABASE_NAME}.${CASE_COLLECTION_NAME}...`
+      `Fetching cases ${FROM_DATABASE_NAME}.${CASE_COLLECTION_NAME}...`
     );
     const db = client.db(FROM_DATABASE_NAME);
     const collection = db.collection<Case>(CASE_COLLECTION_NAME);

From a7c2436dbd1faff888ae52a52d710de5a3082fd9 Mon Sep 17 00:00:00 2001
From: Chris Bush <chris.bush@mongodb.com>
Date: Thu, 31 Jul 2025 13:37:58 -0400
Subject: [PATCH 07/14] WIP

---
 packages/scripts/src/assessCases.ts           | 87 +------------------
 .../src/generatePromptsFromExpectedAnswer.ts  | 39 +++++++++
 packages/scripts/src/generateRating.ts        | 45 ++++++++++
 packages/scripts/src/main/assessCasesMain.ts  | 40 +++++++--
 4 files changed, 118 insertions(+), 93 deletions(-)
 create mode 100644 packages/scripts/src/generatePromptsFromExpectedAnswer.ts
 create mode 100644 packages/scripts/src/generateRating.ts

diff --git a/packages/scripts/src/assessCases.ts b/packages/scripts/src/assessCases.ts
index 0e8db2ca8..55157590e 100644
--- a/packages/scripts/src/assessCases.ts
+++ b/packages/scripts/src/assessCases.ts
@@ -4,48 +4,13 @@ import { calculateEmbeddings } from "./calculateEmbeddings";
 import { SimpleTextGenerator } from "./SimpleTextGenerator";
 import { Embedder } from "mongodb-rag-core";
 import {
-  LlmAsJudgment,
   PromptAndEmbeddings,
   Relevance,
   RelevanceMetrics,
   ScoredPromptAndEmbeddings,
 } from "./Case";
 import { cosineSimilarity } from "mongodb-rag-core/aiSdk";
-
-/**
-  Given the expected answer, generate a number of possible prompts that could
-  elicit that expected answer.
- */
-export const generatePromptsFromExpectedAnswer = async ({
-  expected,
-  embedders,
-  generate,
-  howMany,
-}: {
-  expected: string;
-  embedders: Embedder[];
-  generate: SimpleTextGenerator;
-  howMany: number;
-}): Promise<PromptAndEmbeddings[]> => {
-  const variants = await generate({
-    prompt: `Given the following "expected answer", formulate a question that is likely to elicit the expected answer.
-Don't necessarily use proper grammar or punctuation; write like a user of a chatbot, search engine, or LLM would.
-Just return the generated question.
-
-Expected answer:\n\n${expected}`,
-    n: howMany,
-    temperature: 0.5,
-  });
-
-  return await Promise.all(
-    variants.map(async (text) => {
-      return {
-        prompt: text,
-        embeddings: await calculateEmbeddings({ embedders, text }),
-      };
-    })
-  );
-};
+import { generatePromptsFromExpectedAnswer } from "./generatePromptsFromExpectedAnswer";
 
 export const scoreVariants = ({
   original,
@@ -170,56 +135,6 @@ ${scoredVariants.map(({ prompt }) => `  - "${prompt}"`).join("\n")}`
   };
 };
 
-export const rateWithLlm = async ({
-  prompt,
-  expected,
-  generate,
-}: {
-  prompt: string;
-  expected: string;
-  generate: SimpleTextGenerator;
-}): Promise<LlmAsJudgment | undefined> => {
-  const shortName = makeShortName(prompt);
-  console.log(`Rating '${shortName}' with LLM...`);
-
-  const [response] = await generate({
-    prompt: `
-Evaluate the quality of the following prompt-expected answer pair across
-multiple dimensions. Return your evaluation as a JSON object with numeric percentage scores
-from 0 (poor) to 1 (excellent) up to 3 decimal places. Return only a JSON object (NOT IN MARKDOWN) with the following keys:
-
-- reasonableness: how reasonable it would be to expect an LLM to produce the given response from the given prompt.
-- clarity: how well formulated and clear the prompt is.
-- fit: how well the expected answer actually matches the prompt.
-- assumption: how much domain-specific knowledge is required to effectively answer the prompt.
-- impact: the business impact/relevance of the question and answer. Good examples: competitor questions, technical questions. Bad exaples: when was MongoDB founded?
-- guidance (string, optional): TERSELY and DIRECTLY detail the issue; suggest how to improve. Only include this if the above scores are low.
-
-Now evaluate this pair, returning only the JSON object:
-
-PROMPT: ${prompt}
----
-EXPECTED ANSWER: ${expected}
-`,
-    n: 1,
-    temperature: 0,
-  });
-
-  try {
-    const judgment = LlmAsJudgment.parse(JSON.parse(response));
-    console.log(`Judgment of '${shortName}':
-${JSON.stringify(judgment, undefined, 2)}`);
-    return judgment;
-  } catch (e) {
-    console.error(
-      `Failed to parse response "${response}" into LlmAsJudgment: ${
-        (e as Error)?.message
-      }`
-    );
-    return undefined;
-  }
-};
-
 export const makeShortName = (prompt: string, ellipsizeAtLength = 64) => {
   assert(ellipsizeAtLength > 0);
   return prompt.length > ellipsizeAtLength
diff --git a/packages/scripts/src/generatePromptsFromExpectedAnswer.ts b/packages/scripts/src/generatePromptsFromExpectedAnswer.ts
new file mode 100644
index 000000000..90b41d355
--- /dev/null
+++ b/packages/scripts/src/generatePromptsFromExpectedAnswer.ts
@@ -0,0 +1,39 @@
+import { Embedder } from "mongodb-rag-core";
+import { SimpleTextGenerator } from "./SimpleTextGenerator";
+import { PromptAndEmbeddings } from "./Case";
+import { calculateEmbeddings } from "./calculateEmbeddings";
+
+/**
+  Given the expected answer, generate a number of possible prompts that could
+  elicit that expected answer.
+ */
+export const generatePromptsFromExpectedAnswer = async ({
+  expected,
+  embedders,
+  generate,
+  howMany,
+}: {
+  expected: string;
+  embedders: Embedder[];
+  generate: SimpleTextGenerator;
+  howMany: number;
+}): Promise<PromptAndEmbeddings[]> => {
+  const variants = await generate({
+    prompt: `Given the following "expected answer", formulate a question that is likely to elicit the expected answer.
+Don't necessarily use proper grammar or punctuation; write like a user of a chatbot, search engine, or LLM would.
+Just return the generated question.
+
+Expected answer:\n\n${expected}`,
+    n: howMany,
+    temperature: 0.5,
+  });
+
+  return await Promise.all(
+    variants.map(async (text) => {
+      return {
+        prompt: text,
+        embeddings: await calculateEmbeddings({ embedders, text }),
+      };
+    })
+  );
+};
diff --git a/packages/scripts/src/generateRating.ts b/packages/scripts/src/generateRating.ts
new file mode 100644
index 000000000..4d0c65691
--- /dev/null
+++ b/packages/scripts/src/generateRating.ts
@@ -0,0 +1,45 @@
+import { generateObject, LanguageModelV1 } from "mongodb-rag-core/aiSdk";
+import { makeShortName } from "./assessCases";
+import { LlmAsJudgment } from "./Case";
+
+/**
+  Given the prompt and expected response pair, use the LLM to assess the quality
+  on a variety of metrics and provide recommendations for improvement.
+ */
+export const generateRating = async ({
+  prompt,
+  expectedResponse,
+  model,
+}: {
+  prompt: string;
+  expectedResponse: string;
+  model: LanguageModelV1;
+}): Promise<LlmAsJudgment | undefined> => {
+  const shortName = makeShortName(prompt);
+  console.log(`Rating '${shortName}' with LLM...`);
+
+  const result = await generateObject<LlmAsJudgment>({
+    prompt: `
+Evaluate the quality of the following prompt-expected answer pair across
+multiple dimensions. Return your evaluation as a JSON object with numeric percentage scores
+from 1 (poor) to 5 (excellent) up to 3 decimal places. Return only a JSON object (NOT IN MARKDOWN) with the following keys:
+
+- answer_reasonableness: how reasonable it would be to expect an LLM to produce the given response from the given prompt.
+- prompt_clarity: how well formulated and clear the prompt is.
+- answer_fit: how well the expected answer actually matches the prompt.
+- prompt_knowledge_assumption: how much domain-specific knowledge is required to effectively answer the prompt.
+- business_impact: the business impact/relevance of the question and answer. Good examples: competitor questions, technical questions. Bad examples: when was MongoDB founded?
+- guidance (string, optional): TERSELY and DIRECTLY detail the issue; suggest how to improve the prompt and/or expected response. Only include this if ANY of the above scores <= 2.
+
+Now evaluate this pair, returning only the JSON object:
+
+PROMPT: ${prompt}
+---
+EXPECTED ANSWER: ${expectedResponse}
+`.trim(),
+    model,
+    schema: LlmAsJudgment,
+  });
+
+  return result.object;
+};
diff --git a/packages/scripts/src/main/assessCasesMain.ts b/packages/scripts/src/main/assessCasesMain.ts
index 3da4d872f..041fe6119 100644
--- a/packages/scripts/src/main/assessCasesMain.ts
+++ b/packages/scripts/src/main/assessCasesMain.ts
@@ -1,10 +1,17 @@
 import { AzureOpenAI } from "mongodb-rag-core/openai";
-import { assertEnvVars, makeOpenAiEmbedder } from "mongodb-rag-core";
+import {
+  assertEnvVars,
+  BRAINTRUST_ENV_VARS,
+  makeOpenAiEmbedder,
+} from "mongodb-rag-core";
 import { MongoClient } from "mongodb";
 import { Case } from "../Case";
 import { makeSimpleTextGenerator } from "../SimpleTextGenerator";
 import "dotenv/config";
-import { assessRelevance, makeShortName, rateWithLlm } from "../assessCases";
+import { assessRelevance, makeShortName } from "../assessCases";
+import { generateRating } from "../generateRating";
+import { models } from "mongodb-rag-core/models";
+import { createOpenAI, wrapLanguageModel } from "mongodb-rag-core/aiSdk";
 
 const assessRelevanceMain = async () => {
   const {
@@ -15,6 +22,8 @@ const assessRelevanceMain = async () => {
     OPENAI_API_VERSION,
     OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT,
     CASE_COLLECTION_NAME,
+    BRAINTRUST_API_KEY,
+    BRAINTRUST_ENDPOINT,
   } = assertEnvVars({
     FROM_CONNECTION_URI: "",
     FROM_DATABASE_NAME: "",
@@ -23,6 +32,7 @@ const assessRelevanceMain = async () => {
     OPENAI_API_VERSION: "",
     OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT: "",
     CASE_COLLECTION_NAME: "",
+    ...BRAINTRUST_ENV_VARS,
   });
 
   const openAiClient = new AzureOpenAI({
@@ -47,6 +57,20 @@ const assessRelevanceMain = async () => {
     model: "gpt-4o",
   });
 
+  const judgmentModel = wrapLanguageModel({
+    model: createOpenAI({
+      apiKey: BRAINTRUST_API_KEY,
+      baseURL: BRAINTRUST_ENDPOINT,
+    }).chat("o3"),
+    middleware: [],
+  });
+
+  const openai = createOpenAI({
+    apiKey: BRAINTRUST_API_KEY,
+    baseURL: BRAINTRUST_ENDPOINT,
+  }).chat(llmOptions.model, {
+    structuredOutputs: true,
+  });
   const client = await MongoClient.connect(FROM_CONNECTION_URI);
   try {
     console.log(
@@ -56,18 +80,20 @@ const assessRelevanceMain = async () => {
     const collection = db.collection<Case>(CASE_COLLECTION_NAME);
     const cases = await collection.find().toArray();
     const relevancePromises = cases.map(
-      async ({ _id, name: prompt, expected }) => {
+      async ({ _id, name: prompt, expected: expectedResponse }) => {
         const shortName = makeShortName(prompt);
         const relevance = await assessRelevance({
           prompt,
-          expected,
+          expectedResponse,
           embedders,
           generate,
         });
-        const llm_as_judgment = await rateWithLlm({
+
+        models.find(({ deployment }) => deployment === "o3");
+        const llm_as_judgment = await generateRating({
           prompt,
-          expected,
-          generate,
+          expectedResponse,
+          model: judgmentModel,
         });
         console.log(`Updating '${shortName}'...`);
         const updateResult = await collection.updateOne(

From 7d8cefbfcf8cd95e0872fb092ea0bdebc772326e Mon Sep 17 00:00:00 2001
From: Chris Bush <chris.bush@mongodb.com>
Date: Thu, 31 Jul 2025 14:31:30 -0400
Subject: [PATCH 08/14] Eval WIP

---
 packages/scripts/src/Case.ts                  |  17 +--
 packages/scripts/src/assessCases.ts           |   8 +-
 .../src/generatePromptsFromExpectedAnswer.ts  |   6 +-
 packages/scripts/src/generateRating.eval.ts   |  75 +++++++++++++
 packages/scripts/src/generateRating.ts        | 101 ++++++++++++------
 packages/scripts/src/main/assessCasesMain.ts  |  17 ++-
 6 files changed, 157 insertions(+), 67 deletions(-)
 create mode 100644 packages/scripts/src/generateRating.eval.ts

diff --git a/packages/scripts/src/Case.ts b/packages/scripts/src/Case.ts
index 9f9e9d595..8bc3db5d5 100644
--- a/packages/scripts/src/Case.ts
+++ b/packages/scripts/src/Case.ts
@@ -1,4 +1,5 @@
 import z from "zod";
+import { PromptResponseRating } from "./generateRating";
 
 // Map of embedding model name -> vector (array of numbers)
 export const Embeddings = z.record(z.string(), z.number().array());
@@ -33,20 +34,6 @@ export type ScoredPromptAndEmbeddings = z.infer<
   typeof ScoredPromptAndEmbeddings
 >;
 
-export const LlmAsJudgment = z
-  .object({
-    reasonableness: z.number(),
-    clarity: z.number(),
-    specificity: z.number(),
-    fit: z.number(),
-    assumption: z.number(),
-    impact: z.number(),
-    guidance: z.string(),
-  })
-  .partial();
-
-export type LlmAsJudgment = z.infer<typeof LlmAsJudgment>;
-
 export const Relevance = z.object({
   prompt_embeddings: Embeddings,
   generated_prompts: ScoredPromptAndEmbeddings.array(),
@@ -70,7 +57,7 @@ export const Case = z.object({
   // Fields to add
   prompt_embeddings: Embeddings.optional(),
   relevance: Relevance.optional(),
-  llm_as_judgment: LlmAsJudgment.optional(),
+  prompt_response_rating: PromptResponseRating.optional(),
 });
 
 export type Case = z.infer<typeof Case>;
diff --git a/packages/scripts/src/assessCases.ts b/packages/scripts/src/assessCases.ts
index 55157590e..6a8f7e933 100644
--- a/packages/scripts/src/assessCases.ts
+++ b/packages/scripts/src/assessCases.ts
@@ -56,11 +56,11 @@ export const scoreVariants = ({
 export const assessRelevance = async ({
   prompt,
   embedders,
-  expected,
+  expectedResponse,
   generate,
 }: {
   prompt: string;
-  expected: string;
+  expectedResponse: string;
   embedders: Embedder[];
   generate: SimpleTextGenerator;
 }): Promise<Relevance> => {
@@ -75,7 +75,7 @@ export const assessRelevance = async ({
   console.log(`Generating variants for '${shortName}'...`);
   const variants = await generatePromptsFromExpectedAnswer({
     embedders,
-    expected,
+    expectedResponse,
     generate,
     howMany: 3,
   });
@@ -92,7 +92,7 @@ export const assessRelevance = async ({
   assert(variantCount === Object.values(scoredVariants).length);
 
   console.log(
-    `- Expected: "${expected}"
+    `- Expected: "${expectedResponse}"
 - Original: "${prompt}"
 - Generated variants:
 ${scoredVariants.map(({ prompt }) => `  - "${prompt}"`).join("\n")}`
diff --git a/packages/scripts/src/generatePromptsFromExpectedAnswer.ts b/packages/scripts/src/generatePromptsFromExpectedAnswer.ts
index 90b41d355..58e68f888 100644
--- a/packages/scripts/src/generatePromptsFromExpectedAnswer.ts
+++ b/packages/scripts/src/generatePromptsFromExpectedAnswer.ts
@@ -8,12 +8,12 @@ import { calculateEmbeddings } from "./calculateEmbeddings";
   elicit that expected answer.
  */
 export const generatePromptsFromExpectedAnswer = async ({
-  expected,
+  expectedResponse,
   embedders,
   generate,
   howMany,
 }: {
-  expected: string;
+  expectedResponse: string;
   embedders: Embedder[];
   generate: SimpleTextGenerator;
   howMany: number;
@@ -23,7 +23,7 @@ export const generatePromptsFromExpectedAnswer = async ({
 Don't necessarily use proper grammar or punctuation; write like a user of a chatbot, search engine, or LLM would.
 Just return the generated question.
 
-Expected answer:\n\n${expected}`,
+Expected answer:\n\n${expectedResponse}`,
     n: howMany,
     temperature: 0.5,
   });
diff --git a/packages/scripts/src/generateRating.eval.ts b/packages/scripts/src/generateRating.eval.ts
new file mode 100644
index 000000000..423bc395a
--- /dev/null
+++ b/packages/scripts/src/generateRating.eval.ts
@@ -0,0 +1,75 @@
+import "dotenv/config";
+import { Eval, BraintrustMiddleware } from "braintrust";
+import { Scorer } from "autoevals";
+import { MongoDbTag } from "mongodb-rag-core/mongoDbMetadata";
+import { createOpenAI, wrapLanguageModel } from "mongodb-rag-core/aiSdk";
+import { makeGenerateRating, PromptResponseRating } from "./generateRating";
+import { assertEnvVars, BRAINTRUST_ENV_VARS } from "mongodb-rag-core";
+
+const { BRAINTRUST_API_KEY, BRAINTRUST_ENDPOINT } =
+  assertEnvVars(BRAINTRUST_ENV_VARS);
+
+interface GenerateRatingEvalCase {
+  input: {
+    prompt: string;
+    expectedResponse: string;
+  };
+  expected: PromptResponseRating;
+  tags?: MongoDbTag[];
+}
+
+const evalCases: GenerateRatingEvalCase[] = [
+  {
+    input: {
+      prompt: "What is the weather?",
+      expectedResponse: "",
+    },
+    expected: {
+      answer_fit: 5,
+      answer_reasonableness: 5,
+      business_impact: 4,
+      prompt_clarity: 4,
+      prompt_knowledge_assumption: 4,
+    },
+  },
+];
+
+const CorrectAnswerFit: Scorer<PromptResponseRating, unknown> = (args) => {
+  return {
+    name: "CorrectAnswerFit",
+    score: args.output.answer_fit === args.expected?.answer_fit ? 1 : 0,
+  };
+};
+
+const model = wrapLanguageModel({
+  model: createOpenAI({
+    apiKey: BRAINTRUST_API_KEY,
+    baseURL: BRAINTRUST_ENDPOINT,
+  }).chat("o3"),
+  middleware: [BraintrustMiddleware({ debug: true })],
+});
+
+const generateRating = makeGenerateRating({
+  model,
+});
+
+Eval("generate-prompt-response-rating", {
+  data: evalCases,
+  experimentName: "response",
+  metadata: {
+    description:
+      "Evaluates the quality of LLM as judge in rating prompt & expected response pairs.",
+    model: model.modelId,
+  },
+  maxConcurrency: 10,
+  async task(input) {
+    try {
+      return await generateRating(input);
+    } catch (error) {
+      console.error(`Error evaluating input: ${input}`);
+      console.error(error);
+      throw error;
+    }
+  },
+  scores: [CorrectAnswerFit],
+});
diff --git a/packages/scripts/src/generateRating.ts b/packages/scripts/src/generateRating.ts
index 4d0c65691..38cdf9064 100644
--- a/packages/scripts/src/generateRating.ts
+++ b/packages/scripts/src/generateRating.ts
@@ -1,45 +1,78 @@
-import { generateObject, LanguageModelV1 } from "mongodb-rag-core/aiSdk";
+import { generateObject, LanguageModel } from "mongodb-rag-core/aiSdk";
 import { makeShortName } from "./assessCases";
-import { LlmAsJudgment } from "./Case";
+import z from "zod";
+
+const Rating = z.number().min(1).max(5);
+
+export const PromptResponseRating = z.object({
+  answer_reasonableness: Rating.describe(
+    "How reasonable it would be to expect an LLM to produce the given response from the given prompt."
+  ),
+  prompt_clarity: Rating.describe(
+    "How well-formulated and clear the prompt is."
+  ),
+  answer_fit: Rating.describe(
+    "How well the expected answer actually matches the prompt."
+  ),
+  prompt_knowledge_assumption: Rating.describe(
+    "How much domain-specific knowledge might be required to effectively answer the prompt."
+  ),
+  business_impact: Rating.describe(
+    "The business impact/relevance of the question and answer. Good examples: competitor questions, technical questions. Bad examples: when was MongoDB founded?"
+  ), // TODO: Clarify this - it is intended to evaluate how useful the prompt actually is - something you can just google? Or something people might actually need help with?
+  guidance: z
+    .string()
+    .optional()
+    .describe(
+      "TERSELY and DIRECTLY detail the issue; suggest how to improve the prompt and/or expected response. Only include this if ANY of the above scores <= 2"
+    ),
+});
+
+export type PromptResponseRating = z.infer<typeof PromptResponseRating>;
+
+const schemaName = "prompt_response_rating";
 
 /**
-  Given the prompt and expected response pair, use the LLM to assess the quality
-  on a variety of metrics and provide recommendations for improvement.
+  Creates a function that, given the prompt and expected response pair, uses the
+  LLM to assess the quality on a variety of metrics and provide recommendations
+  for improvement.
  */
-export const generateRating = async ({
-  prompt,
-  expectedResponse,
-  model,
-}: {
-  prompt: string;
-  expectedResponse: string;
-  model: LanguageModelV1;
-}): Promise<LlmAsJudgment | undefined> => {
-  const shortName = makeShortName(prompt);
-  console.log(`Rating '${shortName}' with LLM...`);
-
-  const result = await generateObject<LlmAsJudgment>({
-    prompt: `
+export const makeGenerateRating = ({ model }: { model: LanguageModel }) => {
+  return async ({
+    prompt,
+    expectedResponse,
+  }: {
+    prompt: string;
+    expectedResponse: string;
+  }): Promise<PromptResponseRating> => {
+    const shortName = makeShortName(prompt);
+    console.log(`Rating '${shortName}' with LLM...`);
+
+    const result = await generateObject({
+      prompt: `
 Evaluate the quality of the following prompt-expected answer pair across
-multiple dimensions. Return your evaluation as a JSON object with numeric percentage scores
-from 1 (poor) to 5 (excellent) up to 3 decimal places. Return only a JSON object (NOT IN MARKDOWN) with the following keys:
+multiple dimensions. Return your evaluation as a JSON object with numeric scores
+from 1 (poor) to 5 (excellent).
 
-- answer_reasonableness: how reasonable it would be to expect an LLM to produce the given response from the given prompt.
-- prompt_clarity: how well formulated and clear the prompt is.
-- answer_fit: how well the expected answer actually matches the prompt.
-- prompt_knowledge_assumption: how much domain-specific knowledge is required to effectively answer the prompt.
-- business_impact: the business impact/relevance of the question and answer. Good examples: competitor questions, technical questions. Bad examples: when was MongoDB founded?
-- guidance (string, optional): TERSELY and DIRECTLY detail the issue; suggest how to improve the prompt and/or expected response. Only include this if ANY of the above scores <= 2.
+Now evaluate this promt/expected answer pair:
 
-Now evaluate this pair, returning only the JSON object:
+<PROMPT>
+${prompt}
+</PROMPT>
+<EXPECTED-ANSWER>
+${expectedResponse}
+</EXPECTED-ANSWER>
 
-PROMPT: ${prompt}
----
-EXPECTED ANSWER: ${expectedResponse}
+Format the response in a '${schemaName}' JSON object.
 `.trim(),
-    model,
-    schema: LlmAsJudgment,
-  });
+      model,
+      schema: PromptResponseRating,
+      schemaName,
+      schemaDescription: "Ratings for prompt response pair.",
+    });
 
-  return result.object;
+    return result.object;
+  };
 };
+
+export type GenerateRating = ReturnType<typeof makeGenerateRating>;
diff --git a/packages/scripts/src/main/assessCasesMain.ts b/packages/scripts/src/main/assessCasesMain.ts
index 041fe6119..f6f3fdcc9 100644
--- a/packages/scripts/src/main/assessCasesMain.ts
+++ b/packages/scripts/src/main/assessCasesMain.ts
@@ -9,9 +9,10 @@ import { Case } from "../Case";
 import { makeSimpleTextGenerator } from "../SimpleTextGenerator";
 import "dotenv/config";
 import { assessRelevance, makeShortName } from "../assessCases";
-import { generateRating } from "../generateRating";
+import { makeGenerateRating } from "../generateRating";
 import { models } from "mongodb-rag-core/models";
 import { createOpenAI, wrapLanguageModel } from "mongodb-rag-core/aiSdk";
+import { BraintrustMiddleware } from "mongodb-rag-core/braintrust";
 
 const assessRelevanceMain = async () => {
   const {
@@ -62,15 +63,10 @@ const assessRelevanceMain = async () => {
       apiKey: BRAINTRUST_API_KEY,
       baseURL: BRAINTRUST_ENDPOINT,
     }).chat("o3"),
-    middleware: [],
+    middleware: [BraintrustMiddleware({ debug: true })],
   });
+  const generateRating = makeGenerateRating({ model: judgmentModel });
 
-  const openai = createOpenAI({
-    apiKey: BRAINTRUST_API_KEY,
-    baseURL: BRAINTRUST_ENDPOINT,
-  }).chat(llmOptions.model, {
-    structuredOutputs: true,
-  });
   const client = await MongoClient.connect(FROM_CONNECTION_URI);
   try {
     console.log(
@@ -90,10 +86,9 @@ const assessRelevanceMain = async () => {
         });
 
         models.find(({ deployment }) => deployment === "o3");
-        const llm_as_judgment = await generateRating({
+        const prompt_response_rating = await generateRating({
           prompt,
           expectedResponse,
-          model: judgmentModel,
         });
         console.log(`Updating '${shortName}'...`);
         const updateResult = await collection.updateOne(
@@ -103,7 +98,7 @@ const assessRelevanceMain = async () => {
           {
             $set: {
               relevance,
-              llm_as_judgment,
+              prompt_response_rating,
             },
           }
         );

From 0fe833dae8c9e65f56294971a2117a402a2368c1 Mon Sep 17 00:00:00 2001
From: Chris Bush <chris.bush@mongodb.com>
Date: Thu, 31 Jul 2025 14:36:09 -0400
Subject: [PATCH 09/14] WIP

---
 packages/scripts/src/generateRating.eval.ts | 6 +++++-
 packages/scripts/src/generateRating.ts      | 2 ++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/packages/scripts/src/generateRating.eval.ts b/packages/scripts/src/generateRating.eval.ts
index 423bc395a..fad37ec14 100644
--- a/packages/scripts/src/generateRating.eval.ts
+++ b/packages/scripts/src/generateRating.eval.ts
@@ -18,6 +18,7 @@ interface GenerateRatingEvalCase {
   tags?: MongoDbTag[];
 }
 
+// TODO: More eval cases
 const evalCases: GenerateRatingEvalCase[] = [
   {
     input: {
@@ -34,10 +35,13 @@ const evalCases: GenerateRatingEvalCase[] = [
   },
 ];
 
+// TODO: More scorers
 const CorrectAnswerFit: Scorer<PromptResponseRating, unknown> = (args) => {
   return {
     name: "CorrectAnswerFit",
-    score: args.output.answer_fit === args.expected?.answer_fit ? 1 : 0,
+    score:
+      1 -
+      Math.abs(args.output.answer_fit - (args.expected?.answer_fit ?? 0)) / 5, // TODO: normalize
   };
 };
 
diff --git a/packages/scripts/src/generateRating.ts b/packages/scripts/src/generateRating.ts
index 38cdf9064..8fc1e6442 100644
--- a/packages/scripts/src/generateRating.ts
+++ b/packages/scripts/src/generateRating.ts
@@ -49,6 +49,8 @@ export const makeGenerateRating = ({ model }: { model: LanguageModel }) => {
     console.log(`Rating '${shortName}' with LLM...`);
 
     const result = await generateObject({
+      // TODO: TOMAYBEDO: Add more context to 1-5, add more context on different scores...
+      // - Output chain of thought - helpful for debugging
       prompt: `
 Evaluate the quality of the following prompt-expected answer pair across
 multiple dimensions. Return your evaluation as a JSON object with numeric scores

From 26e57d806ff55894c92f368314b1bc8008542ce7 Mon Sep 17 00:00:00 2001
From: Chris Bush <chris.bush@mongodb.com>
Date: Thu, 31 Jul 2025 16:58:50 -0400
Subject: [PATCH 10/14] Add other scores

---
 packages/scripts/src/generateRating.eval.ts | 25 +++++++++++-------
 packages/scripts/src/generateRating.ts      | 29 ++++++++++++++++-----
 2 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/packages/scripts/src/generateRating.eval.ts b/packages/scripts/src/generateRating.eval.ts
index fad37ec14..d3597841f 100644
--- a/packages/scripts/src/generateRating.eval.ts
+++ b/packages/scripts/src/generateRating.eval.ts
@@ -35,15 +35,20 @@ const evalCases: GenerateRatingEvalCase[] = [
   },
 ];
 
-// TODO: More scorers
-const CorrectAnswerFit: Scorer<PromptResponseRating, unknown> = (args) => {
-  return {
-    name: "CorrectAnswerFit",
-    score:
-      1 -
-      Math.abs(args.output.answer_fit - (args.expected?.answer_fit ?? 0)) / 5, // TODO: normalize
-  };
-};
+const scores = (
+  [
+    "answer_fit",
+    "answer_reasonableness",
+    "business_impact",
+    "prompt_clarity",
+    "prompt_knowledge_assumption",
+  ] satisfies (keyof PromptResponseRating)[]
+).map((key): Scorer<PromptResponseRating, unknown> => {
+  return ({ output, expected }) => ({
+    name: `correct_${key}`,
+    score: 1 - Math.abs(output[key] - (expected?.[key] ?? 0)) / 4,
+  });
+});
 
 const model = wrapLanguageModel({
   model: createOpenAI({
@@ -75,5 +80,5 @@ Eval("generate-prompt-response-rating", {
       throw error;
     }
   },
-  scores: [CorrectAnswerFit],
+  scores,
 });
diff --git a/packages/scripts/src/generateRating.ts b/packages/scripts/src/generateRating.ts
index 8fc1e6442..9f9dbeb71 100644
--- a/packages/scripts/src/generateRating.ts
+++ b/packages/scripts/src/generateRating.ts
@@ -3,29 +3,44 @@ import { makeShortName } from "./assessCases";
 import z from "zod";
 
 const Rating = z.number().min(1).max(5);
-
+const Rationale = (name: string) =>
+  z
+    .string()
+    .optional()
+    .describe(
+      `A terse and direct chain of thought or rationale for why you gave the ${name} score.`
+    );
 export const PromptResponseRating = z.object({
+  answer_fit: Rating.describe(
+    "How well the expected answer actually matches the prompt."
+  ),
   answer_reasonableness: Rating.describe(
     "How reasonable it would be to expect an LLM to produce the given response from the given prompt."
   ),
+  business_impact: Rating.describe(
+    "The business impact/relevance of the question and answer. Good examples: competitor questions, technical questions. Bad examples: when was MongoDB founded?"
+  ), // TODO: Clarify this - it is intended to evaluate how useful the prompt actually is - something you can just google? Or something people might actually need help with?
   prompt_clarity: Rating.describe(
     "How well-formulated and clear the prompt is."
   ),
-  answer_fit: Rating.describe(
-    "How well the expected answer actually matches the prompt."
-  ),
   prompt_knowledge_assumption: Rating.describe(
     "How much domain-specific knowledge might be required to effectively answer the prompt."
   ),
-  business_impact: Rating.describe(
-    "The business impact/relevance of the question and answer. Good examples: competitor questions, technical questions. Bad examples: when was MongoDB founded?"
-  ), // TODO: Clarify this - it is intended to evaluate how useful the prompt actually is - something you can just google? Or something people might actually need help with?
+
   guidance: z
     .string()
     .optional()
     .describe(
       "TERSELY and DIRECTLY detail the issue; suggest how to improve the prompt and/or expected response. Only include this if ANY of the above scores <= 2"
     ),
+
+  answer_fit_rationale: Rationale("answer fit"),
+  answer_reasonableness_rationale: Rationale("answer reasonableness"),
+  business_impact_rationale: Rationale("business impact"),
+  prompt_clarity_rationale: Rationale("prompt clarity"),
+  prompt_knowledge_assumption_rationale: Rationale(
+    "prompt knowledge assumption"
+  ),
 });
 
 export type PromptResponseRating = z.infer<typeof PromptResponseRating>;

From da4df5fedef4ddb50ccb081414c40a3c9fcbc06c Mon Sep 17 00:00:00 2001
From: Chris Bush <chris.bush@mongodb.com>
Date: Thu, 31 Jul 2025 17:33:11 -0400
Subject: [PATCH 11/14] WIP

---
 packages/scripts/src/Case.ts                  |   5 +
 packages/scripts/src/SimpleTextGenerator.ts   |  45 +++---
 packages/scripts/src/assessRelevance.eval.ts  | 128 ++++++++++++++++++
 .../{assessCases.ts => assessRelevance.ts}    |   0
 packages/scripts/src/generateRating.ts        |   2 +-
 packages/scripts/src/main/assessCasesMain.ts  |  14 +-
 6 files changed, 159 insertions(+), 35 deletions(-)
 create mode 100644 packages/scripts/src/assessRelevance.eval.ts
 rename packages/scripts/src/{assessCases.ts => assessRelevance.ts} (100%)

diff --git a/packages/scripts/src/Case.ts b/packages/scripts/src/Case.ts
index 8bc3db5d5..df1dc2923 100644
--- a/packages/scripts/src/Case.ts
+++ b/packages/scripts/src/Case.ts
@@ -13,6 +13,11 @@ export const PromptAndEmbeddings = z.object({
 
 export type PromptAndEmbeddings = z.infer<typeof PromptAndEmbeddings>;
 
+/**
+  Answer relevance: given prompt and expected answer pair, generate N possible
+  prompts that would elicit that answer, then compare their embeddings with the
+  embedding of the original prompt.
+ */
 export const RelevanceMetrics = z.object({
   // normalized square magnitude difference (lower = closer = better)
   norm_sq_mag_diff: z.number(),
diff --git a/packages/scripts/src/SimpleTextGenerator.ts b/packages/scripts/src/SimpleTextGenerator.ts
index 2d05c8c13..e47d0e38d 100644
--- a/packages/scripts/src/SimpleTextGenerator.ts
+++ b/packages/scripts/src/SimpleTextGenerator.ts
@@ -1,48 +1,35 @@
-import { OpenAI } from "mongodb-rag-core/openai";
+import { generateText, LanguageModel } from "mongodb-rag-core/aiSdk";
 
 export const makeSimpleTextGenerator = ({
-  client,
   model,
   systemPrompt,
 }: {
-  client: OpenAI;
-  model: string;
+  model: LanguageModel;
   systemPrompt?: string;
 }) => {
   return async ({
     prompt,
     temperature = 0,
-    maxTokens = 1500,
     n = 1,
   }: {
     prompt: string;
     temperature?: number;
-    maxTokens?: number;
+
     n?: number;
   }): Promise<string[]> => {
-    const messages = [
-      {
-        role: "system",
-        content: systemPrompt ?? "",
-      },
-      {
-        role: "user",
-        content: prompt,
-      },
-    ] satisfies OpenAI.ChatCompletionMessageParam[];
-    const result = await client.chat.completions.create({
-      model,
-      messages,
-      temperature,
-      max_tokens: maxTokens,
-      n,
-    });
-    return result.choices.map(({ message: { content } }) => {
-      if (content === null) {
-        throw new Error(`Failed to generate content!`);
-      }
-      return content;
-    });
+    const result = await Promise.all(
+      Array(n)
+        .fill(0)
+        .map(async () =>
+          generateText({
+            model,
+            prompt,
+            system: systemPrompt,
+            temperature,
+          })
+        )
+    );
+    return result.map(({ text }) => text);
   };
 };
 
diff --git a/packages/scripts/src/assessRelevance.eval.ts b/packages/scripts/src/assessRelevance.eval.ts
new file mode 100644
index 000000000..ae1cd12c5
--- /dev/null
+++ b/packages/scripts/src/assessRelevance.eval.ts
@@ -0,0 +1,128 @@
+import "dotenv/config";
+import { Eval, BraintrustMiddleware } from "braintrust";
+import { Scorer } from "autoevals";
+import { MongoDbTag } from "mongodb-rag-core/mongoDbMetadata";
+import { createOpenAI, wrapLanguageModel } from "mongodb-rag-core/aiSdk";
+import {
+  assertEnvVars,
+  BRAINTRUST_ENV_VARS,
+  makeOpenAiEmbedder,
+} from "mongodb-rag-core";
+import { Relevance } from "./Case";
+import { assessRelevance } from "./assessRelevance";
+import { AzureOpenAI } from "mongodb-rag-core/openai";
+import { makeSimpleTextGenerator } from "./SimpleTextGenerator";
+
+const {
+  OPENAI_API_KEY,
+  OPENAI_ENDPOINT,
+  OPENAI_API_VERSION,
+  OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT,
+  BRAINTRUST_API_KEY,
+  BRAINTRUST_ENDPOINT,
+} = assertEnvVars({
+  OPENAI_API_KEY: "",
+  OPENAI_ENDPOINT: "",
+  OPENAI_API_VERSION: "",
+  OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT: "",
+  ...BRAINTRUST_ENV_VARS,
+});
+
+interface AssessRelevanceEvalCase {
+  input: {
+    prompt: string;
+    expectedResponse: string;
+  };
+  expected: Pick<Relevance, "averages">;
+  tags?: MongoDbTag[];
+}
+
+// TODO: More eval cases
+const evalCases: AssessRelevanceEvalCase[] = [
+  {
+    input: {
+      prompt: "What is the weather?",
+      expectedResponse: "",
+    },
+    expected: {
+      averages: {
+        cos_similarity: 0.8,
+        norm_sq_mag_diff: 0.9,
+      },
+    },
+  },
+];
+
+const cosSimilarityScorer: Scorer<Pick<Relevance, "averages">, unknown> = ({
+  output,
+  expected,
+}) => ({
+  name: `closeCosSimilarity`,
+  score:
+    expected === undefined
+      ? 0
+      : 1 -
+        Math.abs(
+          output.averages.cos_similarity - expected.averages.cos_similarity
+        ),
+});
+
+const model = wrapLanguageModel({
+  model: createOpenAI({
+    apiKey: BRAINTRUST_API_KEY,
+    baseURL: BRAINTRUST_ENDPOINT,
+  }).chat("o3"),
+  middleware: [BraintrustMiddleware({ debug: true })],
+});
+
+const openAiClient = new AzureOpenAI({
+  apiKey: OPENAI_API_KEY,
+  endpoint: OPENAI_ENDPOINT,
+  apiVersion: OPENAI_API_VERSION,
+});
+
+const embedders = [
+  makeOpenAiEmbedder({
+    openAiClient,
+    deployment: OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT,
+    backoffOptions: {
+      numOfAttempts: 25,
+      startingDelay: 1000,
+    },
+  }),
+];
+
+const generate = makeSimpleTextGenerator({
+  model: wrapLanguageModel({
+    model: createOpenAI({
+      apiKey: BRAINTRUST_API_KEY,
+      baseURL: BRAINTRUST_ENDPOINT,
+    }).chat("gpt-4.1"),
+    middleware: [BraintrustMiddleware({ debug: true })],
+  }),
+});
+
+Eval("assess-prompt-response-relevance", {
+  data: evalCases,
+  experimentName: "assess-relevance",
+  metadata: {
+    description: "Evaluates assessRelevance().",
+    model: model.modelId,
+  },
+  maxConcurrency: 10,
+  async task({ prompt, expectedResponse }) {
+    try {
+      return await assessRelevance({
+        prompt,
+        expectedResponse,
+        embedders,
+        generate,
+      });
+    } catch (error) {
+      console.error(`Error evaluating input: ${prompt} - ${expectedResponse}`);
+      console.error(error);
+      throw error;
+    }
+  },
+  scores: [cosSimilarityScorer],
+});
diff --git a/packages/scripts/src/assessCases.ts b/packages/scripts/src/assessRelevance.ts
similarity index 100%
rename from packages/scripts/src/assessCases.ts
rename to packages/scripts/src/assessRelevance.ts
diff --git a/packages/scripts/src/generateRating.ts b/packages/scripts/src/generateRating.ts
index 9f9dbeb71..909eb8562 100644
--- a/packages/scripts/src/generateRating.ts
+++ b/packages/scripts/src/generateRating.ts
@@ -1,5 +1,5 @@
 import { generateObject, LanguageModel } from "mongodb-rag-core/aiSdk";
-import { makeShortName } from "./assessCases";
+import { makeShortName } from "./assessRelevance";
 import z from "zod";
 
 const Rating = z.number().min(1).max(5);
diff --git a/packages/scripts/src/main/assessCasesMain.ts b/packages/scripts/src/main/assessCasesMain.ts
index f6f3fdcc9..85bc4a6b2 100644
--- a/packages/scripts/src/main/assessCasesMain.ts
+++ b/packages/scripts/src/main/assessCasesMain.ts
@@ -8,9 +8,8 @@ import { MongoClient } from "mongodb";
 import { Case } from "../Case";
 import { makeSimpleTextGenerator } from "../SimpleTextGenerator";
 import "dotenv/config";
-import { assessRelevance, makeShortName } from "../assessCases";
+import { assessRelevance, makeShortName } from "../assessRelevance";
 import { makeGenerateRating } from "../generateRating";
-import { models } from "mongodb-rag-core/models";
 import { createOpenAI, wrapLanguageModel } from "mongodb-rag-core/aiSdk";
 import { BraintrustMiddleware } from "mongodb-rag-core/braintrust";
 
@@ -54,8 +53,13 @@ const assessRelevanceMain = async () => {
   ];
 
   const generate = makeSimpleTextGenerator({
-    client: openAiClient,
-    model: "gpt-4o",
+    model: wrapLanguageModel({
+      model: createOpenAI({
+        apiKey: BRAINTRUST_API_KEY,
+        baseURL: BRAINTRUST_ENDPOINT,
+      }).chat("gpt-4.1"),
+      middleware: [BraintrustMiddleware({ debug: true })],
+    }),
   });
 
   const judgmentModel = wrapLanguageModel({
@@ -78,6 +82,7 @@ const assessRelevanceMain = async () => {
     const relevancePromises = cases.map(
       async ({ _id, name: prompt, expected: expectedResponse }) => {
         const shortName = makeShortName(prompt);
+
         const relevance = await assessRelevance({
           prompt,
           expectedResponse,
@@ -85,7 +90,6 @@ const assessRelevanceMain = async () => {
           generate,
         });
 
-        models.find(({ deployment }) => deployment === "o3");
         const prompt_response_rating = await generateRating({
           prompt,
           expectedResponse,

From 17abf7781819c98ba58967849404586b742f3197 Mon Sep 17 00:00:00 2001
From: Chris Bush <chris.bush@mongodb.com>
Date: Thu, 31 Jul 2025 17:52:41 -0400
Subject: [PATCH 12/14] Add evals

---
 packages/scripts/src/assessRelevance.eval.ts |  25 ++++-
 packages/scripts/src/generateRating.eval.ts  | 100 +++++++++++++++++--
 2 files changed, 113 insertions(+), 12 deletions(-)

diff --git a/packages/scripts/src/assessRelevance.eval.ts b/packages/scripts/src/assessRelevance.eval.ts
index ae1cd12c5..1c72b93c5 100644
--- a/packages/scripts/src/assessRelevance.eval.ts
+++ b/packages/scripts/src/assessRelevance.eval.ts
@@ -37,17 +37,32 @@ interface AssessRelevanceEvalCase {
   tags?: MongoDbTag[];
 }
 
-// TODO: More eval cases
 const evalCases: AssessRelevanceEvalCase[] = [
   {
+    // High similarity
     input: {
-      prompt: "What is the weather?",
-      expectedResponse: "",
+      prompt: "When to use $pull and $push mongodb",
+      expectedResponse:
+        "Use the $pull operator when you want to remove a value or values that match specific conditions from an existing array. \nUse the $push operator when you want to add a specific value to an array. ",
     },
     expected: {
       averages: {
-        cos_similarity: 0.8,
-        norm_sq_mag_diff: 0.9,
+        cos_similarity: 0.9639615103046141,
+        norm_sq_mag_diff: 1.876484720646915e-11,
+      },
+    },
+  },
+  {
+    // Low similarity
+    input: {
+      prompt: "give me an example of how to use the $and operator",
+      expectedResponse:
+        "The following example returns inventory documents where the price is greater than 25 and the quantity is less than 20:\n\ndb.inventory.find( {\n  $and: [\n    { price: { $gt: 25 } },\n    { quantity: { $lt: 20 } }\n  ]\n} )",
+    },
+    expected: {
+      averages: {
+        cos_similarity: 0.3442199438560915,
+        norm_sq_mag_diff: 1.0454893515591396e-10,
       },
     },
   },
diff --git a/packages/scripts/src/generateRating.eval.ts b/packages/scripts/src/generateRating.eval.ts
index d3597841f..ddf556061 100644
--- a/packages/scripts/src/generateRating.eval.ts
+++ b/packages/scripts/src/generateRating.eval.ts
@@ -18,19 +18,105 @@ interface GenerateRatingEvalCase {
   tags?: MongoDbTag[];
 }
 
-// TODO: More eval cases
 const evalCases: GenerateRatingEvalCase[] = [
   {
     input: {
-      prompt: "What is the weather?",
-      expectedResponse: "",
+      prompt: "Is there a  limit for mongodb deletemany",
+      expectedResponse:
+        "db.collection.deleteMany() removes all documents that match the filter from a collection.\n\nNOTE: If you are deleting all documents in a large collection, it may be faster to drop the collection and recreate it. Before dropping the collection, note all indexes on the collection. You must recreate any indexes that existed in the original collection. If the original collection was sharded, you must also shard the recreated collection.\n\nFor more information on dropping a collection, see db.collection.drop().",
     },
     expected: {
-      answer_fit: 5,
-      answer_reasonableness: 5,
-      business_impact: 4,
+      answer_fit: 3,
+      answer_reasonableness: 4,
+      business_impact: 3,
+      prompt_clarity: 3,
+      prompt_knowledge_assumption: 2,
+      answer_fit_rationale:
+        "Prompt asks specifically about a possible limit on deleteMany; expected answer implicitly says it removes all matching docs and advises using drop for large collections but never explicitly states whether a limit exists. Partially addresses question but not directly or definitively.",
+      answer_reasonableness_rationale:
+        "It is reasonable for an LLM to mention that deleteMany removes all matching documents and suggest dropping the collection for large-scale deletes; that is common guidance in MongoDB docs.",
+      business_impact_rationale:
+        "Understanding deleteMany behavior is moderately important for applications that manage large datasets, but not highly strategic.",
+      prompt_clarity_rationale:
+        "Prompt is understandable but contains a typo and lacks context or detail, making it only moderately clear.",
+      prompt_knowledge_assumption_rationale:
+        "Requires only basic MongoDB CRUD knowledge; little domain depth needed.",
+    },
+  },
+  {
+    input: {
+      prompt: "What are best practices for mongodb pagination",
+      expectedResponse:
+        "The following principles are best practices for paginating your Atlas Search query results:\n- Sort queries by unique fields to prevent ties in relevance scores.\n- If you want to primarily sort by a field that is not unique, add an additional sort clause on a unique field to act as a tiebreaker.\n- Sort your query results by an immutable field to ensure that your results reflect updates made between queries. \n",
+    },
+    expected: {
+      answer_fit: 2,
+      answer_reasonableness: 3,
+      business_impact: 3,
+      prompt_clarity: 4,
+      prompt_knowledge_assumption: 3,
+      guidance:
+        "Answer omits several widely-recognized pagination practices (e.g., avoiding large skip, using range queries or ‘after’ cursors, leveraging _id for seek-based pagination). Either broaden the expected answer or narrow the prompt to “Atlas Search pagination sorting best practices.”",
+      answer_fit_rationale:
+        "Prompt asks for general MongoDB pagination best practices, but expected answer covers only Atlas Search sorting guidance; many core practices are missing.",
+      answer_reasonableness_rationale:
+        "An LLM could plausibly reply with the three listed bullets, but would usually add other common tips; partial but not implausible.",
+      business_impact_rationale:
+        "Pagination efficiency affects app performance; guidance has moderate practical value.",
+      prompt_clarity_rationale:
+        "Question is concise and unambiguous, though it doesn’t specify Atlas Search scope, causing mismatch.",
+      prompt_knowledge_assumption_rationale:
+        "Requires some database and MongoDB operational knowledge, but not deep specialist expertise.",
+    },
+  },
+  {
+    input: {
+      prompt: "How to use  unset field in array mongodb",
+      expectedResponse:
+        'To use the $unset operator on a field that contains an array, use the update method with a filter to identify the specific value in the array that you want to unset and the $ operator to unset that value. \n\n```\ndb.grades.insertMany([{user: "A", grades: [90, 30, 40]}, {user: "B", grades: [30, 70, 60]}])\n\ndb.grades.updateMany({grades: 30}, {$unset: {"grades.$": 1}})\n```\n\nWhen the operation is complete, the value that matches the filter in the array is changed to null.\n',
+    },
+    expected: {
+      answer_fit: 1,
+      answer_reasonableness: 3,
+      business_impact: 3,
+      prompt_clarity: 3,
+      prompt_knowledge_assumption: 3,
+      guidance:
+        'The answer is technically incorrect: $unset cannot be combined with the positional $ operator. To null-out an array element you must reference it by index (e.g., {$unset:{"grades.0":1}}) or use $set with the positional operator. Clarify the prompt (“How do I unset/remove a single element inside an array in MongoDB?”) and ensure the expected answer reflects MongoDB’s documented behavior.',
+      answer_fit_rationale:
+        '$unset with "grades.$" is invalid in MongoDB; the operation shown would fail. Therefore the answer does not correctly satisfy the prompt.',
+      answer_reasonableness_rationale:
+        "Given the vague prompt, an LLM might guess the misuse of $unset and $; although wrong, it’s a plausible but not authoritative response.",
+      business_impact_rationale:
+        "Array updates are common developer tasks; wrong guidance could cause wasted time but not critical business failure.",
+      prompt_clarity_rationale:
+        "Prompt is understandable but ungrammatical; intent is clear enough.",
+      prompt_knowledge_assumption_rationale:
+        "Requires basic MongoDB update knowledge; moderate domain specificity.",
+    },
+  },
+  {
+    input: {
+      prompt: "How can i combine vector search with lexical search?",
+      expectedResponse: "[TO FILL] update when $rankFusion syntax is released",
+    },
+    expected: {
+      answer_fit: 1,
+      answer_reasonableness: 1,
+      business_impact: 3,
       prompt_clarity: 4,
-      prompt_knowledge_assumption: 4,
+      prompt_knowledge_assumption: 3,
+      guidance:
+        "The expected answer is missing, so scores for fit and reasonableness are 1. Provide a complete answer explaining concrete techniques (e.g., hybrid search, result fusion, the upcoming $rankFusion syntax) and practical examples. Clarify any prerequisites, such as the search engine or framework in use, to let respondents tailor detailed guidance.",
+      answer_fit_rationale:
+        "No substantive content; expected answer is a placeholder.",
+      answer_reasonableness_rationale:
+        "An LLM cannot derive a full answer from the placeholder; unreasonable to expect a correct output.",
+      business_impact_rationale:
+        "Hybrid search guidance is valuable for many search applications, but impact is undermined by missing answer.",
+      prompt_clarity_rationale: "Direct, concise question; understandable.",
+      prompt_knowledge_assumption_rationale:
+        "Assumes familiarity with search paradigms but not excessively specialized.",
     },
   },
 ];

From 372ee47e7e3dcfec6c01d0475b5dbec3c4e37e9e Mon Sep 17 00:00:00 2001
From: Chris Bush <chris.bush@mongodb.com>
Date: Thu, 31 Jul 2025 17:57:39 -0400
Subject: [PATCH 13/14] Update docs

---
 packages/scripts/src/Case.ts | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/packages/scripts/src/Case.ts b/packages/scripts/src/Case.ts
index df1dc2923..5f2521473 100644
--- a/packages/scripts/src/Case.ts
+++ b/packages/scripts/src/Case.ts
@@ -19,9 +19,15 @@ export type PromptAndEmbeddings = z.infer<typeof PromptAndEmbeddings>;
   embedding of the original prompt.
  */
 export const RelevanceMetrics = z.object({
-  // normalized square magnitude difference (lower = closer = better)
+  /**
+    Normalized square magnitude difference. Lower = closer = better. This gives
+    an idea of how close the vectors are to each other in their N-dimensional
+    space, but doesn't seem to work as well as cos_similarity.
+   */
   norm_sq_mag_diff: z.number(),
-  // cosine similarity (are vectors pointing the same way?) [-1, 1]
+  /**
+    Cosine similarity: are vectors pointing the same way? Range [-1, 1].
+   */
   cos_similarity: z.number(),
 });
 

From df8da1f13335809502fb980b5537387be74a125f Mon Sep 17 00:00:00 2001
From: Chris Bush <chris.bush@mongodb.com>
Date: Thu, 31 Jul 2025 18:01:59 -0400
Subject: [PATCH 14/14] Move dependent generate to assessRelevance

---
 packages/scripts/src/assessRelevance.ts       | 36 ++++++++++++++++-
 .../src/generatePromptsFromExpectedAnswer.ts  | 39 -------------------
 2 files changed, 35 insertions(+), 40 deletions(-)
 delete mode 100644 packages/scripts/src/generatePromptsFromExpectedAnswer.ts

diff --git a/packages/scripts/src/assessRelevance.ts b/packages/scripts/src/assessRelevance.ts
index 6a8f7e933..4ecd42f96 100644
--- a/packages/scripts/src/assessRelevance.ts
+++ b/packages/scripts/src/assessRelevance.ts
@@ -10,7 +10,41 @@ import {
   ScoredPromptAndEmbeddings,
 } from "./Case";
 import { cosineSimilarity } from "mongodb-rag-core/aiSdk";
-import { generatePromptsFromExpectedAnswer } from "./generatePromptsFromExpectedAnswer";
+
+/**
+  Given the expected answer, generate a number of possible prompts that could
+  elicit that expected answer.
+ */
+export const generatePromptsFromExpectedAnswer = async ({
+  expectedResponse,
+  embedders,
+  generate,
+  howMany,
+}: {
+  expectedResponse: string;
+  embedders: Embedder[];
+  generate: SimpleTextGenerator;
+  howMany: number;
+}): Promise<PromptAndEmbeddings[]> => {
+  const variants = await generate({
+    prompt: `Given the following "expected answer", formulate a question that is likely to elicit the expected answer.
+Don't necessarily use proper grammar or punctuation; write like a user of a chatbot, search engine, or LLM would.
+Just return the generated question.
+
+Expected answer:\n\n${expectedResponse}`,
+    n: howMany,
+    temperature: 0.5,
+  });
+
+  return await Promise.all(
+    variants.map(async (text) => {
+      return {
+        prompt: text,
+        embeddings: await calculateEmbeddings({ embedders, text }),
+      };
+    })
+  );
+};
 
 export const scoreVariants = ({
   original,
diff --git a/packages/scripts/src/generatePromptsFromExpectedAnswer.ts b/packages/scripts/src/generatePromptsFromExpectedAnswer.ts
deleted file mode 100644
index 58e68f888..000000000
--- a/packages/scripts/src/generatePromptsFromExpectedAnswer.ts
+++ /dev/null
@@ -1,39 +0,0 @@
-import { Embedder } from "mongodb-rag-core";
-import { SimpleTextGenerator } from "./SimpleTextGenerator";
-import { PromptAndEmbeddings } from "./Case";
-import { calculateEmbeddings } from "./calculateEmbeddings";
-
-/**
-  Given the expected answer, generate a number of possible prompts that could
-  elicit that expected answer.
- */
-export const generatePromptsFromExpectedAnswer = async ({
-  expectedResponse,
-  embedders,
-  generate,
-  howMany,
-}: {
-  expectedResponse: string;
-  embedders: Embedder[];
-  generate: SimpleTextGenerator;
-  howMany: number;
-}): Promise<PromptAndEmbeddings[]> => {
-  const variants = await generate({
-    prompt: `Given the following "expected answer", formulate a question that is likely to elicit the expected answer.
-Don't necessarily use proper grammar or punctuation; write like a user of a chatbot, search engine, or LLM would.
-Just return the generated question.
-
-Expected answer:\n\n${expectedResponse}`,
-    n: howMany,
-    temperature: 0.5,
-  });
-
-  return await Promise.all(
-    variants.map(async (text) => {
-      return {
-        prompt: text,
-        embeddings: await calculateEmbeddings({ embedders, text }),
-      };
-    })
-  );
-};