Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 21 additions & 20 deletions packages/scripts/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,37 +5,38 @@
"main": "index.js",
"private": true,
"scripts": {
"assessCases": "npm run build && node ./build/main/assessCasesMain.js",
"analyzeMessages": "npm run build && node ./build/analyzeMessages.js",
"build": "npm run clean && tsc -b tsconfig.build.json",
"checkUrlsAgainstDB": "npm run build && node ./build/checkUrlsAgainstDB.js",
"clean": "rm -rf ./build",
"createQualityTestsYaml-aug-2023": "npm run build && node ./build/createAug2023QualityTestsYaml.js",
"createQualityTestsYaml-sept-2023": "npm run build && node ./build/createSept2023QualityTestsYaml.js",
"scrubMessages": "npm run build && node ./build/scrubMessages.js",
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

alphabetized

"sampleAndSaveMarkdownPages": "npm run build && node ./build/sampleAndSaveMarkdownPages.js",
"upsertMetaDirective": "npm run build && node ./build/upsertMetaDirective.js",
"analyzeMessages": "npm run build && node ./build/analyzeMessages.js",
"findFaq": "npm run build && node ./build/main/findFaqMain.js",
"upgradeFaqEntries": "npm run build && node ./build/main/upgradeFaqEntriesMain.js",
"materializeScrubbedMessagesStats:all": "npm run build && node ./build/main/materializeScrubbedMessagesStats.js --all",
"materializeScrubbedMessagesStats:since": "npm run build && node ./build/main/materializeScrubbedMessagesStats.js --since",
"materializeScrubbedMessagesStats:latest": "npm run build && node ./build/main/materializeScrubbedMessagesStats.js",
"createView/messages_by_rating": "npm run build && node ./build/createView/messages_by_rating.js",
"createView/top_250_references": "npm run build && node ./build/createView/top_250_references.js",
"createView/scrubbed_messages_by_rating": "npm run build && node ./build/createView/scrubbed_messages_by_rating.js",
"createView/scrubbed_messages_stats": "npm run build && node ./build/createView/scrubbed_messages_stats.js",
"createView/scrubbed_top_250_references": "npm run build && node ./build/createView/scrubbed_top_250_references.js",
"createView/scrubbed_topics": "npm run build && node ./build/createView/scrubbed_topics.js",
"removeTestDatabases": "npm run build && node ./build/removeTestDatabases.js",
"getConversationText": "npm run build && node ./build/getConversationText.js",
"createView/top_250_references": "npm run build && node ./build/createView/top_250_references.js",
"findFaq": "npm run build && node ./build/main/findFaqMain.js",
"findPageTitles": "npm run build && node ./build/main/findPageTitlesMain.js",
"getConversationText": "npm run build && node ./build/getConversationText.js",
"getLLMAnswers": "npm run build && node ./build/profound/getAndProcessAnswers.js",
"lint:fix": "npm run lint -- --fix && prettier ./src --check --write",
"lint": "eslint ./src --ext ts,tsx,js,jsx --report-unused-disable-directives",
"listSlackMessages": "npm run build && node ./build/main/listSlackMessagesMain.js",
"materializeScrubbedMessagesStats:all": "npm run build && node ./build/main/materializeScrubbedMessagesStats.js --all",
"materializeScrubbedMessagesStats:latest": "npm run build && node ./build/main/materializeScrubbedMessagesStats.js",
"materializeScrubbedMessagesStats:since": "npm run build && node ./build/main/materializeScrubbedMessagesStats.js --since",
"release": "release-it",
"removeSlackMessage": "npm run build && node ./build/main/removeSlackMessageMain.js",
"checkUrlsAgainstDB": "npm run build && node ./build/checkUrlsAgainstDB.js",
"getLLMAnswers": "npm run build && node ./build/profound/getAndProcessAnswers.js",
"verifyPagesInSource": "npm run build && node ./build/verifyPagesInSource.js",
"removeTestDatabases": "npm run build && node ./build/removeTestDatabases.js",
"sampleAndSaveMarkdownPages": "npm run build && node ./build/sampleAndSaveMarkdownPages.js",
"scrubMessages": "npm run build && node ./build/scrubMessages.js",
"test": "jest --forceExit",
"build": "npm run clean && tsc -b tsconfig.build.json",
"clean": "rm -rf ./build",
"release": "release-it",
"lint": "eslint ./src --ext ts,tsx,js,jsx --report-unused-disable-directives",
"lint:fix": "npm run lint -- --fix && prettier ./src --check --write"
"upgradeFaqEntries": "npm run build && node ./build/main/upgradeFaqEntriesMain.js",
"upsertMetaDirective": "npm run build && node ./build/upsertMetaDirective.js",
"verifyPagesInSource": "npm run build && node ./build/verifyPagesInSource.js"
},
"keywords": [],
"author": "",
Expand Down
74 changes: 74 additions & 0 deletions packages/scripts/src/Case.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import z from "zod";
import { PromptResponseRating } from "./generateRating";

// Map of embedding model name -> vector (array of numbers)
export const Embeddings = z.record(z.string(), z.number().array());

export type Embeddings = z.infer<typeof Embeddings>;

export const PromptAndEmbeddings = z.object({
prompt: z.string(),
embeddings: Embeddings,
});

export type PromptAndEmbeddings = z.infer<typeof PromptAndEmbeddings>;

/**
Answer relevance: given prompt and expected answer pair, generate N possible
prompts that would elicit that answer, then compare their embeddings with the
embedding of the original prompt.
*/
export const RelevanceMetrics = z.object({
/**
Normalized square magnitude difference. Lower = closer = better. This gives
an idea of how close the vectors are to each other in their N-dimensional
space, but doesn't seem to work as well as cos_similarity.
*/
norm_sq_mag_diff: z.number(),
/**
Cosine similarity: are vectors pointing the same way? Range [-1, 1].
*/
cos_similarity: z.number(),
});

export type RelevanceMetrics = z.infer<typeof RelevanceMetrics>;

export const ScoredPromptAndEmbeddings = PromptAndEmbeddings.and(
z.object({
relevance:
// embedding model name -> score
z.record(z.string(), RelevanceMetrics),
})
);

export type ScoredPromptAndEmbeddings = z.infer<
typeof ScoredPromptAndEmbeddings
>;

export const Relevance = z.object({
prompt_embeddings: Embeddings,
generated_prompts: ScoredPromptAndEmbeddings.array(),
averages: RelevanceMetrics,
});

export type Relevance = z.infer<typeof Relevance>;

export const Case = z.object({
type: z.string(),
tags: z.string().array(),
name: z.string(),
prompt: z
.object({
content: z.string(),
role: z.string(),
})
.array(),
expected: z.string(),

// Fields to add
prompt_embeddings: Embeddings.optional(),
relevance: Relevance.optional(),
prompt_response_rating: PromptResponseRating.optional(),
});

export type Case = z.infer<typeof Case>;
36 changes: 36 additions & 0 deletions packages/scripts/src/SimpleTextGenerator.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import { generateText, LanguageModel } from "mongodb-rag-core/aiSdk";

export const makeSimpleTextGenerator = ({
model,
systemPrompt,
}: {
model: LanguageModel;
systemPrompt?: string;
}) => {
return async ({
prompt,
temperature = 0,
n = 1,
}: {
prompt: string;
temperature?: number;

n?: number;
}): Promise<string[]> => {
const result = await Promise.all(
Array(n)
.fill(0)
.map(async () =>
generateText({
model,
prompt,
system: systemPrompt,
temperature,
})
)
);
return result.map(({ text }) => text);
};
};

export type SimpleTextGenerator = ReturnType<typeof makeSimpleTextGenerator>;
143 changes: 143 additions & 0 deletions packages/scripts/src/assessRelevance.eval.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import "dotenv/config";
import { Eval, BraintrustMiddleware } from "braintrust";
import { Scorer } from "autoevals";
import { MongoDbTag } from "mongodb-rag-core/mongoDbMetadata";
import { createOpenAI, wrapLanguageModel } from "mongodb-rag-core/aiSdk";
import {
assertEnvVars,
BRAINTRUST_ENV_VARS,
makeOpenAiEmbedder,
} from "mongodb-rag-core";
import { Relevance } from "./Case";
import { assessRelevance } from "./assessRelevance";
import { AzureOpenAI } from "mongodb-rag-core/openai";
import { makeSimpleTextGenerator } from "./SimpleTextGenerator";

const {
OPENAI_API_KEY,
OPENAI_ENDPOINT,
OPENAI_API_VERSION,
OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT,
BRAINTRUST_API_KEY,
BRAINTRUST_ENDPOINT,
} = assertEnvVars({
OPENAI_API_KEY: "",
OPENAI_ENDPOINT: "",
OPENAI_API_VERSION: "",
OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT: "",
...BRAINTRUST_ENV_VARS,
});

interface AssessRelevanceEvalCase {
input: {
prompt: string;
expectedResponse: string;
};
expected: Pick<Relevance, "averages">;
tags?: MongoDbTag[];
}

const evalCases: AssessRelevanceEvalCase[] = [
{
// High similarity
input: {
prompt: "When to use $pull and $push mongodb",
expectedResponse:
"Use the $pull operator when you want to remove a value or values that match specific conditions from an existing array. \nUse the $push operator when you want to add a specific value to an array. ",
},
expected: {
averages: {
cos_similarity: 0.9639615103046141,
norm_sq_mag_diff: 1.876484720646915e-11,
},
},
},
{
// Low similarity
input: {
prompt: "give me an example of how to use the $and operator",
expectedResponse:
"The following example returns inventory documents where the price is greater than 25 and the quantity is less than 20:\n\ndb.inventory.find( {\n $and: [\n { price: { $gt: 25 } },\n { quantity: { $lt: 20 } }\n ]\n} )",
},
expected: {
averages: {
cos_similarity: 0.3442199438560915,
norm_sq_mag_diff: 1.0454893515591396e-10,
},
},
},
];

const cosSimilarityScorer: Scorer<Pick<Relevance, "averages">, unknown> = ({
output,
expected,
}) => ({
name: `closeCosSimilarity`,
score:
expected === undefined
? 0
: 1 -
Math.abs(
output.averages.cos_similarity - expected.averages.cos_similarity
),
});

const model = wrapLanguageModel({
model: createOpenAI({
apiKey: BRAINTRUST_API_KEY,
baseURL: BRAINTRUST_ENDPOINT,
}).chat("o3"),
middleware: [BraintrustMiddleware({ debug: true })],
});

const openAiClient = new AzureOpenAI({
apiKey: OPENAI_API_KEY,
endpoint: OPENAI_ENDPOINT,
apiVersion: OPENAI_API_VERSION,
});

const embedders = [
makeOpenAiEmbedder({
openAiClient,
deployment: OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT,
backoffOptions: {
numOfAttempts: 25,
startingDelay: 1000,
},
}),
];

const generate = makeSimpleTextGenerator({
model: wrapLanguageModel({
model: createOpenAI({
apiKey: BRAINTRUST_API_KEY,
baseURL: BRAINTRUST_ENDPOINT,
}).chat("gpt-4.1"),
middleware: [BraintrustMiddleware({ debug: true })],
}),
});

Eval("assess-prompt-response-relevance", {
data: evalCases,
experimentName: "assess-relevance",
metadata: {
description: "Evaluates assessRelevance().",
model: model.modelId,
},
maxConcurrency: 10,
async task({ prompt, expectedResponse }) {
try {
return await assessRelevance({
prompt,
expectedResponse,
embedders,
generate,
});
} catch (error) {
console.error(`Error evaluating input: ${prompt} - ${expectedResponse}`);
console.error(error);
throw error;
}
},
scores: [cosSimilarityScorer],
});
Loading