-
Notifications
You must be signed in to change notification settings - Fork 75
Implement case analysis script #844
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
15 commits
Select commit
Hold shift + click to select a range
12139ea
Case analysis wip
cbush 9efb650
WIP
cbush 0e9f998
WIP
cbush 25632a6
WIP
cbush 7a318fa
LLM as judge
cbush 822c4cd
WIP
cbush a7c2436
WIP
cbush ff92704
Merge remote-tracking branch 'origin/main' into case-analysis
cbush 7d8cefb
Eval WIP
cbush 0fe833d
WIP
cbush 26e57d8
Add other scores
cbush da4df5f
WIP
cbush 17abf77
Add evals
cbush 372ee47
Update docs
cbush df8da1f
Move dependent generate to assessRelevance
cbush File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
import z from "zod"; | ||
import { PromptResponseRating } from "./generateRating"; | ||
|
||
// Map of embedding model name -> vector (array of numbers) | ||
export const Embeddings = z.record(z.string(), z.number().array()); | ||
|
||
export type Embeddings = z.infer<typeof Embeddings>; | ||
|
||
export const PromptAndEmbeddings = z.object({ | ||
prompt: z.string(), | ||
embeddings: Embeddings, | ||
}); | ||
|
||
export type PromptAndEmbeddings = z.infer<typeof PromptAndEmbeddings>; | ||
|
||
/** | ||
Answer relevance: given prompt and expected answer pair, generate N possible | ||
prompts that would elicit that answer, then compare their embeddings with the | ||
embedding of the original prompt. | ||
*/ | ||
export const RelevanceMetrics = z.object({ | ||
/** | ||
Normalized square magnitude difference. Lower = closer = better. This gives | ||
an idea of how close the vectors are to each other in their N-dimensional | ||
space, but doesn't seem to work as well as cos_similarity. | ||
*/ | ||
norm_sq_mag_diff: z.number(), | ||
/** | ||
Cosine similarity: are vectors pointing the same way? Range [-1, 1]. | ||
*/ | ||
cos_similarity: z.number(), | ||
}); | ||
|
||
export type RelevanceMetrics = z.infer<typeof RelevanceMetrics>; | ||
|
||
export const ScoredPromptAndEmbeddings = PromptAndEmbeddings.and( | ||
z.object({ | ||
relevance: | ||
// embedding model name -> score | ||
z.record(z.string(), RelevanceMetrics), | ||
}) | ||
); | ||
|
||
export type ScoredPromptAndEmbeddings = z.infer< | ||
typeof ScoredPromptAndEmbeddings | ||
>; | ||
|
||
export const Relevance = z.object({ | ||
prompt_embeddings: Embeddings, | ||
generated_prompts: ScoredPromptAndEmbeddings.array(), | ||
averages: RelevanceMetrics, | ||
}); | ||
|
||
export type Relevance = z.infer<typeof Relevance>; | ||
|
||
export const Case = z.object({ | ||
type: z.string(), | ||
tags: z.string().array(), | ||
name: z.string(), | ||
prompt: z | ||
.object({ | ||
content: z.string(), | ||
role: z.string(), | ||
}) | ||
.array(), | ||
expected: z.string(), | ||
|
||
// Fields to add | ||
prompt_embeddings: Embeddings.optional(), | ||
relevance: Relevance.optional(), | ||
prompt_response_rating: PromptResponseRating.optional(), | ||
}); | ||
|
||
export type Case = z.infer<typeof Case>; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
import { generateText, LanguageModel } from "mongodb-rag-core/aiSdk"; | ||
|
||
export const makeSimpleTextGenerator = ({ | ||
model, | ||
systemPrompt, | ||
}: { | ||
model: LanguageModel; | ||
systemPrompt?: string; | ||
}) => { | ||
return async ({ | ||
prompt, | ||
temperature = 0, | ||
n = 1, | ||
}: { | ||
prompt: string; | ||
temperature?: number; | ||
|
||
n?: number; | ||
}): Promise<string[]> => { | ||
const result = await Promise.all( | ||
Array(n) | ||
.fill(0) | ||
.map(async () => | ||
generateText({ | ||
model, | ||
prompt, | ||
system: systemPrompt, | ||
temperature, | ||
}) | ||
) | ||
); | ||
return result.map(({ text }) => text); | ||
}; | ||
}; | ||
|
||
export type SimpleTextGenerator = ReturnType<typeof makeSimpleTextGenerator>; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
import "dotenv/config"; | ||
import { Eval, BraintrustMiddleware } from "braintrust"; | ||
import { Scorer } from "autoevals"; | ||
import { MongoDbTag } from "mongodb-rag-core/mongoDbMetadata"; | ||
import { createOpenAI, wrapLanguageModel } from "mongodb-rag-core/aiSdk"; | ||
import { | ||
assertEnvVars, | ||
BRAINTRUST_ENV_VARS, | ||
makeOpenAiEmbedder, | ||
} from "mongodb-rag-core"; | ||
import { Relevance } from "./Case"; | ||
import { assessRelevance } from "./assessRelevance"; | ||
import { AzureOpenAI } from "mongodb-rag-core/openai"; | ||
import { makeSimpleTextGenerator } from "./SimpleTextGenerator"; | ||
|
||
const { | ||
OPENAI_API_KEY, | ||
OPENAI_ENDPOINT, | ||
OPENAI_API_VERSION, | ||
OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT, | ||
BRAINTRUST_API_KEY, | ||
BRAINTRUST_ENDPOINT, | ||
} = assertEnvVars({ | ||
OPENAI_API_KEY: "", | ||
OPENAI_ENDPOINT: "", | ||
OPENAI_API_VERSION: "", | ||
OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT: "", | ||
...BRAINTRUST_ENV_VARS, | ||
}); | ||
|
||
interface AssessRelevanceEvalCase { | ||
input: { | ||
prompt: string; | ||
expectedResponse: string; | ||
}; | ||
expected: Pick<Relevance, "averages">; | ||
tags?: MongoDbTag[]; | ||
} | ||
|
||
const evalCases: AssessRelevanceEvalCase[] = [ | ||
{ | ||
// High similarity | ||
input: { | ||
prompt: "When to use $pull and $push mongodb", | ||
expectedResponse: | ||
"Use the $pull operator when you want to remove a value or values that match specific conditions from an existing array. \nUse the $push operator when you want to add a specific value to an array. ", | ||
}, | ||
expected: { | ||
averages: { | ||
cos_similarity: 0.9639615103046141, | ||
norm_sq_mag_diff: 1.876484720646915e-11, | ||
}, | ||
}, | ||
}, | ||
{ | ||
// Low similarity | ||
input: { | ||
prompt: "give me an example of how to use the $and operator", | ||
expectedResponse: | ||
"The following example returns inventory documents where the price is greater than 25 and the quantity is less than 20:\n\ndb.inventory.find( {\n $and: [\n { price: { $gt: 25 } },\n { quantity: { $lt: 20 } }\n ]\n} )", | ||
}, | ||
expected: { | ||
averages: { | ||
cos_similarity: 0.3442199438560915, | ||
norm_sq_mag_diff: 1.0454893515591396e-10, | ||
}, | ||
}, | ||
}, | ||
]; | ||
|
||
const cosSimilarityScorer: Scorer<Pick<Relevance, "averages">, unknown> = ({ | ||
output, | ||
expected, | ||
}) => ({ | ||
name: `closeCosSimilarity`, | ||
score: | ||
expected === undefined | ||
? 0 | ||
: 1 - | ||
Math.abs( | ||
output.averages.cos_similarity - expected.averages.cos_similarity | ||
), | ||
}); | ||
|
||
const model = wrapLanguageModel({ | ||
model: createOpenAI({ | ||
apiKey: BRAINTRUST_API_KEY, | ||
baseURL: BRAINTRUST_ENDPOINT, | ||
}).chat("o3"), | ||
middleware: [BraintrustMiddleware({ debug: true })], | ||
}); | ||
|
||
const openAiClient = new AzureOpenAI({ | ||
apiKey: OPENAI_API_KEY, | ||
endpoint: OPENAI_ENDPOINT, | ||
apiVersion: OPENAI_API_VERSION, | ||
}); | ||
|
||
const embedders = [ | ||
makeOpenAiEmbedder({ | ||
openAiClient, | ||
deployment: OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT, | ||
backoffOptions: { | ||
numOfAttempts: 25, | ||
startingDelay: 1000, | ||
}, | ||
}), | ||
]; | ||
|
||
const generate = makeSimpleTextGenerator({ | ||
model: wrapLanguageModel({ | ||
model: createOpenAI({ | ||
apiKey: BRAINTRUST_API_KEY, | ||
baseURL: BRAINTRUST_ENDPOINT, | ||
}).chat("gpt-4.1"), | ||
middleware: [BraintrustMiddleware({ debug: true })], | ||
}), | ||
}); | ||
|
||
Eval("assess-prompt-response-relevance", { | ||
data: evalCases, | ||
experimentName: "assess-relevance", | ||
metadata: { | ||
description: "Evaluates assessRelevance().", | ||
model: model.modelId, | ||
}, | ||
maxConcurrency: 10, | ||
async task({ prompt, expectedResponse }) { | ||
try { | ||
return await assessRelevance({ | ||
prompt, | ||
expectedResponse, | ||
embedders, | ||
generate, | ||
}); | ||
} catch (error) { | ||
console.error(`Error evaluating input: ${prompt} - ${expectedResponse}`); | ||
console.error(error); | ||
throw error; | ||
} | ||
}, | ||
scores: [cosSimilarityScorer], | ||
}); |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
alphabetized