Skip to content

Commit 80b0a62

Browse files
committed
Added basic evals
1 parent 2ece703 commit 80b0a62

File tree

5 files changed

+692
-62
lines changed

5 files changed

+692
-62
lines changed

apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.ai-filter.tsx

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import { VersionListPresenter } from "~/presenters/v3/VersionListPresenter.serve
1919
import { TaskListPresenter } from "~/presenters/v3/TaskListPresenter.server";
2020
import { getAllTaskIdentifiers } from "~/models/task.server";
2121
import { $replica } from "~/db.server";
22+
import { env } from "~/env.server";
2223

2324
const RequestSchema = z.object({
2425
text: z.string().min(1),
@@ -130,14 +131,21 @@ export async function action({ request, params }: ActionFunctionArgs) {
130131
},
131132
};
132133

134+
if (!env.OPENAI_API_KEY) {
135+
return {
136+
success: false,
137+
error: "OpenAI API key is not configured",
138+
};
139+
}
140+
133141
const service = new AIRunFilterService({
134142
queryTags,
135143
queryVersions,
136144
queryQueues,
137145
queryTasks,
138146
});
139147

140-
const [error, result] = await tryCatch(service.call(text, environment));
148+
const [error, result] = await tryCatch(service.call(text, environment.id));
141149
if (error) {
142150
return json({ success: false, error: error.message }, { status: 400 });
143151
}

apps/webapp/app/v3/services/aiRunFilterService.server.ts

Lines changed: 4 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,13 @@ import { type TaskTriggerSource } from "@trigger.dev/database";
33
import { generateText, Output, tool } from "ai";
44
import { z } from "zod";
55
import { TaskRunListSearchFilters } from "~/components/runs/v3/RunFilters";
6-
import { env } from "~/env.server";
7-
import { type AuthenticatedEnvironment } from "~/services/apiAuth.server";
86
import { logger } from "~/services/logger.server";
97

108
const AIFilterResponseSchema = z
119
.discriminatedUnion("success", [
1210
z.object({
1311
success: z.literal(true),
1412
filters: TaskRunListSearchFilters.omit({ environments: true }),
15-
explanation: z
16-
.string()
17-
.describe("A short human-readable explanation of what filters were applied"),
1813
}),
1914
z.object({
2015
success: z.literal(false),
@@ -62,7 +57,6 @@ export type AIFilterResult =
6257
| {
6358
success: true;
6459
filters: TaskRunListSearchFilters;
65-
explanation: string;
6660
}
6761
| {
6862
success: false;
@@ -79,14 +73,7 @@ export class AIRunFilterService {
7973
}
8074
) {}
8175

82-
async call(text: string, environment: AuthenticatedEnvironment): Promise<AIFilterResult> {
83-
if (!env.OPENAI_API_KEY) {
84-
return {
85-
success: false,
86-
error: "OpenAI API key is not configured",
87-
};
88-
}
89-
76+
async call(text: string, environmentId: string): Promise<AIFilterResult> {
9077
try {
9178
const result = await generateText({
9279
model: openai("gpt-4o-mini"),
@@ -205,17 +192,15 @@ export class AIRunFilterService {
205192
experimental_telemetry: {
206193
isEnabled: true,
207194
metadata: {
208-
environmentId: environment.id,
209-
projectId: environment.projectId,
210-
organizationId: environment.organizationId,
195+
environmentId,
211196
},
212197
},
213198
});
214199

215200
// Add debugging to see what the AI returned
216201
logger.info("AI filter response", {
217202
text,
218-
environmentId: environment.id,
203+
environmentId,
219204
result: result.experimental_output,
220205
});
221206

@@ -245,14 +230,13 @@ export class AIRunFilterService {
245230
return {
246231
success: true,
247232
filters: validationResult.data,
248-
explanation: result.experimental_output.explanation,
249233
};
250234
} catch (error) {
251235
logger.error("AI filter processing failed", {
252236
error,
253237
errorMessage: error instanceof Error ? error.message : String(error),
254238
text,
255-
environmentId: environment.id,
239+
environmentId,
256240
});
257241

258242
// If it's a schema validation error, provide more specific feedback

apps/webapp/evals/aiRunFilter.eval.ts

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import { evalite } from "evalite";
2+
import { Levenshtein } from "autoevals";
3+
import {
4+
AIRunFilterService,
5+
type QueryQueues,
6+
type QueryTags,
7+
type QueryTasks,
8+
type QueryVersions,
9+
} from "~/v3/services/aiRunFilterService.server";
10+
import dotenv from "dotenv";
11+
12+
dotenv.config({ path: "../../.env" });
13+
14+
const queryTags: QueryTags = {
15+
query: async (search) => {
16+
return {
17+
tags: ["user_1", "user_2", "org_1", "org_2"],
18+
};
19+
},
20+
};
21+
22+
const queryVersions: QueryVersions = {
23+
query: async (versionPrefix, isCurrent) => {
24+
if (isCurrent) {
25+
return {
26+
version: "20250721.1",
27+
};
28+
}
29+
30+
return {
31+
versions: ["20250721.1", "20250720.2", "20250720.1"],
32+
};
33+
},
34+
};
35+
36+
const queryQueues: QueryQueues = {
37+
query: async (query, type) => {
38+
return {
39+
queues: ["shared", "paid"],
40+
};
41+
},
42+
};
43+
44+
const queryTasks: QueryTasks = {
45+
query: async () => {
46+
return {
47+
tasks: [
48+
{ slug: "task1", triggerSource: "STANDARD" },
49+
{ slug: "task2", triggerSource: "SCHEDULED" },
50+
],
51+
};
52+
},
53+
};
54+
55+
evalite("AI Run Filter", {
56+
data: async () => {
57+
return [
58+
{
59+
input: "Completed runs",
60+
expected: JSON.stringify({
61+
success: true,
62+
filters: {
63+
statuses: ["COMPLETED_SUCCESSFULLY"],
64+
},
65+
}),
66+
},
67+
{
68+
input: "Failed runs",
69+
expected: JSON.stringify({
70+
success: true,
71+
filters: {
72+
statuses: ["COMPLETED_WITH_ERRORS", "CRASHED", "TIMED_OUT", "SYSTEM_FAILURE"],
73+
},
74+
}),
75+
},
76+
{
77+
input: "Executing runs",
78+
expected: JSON.stringify({
79+
success: true,
80+
filters: {
81+
statuses: ["EXECUTING", "RETRYING_AFTER_FAILURE", "WAITING_TO_RESUME"],
82+
},
83+
}),
84+
},
85+
];
86+
},
87+
task: async (input) => {
88+
const service = new AIRunFilterService({
89+
queryTags,
90+
queryVersions,
91+
queryQueues,
92+
queryTasks,
93+
});
94+
95+
const result = await service.call(input, "123456");
96+
return JSON.stringify(result);
97+
},
98+
scorers: [Levenshtein],
99+
});

apps/webapp/package.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@
2323
"clean:sourcemaps": "run-s clean:sourcemaps:*",
2424
"clean:sourcemaps:public": "rimraf ./build/**/*.map",
2525
"clean:sourcemaps:build": "rimraf ./public/build/**/*.map",
26-
"test": "vitest --no-file-parallelism"
26+
"test": "vitest --no-file-parallelism",
27+
"eval:dev": "evalite watch"
2728
},
2829
"eslintIgnore": [
2930
"/node_modules",
@@ -248,6 +249,7 @@
248249
"@types/ws": "^8.5.3",
249250
"@typescript-eslint/eslint-plugin": "^5.59.6",
250251
"@typescript-eslint/parser": "^5.59.6",
252+
"autoevals": "^0.0.130",
251253
"autoprefixer": "^10.4.13",
252254
"css-loader": "^6.10.0",
253255
"datepicker": "link:@types/@react-aria/datepicker",
@@ -258,6 +260,7 @@
258260
"eslint-plugin-import": "^2.29.1",
259261
"eslint-plugin-react-hooks": "^4.6.2",
260262
"eslint-plugin-turbo": "^2.0.4",
263+
"evalite": "^0.11.4",
261264
"npm-run-all": "^4.1.5",
262265
"postcss-import": "^16.0.1",
263266
"postcss-loader": "^8.1.1",

0 commit comments

Comments
 (0)