Skip to content

Commit 01ffc88

Browse files
committed
Better time inputs and evals
1 parent e476838 commit 01ffc88

File tree

3 files changed

+171
-16
lines changed

3 files changed

+171
-16
lines changed

apps/webapp/app/v3/services/aiRunFilterService.server.ts

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,20 @@ import { z } from "zod";
55
import { TaskRunListSearchFilters } from "~/components/runs/v3/RunFilters";
66
import { logger } from "~/services/logger.server";
77

8+
const AIFilters = TaskRunListSearchFilters.omit({
9+
environments: true,
10+
from: true,
11+
to: true,
12+
}).extend({
13+
from: z.string().optional().describe("The ISO datetime to filter from"),
14+
to: z.string().optional().describe("The ISO datetime to filter to"),
15+
});
16+
817
const AIFilterResponseSchema = z
918
.discriminatedUnion("success", [
1019
z.object({
1120
success: z.literal(true),
12-
filters: TaskRunListSearchFilters.omit({ environments: true }),
21+
filters: AIFilters,
1322
}),
1423
z.object({
1524
success: z.literal(false),
@@ -137,7 +146,7 @@ export class AIRunFilterService {
137146
Available filter options:
138147
- statuses: Array of run statuses (PENDING, EXECUTING, COMPLETED_SUCCESSFULLY, COMPLETED_WITH_ERRORS, CANCELED, TIMED_OUT, CRASHED, etc.)
139148
- period: Time period string (e.g., "1h", "7d", "30d", "1y")
140-
- from/to: Unix ms timestamps for specific time ranges. You'll need to use a converter if they give you a date. Today's date is ${new Date().toISOString()}, if they only specify a day use the current month. If they don't specify a year use the current year. If they don't specify a time of day use midnight to midnight.
149+
- from/to: ISO date string. Today's date is ${new Date().toISOString()}, if they only specify a day use the current month. If they don't specify a year use the current year. If they don't specify a time of day use midnight.
141150
- tags: Array of tag names to filter by. Use the lookupTags tool to get the tags.
142151
- tasks: Array of task identifiers to filter by. Use the lookupTasks tool to get the tasks.
143152
- machines: Array of machine presets (micro, small, small-2x, medium, large, xlarge, etc.)
@@ -148,6 +157,7 @@ export class AIRunFilterService {
148157
- batchId: Specific batch ID to filter by
149158
- scheduleId: Specific schedule ID to filter by
150159
160+
151161
Common patterns to recognize:
152162
- "failed runs" → statuses: ["COMPLETED_WITH_ERRORS", "CRASHED", "TIMED_OUT", "SYSTEM_FAILURE"].
153163
- "runs not dequeued yet" → statuses: ["PENDING", "PENDING_VERSION", "DELAYED"]
@@ -158,6 +168,7 @@ export class AIRunFilterService {
158168
- "past 7 days" → period: "7d"
159169
- "last hour" → period: "1h"
160170
- "this month" → period: "30d"
171+
- "June 16" -> return a from/to filter.
161172
- "with tag X" → tags: ["X"]
162173
- "from task Y" → tasks: ["Y"]
163174
- "using large machine" → machines: ["large-1x", "large-2x"]
@@ -197,13 +208,6 @@ export class AIRunFilterService {
197208
},
198209
});
199210

200-
// Add debugging to see what the AI returned
201-
logger.info("AI filter response", {
202-
text,
203-
environmentId,
204-
result: result.experimental_output,
205-
});
206-
207211
if (!result.experimental_output.success) {
208212
return {
209213
success: false,
@@ -212,9 +216,7 @@ export class AIRunFilterService {
212216
}
213217

214218
// Validate the filters against the schema to catch any issues
215-
const validationResult = TaskRunListSearchFilters.omit({ environments: true }).safeParse(
216-
result.experimental_output.filters
217-
);
219+
const validationResult = AIFilters.safeParse(result.experimental_output.filters);
218220
if (!validationResult.success) {
219221
logger.error("AI filter validation failed", {
220222
errors: validationResult.error.errors,
@@ -229,7 +231,13 @@ export class AIRunFilterService {
229231

230232
return {
231233
success: true,
232-
filters: validationResult.data,
234+
filters: {
235+
...validationResult.data,
236+
from: validationResult.data.from
237+
? new Date(validationResult.data.from).getTime()
238+
: undefined,
239+
to: validationResult.data.to ? new Date(validationResult.data.to).getTime() : undefined,
240+
},
233241
};
234242
} catch (error) {
235243
logger.error("AI filter processing failed", {

apps/webapp/evals/aiRunFilter.eval.ts

Lines changed: 149 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,8 @@ const queryTasks: QueryTasks = {
4545
query: async () => {
4646
return {
4747
tasks: [
48-
{ slug: "task1", triggerSource: "STANDARD" },
49-
{ slug: "task2", triggerSource: "SCHEDULED" },
48+
{ slug: "email-sender", triggerSource: "STANDARD" },
49+
{ slug: "email-sender-scheduled", triggerSource: "SCHEDULED" },
5050
],
5151
};
5252
},
@@ -55,6 +55,7 @@ const queryTasks: QueryTasks = {
5555
evalite("AI Run Filter", {
5656
data: async () => {
5757
return [
58+
// Basic status filtering
5859
{
5960
input: "Completed runs",
6061
expected: JSON.stringify({
@@ -82,6 +83,152 @@ evalite("AI Run Filter", {
8283
},
8384
}),
8485
},
86+
// Time filters
87+
{
88+
input: "Runs from the past 7 days",
89+
expected: JSON.stringify({
90+
success: true,
91+
filters: {
92+
period: "7d",
93+
},
94+
}),
95+
},
96+
{
97+
input: "Runs from the last hour",
98+
expected: JSON.stringify({
99+
success: true,
100+
filters: {
101+
period: "1h",
102+
},
103+
}),
104+
},
105+
{
106+
input: "Runs from this month",
107+
expected: JSON.stringify({
108+
success: true,
109+
filters: {
110+
period: "30d",
111+
},
112+
}),
113+
},
114+
{
115+
input: "June 16",
116+
expected: JSON.stringify({
117+
success: true,
118+
filters: {
119+
from: new Date("2025-06-16").getTime(),
120+
to: new Date("2025-06-17").getTime(),
121+
},
122+
}),
123+
},
124+
// Combined filters
125+
{
126+
input: "Failed runs from the past week",
127+
expected: JSON.stringify({
128+
success: true,
129+
filters: {
130+
statuses: ["COMPLETED_WITH_ERRORS", "CRASHED", "TIMED_OUT", "SYSTEM_FAILURE"],
131+
period: "7d",
132+
},
133+
}),
134+
},
135+
{
136+
input: "Successful runs from the last 24 hours",
137+
expected: JSON.stringify({
138+
success: true,
139+
filters: {
140+
statuses: ["COMPLETED_SUCCESSFULLY"],
141+
period: "1d",
142+
},
143+
}),
144+
},
145+
// Root-only filtering
146+
{
147+
input: "Root runs only",
148+
expected: JSON.stringify({
149+
success: true,
150+
filters: {
151+
rootOnly: true,
152+
},
153+
}),
154+
},
155+
{
156+
input: "Failed root runs from yesterday",
157+
expected: JSON.stringify({
158+
success: true,
159+
filters: {
160+
statuses: ["COMPLETED_WITH_ERRORS", "CRASHED", "TIMED_OUT", "SYSTEM_FAILURE"],
161+
rootOnly: true,
162+
period: "1d",
163+
},
164+
}),
165+
},
166+
// Machine filtering
167+
{
168+
input: "Runs using large machines",
169+
expected: JSON.stringify({
170+
success: true,
171+
filters: {
172+
machines: ["large-1x", "large-2x"],
173+
},
174+
}),
175+
},
176+
// Edge cases and error handling
177+
{
178+
input: "Runs with tag production",
179+
expected: JSON.stringify({
180+
success: true,
181+
filters: {
182+
tags: ["production"],
183+
},
184+
}),
185+
},
186+
{
187+
input: "Runs from task email-sender",
188+
expected: JSON.stringify({
189+
success: true,
190+
filters: {
191+
tasks: ["email-sender"],
192+
},
193+
}),
194+
},
195+
{
196+
input: "Runs in the shared queue",
197+
expected: JSON.stringify({
198+
success: true,
199+
filters: {
200+
queues: ["shared"],
201+
},
202+
}),
203+
},
204+
// Complex combinations
205+
{
206+
input: "Failed production runs from the past 3 days using large machines",
207+
expected: JSON.stringify({
208+
success: true,
209+
filters: {
210+
statuses: ["COMPLETED_WITH_ERRORS", "CRASHED", "TIMED_OUT", "SYSTEM_FAILURE"],
211+
tags: ["production"],
212+
period: "3d",
213+
machines: ["large-1x", "large-2x"],
214+
},
215+
}),
216+
},
217+
// Ambiguous cases that should return errors
218+
{
219+
input: "Show me something",
220+
expected: JSON.stringify({
221+
success: false,
222+
error: "Unclear what to filter",
223+
}),
224+
},
225+
{
226+
input: "Runs with unknown status",
227+
expected: JSON.stringify({
228+
success: false,
229+
error: "Unknown status specified",
230+
}),
231+
},
85232
];
86233
},
87234
task: async (input) => {

apps/webapp/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -279,4 +279,4 @@
279279
"engines": {
280280
"node": ">=16.0.0"
281281
}
282-
}
282+
}

0 commit comments

Comments
 (0)