Better time inputs and evals

matt-aitken · matt-aitken · commit 01ffc88cbdcc · 2025-07-21T14:07:43.000+01:00
diff --git a/apps/webapp/app/v3/services/aiRunFilterService.server.ts b/apps/webapp/app/v3/services/aiRunFilterService.server.ts
@@ -5,11 +5,20 @@ import { z } from "zod";
 import { TaskRunListSearchFilters } from "~/components/runs/v3/RunFilters";
 import { logger } from "~/services/logger.server";
 
+const AIFilters = TaskRunListSearchFilters.omit({
+  environments: true,
+  from: true,
+  to: true,
+}).extend({
+  from: z.string().optional().describe("The ISO datetime to filter from"),
+  to: z.string().optional().describe("The ISO datetime to filter to"),
+});
+
 const AIFilterResponseSchema = z
   .discriminatedUnion("success", [
     z.object({
       success: z.literal(true),
-      filters: TaskRunListSearchFilters.omit({ environments: true }),
+      filters: AIFilters,
     }),
     z.object({
       success: z.literal(false),
@@ -137,7 +146,7 @@ export class AIRunFilterService {
   Available filter options:
   - statuses: Array of run statuses (PENDING, EXECUTING, COMPLETED_SUCCESSFULLY, COMPLETED_WITH_ERRORS, CANCELED, TIMED_OUT, CRASHED, etc.)
   - period: Time period string (e.g., "1h", "7d", "30d", "1y")
-  - from/to: Unix ms timestamps for specific time ranges. You'll need to use a converter if they give you a date. Today's date is ${new Date().toISOString()}, if they only specify a day use the current month. If they don't specify a year use the current year. If they don't specify a time of day use midnight to midnight.
+  - from/to: ISO date string. Today's date is ${new Date().toISOString()}, if they only specify a day use the current month. If they don't specify a year use the current year. If they don't specify a time of day use midnight.
   - tags: Array of tag names to filter by. Use the lookupTags tool to get the tags.
   - tasks: Array of task identifiers to filter by. Use the lookupTasks tool to get the tasks.
   - machines: Array of machine presets (micro, small, small-2x, medium, large, xlarge, etc.)
@@ -148,6 +157,7 @@ export class AIRunFilterService {
   - batchId: Specific batch ID to filter by
   - scheduleId: Specific schedule ID to filter by
   
+
   Common patterns to recognize:
   - "failed runs" → statuses: ["COMPLETED_WITH_ERRORS", "CRASHED", "TIMED_OUT", "SYSTEM_FAILURE"].
   - "runs not dequeued yet" → statuses: ["PENDING", "PENDING_VERSION", "DELAYED"]
@@ -158,6 +168,7 @@ export class AIRunFilterService {
   - "past 7 days" → period: "7d"
   - "last hour" → period: "1h"
   - "this month" → period: "30d"
+  - "June 16" -> return a from/to filter.
   - "with tag X" → tags: ["X"]
   - "from task Y" → tasks: ["Y"]
   - "using large machine" → machines: ["large-1x", "large-2x"]
@@ -197,13 +208,6 @@ export class AIRunFilterService {
         },
       });
 
-      // Add debugging to see what the AI returned
-      logger.info("AI filter response", {
-        text,
-        environmentId,
-        result: result.experimental_output,
-      });
-
       if (!result.experimental_output.success) {
         return {
           success: false,
@@ -212,9 +216,7 @@ export class AIRunFilterService {
       }
 
       // Validate the filters against the schema to catch any issues
-      const validationResult = TaskRunListSearchFilters.omit({ environments: true }).safeParse(
-        result.experimental_output.filters
-      );
+      const validationResult = AIFilters.safeParse(result.experimental_output.filters);
       if (!validationResult.success) {
         logger.error("AI filter validation failed", {
           errors: validationResult.error.errors,
@@ -229,7 +231,13 @@ export class AIRunFilterService {
 
       return {
         success: true,
-        filters: validationResult.data,
+        filters: {
+          ...validationResult.data,
+          from: validationResult.data.from
+            ? new Date(validationResult.data.from).getTime()
+            : undefined,
+          to: validationResult.data.to ? new Date(validationResult.data.to).getTime() : undefined,
+        },
       };
     } catch (error) {
       logger.error("AI filter processing failed", {
diff --git a/apps/webapp/evals/aiRunFilter.eval.ts b/apps/webapp/evals/aiRunFilter.eval.ts
@@ -45,8 +45,8 @@ const queryTasks: QueryTasks = {
   query: async () => {
     return {
       tasks: [
-        { slug: "task1", triggerSource: "STANDARD" },
-        { slug: "task2", triggerSource: "SCHEDULED" },
+        { slug: "email-sender", triggerSource: "STANDARD" },
+        { slug: "email-sender-scheduled", triggerSource: "SCHEDULED" },
       ],
     };
   },
@@ -55,6 +55,7 @@ const queryTasks: QueryTasks = {
 evalite("AI Run Filter", {
   data: async () => {
     return [
+      // Basic status filtering
       {
         input: "Completed runs",
         expected: JSON.stringify({
@@ -82,6 +83,152 @@ evalite("AI Run Filter", {
           },
         }),
       },
+      // Time filters
+      {
+        input: "Runs from the past 7 days",
+        expected: JSON.stringify({
+          success: true,
+          filters: {
+            period: "7d",
+          },
+        }),
+      },
+      {
+        input: "Runs from the last hour",
+        expected: JSON.stringify({
+          success: true,
+          filters: {
+            period: "1h",
+          },
+        }),
+      },
+      {
+        input: "Runs from this month",
+        expected: JSON.stringify({
+          success: true,
+          filters: {
+            period: "30d",
+          },
+        }),
+      },
+      {
+        input: "June 16",
+        expected: JSON.stringify({
+          success: true,
+          filters: {
+            from: new Date("2025-06-16").getTime(),
+            to: new Date("2025-06-17").getTime(),
+          },
+        }),
+      },
+      // Combined filters
+      {
+        input: "Failed runs from the past week",
+        expected: JSON.stringify({
+          success: true,
+          filters: {
+            statuses: ["COMPLETED_WITH_ERRORS", "CRASHED", "TIMED_OUT", "SYSTEM_FAILURE"],
+            period: "7d",
+          },
+        }),
+      },
+      {
+        input: "Successful runs from the last 24 hours",
+        expected: JSON.stringify({
+          success: true,
+          filters: {
+            statuses: ["COMPLETED_SUCCESSFULLY"],
+            period: "1d",
+          },
+        }),
+      },
+      // Root-only filtering
+      {
+        input: "Root runs only",
+        expected: JSON.stringify({
+          success: true,
+          filters: {
+            rootOnly: true,
+          },
+        }),
+      },
+      {
+        input: "Failed root runs from yesterday",
+        expected: JSON.stringify({
+          success: true,
+          filters: {
+            statuses: ["COMPLETED_WITH_ERRORS", "CRASHED", "TIMED_OUT", "SYSTEM_FAILURE"],
+            rootOnly: true,
+            period: "1d",
+          },
+        }),
+      },
+      // Machine filtering
+      {
+        input: "Runs using large machines",
+        expected: JSON.stringify({
+          success: true,
+          filters: {
+            machines: ["large-1x", "large-2x"],
+          },
+        }),
+      },
+      // Edge cases and error handling
+      {
+        input: "Runs with tag production",
+        expected: JSON.stringify({
+          success: true,
+          filters: {
+            tags: ["production"],
+          },
+        }),
+      },
+      {
+        input: "Runs from task email-sender",
+        expected: JSON.stringify({
+          success: true,
+          filters: {
+            tasks: ["email-sender"],
+          },
+        }),
+      },
+      {
+        input: "Runs in the shared queue",
+        expected: JSON.stringify({
+          success: true,
+          filters: {
+            queues: ["shared"],
+          },
+        }),
+      },
+      // Complex combinations
+      {
+        input: "Failed production runs from the past 3 days using large machines",
+        expected: JSON.stringify({
+          success: true,
+          filters: {
+            statuses: ["COMPLETED_WITH_ERRORS", "CRASHED", "TIMED_OUT", "SYSTEM_FAILURE"],
+            tags: ["production"],
+            period: "3d",
+            machines: ["large-1x", "large-2x"],
+          },
+        }),
+      },
+      // Ambiguous cases that should return errors
+      {
+        input: "Show me something",
+        expected: JSON.stringify({
+          success: false,
+          error: "Unclear what to filter",
+        }),
+      },
+      {
+        input: "Runs with unknown status",
+        expected: JSON.stringify({
+          success: false,
+          error: "Unknown status specified",
+        }),
+      },
     ];
   },
   task: async (input) => {
diff --git a/apps/webapp/package.json b/apps/webapp/package.json
@@ -279,4 +279,4 @@
   "engines": {
     "node": ">=16.0.0"
   }
-}
+}

Original file line number	Diff line number	Diff line change
`@@ -279,4 +279,4 @@`
`279`	`279`	`"engines": {`
`280`	`280`	`"node": ">=16.0.0"`
`281`	`281`	`}`
`282`		`-}`
	`282`	`+}`