BrowserOperator
diff --git a/‎config/gni/devtools_grd_files.gni
Lines changed: 6 additions & 2 deletions b/‎config/gni/devtools_grd_files.gni
Lines changed: 6 additions & 2 deletions
diff --git a/‎eval-server/README.md
Lines changed: 20 additions & 0 deletions b/‎eval-server/README.md
Lines changed: 20 additions & 0 deletions
diff --git a/‎eval-server/docs/TRIGGERING_EVALUATIONS.md
Lines changed: 3 additions & 31 deletions b/‎eval-server/docs/TRIGGERING_EVALUATIONS.md
Lines changed: 3 additions & 31 deletions
diff --git a/‎eval-server/docs/YAML_SCHEMA.md
Lines changed: 0 additions & 13 deletions b/‎eval-server/docs/YAML_SCHEMA.md
Lines changed: 0 additions & 13 deletions
diff --git a/‎eval-server/evals/action-agent/a11y-001.yaml
Lines changed: 46 additions & 0 deletions b/‎eval-server/evals/action-agent/a11y-001.yaml
Lines changed: 46 additions & 0 deletions
diff --git a/‎eval-server/evals/action-agent/accordion-001.yaml
Lines changed: 46 additions & 0 deletions b/‎eval-server/evals/action-agent/accordion-001.yaml
Lines changed: 46 additions & 0 deletions
diff --git a/‎eval-server/evals/action-agent/autocomplete-001.yaml
Lines changed: 46 additions & 0 deletions b/‎eval-server/evals/action-agent/autocomplete-001.yaml
Lines changed: 46 additions & 0 deletions
diff --git a/‎eval-server/evals/action-agent/checkbox-001.yaml
Lines changed: 46 additions & 0 deletions b/‎eval-server/evals/action-agent/checkbox-001.yaml
Lines changed: 46 additions & 0 deletions
diff --git a/‎eval-server/evals/action-agent/checkbox-002.yaml
Lines changed: 47 additions & 0 deletions b/‎eval-server/evals/action-agent/checkbox-002.yaml
Lines changed: 47 additions & 0 deletions
@@ -608,6 +608,10 @@ grd_files_bundled_sources = [
   "front_end/panels/ai_chat/ui/PromptEditDialog.js",
   "front_end/panels/ai_chat/ui/SettingsDialog.js",
   "front_end/panels/ai_chat/ui/EvaluationDialog.js",
+  "front_end/panels/ai_chat/ui/components/TracingConfig.js",
+  "front_end/panels/ai_chat/ui/components/EvaluationConfig.js",
+  "front_end/panels/ai_chat/ui/components/VectorDatabaseConfig.js",
+  "front_end/panels/ai_chat/ui/components/ProviderConfig.js",
   "front_end/panels/ai_chat/core/AgentService.js",
   "front_end/panels/ai_chat/core/State.js",
   "front_end/panels/ai_chat/core/Graph.js",
@@ -650,8 +654,8 @@ grd_files_bundled_sources = [
   "front_end/panels/ai_chat/common/page.js",
   "front_end/panels/ai_chat/common/WebSocketRPCClient.js",
   "front_end/panels/ai_chat/common/EvaluationConfig.js",
-  "front_end/panels/ai_chat/evaluation/EvaluationProtocol.js",
-  "front_end/panels/ai_chat/evaluation/EvaluationAgent.js",
+  "front_end/panels/ai_chat/evaluation/remote/EvaluationProtocol.js",
+  "front_end/panels/ai_chat/evaluation/remote/EvaluationAgent.js",
   "front_end/panels/ai_chat/tracing/TracingProvider.js",
   "front_end/panels/ai_chat/tracing/LangfuseProvider.js",
   "front_end/panels/ai_chat/tracing/TracingConfig.js",
 
@@ -34,6 +34,26 @@ A WebSocket-based evaluation server for LLM agents using LLM-as-a-judge methodol
 - 🖥️ Interactive CLI for testing and management
 - ⚡ Support for concurrent agent evaluations
 
+## OpenAI Compatible API
+
+The server provides an OpenAI-compatible `/v1/responses` endpoint for direct API access:
+
+```bash
+curl -X POST 'http://localhost:8081/v1/responses' \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "input": "What is 2+2?",
+    "main_model": "gpt-4.1",
+    "mini_model": "gpt-4.1-nano", 
+    "nano_model": "gpt-4.1-nano",
+    "provider": "openai"
+  }'
+```
+
+**Model Precedence:**
+1. **API calls** OR **individual test YAML models** (highest priority)
+2. **config.yaml defaults** (fallback when neither API nor test specify models)
+
 ## Agent Protocol
 
 Your agent needs to:
 
@@ -98,36 +98,8 @@ curl -X POST http://localhost:8081/evaluate \\
   }'
 ```
 
-## Method 3: Automatic Scheduling (YAML Configuration)
 
-Evaluations can be configured to run automatically based on their schedule in the YAML file.
-
-### Schedule Types
-
-#### On-Demand (Manual Only)
-```yaml
-schedule:
-  type: "on_demand"
-```
-Only runs when manually triggered.
-
-#### Periodic (Automatic)
-```yaml
-schedule:
-  type: "periodic"
-  interval: 86400000  # Run every 24 hours (in milliseconds)
-```
-Runs automatically at the specified interval.
-
-#### One-Time (Automatic)
-```yaml
-schedule:
-  type: "once"
-  run_at: "2024-12-25T09:00:00Z"  # Run once at specific time
-```
-Runs once at the specified time.
-
-## Method 4: Programmatic Integration
+## Method 3: Programmatic Integration
 
 You can integrate the evaluation system into your own applications:
 
@@ -186,7 +158,7 @@ result = trigger_evaluation(
 print(json.dumps(result, indent=2))
 ```
 
-## Method 5: Webhook Integration
+## Method 4: Webhook Integration
 
 You can set up webhooks to trigger evaluations from external systems:
 
@@ -299,7 +271,7 @@ WebSocket connection failed
 
 ## Best Practices
 
-1. **Start Simple**: Begin with on-demand evaluations before setting up automation
+1. **Start Simple**: Begin with manual evaluations before setting up automation
 2. **Monitor Logs**: Always monitor logs when running evaluations
 3. **Test Connections**: Use the `status` command to verify everything is connected
 4. **Gradual Rollout**: Test individual evaluations before running batch operations
 
@@ -73,13 +73,6 @@ Each evaluation in the `evaluations` array follows this structure:
         summary:
           type: "string"
 
-  # Scheduling configuration
-  schedule:
-    type: "on_demand"          # on_demand|periodic|once
-    # For periodic:
-    interval: 3600000          # Interval in milliseconds
-    # For once:
-    run_at: "2024-01-01T00:00:00Z"  # ISO timestamp
 
   # Validation configuration
   validation:
@@ -233,9 +226,6 @@ evaluations:
           lastModified:
             type: "string"
 
-    schedule:
-      type: "periodic"
-      interval: 86400000  # Daily
 
     validation:
       type: "hybrid"
@@ -275,8 +265,6 @@ evaluations:
       include_sources: true
       depth: "moderate"
 
-    schedule:
-      type: "on_demand"
 
     validation:
       type: "llm-judge"
@@ -301,7 +289,6 @@ evaluations:
 3. **Tool names**: Must match registered tools in the client
 4. **URLs**: Must be valid HTTP/HTTPS URLs
 5. **Timeouts**: Must be positive integers (milliseconds)
-6. **Schedule intervals**: Must be at least 60000ms (1 minute)
 
 ## YAML Best Practices
 
 
@@ -0,0 +1,46 @@
+# Accessibility action test
+id: "a11y-001"
+name: "Click Using ARIA Label"
+description: "Test clicking an element identified primarily by ARIA attributes"
+enabled: true
+
+target:
+  url: "https://www.w3.org/WAI/ARIA/apg/patterns/button/examples/button/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click the button with aria-label \"Print Page\""
+  reasoning: "Testing action selection using accessibility attributes"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Used accessibility tree to find elements"
+      - "Correctly identified element by ARIA label"
+      - "Successfully clicked the target button"
+      - "Demonstrated understanding of accessibility attributes"
+      - "No reliance on visual appearance alone"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the Print Page button was successfully clicked"
+        - "Check if any print dialog or print preview appeared"
+        - "Confirm the button showed visual feedback (pressed state)"
+        - "Ensure the action was performed on the correct accessibility-labeled element"
+
+metadata:
+  tags: ["action", "accessibility", "aria", "click", "a11y"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
@@ -0,0 +1,46 @@
+# Accordion expansion test
+id: "accordion-001"
+name: "Expand Accordion Section"
+description: "Test clicking to expand an accordion panel"
+enabled: true
+
+target:
+  url: "https://jqueryui.com/accordion/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click to expand the \"Section 2\" accordion panel"
+  reasoning: "Testing accordion expand/collapse interaction"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the Section 2 accordion header"
+      - "Successfully clicked to expand the section"
+      - "Section 2 content became visible"
+      - "Other sections collapsed appropriately"
+      - "Accordion animation completed smoothly"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify Section 2 is now expanded and content visible"
+        - "Check if other accordion sections collapsed"
+        - "Confirm the expansion animation completed"
+        - "Ensure Section 2 header shows expanded state"
+
+metadata:
+  tags: ["action", "accordion", "expand", "collapse", "ui"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
@@ -0,0 +1,46 @@
+# Autocomplete search test
+id: "autocomplete-001"
+name: "Use Autocomplete Search"
+description: "Test typing in autocomplete field and selecting from suggestions"
+enabled: true
+
+target:
+  url: "https://jqueryui.com/autocomplete/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Type \"Java\" in the autocomplete field and select \"JavaScript\" from suggestions"
+  reasoning: "Testing autocomplete/typeahead interaction patterns"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the autocomplete input field"
+      - "Typed \"Java\" to trigger suggestions"
+      - "Autocomplete dropdown appeared with suggestions"
+      - "Selected \"JavaScript\" from the suggestion list"
+      - "Input field shows the selected value"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify \"JavaScript\" appears in the input field"
+        - "Check if autocomplete suggestions appeared"
+        - "Confirm the correct suggestion was selected"
+        - "Ensure dropdown closed after selection"
+
+metadata:
+  tags: ["action", "autocomplete", "typeahead", "search", "suggestions"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
@@ -0,0 +1,46 @@
+# Checkbox/radio button test
+id: "checkbox-001"
+name: "Toggle Newsletter Checkbox"
+description: "Test clicking checkbox elements for form options"
+enabled: true
+
+target:
+  url: "https://www.w3schools.com/html/tryit.asp?filename=tryhtml_checkbox"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 45000
+
+input:
+  objective: "Click the checkbox labeled \"I have a bike\" to check it"
+  reasoning: "Testing interaction with checkbox form elements"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Identified the correct checkbox among multiple options"
+      - "Used click action on the checkbox element"
+      - "Checkbox state changed from unchecked to checked"
+      - "Handled the iframe structure if present"
+      - "No errors with form element interaction"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Compare screenshots to verify the checkbox state changed from unchecked to checked"
+        - "Confirm the \"I have a bike\" checkbox now shows a checkmark"
+        - "Verify the checkbox visual indicator (checkmark) is clearly visible"
+        - "Ensure no other checkboxes were accidentally modified"
+
+metadata:
+  tags: ["action", "checkbox", "form", "w3schools", "input"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
@@ -0,0 +1,47 @@
+# Toggle checkbox test - using HTML form test site
+id: "checkbox-002"
+name: "Check Extra Cheese Checkbox"
+description: "Test checking a specific checkbox using the check method"
+enabled: true
+
+target:
+  url: "https://httpbin.org/forms/post"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 45000
+
+input:
+  objective: "Find and check the \"Extra Cheese\" checkbox in the Pizza Toppings section"
+  reasoning: "Testing checkbox interaction functionality using check method"
+  hint: "Look for the Extra Cheese checkbox and use the check method to select it"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the Extra Cheese checkbox in the Pizza Toppings section"
+      - "Used the check method instead of click for better reliability"
+      - "Checkbox became checked (if it wasn't already)"
+      - "No errors occurred during checkbox interaction"
+      - "Form maintained its structure after checkbox selection"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the Extra Cheese checkbox is now checked (shows checkmark)"
+        - "Check that the checkbox shows proper visual feedback for checked state"
+        - "Confirm the form structure remained intact"
+        - "Ensure the checkbox for Extra Cheese was specifically targeted and checked"
+
+metadata:
+  tags: ["action", "checkbox", "check", "form", "httpbin"]
+  priority: "high"
+  timeout: 45000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"