overseek944 · overseek944 · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026
diff --git a/.geval/approval.json b/.geval/approval.json
@@ -0,0 +1,7 @@
+{
+  "version": "1",
+  "approved_by": "manavpatel",
+  "reason": "Looks good enough for beta",
+  "timestamp": "2026-03-28T19:03:34Z",
+  "approved": true
+}
diff --git a/.geval/contract.yaml b/.geval/contract.yaml
@@ -0,0 +1,4 @@
+name: text2sql-quality-gate
+version: "1.0.0"
+policies:
+  - path: policies/quality.yaml
diff --git a/.geval/decisions/2026-03-28T18-33-39Z.json b/.geval/decisions/2026-03-28T18-33-39Z.json
@@ -0,0 +1,39 @@
+{
+  "artifact_version": "4",
+  "geval_version": "0.1.2",
+  "bundle_hash": "c47bc2ea9a04100aceb205db07fcd09744c2bfe484e4b0686aea930962a0d8dc",
+  "contracts_combine_rule": "worst_case",
+  "contracts": [
+    {
+      "contract_path": ".geval/contract.yaml",
+      "contract_name": "text2sql-quality-gate",
+      "contract_version": "1.0.0",
+      "contract_hash": "1314dfd196332388ab64303e208e47a79d5aba112c64e2a5772c94835e7b666c",
+      "combine_rule": "worst_case",
+      "policy_results": [
+        {
+          "policy_path": "policies/quality.yaml",
+          "policy_name": "quality-gate",
+          "policy_version": "1.0.0",
+          "policy_hash": "2a2ef6cad081aa13541d916aad82a8a78555db436d1e3e942726da85121b0e37",
+          "outcome": "REQUIRE_APPROVAL",
+          "matched_rule": "response_quality_gate",
+          "matching_rules": [
+            "response_quality_gate"
+          ]
+        }
+      ],
+      "combined_decision": "REQUIRE_APPROVAL",
+      "combined_matched_rule": "policies/quality.yaml:response_quality_gate",
+      "combined_reason": "Response quality dropped below 2.5/5"
+    }
+  ],
+  "overall_combined_decision": "REQUIRE_APPROVAL",
+  "overall_matched_rule": ".geval/contract.yaml:policies/quality.yaml:response_quality_gate",
+  "overall_reason": "Response quality dropped below 2.5/5",
+  "signals_name": "langsmith-evals",
+  "signals_version": "1.0.0",
+  "signals_hash": "a41910c481b05e1774bc1621365ef1b40aae101fec043a120962ed1dfb3972e1",
+  "timestamp": "2026-03-28T18:33:39Z",
+  "approval": null
+}
diff --git a/.geval/decisions/2026-03-28T19-03-34Z.json b/.geval/decisions/2026-03-28T19-03-34Z.json
@@ -0,0 +1,39 @@
+{
+  "artifact_version": "4",
+  "geval_version": "0.1.2",
+  "bundle_hash": "c47bc2ea9a04100aceb205db07fcd09744c2bfe484e4b0686aea930962a0d8dc",
+  "contracts_combine_rule": "worst_case",
+  "contracts": [
+    {
+      "contract_path": ".geval/contract.yaml",
+      "contract_name": "text2sql-quality-gate",
+      "contract_version": "1.0.0",
+      "contract_hash": "1314dfd196332388ab64303e208e47a79d5aba112c64e2a5772c94835e7b666c",
+      "combine_rule": "worst_case",
+      "policy_results": [
+        {
+          "policy_path": "policies/quality.yaml",
+          "policy_name": "quality-gate",
+          "policy_version": "1.0.0",
+          "policy_hash": "2a2ef6cad081aa13541d916aad82a8a78555db436d1e3e942726da85121b0e37",
+          "outcome": "REQUIRE_APPROVAL",
+          "matched_rule": "response_quality_gate",
+          "matching_rules": [
+            "response_quality_gate"
+          ]
+        }
+      ],
+      "combined_decision": "REQUIRE_APPROVAL",
+      "combined_matched_rule": "policies/quality.yaml:response_quality_gate",
+      "combined_reason": "Response quality dropped below 2.5/5"
+    }
+  ],
+  "overall_combined_decision": "REQUIRE_APPROVAL",
+  "overall_matched_rule": ".geval/contract.yaml:policies/quality.yaml:response_quality_gate",
+  "overall_reason": "Response quality dropped below 2.5/5",
+  "signals_name": "langsmith-evals",
+  "signals_version": "1.0.0",
+  "signals_hash": "a41910c481b05e1774bc1621365ef1b40aae101fec043a120962ed1dfb3972e1",
+  "timestamp": "2026-03-28T19:03:34Z",
+  "approval": null
+}
diff --git a/.geval/decisions/2026-03-28T19-18-31Z.json b/.geval/decisions/2026-03-28T19-18-31Z.json
@@ -0,0 +1,39 @@
+{
+  "artifact_version": "4",
+  "geval_version": "0.1.2",
+  "bundle_hash": "c47bc2ea9a04100aceb205db07fcd09744c2bfe484e4b0686aea930962a0d8dc",
+  "contracts_combine_rule": "worst_case",
+  "contracts": [
+    {
+      "contract_path": ".geval/contract.yaml",
+      "contract_name": "text2sql-quality-gate",
+      "contract_version": "1.0.0",
+      "contract_hash": "1314dfd196332388ab64303e208e47a79d5aba112c64e2a5772c94835e7b666c",
+      "combine_rule": "worst_case",
+      "policy_results": [
+        {
+          "policy_path": "policies/quality.yaml",
+          "policy_name": "quality-gate",
+          "policy_version": "1.0.0",
+          "policy_hash": "2a2ef6cad081aa13541d916aad82a8a78555db436d1e3e942726da85121b0e37",
+          "outcome": "REQUIRE_APPROVAL",
+          "matched_rule": "response_quality_gate",
+          "matching_rules": [
+            "response_quality_gate"
+          ]
+        }
+      ],
+      "combined_decision": "REQUIRE_APPROVAL",
+      "combined_matched_rule": "policies/quality.yaml:response_quality_gate",
+      "combined_reason": "Response quality dropped below 2.5/5"
+    }
+  ],
+  "overall_combined_decision": "REQUIRE_APPROVAL",
+  "overall_matched_rule": ".geval/contract.yaml:policies/quality.yaml:response_quality_gate",
+  "overall_reason": "Response quality dropped below 2.5/5",
+  "signals_name": "langsmith-evals",
+  "signals_version": "1.0.0",
+  "signals_hash": "a41910c481b05e1774bc1621365ef1b40aae101fec043a120962ed1dfb3972e1",
+  "timestamp": "2026-03-28T19:18:31Z",
+  "approval": null
+}
diff --git a/.geval/policies/quality.yaml b/.geval/policies/quality.yaml
@@ -0,0 +1,47 @@
+name: quality-gate
+version: "1.0.0"
+policy:
+  rules:
+    - priority: 1
+      name: sql_correctness_gate
+      when:
+        component: sql_eval
+        metric: sql_correctness
+        operator: "<"
+        threshold: 0.7
+      then:
+        action: block
+        reason: "SQL correctness dropped below 70%"
+
+    - priority: 2
+      name: sql_quality_gate
+      when:
+        component: sql_eval
+        metric: sql_quality
+        operator: "<"
+        threshold: 3.0
+      then:
+        action: block
+        reason: "SQL quality dropped below 3.0/5"
+
+    - priority: 3
+      name: e2e_correctness_gate
+      when:
+        component: e2e_eval
+        metric: correctness
+        operator: "<"
+        threshold: 0.6
+      then:
+        action: block
+        reason: "E2E correctness dropped below 60%"
+
+    - priority: 4
+      name: response_quality_gate
+      when:
+        component: e2e_eval
+        metric: response_quality
+        operator: "<"
+        threshold: 2.5
+      then:
+        action: require_approval
+        reason: "Response quality dropped below 2.5/5"
diff --git a/.github/scripts/generate_signals.py b/.github/scripts/generate_signals.py
@@ -0,0 +1,104 @@
+"""Generate signals.json from the latest LangSmith experiments."""
+
+import json
+import os
+import sys
+
+from dotenv import load_dotenv
+
+load_dotenv(override=True)
+
+from langsmith import Client
+
+DATASET_NAME = "text2sql-agent"
+
+
+def get_latest_experiment_stats(client: Client, prefix: str):
+    try:
+        dataset = client.read_dataset(dataset_name=DATASET_NAME)
+    except Exception as e:
+        print(f"Error reading dataset: {e}", file=sys.stderr)
+        return None
+
+    experiments = list(client.list_projects(reference_dataset_id=dataset.id))
+    prefix_experiments = [e for e in experiments if e.name.startswith(prefix)]
+
+    if not prefix_experiments:
+        print(f"No experiments found with prefix {prefix}", file=sys.stderr)
+        return None
+
+    prefix_experiments.sort(key=lambda x: x.start_time, reverse=True)
+    latest_exp = prefix_experiments[0]
+    print(f"Latest {prefix} experiment: {latest_exp.name}", file=sys.stderr)
+
+    # First try getting pre-aggregated stats
+    stats = getattr(latest_exp, "feedback_stats", None)
+    if stats:
+        return {k: v.get("avg", v) if isinstance(v, dict) else v for k, v in stats.items()}
+
+    # If missing, aggregate manually from individual runs
+    print("Pre-aggregated feedback_stats missing or empty, calculating from runs...", file=sys.stderr)
+    runs = list(client.list_runs(project_name=latest_exp.name, is_root=True))
+    if not runs:
+        return {}
+
+    run_ids = [r.id for r in runs]
+    feedbacks = list(client.list_feedback(run_ids=run_ids))
+
+    metric_values = {}
+    for f in feedbacks:
+        if f.score is not None:
+            if f.key not in metric_values:
+                metric_values[f.key] = []
+            metric_values[f.key].append(f.score)
+
+    averages = {}
+    for k, v in metric_values.items():
+        if v:
+            averages[k] = sum(v) / len(v)
+
+    return averages
+
+
+def main():
+    client = Client()
+    signals = []
+
+    # 1. Fetch SQL Evaluation Stats
+    sql_stats = get_latest_experiment_stats(client, "text2sql-agent-sql")
+    if sql_stats:
+        for metric, avg in sql_stats.items():
+            if avg is not None:
+                signals.append({
+                    "system": "text2sql",
+                    "component": "sql_eval",
+                    "metric": metric,
+                    "value": float(avg)
+                })
+
+    # 2. Fetch E2E Agent Evaluation Stats
+    e2e_stats = get_latest_experiment_stats(client, "text2sql-agent-e2e")
+    if e2e_stats:
+        for metric, avg in e2e_stats.items():
+            if avg is not None:
+                signals.append({
+                    "system": "text2sql",
+                    "component": "e2e_eval",
+                    "metric": metric,
+                    "value": float(avg)
+                })
+
+    if not signals:
+        print("No signals found from LangSmith experiments.", file=sys.stderr)
+        sys.exit(1)
+
+    output = {
+        "name": "langsmith-evals",
+        "version": "1.0.0",
+        "signals": signals
+    }
+    print(json.dumps(output, indent=2))
+
+
+if __name__ == "__main__":
+    main()