Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .geval/approval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"version": "1",
"approved_by": "manavpatel",
"reason": "Looks good enough for beta",
"timestamp": "2026-03-28T19:03:34Z",
"approved": true
}
4 changes: 4 additions & 0 deletions .geval/contract.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
name: text2sql-quality-gate
version: "1.0.0"
policies:
- path: policies/quality.yaml
39 changes: 39 additions & 0 deletions .geval/decisions/2026-03-28T18-33-39Z.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
{
"artifact_version": "4",
"geval_version": "0.1.2",
"bundle_hash": "c47bc2ea9a04100aceb205db07fcd09744c2bfe484e4b0686aea930962a0d8dc",
"contracts_combine_rule": "worst_case",
"contracts": [
{
"contract_path": ".geval/contract.yaml",
"contract_name": "text2sql-quality-gate",
"contract_version": "1.0.0",
"contract_hash": "1314dfd196332388ab64303e208e47a79d5aba112c64e2a5772c94835e7b666c",
"combine_rule": "worst_case",
"policy_results": [
{
"policy_path": "policies/quality.yaml",
"policy_name": "quality-gate",
"policy_version": "1.0.0",
"policy_hash": "2a2ef6cad081aa13541d916aad82a8a78555db436d1e3e942726da85121b0e37",
"outcome": "REQUIRE_APPROVAL",
"matched_rule": "response_quality_gate",
"matching_rules": [
"response_quality_gate"
]
}
],
"combined_decision": "REQUIRE_APPROVAL",
"combined_matched_rule": "policies/quality.yaml:response_quality_gate",
"combined_reason": "Response quality dropped below 2.5/5"
}
],
"overall_combined_decision": "REQUIRE_APPROVAL",
"overall_matched_rule": ".geval/contract.yaml:policies/quality.yaml:response_quality_gate",
"overall_reason": "Response quality dropped below 2.5/5",
"signals_name": "langsmith-evals",
"signals_version": "1.0.0",
"signals_hash": "a41910c481b05e1774bc1621365ef1b40aae101fec043a120962ed1dfb3972e1",
"timestamp": "2026-03-28T18:33:39Z",
"approval": null
}
39 changes: 39 additions & 0 deletions .geval/decisions/2026-03-28T19-03-34Z.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
{
"artifact_version": "4",
"geval_version": "0.1.2",
"bundle_hash": "c47bc2ea9a04100aceb205db07fcd09744c2bfe484e4b0686aea930962a0d8dc",
"contracts_combine_rule": "worst_case",
"contracts": [
{
"contract_path": ".geval/contract.yaml",
"contract_name": "text2sql-quality-gate",
"contract_version": "1.0.0",
"contract_hash": "1314dfd196332388ab64303e208e47a79d5aba112c64e2a5772c94835e7b666c",
"combine_rule": "worst_case",
"policy_results": [
{
"policy_path": "policies/quality.yaml",
"policy_name": "quality-gate",
"policy_version": "1.0.0",
"policy_hash": "2a2ef6cad081aa13541d916aad82a8a78555db436d1e3e942726da85121b0e37",
"outcome": "REQUIRE_APPROVAL",
"matched_rule": "response_quality_gate",
"matching_rules": [
"response_quality_gate"
]
}
],
"combined_decision": "REQUIRE_APPROVAL",
"combined_matched_rule": "policies/quality.yaml:response_quality_gate",
"combined_reason": "Response quality dropped below 2.5/5"
}
],
"overall_combined_decision": "REQUIRE_APPROVAL",
"overall_matched_rule": ".geval/contract.yaml:policies/quality.yaml:response_quality_gate",
"overall_reason": "Response quality dropped below 2.5/5",
"signals_name": "langsmith-evals",
"signals_version": "1.0.0",
"signals_hash": "a41910c481b05e1774bc1621365ef1b40aae101fec043a120962ed1dfb3972e1",
"timestamp": "2026-03-28T19:03:34Z",
"approval": null
}
39 changes: 39 additions & 0 deletions .geval/decisions/2026-03-28T19-18-31Z.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
{
"artifact_version": "4",
"geval_version": "0.1.2",
"bundle_hash": "c47bc2ea9a04100aceb205db07fcd09744c2bfe484e4b0686aea930962a0d8dc",
"contracts_combine_rule": "worst_case",
"contracts": [
{
"contract_path": ".geval/contract.yaml",
"contract_name": "text2sql-quality-gate",
"contract_version": "1.0.0",
"contract_hash": "1314dfd196332388ab64303e208e47a79d5aba112c64e2a5772c94835e7b666c",
"combine_rule": "worst_case",
"policy_results": [
{
"policy_path": "policies/quality.yaml",
"policy_name": "quality-gate",
"policy_version": "1.0.0",
"policy_hash": "2a2ef6cad081aa13541d916aad82a8a78555db436d1e3e942726da85121b0e37",
"outcome": "REQUIRE_APPROVAL",
"matched_rule": "response_quality_gate",
"matching_rules": [
"response_quality_gate"
]
}
],
"combined_decision": "REQUIRE_APPROVAL",
"combined_matched_rule": "policies/quality.yaml:response_quality_gate",
"combined_reason": "Response quality dropped below 2.5/5"
}
],
"overall_combined_decision": "REQUIRE_APPROVAL",
"overall_matched_rule": ".geval/contract.yaml:policies/quality.yaml:response_quality_gate",
"overall_reason": "Response quality dropped below 2.5/5",
"signals_name": "langsmith-evals",
"signals_version": "1.0.0",
"signals_hash": "a41910c481b05e1774bc1621365ef1b40aae101fec043a120962ed1dfb3972e1",
"timestamp": "2026-03-28T19:18:31Z",
"approval": null
}
47 changes: 47 additions & 0 deletions .geval/policies/quality.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
name: quality-gate
version: "1.0.0"
policy:
rules:
- priority: 1
name: sql_correctness_gate
when:
component: sql_eval
metric: sql_correctness
operator: "<"
threshold: 0.7
then:
action: block
reason: "SQL correctness dropped below 70%"

- priority: 2
name: sql_quality_gate
when:
component: sql_eval
metric: sql_quality
operator: "<"
threshold: 3.0
then:
action: block
reason: "SQL quality dropped below 3.0/5"

- priority: 3
name: e2e_correctness_gate
when:
component: e2e_eval
metric: correctness
operator: "<"
threshold: 0.6
then:
action: block
reason: "E2E correctness dropped below 60%"

- priority: 4
name: response_quality_gate
when:
component: e2e_eval
metric: response_quality
operator: "<"
threshold: 2.5
then:
action: require_approval
reason: "Response quality dropped below 2.5/5"
104 changes: 104 additions & 0 deletions .github/scripts/generate_signals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
"""Generate signals.json from the latest LangSmith experiments."""

import json
import os
import sys

from dotenv import load_dotenv

load_dotenv(override=True)

from langsmith import Client

DATASET_NAME = "text2sql-agent"


def get_latest_experiment_stats(client: Client, prefix: str):
try:
dataset = client.read_dataset(dataset_name=DATASET_NAME)
except Exception as e:
print(f"Error reading dataset: {e}", file=sys.stderr)
return None

experiments = list(client.list_projects(reference_dataset_id=dataset.id))
prefix_experiments = [e for e in experiments if e.name.startswith(prefix)]

if not prefix_experiments:
print(f"No experiments found with prefix {prefix}", file=sys.stderr)
return None

prefix_experiments.sort(key=lambda x: x.start_time, reverse=True)
latest_exp = prefix_experiments[0]
print(f"Latest {prefix} experiment: {latest_exp.name}", file=sys.stderr)

# First try getting pre-aggregated stats
stats = getattr(latest_exp, "feedback_stats", None)
if stats:
return {k: v.get("avg", v) if isinstance(v, dict) else v for k, v in stats.items()}

# If missing, aggregate manually from individual runs
print("Pre-aggregated feedback_stats missing or empty, calculating from runs...", file=sys.stderr)
runs = list(client.list_runs(project_name=latest_exp.name, is_root=True))
if not runs:
return {}

run_ids = [r.id for r in runs]
feedbacks = list(client.list_feedback(run_ids=run_ids))

metric_values = {}
for f in feedbacks:
if f.score is not None:
if f.key not in metric_values:
metric_values[f.key] = []
metric_values[f.key].append(f.score)

averages = {}
for k, v in metric_values.items():
if v:
averages[k] = sum(v) / len(v)

return averages


def main():
client = Client()
signals = []

# 1. Fetch SQL Evaluation Stats
sql_stats = get_latest_experiment_stats(client, "text2sql-agent-sql")
if sql_stats:
for metric, avg in sql_stats.items():
if avg is not None:
signals.append({
"system": "text2sql",
"component": "sql_eval",
"metric": metric,
"value": float(avg)
})

# 2. Fetch E2E Agent Evaluation Stats
e2e_stats = get_latest_experiment_stats(client, "text2sql-agent-e2e")
if e2e_stats:
for metric, avg in e2e_stats.items():
if avg is not None:
signals.append({
"system": "text2sql",
"component": "e2e_eval",
"metric": metric,
"value": float(avg)
})

if not signals:
print("No signals found from LangSmith experiments.", file=sys.stderr)
sys.exit(1)

output = {
"name": "langsmith-evals",
"version": "1.0.0",
"signals": signals
}
print(json.dumps(output, indent=2))


if __name__ == "__main__":
main()
Loading
Loading