Skip to content

Commit aab3abe

Browse files
committed
Add scorer for selected ContextBench prediction
1 parent 66a522b commit aab3abe

1 file changed

Lines changed: 176 additions & 0 deletions

File tree

Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs';
2+
import { join } from 'node:path';
3+
import { spawnSync } from 'node:child_process';
4+
5+
const root = process.env.ROOT;
6+
const officialContextBench = process.env.OFFICIAL_CONTEXTBENCH;
7+
const payloads = JSON.parse(readFileSync(process.env.TASK_PAYLOADS, 'utf8'));
8+
const task = payloads.tasks[2];
9+
const runDir = join(root, 'selected-codebase-memory-mcp');
10+
mkdirSync(runDir, { recursive: true });
11+
12+
const selection = {
13+
files: ['core/metrics/prometheus.go', 'cmd/root.go', 'server/subsonic/helpers.go'],
14+
spans: [
15+
{ file: 'core/metrics/prometheus.go', start: 15, end: 17 },
16+
{ file: 'core/metrics/prometheus.go', start: 19, end: 25 },
17+
{ file: 'core/metrics/prometheus.go', start: 33, end: 38 },
18+
{ file: 'core/metrics/prometheus.go', start: 40, end: 49 },
19+
{ file: 'core/metrics/prometheus.go', start: 51, end: 100 },
20+
{ file: 'core/metrics/prometheus.go', start: 102, end: 123 },
21+
{ file: 'cmd/root.go', start: 194, end: 201 },
22+
{ file: 'cmd/root.go', start: 204, end: 220 },
23+
{ file: 'server/subsonic/helpers.go', start: 129, end: 187 },
24+
{ file: 'server/subsonic/helpers.go', start: 189, end: 201 },
25+
{ file: 'server/subsonic/helpers.go', start: 256, end: 268 },
26+
],
27+
rationale:
28+
'These spans cover the startup metrics writer path and the Subsonic auth-header/Bearer-token parsing path where the two reported behaviors most likely live.',
29+
};
30+
31+
function run(cmd, args, opts = {}) {
32+
const started = Date.now();
33+
const r = spawnSync(cmd, args, {
34+
cwd: opts.cwd || process.cwd(),
35+
env: opts.env || process.env,
36+
encoding: 'utf8',
37+
timeout: opts.timeoutMs || 600000,
38+
maxBuffer: 128 * 1024 * 1024,
39+
});
40+
return {
41+
command: [cmd, ...args].join(' '),
42+
cwd: opts.cwd || process.cwd(),
43+
status: r.status,
44+
signal: r.signal,
45+
error: r.error?.message || null,
46+
durationMs: Date.now() - started,
47+
stdout: r.stdout || '',
48+
stderr: r.stderr || '',
49+
};
50+
}
51+
52+
function addSpan(map, file, start, end) {
53+
const s = Math.max(1, Number(start) || 1);
54+
const e = Math.max(s, Number(end) || s);
55+
const list = map.get(file) || [];
56+
list.push({ start: s, end: e });
57+
map.set(file, list);
58+
}
59+
60+
const spanMap = new Map();
61+
for (const span of selection.spans) addSpan(spanMap, span.file, span.start, span.end);
62+
const predFiles = [...new Set([...selection.files, ...selection.spans.map((span) => span.file)])];
63+
const predSpans = Object.fromEntries(spanMap.entries());
64+
const prediction = {
65+
instance_id: task.instance_id,
66+
repo_url: task.repo_checkout_path,
67+
commit: task.base_commit,
68+
traj_data: {
69+
pred_steps: [{ files: predFiles, spans: predSpans }],
70+
pred_files: predFiles,
71+
pred_spans: predSpans,
72+
},
73+
model_patch: '',
74+
};
75+
76+
writeFileSync(join(runDir, 'selection.json'), JSON.stringify(selection, null, 2));
77+
writeFileSync(join(runDir, 'prediction.json'), JSON.stringify(prediction, null, 2));
78+
79+
const goldPath = join(runDir, 'gold.json');
80+
const gold = run(
81+
'node',
82+
[
83+
'scripts/contextbench-select-slice.mjs',
84+
'--write-gold',
85+
'--task-id',
86+
task.instance_id,
87+
'--out',
88+
goldPath,
89+
'--payloads',
90+
process.env.TASK_PAYLOADS,
91+
],
92+
{ timeoutMs: 600000 },
93+
);
94+
writeFileSync(join(runDir, 'gold-command.json'), JSON.stringify(gold, null, 2));
95+
if (gold.status !== 0) {
96+
throw new Error(`gold materialization failed: ${gold.stderr || gold.stdout}`);
97+
}
98+
99+
const scorePath = join(runDir, 'official-score.jsonl');
100+
const evaluator = run(
101+
'python',
102+
[
103+
'-m',
104+
'contextbench.evaluate',
105+
'--gold',
106+
goldPath,
107+
'--pred',
108+
join(runDir, 'prediction.json'),
109+
'--cache',
110+
join(runDir, 'repo-cache'),
111+
'--out',
112+
scorePath,
113+
],
114+
{ cwd: officialContextBench, timeoutMs: 1200000 },
115+
);
116+
writeFileSync(join(runDir, 'evaluator-command.json'), JSON.stringify(evaluator, null, 2));
117+
118+
let score = null;
119+
if (existsSync(scorePath)) {
120+
const lines = readFileSync(scorePath, 'utf8').trim().split(/\n+/).filter(Boolean);
121+
if (lines.length > 0) score = JSON.parse(lines.at(-1));
122+
}
123+
124+
const scoreable = evaluator.status === 0 && Boolean(score);
125+
const final = score?.final || {};
126+
const row = {
127+
lane_id: 'codebase-memory-mcp',
128+
task_id: task.instance_id,
129+
model: 'gpt-5.4-mini-high',
130+
predictionSource: 'gpt-5.4-mini-high subagent over real codebase-memory-mcp candidate pack',
131+
status: scoreable ? 'completed' : 'judge_failed',
132+
setupStatus: 'completed',
133+
indexStatus: 'completed',
134+
toolCallable: true,
135+
candidateCount: 100,
136+
nonEmptyPrediction: predFiles.length > 0 && selection.spans.length > 0,
137+
predFiles: predFiles.length,
138+
predSpans: selection.spans.length,
139+
officialEvaluatorScoreable: scoreable,
140+
setupIndex: { setupDurationMs: 4, indexDurationMs: 2038, queryDurationMs: 365 },
141+
score,
142+
};
143+
const summary = {
144+
createdAt: new Date().toISOString(),
145+
attemptedRows: 1,
146+
scoreableRows: scoreable ? 1 : 0,
147+
setupIndexCostReportedSeparately: true,
148+
model: row.model,
149+
caveat: 'Prediction was selected by a Codex gpt-5.4-mini-high subagent because GitHub Actions has no OpenAI API key for direct CI inference.',
150+
resultsTable: scoreable
151+
? [
152+
{
153+
lane: row.lane_id,
154+
task: row.task_id,
155+
fileCoverage: final.file?.coverage ?? null,
156+
filePrecision: final.file?.precision ?? null,
157+
symbolCoverage: final.symbol?.coverage ?? null,
158+
symbolPrecision: final.symbol?.precision ?? null,
159+
spanCoverage: final.span?.coverage ?? null,
160+
spanPrecision: final.span?.precision ?? null,
161+
lineCoverage: final.line?.coverage ?? null,
162+
linePrecision: final.line?.precision ?? null,
163+
editlocRecall: score.editloc?.recall ?? null,
164+
editlocPrecision: score.editloc?.precision ?? null,
165+
},
166+
]
167+
: [],
168+
rows: [row],
169+
};
170+
171+
writeFileSync(join(runDir, 'row.json'), JSON.stringify(row, null, 2));
172+
writeFileSync(join(root, 'summary.json'), JSON.stringify(summary, null, 2));
173+
console.log('CONTEXTBENCH_SELECTED_SCORE_JSON_START');
174+
console.log(JSON.stringify(summary, null, 2));
175+
console.log('CONTEXTBENCH_SELECTED_SCORE_JSON_END');
176+
if (!scoreable) process.exitCode = 1;

0 commit comments

Comments
 (0)