From 48976fae1418d9e256f0f5b7beecf54583d988dd Mon Sep 17 00:00:00 2001 From: Noah Gift Date: Tue, 12 May 2026 13:34:30 +0200 Subject: [PATCH] feat(apr-cli): extend APR_EVAL_DEBUG diagnostic to MBPP harness (PMAT-CODE-MBPP-DIAG-001) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The §69 diagnostic surface (PR #1634) and §70 RC3 fix (PR #1635) closed the harness-bug class for HumanEval. MBPP's path (run_mbpp_inference + run_mbpp_inference_cuda) was not yet instrumented. This PR extends APR_EVAL_DEBUG to MBPP so future investigation of MBPP failures has ground-truth diagnostics on the same surface. What changes: - run_mbpp_inference (CPU path) now calls execute_python_test_with_diagnostics and emits /tmp/apr_eval_debug_MBPP_.json when APR_EVAL_DEBUG=1 is set. - run_mbpp_inference_cuda (CUDA path) gets the same treatment. What does NOT change: - run_mbpp_inference still uses the legacy AprTransformer::forward_with_cache + AprKVCache path. PMAT-CODE- SHIP-005-FIX (PR #1616) replaced this for HumanEval with realizar:: run_inference + OwnedQuantizedModel::from_apr. MBPP needs the same routing fix — but that's a separate multi-PR cascade scope (also includes H4 ChatML wrap + R1+R2 extraction equivalents for MBPP). Out of scope for this PR. - MBPP prompts are natural language (not Python signatures), so the §70 RC3 import-stripping bug does NOT apply to MBPP. Why ship this now: - Pure diagnostic — zero behaviour change for non-APR_EVAL_DEBUG callers - Lets us run a 1-problem MBPP smoke under APR_EVAL_DEBUG=1 to verify the legacy path's failure mode (currently undiagnosed) - Mirrors the pattern that successfully diagnosed §69 RC3 in 5 minutes on gx10 Test plan: - [x] cargo check -p apr-cli --features inference → clean - [x] cargo check -p apr-cli --features "inference,cuda,training" → clean - [x] cargo fmt --all → clean - [ ] gx10 single-MBPP-problem APR_EVAL_DEBUG=1 smoke (next slice; will document MBPP failure mode in a §72-class amendment) Refs: - crates/apr-cli/src/commands/eval/inference.rs::write_apr_eval_debug - contracts/apr-eval-humaneval-harness-invariant-v1.yaml v1.1.0 - PR #1634 (HumanEval diagnostic surface) - PR #1635 (HumanEval RC3 fix; cascade base for this branch) Closes task #53 (MBPP harness diagnostic extension; renamed from "RC3 prompt-preamble fix" since RC3 does not apply to MBPP's NL prompts — that decision recorded in commit body). Co-Authored-By: Claude Opus 4.7 --- crates/apr-cli/src/commands/eval/inference.rs | 28 +++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/crates/apr-cli/src/commands/eval/inference.rs b/crates/apr-cli/src/commands/eval/inference.rs index a69960931..4154dbd08 100644 --- a/crates/apr-cli/src/commands/eval/inference.rs +++ b/crates/apr-cli/src/commands/eval/inference.rs @@ -1535,7 +1535,19 @@ fn run_mbpp_inference( format!("{completion}\n{setup}\n{tests}\n") }; - let ok = execute_python_test(&full_program, 10); + let exec_result = execute_python_test_with_diagnostics(&full_program, 10); + let ok = exec_result.success; + + if std::env::var("APR_EVAL_DEBUG").is_ok() { + write_apr_eval_debug( + &task_id, + &prompt, + &tokenizer.decode(&tokens), + completion, + &full_program, + &exec_result, + ); + } if ok { passed += 1; @@ -1657,7 +1669,19 @@ fn run_mbpp_inference_cuda( format!("{completion}\n{setup}\n{tests}\n") }; - let ok = execute_python_test(&full_program, 10); + let exec_result = execute_python_test_with_diagnostics(&full_program, 10); + let ok = exec_result.success; + + if std::env::var("APR_EVAL_DEBUG").is_ok() { + write_apr_eval_debug( + &task_id, + &prompt, + &tokenizer.decode(&tokens), + completion, + &full_program, + &exec_result, + ); + } if ok { passed += 1;