diff --git a/crates/apr-cli/src/commands/eval/inference.rs b/crates/apr-cli/src/commands/eval/inference.rs index 71475f04a..f459e80e1 100644 --- a/crates/apr-cli/src/commands/eval/inference.rs +++ b/crates/apr-cli/src/commands/eval/inference.rs @@ -274,6 +274,19 @@ fn load_humaneval_tokenizer( } /// ALB-084: Run HumanEval with actual model inference + Python test execution. +/// +/// PMAT-CODE-SHIP-005-FIX (2026-05-11): routed through `realizar::run_inference` +/// + `OwnedQuantizedModel::from_apr` (the same working path that SHIP-002 + +/// SHIP-006 + SHIP-008 LIVE-discharged) instead of the legacy +/// `AprTransformer::forward_with_cache + AprKVCache` path. The legacy path +/// produced 0/3 pass@1 on canonical 7B teacher smoke test (every problem +/// FAIL); the run_inference path produces the canonical pairwise-comparison +/// solution for HumanEval/0 (verified manually 2026-05-11 via `apr run`). +/// +/// HumanEval prompts are raw Python code (with docstrings); we tokenize via +/// embedded BPE and pass via `InferenceConfig::with_input_tokens` to bypass +/// `prepare_tokens_apr`'s ChatML auto-wrap (which would wrap raw Python in +/// `<|im_start|>user...` causing degenerate output). #[cfg(feature = "inference")] fn run_humaneval_inference( model_path: &Path, @@ -281,27 +294,19 @@ fn run_humaneval_inference( _k_values: &[usize], json_output: bool, ) -> std::result::Result<(usize, Vec<(String, String, bool)>), String> { - use realizar::apr_transformer::AprKVCache; + use realizar::{run_inference, InferenceConfig}; if !json_output { println!(" {} Loading model for inference...", "→".dimmed()); } - let transformer = load_humaneval_model(model_path)?; let tokenizer = load_humaneval_tokenizer(model_path, json_output)?; if !json_output { - println!( - " {} Model loaded ({} layers, vocab={})", - "✓".green(), - transformer.config.num_layers, - transformer.config.vocab_size - ); + println!(" {} Tokenizer loaded", "✓".green()); } let mut passed = 0usize; let mut results = Vec::new(); - let temperature = 0.0f32; - let mut rng_state: u64 = 42; for (i, problem) in problems.iter().enumerate() { let entry = problem @@ -316,42 +321,54 @@ fn run_humaneval_inference( continue; } - // Generate completion (greedy, max 256 tokens) - let mut cache = AprKVCache::new(&transformer.config); - let mut tokens = prompt_tokens.clone(); - - for (pos, &tok) in prompt_tokens.iter().enumerate() { - let _ = transformer.forward_with_cache(tok, &mut cache, pos); - } - - let max_new = 256; - for step in 0..max_new { - let pos = prompt_tokens.len() + step; - let last_tok = *tokens.last().expect("last("); - let logits = transformer - .forward_with_cache(last_tok, &mut cache, pos) - .map_err(|e| format!("Generation failed: {e}"))?; - - let next = sample_token(&logits, temperature, &mut rng_state); - tokens.push(next); - - if next == 0 { - break; - } - if let Some(eos) = transformer.config.eos_token_id { - if next == eos { - break; + // Generate completion via run_inference (greedy, max 256 tokens). + // `with_input_tokens` bypasses `prepare_tokens_apr`'s ChatML auto-wrap + // — HumanEval prompts are raw Python and must NOT be wrapped. + let config = InferenceConfig::new(model_path) + .with_input_tokens(prompt_tokens.clone()) + .with_max_tokens(256) + .with_temperature(0.0) + .with_top_k(1); + + let result = match run_inference(&config) { + Ok(r) => r, + Err(e) => { + if !json_output { + eprintln!(" [FAIL] {} ({}): inference error: {e}", problem.task_id, entry); } - } - } + results.push((problem.task_id.clone(), entry.to_string(), false)); + continue; + }, + }; - let completion_tokens = &tokens[prompt_tokens.len()..]; - let completion = tokenizer.decode(completion_tokens); + // run_inference's `result.text` is the FULL decoded sequence + // (prompt + completion). Slicing by the prompt string preserves + // exact byte boundaries — slicing by tokens introduces a leading- + // whitespace artifact when the prompt ends with `\n` and the + // first generated token decodes as a leading-space-prefixed run. + let completion = if let Some(stripped) = result.text.strip_prefix(&problem.prompt) { + stripped.to_string() + } else { + // Fallback: token-level slicing if text doesn't begin with the + // prompt verbatim (e.g., tokenizer-specific whitespace handling). + let completion_tokens = if result.tokens.len() > result.input_token_count { + &result.tokens[result.input_token_count..] + } else { + &result.tokens[..] + }; + tokenizer.decode(completion_tokens) + }; let completion = truncate_at_function_boundary(&completion); + // PMAT-CODE-SHIP-005-WHITESPACE-RESIDUAL: BPE raw-continuation + // can emit a 1-space over-indent at the prompt-completion boundary. + // Align to the prompt's last non-empty line's indent before + // concatenation — invalid-Python IndentationError otherwise. + let aligned = align_continuation_indent(&problem.prompt, completion); + let full_program = format!( "{}{}\n\n{}\n\ncheck({})\n", - problem.prompt, completion, problem.test, entry + problem.prompt, aligned, problem.test, entry ); let ok = execute_python_test(&full_program, 10); @@ -556,6 +573,139 @@ pub(super) fn truncate_at_function_boundary(completion: &str) -> &str { completion } +/// PMAT-CODE-SHIP-005-WHITESPACE-RESIDUAL: normalise raw-continuation indent. +/// +/// HumanEval prompts end with ` """\n` (4-space-indented docstring close); +/// the function body should continue at 4-space indent. On `apr eval --task +/// humaneval` raw-continuation path, the model emits 5-space leading indent +/// (BPE tokenization artifact at the prompt-completion boundary). The +/// resulting concatenation ` """\n for i in...` is invalid Python +/// (IndentationError). +/// +/// Manual `apr run` on the same model with auto-wrap produces correct +/// 4-space; the bug is raw-continuation-specific. +/// +/// Fix: detect the prompt's expected continuation indent (last non-empty +/// line's leading-space count) vs the completion's first non-empty line +/// indent; if completion is over-indented, dedent every line by the +/// excess. Only over-indented completions are touched (no risk to +/// correctly-aligned outputs). +/// +/// Lines without sufficient leading whitespace (blank lines or top-level +/// code) are left untouched. +pub(super) fn align_continuation_indent(prompt: &str, completion: &str) -> String { + let expected_indent = prompt + .lines() + .rev() + .find(|l| !l.trim().is_empty()) + .map(|l| l.chars().take_while(|c| *c == ' ').count()) + .unwrap_or(0); + + let actual_indent = completion + .lines() + .find(|l| !l.trim().is_empty()) + .map(|l| l.chars().take_while(|c| *c == ' ').count()) + .unwrap_or(0); + + if actual_indent <= expected_indent { + return completion.to_string(); + } + + let excess = actual_indent - expected_indent; + let prefix = " ".repeat(excess); + + // Dedent only the function-body chunk — stop at the first non-empty + // line that drops to indent 0 (signaling we've exited the function + // scope; e.g., `if __name__ == "__main__":` post-amble). Top-level + // code at indent < `excess` must be preserved as-is. + let mut in_body = true; + completion + .split_inclusive('\n') + .map(|line| { + let trimmed = line.trim_start_matches(' ').trim_end_matches('\n'); + // Track scope transition: once we see a non-empty 0-indent line, + // we're past the function body — leave all subsequent lines alone. + if in_body && !trimmed.is_empty() { + let leading = line.chars().take_while(|c| *c == ' ').count(); + if leading == 0 { + in_body = false; + } + } + if in_body && line.starts_with(&prefix) { + line[excess..].to_string() + } else { + line.to_string() + } + }) + .collect() +} + +#[cfg(test)] +mod align_indent_tests { + use super::align_continuation_indent; + + /// Pre-fix HumanEval/0 reproduction: 5-space body indent should + /// dedent to 4-space, with relative inner nesting preserved. + #[test] + fn dedents_one_excess_space() { + let prompt = "def f(x: int) -> int:\n \"\"\" doc.\n \"\"\"\n"; + let completion = " for i in range(x):\n if i > 0:\n return i\n return 0\n"; + let got = align_continuation_indent(prompt, completion); + let want = " for i in range(x):\n if i > 0:\n return i\n return 0\n"; + assert_eq!(got, want); + } + + /// Correctly-aligned completion is left unchanged. + #[test] + fn passthrough_when_already_correct() { + let prompt = "def f():\n \"\"\"doc\"\"\"\n"; + let completion = " return 42\n"; + let got = align_continuation_indent(prompt, completion); + assert_eq!(got, completion); + } + + /// Top-level code after the function body (e.g., `if __name__`) has 0 + /// leading spaces and must NOT be dedented (would crash on slice). + #[test] + fn leaves_zero_indent_lines_untouched() { + let prompt = "def f():\n \"\"\"doc\"\"\"\n"; + let completion = " return 1\n\n\nif __name__ == \"__main__\":\n pass\n"; + let got = align_continuation_indent(prompt, completion); + let want = " return 1\n\n\nif __name__ == \"__main__\":\n pass\n"; + assert_eq!(got, want); + } + + /// Multi-space excess (2+) is dedented uniformly. + #[test] + fn dedents_multi_space_excess() { + let prompt = " pass\n"; + let completion = " x = 1\n nested = 2\n"; + let got = align_continuation_indent(prompt, completion); + // expected = 4 (' pass' last line), actual = 8 → excess = 4 + let want = " x = 1\n nested = 2\n"; + assert_eq!(got, want); + } + + /// Empty completion is passthrough. + #[test] + fn empty_completion() { + let prompt = "def f():\n pass\n"; + let completion = ""; + let got = align_continuation_indent(prompt, completion); + assert_eq!(got, ""); + } + + /// Mutation-survey section: invariant under no-indent prompt + no-indent + /// completion (early-return guard). + #[test] + fn no_indent_anywhere() { + let prompt = "x = 1\n"; + let completion = "y = 2\n"; + let got = align_continuation_indent(prompt, completion); + assert_eq!(got, completion); + } +} + /// Execute a Python program and check if all assertions pass. /// Returns true if exit code is 0, false otherwise. /// Enforces a timeout to catch infinite loops (FALSIFY-EVAL-003).