paiml · noahgift · May 11, 2026 · May 11, 2026 · May 11, 2026
diff --git a/crates/apr-cli/src/commands/eval/inference.rs b/crates/apr-cli/src/commands/eval/inference.rs
@@ -274,34 +274,39 @@ fn load_humaneval_tokenizer(
 }
 
 /// ALB-084: Run HumanEval with actual model inference + Python test execution.
+///
+/// PMAT-CODE-SHIP-005-FIX (2026-05-11): routed through `realizar::run_inference`
+/// + `OwnedQuantizedModel::from_apr` (the same working path that SHIP-002 +
+/// SHIP-006 + SHIP-008 LIVE-discharged) instead of the legacy
+/// `AprTransformer::forward_with_cache + AprKVCache` path. The legacy path
+/// produced 0/3 pass@1 on canonical 7B teacher smoke test (every problem
+/// FAIL); the run_inference path produces the canonical pairwise-comparison
+/// solution for HumanEval/0 (verified manually 2026-05-11 via `apr run`).
+///
+/// HumanEval prompts are raw Python code (with docstrings); we tokenize via
+/// embedded BPE and pass via `InferenceConfig::with_input_tokens` to bypass
+/// `prepare_tokens_apr`'s ChatML auto-wrap (which would wrap raw Python in
+/// `<|im_start|>user...` causing degenerate output).
 #[cfg(feature = "inference")]
 fn run_humaneval_inference(
     model_path: &Path,
     problems: &[HumanEvalProblem],
     _k_values: &[usize],
     json_output: bool,
 ) -> std::result::Result<(usize, Vec<(String, String, bool)>), String> {
-    use realizar::apr_transformer::AprKVCache;
+    use realizar::{run_inference, InferenceConfig};
 
     if !json_output {
         println!("  {} Loading model for inference...", "→".dimmed());
     }
-    let transformer = load_humaneval_model(model_path)?;
     let tokenizer = load_humaneval_tokenizer(model_path, json_output)?;
 
     if !json_output {
-        println!(
-            "  {} Model loaded ({} layers, vocab={})",
-            "✓".green(),
-            transformer.config.num_layers,
-            transformer.config.vocab_size
-        );
+        println!("  {} Tokenizer loaded", "✓".green());
     }
 
     let mut passed = 0usize;
     let mut results = Vec::new();
-    let temperature = 0.0f32;
-    let mut rng_state: u64 = 42;
 
     for (i, problem) in problems.iter().enumerate() {
         let entry = problem
@@ -316,42 +321,54 @@ fn run_humaneval_inference(
             continue;
         }
 
-        // Generate completion (greedy, max 256 tokens)
-        let mut cache = AprKVCache::new(&transformer.config);
-        let mut tokens = prompt_tokens.clone();
-
-        for (pos, &tok) in prompt_tokens.iter().enumerate() {
-            let _ = transformer.forward_with_cache(tok, &mut cache, pos);
-        }
-
-        let max_new = 256;
-        for step in 0..max_new {
-            let pos = prompt_tokens.len() + step;
-            let last_tok = *tokens.last().expect("last(");
-            let logits = transformer
-                .forward_with_cache(last_tok, &mut cache, pos)
-                .map_err(|e| format!("Generation failed: {e}"))?;
-
-            let next = sample_token(&logits, temperature, &mut rng_state);
-            tokens.push(next);
-
-            if next == 0 {
-                break;
-            }
-            if let Some(eos) = transformer.config.eos_token_id {
-                if next == eos {
-                    break;
+        // Generate completion via run_inference (greedy, max 256 tokens).
+        // `with_input_tokens` bypasses `prepare_tokens_apr`'s ChatML auto-wrap
+        // — HumanEval prompts are raw Python and must NOT be wrapped.
+        let config = InferenceConfig::new(model_path)
+            .with_input_tokens(prompt_tokens.clone())
+            .with_max_tokens(256)
+            .with_temperature(0.0)
+            .with_top_k(1);
+
+        let result = match run_inference(&config) {
+            Ok(r) => r,
+            Err(e) => {
+                if !json_output {
+                    eprintln!("  [FAIL] {} ({}): inference error: {e}", problem.task_id, entry);
                 }
-            }
-        }
+                results.push((problem.task_id.clone(), entry.to_string(), false));
+                continue;
+            },
+        };
 
-        let completion_tokens = &tokens[prompt_tokens.len()..];
-        let completion = tokenizer.decode(completion_tokens);
+        // run_inference's `result.text` is the FULL decoded sequence
+        // (prompt + completion). Slicing by the prompt string preserves
+        // exact byte boundaries — slicing by tokens introduces a leading-
+        // whitespace artifact when the prompt ends with `\n` and the
+        // first generated token decodes as a leading-space-prefixed run.
+        let completion = if let Some(stripped) = result.text.strip_prefix(&problem.prompt) {
+            stripped.to_string()
+        } else {
+            // Fallback: token-level slicing if text doesn't begin with the
+            // prompt verbatim (e.g., tokenizer-specific whitespace handling).
+            let completion_tokens = if result.tokens.len() > result.input_token_count {
+                &result.tokens[result.input_token_count..]
+            } else {
+                &result.tokens[..]
+            };
+            tokenizer.decode(completion_tokens)
+        };
         let completion = truncate_at_function_boundary(&completion);
 
+        // PMAT-CODE-SHIP-005-WHITESPACE-RESIDUAL: BPE raw-continuation
+        // can emit a 1-space over-indent at the prompt-completion boundary.
+        // Align to the prompt's last non-empty line's indent before
+        // concatenation — invalid-Python IndentationError otherwise.
+        let aligned = align_continuation_indent(&problem.prompt, completion);
+
         let full_program = format!(
             "{}{}\n\n{}\n\ncheck({})\n",
-            problem.prompt, completion, problem.test, entry
+            problem.prompt, aligned, problem.test, entry
         );
 
         let ok = execute_python_test(&full_program, 10);
@@ -556,6 +573,139 @@ pub(super) fn truncate_at_function_boundary(completion: &str) -> &str {
     completion
 }
 
+/// PMAT-CODE-SHIP-005-WHITESPACE-RESIDUAL: normalise raw-continuation indent.
+///
+/// HumanEval prompts end with `    """\n` (4-space-indented docstring close);
+/// the function body should continue at 4-space indent. On `apr eval --task
+/// humaneval` raw-continuation path, the model emits 5-space leading indent
+/// (BPE tokenization artifact at the prompt-completion boundary). The
+/// resulting concatenation `    """\n     for i in...` is invalid Python
+/// (IndentationError).
+///
+/// Manual `apr run` on the same model with auto-wrap produces correct
+/// 4-space; the bug is raw-continuation-specific.
+///
+/// Fix: detect the prompt's expected continuation indent (last non-empty
+/// line's leading-space count) vs the completion's first non-empty line
+/// indent; if completion is over-indented, dedent every line by the
+/// excess. Only over-indented completions are touched (no risk to
+/// correctly-aligned outputs).
+///
+/// Lines without sufficient leading whitespace (blank lines or top-level
+/// code) are left untouched.
+pub(super) fn align_continuation_indent(prompt: &str, completion: &str) -> String {
+    let expected_indent = prompt
+        .lines()
+        .rev()
+        .find(|l| !l.trim().is_empty())
+        .map(|l| l.chars().take_while(|c| *c == ' ').count())
+        .unwrap_or(0);
+
+    let actual_indent = completion
+        .lines()
+        .find(|l| !l.trim().is_empty())
+        .map(|l| l.chars().take_while(|c| *c == ' ').count())
+        .unwrap_or(0);
+
+    if actual_indent <= expected_indent {
+        return completion.to_string();
+    }
+
+    let excess = actual_indent - expected_indent;
+    let prefix = " ".repeat(excess);
+
+    // Dedent only the function-body chunk — stop at the first non-empty
+    // line that drops to indent 0 (signaling we've exited the function
+    // scope; e.g., `if __name__ == "__main__":` post-amble). Top-level
+    // code at indent < `excess` must be preserved as-is.
+    let mut in_body = true;
+    completion
+        .split_inclusive('\n')
+        .map(|line| {
+            let trimmed = line.trim_start_matches(' ').trim_end_matches('\n');
+            // Track scope transition: once we see a non-empty 0-indent line,
+            // we're past the function body — leave all subsequent lines alone.
+            if in_body && !trimmed.is_empty() {
+                let leading = line.chars().take_while(|c| *c == ' ').count();
+                if leading == 0 {
+                    in_body = false;
+                }
+            }
+            if in_body && line.starts_with(&prefix) {
+                line[excess..].to_string()
+            } else {
+                line.to_string()
+            }
+        })
+        .collect()
+}
+
+#[cfg(test)]
+mod align_indent_tests {
+    use super::align_continuation_indent;
+
+    /// Pre-fix HumanEval/0 reproduction: 5-space body indent should
+    /// dedent to 4-space, with relative inner nesting preserved.
+    #[test]
+    fn dedents_one_excess_space() {
+        let prompt = "def f(x: int) -> int:\n    \"\"\" doc.\n    \"\"\"\n";
+        let completion = "     for i in range(x):\n         if i > 0:\n             return i\n     return 0\n";
+        let got = align_continuation_indent(prompt, completion);
+        let want = "    for i in range(x):\n        if i > 0:\n            return i\n    return 0\n";
+        assert_eq!(got, want);
+    }
+
+    /// Correctly-aligned completion is left unchanged.
+    #[test]
+    fn passthrough_when_already_correct() {
+        let prompt = "def f():\n    \"\"\"doc\"\"\"\n";
+        let completion = "    return 42\n";
+        let got = align_continuation_indent(prompt, completion);
+        assert_eq!(got, completion);
+    }
+
+    /// Top-level code after the function body (e.g., `if __name__`) has 0
+    /// leading spaces and must NOT be dedented (would crash on slice).
+    #[test]
+    fn leaves_zero_indent_lines_untouched() {
+        let prompt = "def f():\n    \"\"\"doc\"\"\"\n";
+        let completion = "     return 1\n\n\nif __name__ == \"__main__\":\n    pass\n";
+        let got = align_continuation_indent(prompt, completion);
+        let want = "    return 1\n\n\nif __name__ == \"__main__\":\n    pass\n";
+        assert_eq!(got, want);
+    }
+
+    /// Multi-space excess (2+) is dedented uniformly.
+    #[test]
+    fn dedents_multi_space_excess() {
+        let prompt = "    pass\n";
+        let completion = "        x = 1\n            nested = 2\n";
+        let got = align_continuation_indent(prompt, completion);
+        // expected = 4 ('    pass' last line), actual = 8 → excess = 4
+        let want = "    x = 1\n        nested = 2\n";
+        assert_eq!(got, want);
+    }
+
+    /// Empty completion is passthrough.
+    #[test]
+    fn empty_completion() {
+        let prompt = "def f():\n    pass\n";
+        let completion = "";
+        let got = align_continuation_indent(prompt, completion);
+        assert_eq!(got, "");
+    }
+
+    /// Mutation-survey section: invariant under no-indent prompt + no-indent
+    /// completion (early-return guard).
+    #[test]
+    fn no_indent_anywhere() {
+        let prompt = "x = 1\n";
+        let completion = "y = 2\n";
+        let got = align_continuation_indent(prompt, completion);
+        assert_eq!(got, completion);
+    }
+}
+
 /// Execute a Python program and check if all assertions pass.
 /// Returns true if exit code is 0, false otherwise.
 /// Enforces a timeout to catch infinite loops (FALSIFY-EVAL-003).