Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
230 changes: 190 additions & 40 deletions crates/apr-cli/src/commands/eval/inference.rs
Original file line number Diff line number Diff line change
Expand Up @@ -274,34 +274,39 @@ fn load_humaneval_tokenizer(
}

/// ALB-084: Run HumanEval with actual model inference + Python test execution.
///
/// PMAT-CODE-SHIP-005-FIX (2026-05-11): routed through `realizar::run_inference`
/// + `OwnedQuantizedModel::from_apr` (the same working path that SHIP-002 +
/// SHIP-006 + SHIP-008 LIVE-discharged) instead of the legacy
/// `AprTransformer::forward_with_cache + AprKVCache` path. The legacy path
/// produced 0/3 pass@1 on canonical 7B teacher smoke test (every problem
/// FAIL); the run_inference path produces the canonical pairwise-comparison
/// solution for HumanEval/0 (verified manually 2026-05-11 via `apr run`).
///
/// HumanEval prompts are raw Python code (with docstrings); we tokenize via
/// embedded BPE and pass via `InferenceConfig::with_input_tokens` to bypass
/// `prepare_tokens_apr`'s ChatML auto-wrap (which would wrap raw Python in
/// `<|im_start|>user...` causing degenerate output).
#[cfg(feature = "inference")]
fn run_humaneval_inference(
model_path: &Path,
problems: &[HumanEvalProblem],
_k_values: &[usize],
json_output: bool,
) -> std::result::Result<(usize, Vec<(String, String, bool)>), String> {
use realizar::apr_transformer::AprKVCache;
use realizar::{run_inference, InferenceConfig};

if !json_output {
println!(" {} Loading model for inference...", "→".dimmed());
}
let transformer = load_humaneval_model(model_path)?;
let tokenizer = load_humaneval_tokenizer(model_path, json_output)?;

if !json_output {
println!(
" {} Model loaded ({} layers, vocab={})",
"✓".green(),
transformer.config.num_layers,
transformer.config.vocab_size
);
println!(" {} Tokenizer loaded", "✓".green());
}

let mut passed = 0usize;
let mut results = Vec::new();
let temperature = 0.0f32;
let mut rng_state: u64 = 42;

for (i, problem) in problems.iter().enumerate() {
let entry = problem
Expand All @@ -316,42 +321,54 @@ fn run_humaneval_inference(
continue;
}

// Generate completion (greedy, max 256 tokens)
let mut cache = AprKVCache::new(&transformer.config);
let mut tokens = prompt_tokens.clone();

for (pos, &tok) in prompt_tokens.iter().enumerate() {
let _ = transformer.forward_with_cache(tok, &mut cache, pos);
}

let max_new = 256;
for step in 0..max_new {
let pos = prompt_tokens.len() + step;
let last_tok = *tokens.last().expect("last(");
let logits = transformer
.forward_with_cache(last_tok, &mut cache, pos)
.map_err(|e| format!("Generation failed: {e}"))?;

let next = sample_token(&logits, temperature, &mut rng_state);
tokens.push(next);

if next == 0 {
break;
}
if let Some(eos) = transformer.config.eos_token_id {
if next == eos {
break;
// Generate completion via run_inference (greedy, max 256 tokens).
// `with_input_tokens` bypasses `prepare_tokens_apr`'s ChatML auto-wrap
// — HumanEval prompts are raw Python and must NOT be wrapped.
let config = InferenceConfig::new(model_path)
.with_input_tokens(prompt_tokens.clone())
.with_max_tokens(256)
.with_temperature(0.0)
.with_top_k(1);

let result = match run_inference(&config) {
Ok(r) => r,
Err(e) => {
if !json_output {
eprintln!(" [FAIL] {} ({}): inference error: {e}", problem.task_id, entry);
}
}
}
results.push((problem.task_id.clone(), entry.to_string(), false));
continue;
},
};

let completion_tokens = &tokens[prompt_tokens.len()..];
let completion = tokenizer.decode(completion_tokens);
// run_inference's `result.text` is the FULL decoded sequence
// (prompt + completion). Slicing by the prompt string preserves
// exact byte boundaries — slicing by tokens introduces a leading-
// whitespace artifact when the prompt ends with `\n` and the
// first generated token decodes as a leading-space-prefixed run.
let completion = if let Some(stripped) = result.text.strip_prefix(&problem.prompt) {
stripped.to_string()
} else {
// Fallback: token-level slicing if text doesn't begin with the
// prompt verbatim (e.g., tokenizer-specific whitespace handling).
let completion_tokens = if result.tokens.len() > result.input_token_count {
&result.tokens[result.input_token_count..]
} else {
&result.tokens[..]
};
tokenizer.decode(completion_tokens)
};
let completion = truncate_at_function_boundary(&completion);

// PMAT-CODE-SHIP-005-WHITESPACE-RESIDUAL: BPE raw-continuation
// can emit a 1-space over-indent at the prompt-completion boundary.
// Align to the prompt's last non-empty line's indent before
// concatenation — invalid-Python IndentationError otherwise.
let aligned = align_continuation_indent(&problem.prompt, completion);

let full_program = format!(
"{}{}\n\n{}\n\ncheck({})\n",
problem.prompt, completion, problem.test, entry
problem.prompt, aligned, problem.test, entry
);

let ok = execute_python_test(&full_program, 10);
Expand Down Expand Up @@ -556,6 +573,139 @@ pub(super) fn truncate_at_function_boundary(completion: &str) -> &str {
completion
}

/// PMAT-CODE-SHIP-005-WHITESPACE-RESIDUAL: normalise raw-continuation indent.
///
/// HumanEval prompts end with ` """\n` (4-space-indented docstring close);
/// the function body should continue at 4-space indent. On `apr eval --task
/// humaneval` raw-continuation path, the model emits 5-space leading indent
/// (BPE tokenization artifact at the prompt-completion boundary). The
/// resulting concatenation ` """\n for i in...` is invalid Python
/// (IndentationError).
///
/// Manual `apr run` on the same model with auto-wrap produces correct
/// 4-space; the bug is raw-continuation-specific.
///
/// Fix: detect the prompt's expected continuation indent (last non-empty
/// line's leading-space count) vs the completion's first non-empty line
/// indent; if completion is over-indented, dedent every line by the
/// excess. Only over-indented completions are touched (no risk to
/// correctly-aligned outputs).
///
/// Lines without sufficient leading whitespace (blank lines or top-level
/// code) are left untouched.
pub(super) fn align_continuation_indent(prompt: &str, completion: &str) -> String {
let expected_indent = prompt
.lines()
.rev()
.find(|l| !l.trim().is_empty())
.map(|l| l.chars().take_while(|c| *c == ' ').count())
.unwrap_or(0);

let actual_indent = completion
.lines()
.find(|l| !l.trim().is_empty())
.map(|l| l.chars().take_while(|c| *c == ' ').count())
.unwrap_or(0);

if actual_indent <= expected_indent {
return completion.to_string();
}

let excess = actual_indent - expected_indent;
let prefix = " ".repeat(excess);

// Dedent only the function-body chunk — stop at the first non-empty
// line that drops to indent 0 (signaling we've exited the function
// scope; e.g., `if __name__ == "__main__":` post-amble). Top-level
// code at indent < `excess` must be preserved as-is.
let mut in_body = true;
completion
.split_inclusive('\n')
.map(|line| {
let trimmed = line.trim_start_matches(' ').trim_end_matches('\n');
// Track scope transition: once we see a non-empty 0-indent line,
// we're past the function body — leave all subsequent lines alone.
if in_body && !trimmed.is_empty() {
let leading = line.chars().take_while(|c| *c == ' ').count();
if leading == 0 {
in_body = false;
}
}
if in_body && line.starts_with(&prefix) {
line[excess..].to_string()
} else {
line.to_string()
}
})
.collect()
}

#[cfg(test)]
mod align_indent_tests {
use super::align_continuation_indent;

/// Pre-fix HumanEval/0 reproduction: 5-space body indent should
/// dedent to 4-space, with relative inner nesting preserved.
#[test]
fn dedents_one_excess_space() {
let prompt = "def f(x: int) -> int:\n \"\"\" doc.\n \"\"\"\n";
let completion = " for i in range(x):\n if i > 0:\n return i\n return 0\n";
let got = align_continuation_indent(prompt, completion);
let want = " for i in range(x):\n if i > 0:\n return i\n return 0\n";
assert_eq!(got, want);
}

/// Correctly-aligned completion is left unchanged.
#[test]
fn passthrough_when_already_correct() {
let prompt = "def f():\n \"\"\"doc\"\"\"\n";
let completion = " return 42\n";
let got = align_continuation_indent(prompt, completion);
assert_eq!(got, completion);
}

/// Top-level code after the function body (e.g., `if __name__`) has 0
/// leading spaces and must NOT be dedented (would crash on slice).
#[test]
fn leaves_zero_indent_lines_untouched() {
let prompt = "def f():\n \"\"\"doc\"\"\"\n";
let completion = " return 1\n\n\nif __name__ == \"__main__\":\n pass\n";
let got = align_continuation_indent(prompt, completion);
let want = " return 1\n\n\nif __name__ == \"__main__\":\n pass\n";
assert_eq!(got, want);
}

/// Multi-space excess (2+) is dedented uniformly.
#[test]
fn dedents_multi_space_excess() {
let prompt = " pass\n";
let completion = " x = 1\n nested = 2\n";
let got = align_continuation_indent(prompt, completion);
// expected = 4 (' pass' last line), actual = 8 → excess = 4
let want = " x = 1\n nested = 2\n";
assert_eq!(got, want);
}

/// Empty completion is passthrough.
#[test]
fn empty_completion() {
let prompt = "def f():\n pass\n";
let completion = "";
let got = align_continuation_indent(prompt, completion);
assert_eq!(got, "");
}

/// Mutation-survey section: invariant under no-indent prompt + no-indent
/// completion (early-return guard).
#[test]
fn no_indent_anywhere() {
let prompt = "x = 1\n";
let completion = "y = 2\n";
let got = align_continuation_indent(prompt, completion);
assert_eq!(got, completion);
}
}

/// Execute a Python program and check if all assertions pass.
/// Returns true if exit code is 0, false otherwise.
/// Enforces a timeout to catch infinite loops (FALSIFY-EVAL-003).
Expand Down
Loading