Skip to content

Commit 851ccc9

Browse files
committed
add llama4 evals
1 parent 6d9cbdd commit 851ccc9

7 files changed

+49
-2
lines changed
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# For vllm script, with -t option (tensor parallel size).
2+
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V3 -b 32 -l 250 -f 8
3+
model_name: "deepseek-ai/DeepSeek-V3"
4+
backend: "vllm"
5+
tasks:
6+
- name: "gsm8k"
7+
metrics:
8+
- name: "exact_match,strict-match"
9+
value: 0.893
10+
- name: "exact_match,flexible-extract"
11+
value: 0.893
12+
limit: 50
13+
num_fewshot: 8
14+
trust_remote_code: True
15+
# TODO(zhewenl): we should increase bath_size and seq_len when we have MI300X or other large GPUs.
16+
max_model_len: 1024
17+
batch_size: 1
18+
gpu_memory_utilization: 0.98
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# For hf script, without -t option (tensor parallel size).
2+
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 250 -f 8
3+
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
4+
backend: "vllm-vlm"
5+
tasks:
6+
- name: "chartqa"
7+
metrics:
8+
- name: "relaxed_accuracy,none"
9+
value: 0.853
10+
limit: 100
11+
num_fewshot: 0
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# For hf script, without -t option (tensor parallel size).
2+
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 250 -f 8
3+
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
4+
backend: "vllm-vlm"
5+
tasks:
6+
- name: "gsm8k"
7+
metrics:
8+
- name: "exact_match,strict-match"
9+
value: 0.94
10+
- name: "exact_match,flexible-extract"
11+
value: 0.94
12+
limit: 250
13+
num_fewshot: 8
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml

.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh

100644100755
File mode changed.

.buildkite/lm-eval-harness/test_lm_eval_correctness.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,25 +19,28 @@
1919
def launch_lm_eval(eval_config, tp_size):
2020
trust_remote_code = eval_config.get("trust_remote_code", False)
2121
max_model_len = eval_config.get("max_model_len", 4096)
22+
gpu_memory_utilization = eval_config.get("gpu_memory_utilization", 1.0)
23+
batch_size = eval_config.get("batch_size", "auto")
2224
model_args = (
2325
f"pretrained={eval_config['model_name']},"
2426
f"tensor_parallel_size={tp_size},"
2527
f"enforce_eager=true,"
2628
f"add_bos_token=true,"
2729
f"trust_remote_code={trust_remote_code},"
28-
f"max_model_len={max_model_len}"
30+
f"max_model_len={max_model_len},"
31+
f"gpu_memory_utilization={gpu_memory_utilization}"
2932
)
3033
results = lm_eval.simple_evaluate(
3134
model=eval_config["backend"],
3235
model_args=model_args,
3336
tasks=[task["name"] for task in eval_config["tasks"]],
3437
num_fewshot=eval_config["num_fewshot"],
3538
limit=eval_config["limit"],
36-
batch_size="auto",
3739
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
3840
# text models. however, this is regressing measured strict-match for
3941
# existing text models in CI, so only apply it for mm.
4042
apply_chat_template=eval_config["backend"] == "vllm-vlm",
43+
batch_size=batch_size,
4144
)
4245
return results
4346

0 commit comments

Comments
 (0)