Skip to content

Commit 6506d63

Browse files
JennyLiu-nvJenny Liu
andauthored
[None][test] Add DGX-Spark VLM gemm3-12b bfp16/fp4/fp8 accuracy and perf cases (#11096)
Signed-off-by: Jenny Liu <JennyLiu-nv+JennyLiu@users.noreply.github.com> Co-authored-by: Jenny Liu <JennyLiu-nv+JennyLiu@users.noreply.github.com>
1 parent 29a203a commit 6506d63

File tree

6 files changed

+118
-37
lines changed

6 files changed

+118
-37
lines changed

tests/integration/defs/accuracy/references/mmmu.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,14 @@ google/gemma-3-27b-it:
66
- quant_algo: NVFP4
77
kv_cache_quant_algo: FP8
88
accuracy: 48.0
9+
google/gemma-3-12b-it:
10+
- accuracy: 50.44
11+
- quant_algo: FP8
12+
kv_cache_quant_algo: FP8
13+
accuracy: 49.0
14+
- quant_algo: NVFP4
15+
kv_cache_quant_algo: FP8
16+
accuracy: 50.11
917
Qwen/Qwen2-VL-7B-Instruct:
1018
- accuracy: 48.44
1119
Qwen/Qwen2.5-VL-7B-Instruct:

tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,60 @@ def test_nvfp4_prequantized(self):
286286
task.evaluate(llm, sampling_params=self.sampling_params)
287287

288288

289+
@skip_pre_hopper
290+
class TestGemma3_12BInstruct(LlmapiAccuracyTestHarness):
291+
MODEL_NAME = "google/gemma-3-12b-it"
292+
MODEL_PATH = f"{llm_models_root()}/gemma/gemma-3-12b-it"
293+
MAX_NUM_TOKENS = 12800
294+
295+
sampling_params = SamplingParams(
296+
max_tokens=MAX_NUM_TOKENS, truncate_prompt_tokens=MMMU.MAX_INPUT_LEN, stop="<end_of_turn>"
297+
)
298+
299+
# Gemma3 VLM needs KV cache reuse disabled for custom mask support.
300+
kv_cache_config = KvCacheConfig(
301+
enable_block_reuse=False,
302+
enable_partial_reuse=False,
303+
free_gpu_memory_fraction=0.6,
304+
)
305+
306+
kv_cache_config_fp8 = kv_cache_config.model_copy(update={"dtype": "fp8"})
307+
308+
def _make_llm(self, model_path: str, kv_cache_config: KvCacheConfig = None):
309+
# Gemma3 VLM needs FlashInfer attention backend for custom mask support.
310+
if kv_cache_config is None:
311+
kv_cache_config = self.kv_cache_config
312+
return LLM(
313+
model_path,
314+
max_batch_size=16,
315+
max_num_tokens=self.MAX_NUM_TOKENS,
316+
max_seq_len=8704, # 8192 + 512.
317+
kv_cache_config=kv_cache_config,
318+
attn_backend="FLASHINFER",
319+
enable_chunked_prefill=False,
320+
)
321+
322+
def test_auto_dtype(self):
323+
with self._make_llm(self.MODEL_PATH) as llm:
324+
task = MMMU(self.MODEL_NAME)
325+
task.evaluate(llm, sampling_params=self.sampling_params)
326+
327+
def test_fp8_prequantized(self):
328+
model_path = f"{llm_models_root()}/gemma/gemma-3-12b-it-fp8"
329+
with self._make_llm(model_path, self.kv_cache_config_fp8) as llm:
330+
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
331+
task = MMMU(self.MODEL_NAME)
332+
task.evaluate(llm, sampling_params=self.sampling_params)
333+
334+
@skip_pre_blackwell
335+
def test_nvfp4_prequantized(self):
336+
model_path = f"{llm_models_root()}/gemma/gemma-3-12b-it-fp4"
337+
with self._make_llm(model_path, self.kv_cache_config_fp8) as llm:
338+
assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
339+
task = MMMU(self.MODEL_NAME)
340+
task.evaluate(llm, sampling_params=self.sampling_params)
341+
342+
289343
class TestQwen3VL_MOE(LlmapiAccuracyTestHarness):
290344
MODEL_NAME = "Qwen/Qwen3-VL-30B-A3B-Instruct"
291345
MODEL_PATH = f"{llm_models_root()}/Qwen3/Qwen3-VL-30B-A3B-Instruct"

tests/integration/defs/perf/pytorch_model_config.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,13 @@ def get_model_yaml_config(model_label: str,
355355
},
356356
'guided_decoding_backend': 'xgrammar'
357357
}
358+
},
359+
# Gemma3 models require FlashInfer backend due to sliding window attention
360+
{
361+
'patterns': ['gemma_3', 'gemma3'],
362+
'config': {
363+
'attn_backend': 'FLASHINFER',
364+
}
358365
}
359366
]
360367

tests/integration/defs/perf/test_perf.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,9 @@
102102
"gemma_3_27b_it": "gemma/gemma-3-27b-it",
103103
"gemma_3_27b_it_fp8": "gemma/gemma-3-27b-it-fp8",
104104
"gemma_3_27b_it_fp4": "gemma/gemma-3-27b-it-FP4",
105+
"gemma_3_12b_it": "gemma/gemma-3-12b-it",
106+
"gemma_3_12b_it_fp8": "gemma/gemma-3-12b-it-fp8",
107+
"gemma_3_12b_it_fp4": "gemma/gemma-3-12b-it-fp4",
105108
"deepseek_r1_fp8": "DeepSeek-R1/DeepSeek-R1",
106109
"deepseek_r1_nvfp4": "DeepSeek-R1/DeepSeek-R1-FP4",
107110
"deepseek_r1_0528_fp8": "DeepSeek-R1/DeepSeek-R1-0528/",
@@ -125,7 +128,7 @@
125128
"qwen3_32b_fp4": "Qwen3/nvidia-Qwen3-32B-NVFP4",
126129
"qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf",
127130
"qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
128-
"qwen2_5_vl_7b_instruct": "multimodals/Qwen2.5-VL-7B-Instruct",
131+
"qwen2_5_vl_7b_instruct": "Qwen2.5-VL-7B-Instruct",
129132
"qwen2_5_vl_7b_instruct_fp8": "multimodals/Qwen2.5-VL-7B-Instruct-FP8",
130133
"qwen2_5_vl_7b_instruct_fp4": "multimodals/Qwen2.5-VL-7B-Instruct-FP4",
131134
"starcoder2_3b": "starcoder2-3b",

tests/integration/test_lists/qa/llm_spark_func.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_fp8
4141
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_nvfp4
4242
accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_fp8_prequantized
4343
accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_nvfp4_prequantized
44+
accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_12BInstruct::test_auto_dtype
45+
accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_12BInstruct::test_fp8_prequantized
46+
accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_12BInstruct::test_nvfp4_prequantized
4447
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False]
4548
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True]
4649
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp8
Lines changed: 42 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,50 @@
11
version: 0.0.1
22
llm_spark_perf:
3+
# ===============================================================================
4+
# 1: Single GPU Spark perf cases
5+
# ===============================================================================
36
- condition:
47
ranges:
58
system_gpu_count:
69
gte: 1
710
lte: 1
811
tests:
9-
- perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
10-
# GPT-OSS 120B normal case (no spec dec)
11-
- perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
12-
# GPT-OSS 120B spec dec case (Eagle3)
13-
- perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-streaming-float4-maxbs:1-maxnt:4096-input_output_len:2048,128-reqs:1-con:1]
14-
- perf/test_perf.py::test_perf[nvidia_nemotron_nano_9b_v2_nvfp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
15-
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
16-
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
17-
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
18-
- perf/test_perf.py::test_perf[qwen3_8b_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
19-
- perf/test_perf.py::test_perf[qwen3_8b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
20-
- perf/test_perf.py::test_perf[qwen3_8b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
21-
- perf/test_perf.py::test_perf[qwen3_14b_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
22-
- perf/test_perf.py::test_perf[qwen3_14b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
23-
- perf/test_perf.py::test_perf[qwen3_14b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
24-
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
25-
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
26-
- perf/test_perf.py::test_perf[qwen3_30b_a3b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
27-
- perf/test_perf.py::test_perf[qwen3_30b_a3b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
28-
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
29-
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_v1.5_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
30-
- perf/test_perf.py::test_perf[phi_4_reasoning_plus_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
31-
- perf/test_perf.py::test_perf[phi_4_reasoning_plus_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
32-
- perf/test_perf.py::test_perf[phi_4_reasoning_plus-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
33-
- perf/test_perf.py::test_perf[qwen3_32b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
34-
- perf/test_perf.py::test_perf[qwen3_32b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
35-
- perf/test_perf.py::test_perf[deepseek_r1_distill_qwen_32b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
36-
- perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
37-
- perf/test_perf.py::test_perf[phi_4_multimodal_instruct_fp4-bench-pytorch-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
38-
- perf/test_perf.py::test_perf[phi_4_multimodal_instruct_fp8-bench-pytorch-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
39-
- perf/test_perf.py::test_perf[qwen2_5_vl_7b_instruct-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
40-
- perf/test_perf.py::test_perf[qwen2_5_vl_7b_instruct_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
41-
- perf/test_perf.py::test_perf[qwen2_5_vl_7b_instruct_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
42-
- perf/test_perf.py::test_perf[gemma_3_27b_it-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
43-
- perf/test_perf.py::test_perf[gemma_3_27b_it_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
44-
- perf/test_perf.py::test_perf[gemma_3_27b_it_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
12+
- perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
13+
# GPT-OSS 120B normal case (no spec dec)
14+
- perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
15+
# GPT-OSS 120B spec dec case (Eagle3)
16+
- perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-streaming-float4-maxbs:1-maxnt:4096-input_output_len:2048,128-reqs:1-con:1]
17+
- perf/test_perf.py::test_perf[nvidia_nemotron_nano_9b_v2_nvfp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
18+
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
19+
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
20+
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
21+
- perf/test_perf.py::test_perf[qwen3_8b_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
22+
- perf/test_perf.py::test_perf[qwen3_8b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
23+
- perf/test_perf.py::test_perf[qwen3_8b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
24+
- perf/test_perf.py::test_perf[qwen3_14b_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
25+
- perf/test_perf.py::test_perf[qwen3_14b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
26+
- perf/test_perf.py::test_perf[qwen3_14b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
27+
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
28+
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
29+
- perf/test_perf.py::test_perf[qwen3_30b_a3b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
30+
- perf/test_perf.py::test_perf[qwen3_30b_a3b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
31+
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
32+
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_v1.5_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
33+
- perf/test_perf.py::test_perf[phi_4_reasoning_plus_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
34+
- perf/test_perf.py::test_perf[phi_4_reasoning_plus_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
35+
- perf/test_perf.py::test_perf[phi_4_reasoning_plus-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
36+
- perf/test_perf.py::test_perf[qwen3_32b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
37+
- perf/test_perf.py::test_perf[qwen3_32b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
38+
- perf/test_perf.py::test_perf[deepseek_r1_distill_qwen_32b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
39+
- perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
40+
- perf/test_perf.py::test_perf[phi_4_multimodal_instruct_fp4-bench-pytorch-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
41+
- perf/test_perf.py::test_perf[phi_4_multimodal_instruct_fp8-bench-pytorch-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
42+
- perf/test_perf.py::test_perf[qwen2_5_vl_7b_instruct-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
43+
- perf/test_perf.py::test_perf[qwen2_5_vl_7b_instruct_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
44+
- perf/test_perf.py::test_perf[qwen2_5_vl_7b_instruct_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
45+
- perf/test_perf.py::test_perf[gemma_3_27b_it-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
46+
- perf/test_perf.py::test_perf[gemma_3_27b_it_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
47+
- perf/test_perf.py::test_perf[gemma_3_27b_it_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
48+
- perf/test_perf.py::test_perf[gemma_3_12b_it-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
49+
- perf/test_perf.py::test_perf[gemma_3_12b_it_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
50+
- perf/test_perf.py::test_perf[gemma_3_12b_it_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]

0 commit comments

Comments
 (0)