[None][test] Add DGX-Spark VLM gemm3-12b bfp16/fp4/fp8 accuracy and perf cases (#11096)

JennyLiu-nv · Jenny Liu · web-flow · commit 6506d6346676 · 2026-01-30T00:38:19.000-05:00
Signed-off-by: Jenny Liu &lt;JennyLiu-nv+JennyLiu@users.noreply.github.com&gt;
Co-authored-by: Jenny Liu &lt;JennyLiu-nv+JennyLiu@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/accuracy/references/mmmu.yaml b/tests/integration/defs/accuracy/references/mmmu.yaml
@@ -6,6 +6,14 @@ google/gemma-3-27b-it:
   - quant_algo: NVFP4
     kv_cache_quant_algo: FP8
     accuracy: 48.0
+google/gemma-3-12b-it:
+  - accuracy: 50.44
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 49.0
+  - quant_algo: NVFP4
+    kv_cache_quant_algo: FP8
+    accuracy: 50.11
 Qwen/Qwen2-VL-7B-Instruct:
   - accuracy: 48.44
 Qwen/Qwen2.5-VL-7B-Instruct:
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py
@@ -286,6 +286,60 @@ def test_nvfp4_prequantized(self):
             task.evaluate(llm, sampling_params=self.sampling_params)
 
 
+@skip_pre_hopper
+class TestGemma3_12BInstruct(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "google/gemma-3-12b-it"
+    MODEL_PATH = f"{llm_models_root()}/gemma/gemma-3-12b-it"
+    MAX_NUM_TOKENS = 12800
+
+    sampling_params = SamplingParams(
+        max_tokens=MAX_NUM_TOKENS, truncate_prompt_tokens=MMMU.MAX_INPUT_LEN, stop="<end_of_turn>"
+    )
+
+    # Gemma3 VLM needs KV cache reuse disabled for custom mask support.
+    kv_cache_config = KvCacheConfig(
+        enable_block_reuse=False,
+        enable_partial_reuse=False,
+        free_gpu_memory_fraction=0.6,
+    )
+
+    kv_cache_config_fp8 = kv_cache_config.model_copy(update={"dtype": "fp8"})
+
+    def _make_llm(self, model_path: str, kv_cache_config: KvCacheConfig = None):
+        # Gemma3 VLM needs FlashInfer attention backend for custom mask support.
+        if kv_cache_config is None:
+            kv_cache_config = self.kv_cache_config
+        return LLM(
+            model_path,
+            max_batch_size=16,
+            max_num_tokens=self.MAX_NUM_TOKENS,
+            max_seq_len=8704,  # 8192 + 512.
+            kv_cache_config=kv_cache_config,
+            attn_backend="FLASHINFER",
+            enable_chunked_prefill=False,
+        )
+
+    def test_auto_dtype(self):
+        with self._make_llm(self.MODEL_PATH) as llm:
+            task = MMMU(self.MODEL_NAME)
+            task.evaluate(llm, sampling_params=self.sampling_params)
+
+    def test_fp8_prequantized(self):
+        model_path = f"{llm_models_root()}/gemma/gemma-3-12b-it-fp8"
+        with self._make_llm(model_path, self.kv_cache_config_fp8) as llm:
+            assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
+            task = MMMU(self.MODEL_NAME)
+            task.evaluate(llm, sampling_params=self.sampling_params)
+
+    @skip_pre_blackwell
+    def test_nvfp4_prequantized(self):
+        model_path = f"{llm_models_root()}/gemma/gemma-3-12b-it-fp4"
+        with self._make_llm(model_path, self.kv_cache_config_fp8) as llm:
+            assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
+            task = MMMU(self.MODEL_NAME)
+            task.evaluate(llm, sampling_params=self.sampling_params)
+
+
 class TestQwen3VL_MOE(LlmapiAccuracyTestHarness):
     MODEL_NAME = "Qwen/Qwen3-VL-30B-A3B-Instruct"
     MODEL_PATH = f"{llm_models_root()}/Qwen3/Qwen3-VL-30B-A3B-Instruct"
diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py
@@ -355,6 +355,13 @@ def get_model_yaml_config(model_label: str,
                 },
                 'guided_decoding_backend': 'xgrammar'
             }
+        },
+        # Gemma3 models require FlashInfer backend due to sliding window attention
+        {
+            'patterns': ['gemma_3', 'gemma3'],
+            'config': {
+                'attn_backend': 'FLASHINFER',
+            }
         }
     ]
 
diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py
@@ -102,6 +102,9 @@
     "gemma_3_27b_it": "gemma/gemma-3-27b-it",
     "gemma_3_27b_it_fp8": "gemma/gemma-3-27b-it-fp8",
     "gemma_3_27b_it_fp4": "gemma/gemma-3-27b-it-FP4",
+    "gemma_3_12b_it": "gemma/gemma-3-12b-it",
+    "gemma_3_12b_it_fp8": "gemma/gemma-3-12b-it-fp8",
+    "gemma_3_12b_it_fp4": "gemma/gemma-3-12b-it-fp4",
     "deepseek_r1_fp8": "DeepSeek-R1/DeepSeek-R1",
     "deepseek_r1_nvfp4": "DeepSeek-R1/DeepSeek-R1-FP4",
     "deepseek_r1_0528_fp8": "DeepSeek-R1/DeepSeek-R1-0528/",
@@ -125,7 +128,7 @@
     "qwen3_32b_fp4": "Qwen3/nvidia-Qwen3-32B-NVFP4",
     "qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf",
     "qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
-    "qwen2_5_vl_7b_instruct": "multimodals/Qwen2.5-VL-7B-Instruct",
+    "qwen2_5_vl_7b_instruct": "Qwen2.5-VL-7B-Instruct",
     "qwen2_5_vl_7b_instruct_fp8": "multimodals/Qwen2.5-VL-7B-Instruct-FP8",
     "qwen2_5_vl_7b_instruct_fp4": "multimodals/Qwen2.5-VL-7B-Instruct-FP4",
     "starcoder2_3b": "starcoder2-3b",
diff --git a/tests/integration/test_lists/qa/llm_spark_func.txt b/tests/integration/test_lists/qa/llm_spark_func.txt
@@ -41,6 +41,9 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_fp8
 accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_nvfp4
 accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_fp8_prequantized
 accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_nvfp4_prequantized
+accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_12BInstruct::test_auto_dtype
+accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_12BInstruct::test_fp8_prequantized
+accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_12BInstruct::test_nvfp4_prequantized
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False]
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True]
 accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp8
diff --git a/tests/integration/test_lists/qa/llm_spark_perf.yml b/tests/integration/test_lists/qa/llm_spark_perf.yml
@@ -1,44 +1,50 @@
 version: 0.0.1
 llm_spark_perf:
+# ===============================================================================
+# 1: Single GPU Spark perf cases
+# ===============================================================================
 - condition:
     ranges:
       system_gpu_count:
         gte: 1
         lte: 1
   tests:
-    - perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    # GPT-OSS 120B normal case (no spec dec)
-    - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    # GPT-OSS 120B spec dec case (Eagle3)
-    - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-streaming-float4-maxbs:1-maxnt:4096-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[nvidia_nemotron_nano_9b_v2_nvfp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[qwen3_8b_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[qwen3_8b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[qwen3_8b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[qwen3_14b_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[qwen3_14b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[qwen3_14b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[qwen3_30b_a3b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[qwen3_30b_a3b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_v1.5_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[phi_4_reasoning_plus_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[phi_4_reasoning_plus_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[phi_4_reasoning_plus-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[qwen3_32b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[qwen3_32b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[deepseek_r1_distill_qwen_32b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[phi_4_multimodal_instruct_fp4-bench-pytorch-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[phi_4_multimodal_instruct_fp8-bench-pytorch-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[qwen2_5_vl_7b_instruct-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[qwen2_5_vl_7b_instruct_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[qwen2_5_vl_7b_instruct_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[gemma_3_27b_it-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[gemma_3_27b_it_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
-    - perf/test_perf.py::test_perf[gemma_3_27b_it_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  # GPT-OSS 120B normal case (no spec dec)
+  - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  # GPT-OSS 120B spec dec case (Eagle3)
+  - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-streaming-float4-maxbs:1-maxnt:4096-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[nvidia_nemotron_nano_9b_v2_nvfp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[qwen3_8b_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[qwen3_8b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[qwen3_8b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[qwen3_14b_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[qwen3_14b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[qwen3_14b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[qwen3_30b_a3b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[qwen3_30b_a3b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_v1.5_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[phi_4_reasoning_plus_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[phi_4_reasoning_plus_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[phi_4_reasoning_plus-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[qwen3_32b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[qwen3_32b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[deepseek_r1_distill_qwen_32b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[phi_4_multimodal_instruct_fp4-bench-pytorch-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[phi_4_multimodal_instruct_fp8-bench-pytorch-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[qwen2_5_vl_7b_instruct-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[qwen2_5_vl_7b_instruct_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[qwen2_5_vl_7b_instruct_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[gemma_3_27b_it-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[gemma_3_27b_it_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[gemma_3_27b_it_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[gemma_3_12b_it-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[gemma_3_12b_it_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1]
+  - perf/test_perf.py::test_perf[gemma_3_12b_it_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]

Original file line number	Diff line number	Diff line change
`@@ -355,6 +355,13 @@ def get_model_yaml_config(model_label: str,`
`355`	`355`	`},`
`356`	`356`	`'guided_decoding_backend': 'xgrammar'`
`357`	`357`	`}`
	`358`	`+ },`
	`359`	`+ # Gemma3 models require FlashInfer backend due to sliding window attention`
	`360`	`+ {`
	`361`	`+ 'patterns': ['gemma_3', 'gemma3'],`
	`362`	`+ 'config': {`
	`363`	`+ 'attn_backend': 'FLASHINFER',`
	`364`	`+ }`
`358`	`365`	`}`
`359`	`366`	`]`
`360`	`367`