Modify hardcoded video_len check that fails with increased number of tests

akibjawad · akibjawad · commit 42e7a3454504 · 2025-07-29T16:21:57.000-07:00
diff --git a/tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py b/tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py
@@ -422,8 +422,15 @@ def _test_apply_chat_template(
         self.assertEqual(len(out_dict["input_ids"]), batch_size)
         self.assertEqual(len(out_dict["attention_mask"]), batch_size)
 
-        video_len = 2880 if batch_size == 1 else 5808  # qwen pixels don't scale with bs same way as other models
-        mm_len = batch_size * 1564 if modality == "image" else video_len
+        if modality == "video":
+            # qwen pixels don't scale with bs same way as other models, calulate expected video token count based on video_grid_thw
+            # TODO: update expected video token count calculation based on the internal processing logic of Qwen2_5OmniProcessor
+            expected_video_token_count = 0
+            for thw in out_dict["video_grid_thw"]:
+                expected_video_token_count += thw[0] * thw[1] * thw[2]
+            mm_len = expected_video_token_count
+        else:
+            mm_len = batch_size * 1564
         self.assertEqual(len(out_dict[input_name]), mm_len)
 
         return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list}
diff --git a/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py
@@ -239,8 +239,15 @@ def _test_apply_chat_template(
         self.assertEqual(len(out_dict["input_ids"]), batch_size)
         self.assertEqual(len(out_dict["attention_mask"]), batch_size)
 
-        video_len = 180 if batch_size == 1 else 320  # qwen pixels don't scale with bs same way as other models
-        mm_len = batch_size * 192 if modality == "image" else video_len
+        if modality == "video":
+            # qwen pixels don't scale with bs same way as other models, calulate expected video token count based on video_grid_thw
+            # TODO: update expected video token count calculation based on the internal processing logic of Qwen2_5VLProcessor
+            expected_video_token_count = 0
+            for thw in out_dict["video_grid_thw"]:
+                expected_video_token_count += thw[0] * thw[1] * thw[2]
+            mm_len = expected_video_token_count
+        else:
+            mm_len = batch_size * 192
         self.assertEqual(len(out_dict[input_name]), mm_len)
 
         return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list}
diff --git a/tests/models/qwen2_vl/test_processor_qwen2_vl.py b/tests/models/qwen2_vl/test_processor_qwen2_vl.py
@@ -239,9 +239,15 @@ def _test_apply_chat_template(
         self.assertTrue(input_name in out_dict)
         self.assertEqual(len(out_dict["input_ids"]), batch_size)
         self.assertEqual(len(out_dict["attention_mask"]), batch_size)
-
-        video_len = 180 if batch_size == 1 else 320  # qwen pixels don't scale with bs same way as other models
-        mm_len = batch_size * 192 if modality == "image" else video_len
+        if modality == "video":
+            # qwen pixels don't scale with bs same way as other models, calulate expected video token count based on video_grid_thw
+            # TODO: update expected video token count calculation based on the internal processing logic of Qwen2VLProcessor
+            expected_video_token_count = 0
+            for thw in out_dict["video_grid_thw"]:
+                expected_video_token_count += thw[0] * thw[1] * thw[2]
+            mm_len = expected_video_token_count
+        else:
+            mm_len = batch_size * 192
         self.assertEqual(len(out_dict[input_name]), mm_len)
 
         return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list}