File tree Expand file tree Collapse file tree 3 files changed +27
-7
lines changed Expand file tree Collapse file tree 3 files changed +27
-7
lines changed Original file line number Diff line number Diff line change @@ -422,8 +422,15 @@ def _test_apply_chat_template(
422
422
self .assertEqual (len (out_dict ["input_ids" ]), batch_size )
423
423
self .assertEqual (len (out_dict ["attention_mask" ]), batch_size )
424
424
425
- video_len = 2880 if batch_size == 1 else 5808 # qwen pixels don't scale with bs same way as other models
426
- mm_len = batch_size * 1564 if modality == "image" else video_len
425
+ if modality == "video" :
426
+ # qwen pixels don't scale with bs same way as other models, calulate expected video token count based on video_grid_thw
427
+ # TODO: update expected video token count calculation based on the internal processing logic of Qwen2_5OmniProcessor
428
+ expected_video_token_count = 0
429
+ for thw in out_dict ["video_grid_thw" ]:
430
+ expected_video_token_count += thw [0 ] * thw [1 ] * thw [2 ]
431
+ mm_len = expected_video_token_count
432
+ else :
433
+ mm_len = batch_size * 1564
427
434
self .assertEqual (len (out_dict [input_name ]), mm_len )
428
435
429
436
return_tensor_to_type = {"pt" : torch .Tensor , "np" : np .ndarray , None : list }
Original file line number Diff line number Diff line change @@ -239,8 +239,15 @@ def _test_apply_chat_template(
239
239
self .assertEqual (len (out_dict ["input_ids" ]), batch_size )
240
240
self .assertEqual (len (out_dict ["attention_mask" ]), batch_size )
241
241
242
- video_len = 180 if batch_size == 1 else 320 # qwen pixels don't scale with bs same way as other models
243
- mm_len = batch_size * 192 if modality == "image" else video_len
242
+ if modality == "video" :
243
+ # qwen pixels don't scale with bs same way as other models, calulate expected video token count based on video_grid_thw
244
+ # TODO: update expected video token count calculation based on the internal processing logic of Qwen2_5VLProcessor
245
+ expected_video_token_count = 0
246
+ for thw in out_dict ["video_grid_thw" ]:
247
+ expected_video_token_count += thw [0 ] * thw [1 ] * thw [2 ]
248
+ mm_len = expected_video_token_count
249
+ else :
250
+ mm_len = batch_size * 192
244
251
self .assertEqual (len (out_dict [input_name ]), mm_len )
245
252
246
253
return_tensor_to_type = {"pt" : torch .Tensor , "np" : np .ndarray , None : list }
Original file line number Diff line number Diff line change @@ -239,9 +239,15 @@ def _test_apply_chat_template(
239
239
self .assertTrue (input_name in out_dict )
240
240
self .assertEqual (len (out_dict ["input_ids" ]), batch_size )
241
241
self .assertEqual (len (out_dict ["attention_mask" ]), batch_size )
242
-
243
- video_len = 180 if batch_size == 1 else 320 # qwen pixels don't scale with bs same way as other models
244
- mm_len = batch_size * 192 if modality == "image" else video_len
242
+ if modality == "video" :
243
+ # qwen pixels don't scale with bs same way as other models, calulate expected video token count based on video_grid_thw
244
+ # TODO: update expected video token count calculation based on the internal processing logic of Qwen2VLProcessor
245
+ expected_video_token_count = 0
246
+ for thw in out_dict ["video_grid_thw" ]:
247
+ expected_video_token_count += thw [0 ] * thw [1 ] * thw [2 ]
248
+ mm_len = expected_video_token_count
249
+ else :
250
+ mm_len = batch_size * 192
245
251
self .assertEqual (len (out_dict [input_name ]), mm_len )
246
252
247
253
return_tensor_to_type = {"pt" : torch .Tensor , "np" : np .ndarray , None : list }
You can’t perform that action at this time.
0 commit comments