@@ -126,31 +126,29 @@ class Glm4vVideoPixelInputs(TensorSchema):
126
126
- np: Number of patches
127
127
- ctpp: Number of channels * temporal_patch_size *
128
128
patch_size * patch_size
129
- - nv: Number of videos
130
129
- f: Number of frames
131
130
- g: Grid dimensions (3 for grid_t which is usually 1 for processed
132
131
video, grid_h, grid_w)
133
132
"""
134
133
type : Literal ["pixel_values_videos" ] = "pixel_values_videos"
135
134
136
135
pixel_values_videos : Annotated [torch .Tensor , TensorShape ("np" , "ctpp" )]
137
- # video_metadata: Union[list[VideoMetadata], list[dict]]
138
- video_grid_thw : Annotated [torch .Tensor , TensorShape ("nv" , "f" , 3 )]
136
+ video_grid_thw : Annotated [torch .Tensor , TensorShape ("f" , 3 )]
139
137
140
138
141
139
class Glm4vVideoEmbeddingInputs (TensorSchema ):
142
140
"""
143
141
Dimensions:
144
142
- p: Number of video patches across all frames
145
143
- h: Hidden size (must match language model backbone)
146
- - n : Number of videos
144
+ - f : Number of frames
147
145
- g: Grid dimensions (3 for grid_t which is usually 1 for processed
148
146
video, grid_h, grid_w)
149
147
"""
150
148
type : Literal ["video_embeds" ] = "video_embeds"
151
149
152
150
video_embeds : Annotated [torch .Tensor , TensorShape ("p" , "h" )]
153
- video_grid_thw : Annotated [torch .Tensor , TensorShape ("n" , 1 , 3 )]
151
+ video_grid_thw : Annotated [torch .Tensor , TensorShape ("f" , 3 )]
154
152
155
153
156
154
Glm4vVideoInputs = Union [Glm4vVideoPixelInputs , Glm4vVideoEmbeddingInputs ]
@@ -1348,7 +1346,6 @@ def _parse_and_validate_video_input(
1348
1346
1349
1347
return Glm4vVideoPixelInputs (
1350
1348
type = "pixel_values_videos" ,
1351
- # video_metadata=video_metadata,
1352
1349
pixel_values_videos = pixel_values_videos ,
1353
1350
video_grid_thw = video_grid_thw ,
1354
1351
)
0 commit comments