Skip to content

Commit 0ae970e

Browse files
authored
[Bugfix] Fix glm4.1v video_grid_thw tensor shape scheme (#21744)
Signed-off-by: Isotr0py <[email protected]>
1 parent 65e8466 commit 0ae970e

File tree

1 file changed

+3
-6
lines changed

1 file changed

+3
-6
lines changed

vllm/model_executor/models/glm4_1v.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -126,31 +126,29 @@ class Glm4vVideoPixelInputs(TensorSchema):
126126
- np: Number of patches
127127
- ctpp: Number of channels * temporal_patch_size *
128128
patch_size * patch_size
129-
- nv: Number of videos
130129
- f: Number of frames
131130
- g: Grid dimensions (3 for grid_t which is usually 1 for processed
132131
video, grid_h, grid_w)
133132
"""
134133
type: Literal["pixel_values_videos"] = "pixel_values_videos"
135134

136135
pixel_values_videos: Annotated[torch.Tensor, TensorShape("np", "ctpp")]
137-
# video_metadata: Union[list[VideoMetadata], list[dict]]
138-
video_grid_thw: Annotated[torch.Tensor, TensorShape("nv", "f", 3)]
136+
video_grid_thw: Annotated[torch.Tensor, TensorShape("f", 3)]
139137

140138

141139
class Glm4vVideoEmbeddingInputs(TensorSchema):
142140
"""
143141
Dimensions:
144142
- p: Number of video patches across all frames
145143
- h: Hidden size (must match language model backbone)
146-
- n: Number of videos
144+
- f: Number of frames
147145
- g: Grid dimensions (3 for grid_t which is usually 1 for processed
148146
video, grid_h, grid_w)
149147
"""
150148
type: Literal["video_embeds"] = "video_embeds"
151149

152150
video_embeds: Annotated[torch.Tensor, TensorShape("p", "h")]
153-
video_grid_thw: Annotated[torch.Tensor, TensorShape("n", 1, 3)]
151+
video_grid_thw: Annotated[torch.Tensor, TensorShape("f", 3)]
154152

155153

156154
Glm4vVideoInputs = Union[Glm4vVideoPixelInputs, Glm4vVideoEmbeddingInputs]
@@ -1348,7 +1346,6 @@ def _parse_and_validate_video_input(
13481346

13491347
return Glm4vVideoPixelInputs(
13501348
type="pixel_values_videos",
1351-
# video_metadata=video_metadata,
13521349
pixel_values_videos=pixel_values_videos,
13531350
video_grid_thw=video_grid_thw,
13541351
)

0 commit comments

Comments
 (0)