|  | 
| 37 | 37 | 
 | 
| 38 | 38 | from .utils import apply_chat_template_with_fallback, process_conversation_inputs, save_config_to_constant_methods | 
| 39 | 39 | 
 | 
|  | 40 | +def _patch_idefics3_vision_embeddings_for_export(vision_model): | 
|  | 41 | +    """ | 
|  | 42 | +    Patch Idefics3VisionEmbeddings to make it export-friendly by removing data-dependent operations. | 
|  | 43 | +    This assumes batch_size=1 and a full attention mask (all 1s). | 
|  | 44 | +    """ | 
|  | 45 | +    import types | 
|  | 46 | + | 
|  | 47 | +    def export_friendly_forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.BoolTensor) -> torch.Tensor: | 
|  | 48 | +        batch_size, _, max_im_h, max_im_w = pixel_values.shape | 
|  | 49 | + | 
|  | 50 | +        patch_embeds = self.patch_embedding(pixel_values) | 
|  | 51 | +        embeddings = patch_embeds.flatten(2).transpose(1, 2) | 
|  | 52 | + | 
|  | 53 | +        nb_patches_h = max_im_h // self.patch_size | 
|  | 54 | +        nb_patches_w = max_im_w // self.patch_size | 
|  | 55 | +        N = self.num_patches_per_side | 
|  | 56 | + | 
|  | 57 | +        # For export, we assume full attention mask and compute position IDs statically. | 
|  | 58 | +        # This avoids the data-dependent loop over batch dimension. | 
|  | 59 | +        h_indices = torch.arange(nb_patches_h, device=pixel_values.device, dtype=torch.long) | 
|  | 60 | +        w_indices = torch.arange(nb_patches_w, device=pixel_values.device, dtype=torch.long) | 
|  | 61 | + | 
|  | 62 | +        # This replaces bucketize(x, boundaries=[1/N, 2/N, ...], right=True) ≈ floor(x * N), which | 
|  | 63 | +        # we don't have a kernel for at the moment. | 
|  | 64 | +        bucket_coords_h = (h_indices * N) // nb_patches_h | 
|  | 65 | +        bucket_coords_w = (w_indices * N) // nb_patches_w | 
|  | 66 | + | 
|  | 67 | +        bucket_coords_h = torch.clamp(bucket_coords_h, max=N - 1) | 
|  | 68 | +        bucket_coords_w = torch.clamp(bucket_coords_w, max=N - 1) | 
|  | 69 | + | 
|  | 70 | +        pos_ids = (bucket_coords_h[:, None] * N + bucket_coords_w[None, :]).reshape(-1) | 
|  | 71 | +        position_ids = pos_ids.unsqueeze(0).expand(batch_size, -1) | 
|  | 72 | +        embeddings = embeddings + self.position_embedding(position_ids) | 
|  | 73 | +        return embeddings | 
|  | 74 | + | 
|  | 75 | +    # Patch the forward method. | 
|  | 76 | +    vision_model.embeddings.forward = types.MethodType(export_friendly_forward, vision_model.embeddings) | 
|  | 77 | + | 
| 40 | 78 | 
 | 
| 41 | 79 | class VisionExportableModule(torch.nn.Module): | 
| 42 | 80 |     def __init__(self, model: torch.nn.Module): | 
| 43 | 81 |         super().__init__() | 
| 44 | 82 |         self.model = model | 
| 45 | 83 | 
 | 
|  | 84 | +        # Patch Idefics3 vision embeddings if needed | 
|  | 85 | +        if hasattr(model, 'model') and hasattr(model.model, 'vision_model'): | 
|  | 86 | +            model_type = getattr(model.config, 'model_type', '') | 
|  | 87 | +            if 'idefics3' in model_type.lower(): | 
|  | 88 | +                _patch_idefics3_vision_embeddings_for_export(model.model.vision_model) | 
|  | 89 | + | 
| 46 | 90 |     def prepare_export_inputs(self): | 
| 47 | 91 |         # 1. Get export inputs | 
| 48 | 92 |         model_id = self.model.config.name_or_path | 
| @@ -83,7 +127,9 @@ def forward( | 
| 83 | 127 |         self, | 
| 84 | 128 |         input_features: torch.FloatTensor, | 
| 85 | 129 |     ): | 
| 86 |  | -        image_embeds = self.model.get_image_features(input_features) | 
|  | 130 | +        # Pass pixel_attention_mask=None to avoid data-dependent operations during export. | 
|  | 131 | +        # The model will create a mask full of 1s internally if None is passed. | 
|  | 132 | +        image_embeds = self.model.get_image_features(input_features, pixel_attention_mask=None) | 
| 87 | 133 |         if isinstance(image_embeds, list): | 
| 88 | 134 |             image_embeds = torch.stack(image_embeds) | 
| 89 | 135 |         return image_embeds | 
|  | 
0 commit comments