Clean up

jackzhxng · jackzhxng · commit 62da12e5f4ca · 2025-07-31T22:39:20.000-07:00
diff --git a/src/transformers/integrations/executorch.py b/src/transformers/integrations/executorch.py
@@ -16,6 +16,7 @@
 import torch
 
 from ..cache_utils import DynamicCache, EncoderDecoderCache, HybridCache, StaticCache
+from ..configuration_utils import PretrainedConfig
 from ..generation.configuration_utils import GenerationConfig
 from ..masking_utils import (
     ALL_MASK_ATTENTION_FUNCTIONS,
@@ -47,7 +48,7 @@ def __init__(
 
         Args:
             model (`PreTrainedModel`): The pretrained model to wrap.
-            config (`PreTrainedConfig`): The pretrained text config for the decoder model.
+            config (`PretrainedConfig`): The pretrained text config for the decoder model.
             generation_config (`GenerationConfig`): The generation config for the model.
             max_batch_size (int): Maximum batch size for the cache.
             max_cache_len (int): Maximum sequence length for the cache.
@@ -82,7 +83,7 @@ def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
-        cache_position: torch.Tensor,
+        cache_position: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """
         Forward pass of the module, which is compatible with the ExecuTorch llm runner.
@@ -114,16 +115,50 @@ def export(
 
         Args:
             input_ids (`Optional[torch.Tensor]`):
-                Tensor representing current input token id to the module. If this and inputs_embeds are not provided, a default tensor will be used.
+                Tensor representing current input token id to the module. Must specify either this or inputs_embeds.
             inputs_embeds (`Optional[torch.Tensor]`):
-                Tensor representing current input embeddings to the module.
+                Tensor representing current input embeddings to the module. Must specify either this or input_ids.
             cache_position (`Optional[torch.Tensor]`):
                 Tensor representing current input position in the cache. If not provided, a default tensor will be used.
             dynamic_shapes (`Optional[dict]`):
                 Dynamic shapes to use for export if specified.
             strict(`Optional[bool]`):
                 Flag to instruct `torch.export` to use `torchdynamo`.
+
+        Returns:
+            torch.export.ExportedProgram: The exported program that can be used for inference.
+
+        Examples:
+            Export with input_ids:
+            ```python
+            # Prepare inputs
+            input_ids = torch.tensor([[1, 2, 3]], dtype=torch.long, device=model.device)
+            cache_position = torch.arange(input_ids.shape[-1], dtype=torch.long, device=model.device)
+            
+            # Export
+            exported = exportable_module.export(
+                input_ids=input_ids, 
+                cache_position=cache_position
+            )
+            ```
+            
+            Export with inputs_embeds:
+            ```python
+            # Prepare embeddings
+            inputs_embeds = torch.randn(1, 3, 768, device=model.device)  # batch_size=1, seq_len=3, hidden_size=768
+            cache_position = torch.arange(inputs_embeds.shape[1], dtype=torch.long, device=model.device)
+            
+            # Export  
+            exported = exportable_module.export(
+                inputs_embeds=inputs_embeds,
+                cache_position=cache_position
+            )
+            ```
         """
+        # Validate inputs early for fail-fast behavior
+        if not input_ids ^ inputs_embeds:
+            raise ValueError("Need to specify either input_ids or inputs_embeds.")
+
         if hasattr(self.model, "base_model_prefix"):
             base = getattr(self.model, self.model.base_model_prefix, self.model)
             model_device = base.device
@@ -135,9 +170,6 @@ def export(
                 "TorchExportableModuleForDecoderOnlyLM.export Can't infer device from the model. Set to CPU by default."
             )
 
-        if not input_ids ^ inputs_embeds:
-            raise ValueError("Need to specify either input_ids or inputs_embeds.")
-
         example_cache_position = (
             cache_position if cache_position is not None else torch.tensor([0], dtype=torch.long, device=model_device)
         )
@@ -293,7 +325,7 @@ def __init__(
         Args:
             model (`PreTrainedModel`): The pretrained model to wrap. The model must have caching
             enabled and use a 'static' caching implementation.
-            config (`PreTrainedConfig`): The pretrained text config for the model.
+            config (`PretrainedConfig`): The pretrained text config for the model.
             generation_config (`GenerationConfig`): The generation config for the model.
 
         Raises:
@@ -340,8 +372,8 @@ def __init__(
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        cache_position: torch.Tensor = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
     ):
         """
         Forward pass of the module, which is compatible with the ExecuTorch runtime.
@@ -448,7 +480,7 @@ def __init__(
 
         Args:
             model (`PreTrainedModel`): The pretrained model to wrap.
-            config (`PreTrainedConfig`): The pretrained text config for the model.
+            config (`PretrainedConfig`): The pretrained text config for the model.
             generation_config (`GenerationConfig`): The generation config for the model.
             max_batch_size (int): Maximum batch size for the cache.
             max_cache_len (int): Maximum sequence length for the cache.
@@ -482,8 +514,8 @@ def __init__(
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        cache_position: torch.Tensor = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """
         Forward pass of the module, which is compatible with the ExecuTorch llm runner.
@@ -523,7 +555,7 @@ def forward(
 
 def convert_and_export_with_cache(
     model: PreTrainedModel,
-    config: PreTrainedConfig,
+    config: PretrainedConfig,
     generation_config: GenerationConfig,
     example_input_ids: Optional[torch.Tensor] = None,
     example_cache_position: Optional[torch.Tensor] = None,
@@ -536,7 +568,7 @@ def convert_and_export_with_cache(
 
     Args:
         model (`PreTrainedModel`): The pretrained model to be exported.
-        config (`PreTrainedConfig`): The pretrained text config for the decoder model.
+        config (`PretrainedConfig`): The pretrained text config for the decoder model.
         generation_config (`GenerationConfig`): The generation config for the model.
         example_input_ids (`Optional[torch.Tensor]`): Example input token id used by `torch.export`.
         example_cache_position (`Optional[torch.Tensor]`): Example current cache position used by `torch.export`.
diff --git a/tests/utils/test_cache_utils.py b/tests/utils/test_cache_utils.py
@@ -813,7 +813,9 @@ def test_static_cache_exportability(self):
 
         from transformers.integrations.executorch import TorchExportableModuleForDecoderOnlyLM
 
-        exportable_module = TorchExportableModuleForDecoderOnlyLM(model)
+        exportable_module = TorchExportableModuleForDecoderOnlyLM(
+            model, config=model.config, generation_config=model.generation_config
+        )
         exported_program = exportable_module.export(
             input_ids=input_ids,
             cache_position=cache_position,
@@ -841,8 +843,25 @@ def test_hybrid_cache_exportability(self):
         model.eval()
         max_batch_size = 1
         max_cache_len = 23
-        exportable_module = TorchExportableModuleForDecoderOnlyLM(model, max_batch_size, max_cache_len)
-        exported_program = exportable_module.export()
+        # Create generation config for the hybrid cache model
+        from transformers.generation.configuration_utils import GenerationConfig
+        generation_config = GenerationConfig(
+            use_cache=True,
+            cache_implementation="hybrid",
+            max_length=max_cache_len,
+            cache_config={
+                "batch_size": max_batch_size,
+                "max_cache_len": max_cache_len,
+                "device": model.device,
+            },
+        )
+        exportable_module = TorchExportableModuleForDecoderOnlyLM(
+            model, config=model.config, generation_config=generation_config
+        )
+        exported_program = exportable_module.export(
+            input_ids=torch.tensor([[1]], dtype=torch.long, device=model.device),
+            cache_position=torch.tensor([0], dtype=torch.long, device=model.device)
+        )
         n_g_key_caches = n_g_value_caches = 0
         for buffer_name, buffer in exported_program.named_buffers():
             if buffer_name.startswith("key_cache"):