Revert config/generation_config changes

jackzhxng · jackzhxng · commit 8b06186744c9 · 2025-08-04T17:02:28.000-07:00
diff --git a/src/transformers/integrations/executorch.py b/src/transformers/integrations/executorch.py
@@ -38,43 +38,33 @@ class TorchExportableModuleForDecoderOnlyLM(torch.nn.Module):
     def __init__(
         self,
         model: PreTrainedModel,
-        config: Optional[PretrainedConfig] = None,
-        generation_config: Optional[GenerationConfig] = None,
     ):
         """
         Initializes the exportable module with `HybridCache`.
 
         Args:
             model (`PreTrainedModel`): The pretrained model to wrap.
-            config (`PretrainedConfig`): The pretrained text config for the decoder model.
-            If not specified will try to resolve with the model's config.
-            generation_config (`GenerationConfig`): The generation config for the model.
-            If not specified will try to resolve with the model's generation config.
 
         Raises:
             ValueError: If the model is configured with a unsupported cache implementation.
         """
         super().__init__()
 
-        if not config:
-            config = model.config
-        if not generation_config:
-            generation_config = model.generation_config
+        config = model.config.get_text_config()
+        generation_config = model.generation_config
 
         if not hasattr(config, "use_cache") or config.use_cache is False:
             raise ValueError("The model must have caching enabled to be performant.")
 
         if hasattr(config, "layer_types") and getattr(config, "sliding_window", None) is not None:
-            self.model = TorchExportableModuleWithHybridCache(
-                model, config=config, generation_config=generation_config
-            )
+            self.model = TorchExportableModuleWithHybridCache(model)
         else:
             # If `layer_types` is not specified explicitly in the config or `sliding_window` is null,
             # there is only 1 type of layers, so export will use `StaticCache` by default.
             logging.info(
                 "Using `StaticCache` for export as `layer_types` is not specified or `sliding_window` is `null` in the config."
             )
-            self.model = TorchExportableModuleWithStaticCache(model, config, generation_config)
+            self.model = TorchExportableModuleWithStaticCache(model)
         # This is the same as sdpa, but mask creation does not use `vmap` which is not exportable
         ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa_without_vmap", sdpa_mask_without_vmap)
         ALL_ATTENTION_FUNCTIONS.register("sdpa_without_vmap", ALL_ATTENTION_FUNCTIONS["sdpa"])
@@ -171,25 +161,23 @@ def export(
             )
 
         if input_ids is not None:
-            if cache_position is None:
-                cache_position = torch.arange(input_ids.shape[-1], dtype=torch.long, model=model_device)
-            exported_program = torch.export.export(
-                self.model,
-                args=(),
-                kwargs={"input_ids": input_ids, "cache_position": cache_position},
-                dynamic_shapes=dynamic_shapes,
-                strict=strict if strict is not None else True,
-            )
+            input_kwargs = {
+                "input_ids": input_ids,
+                "cache_position": cache_position if cache_position is not None else torch.arange(input_ids.shape[-1], dtype=torch.long, model=model_device)
+            }
         else:  # inputs_embeds
-            if cache_position is None:
-                cache_position = torch.arange(inputs_embeds.shape[1], dtype=torch.long, model=model_device)
-            exported_program = torch.export.export(
-                self.model,
-                args=(),
-                kwargs={"inputs_embeds": inputs_embeds, "cache_position": cache_position},
-                dynamic_shapes=dynamic_shapes,
-                strict=strict if strict is not None else True,
-            )
+            input_kwargs = {
+                "inputs_embeds": inputs_embeds,
+                "cache_position": cache_position if cache_position is not None else torch.arange(inputs_embeds.shape[1], dtype=torch.long, model=model_device)
+            }
+
+        exported_program = torch.export.export(
+            self.model,
+            args=(),
+            kwargs=input_kwargs,
+            dynamic_shapes=dynamic_shapes,
+            strict=strict if strict is not None else True,
+        )
 
         return exported_program
 
@@ -316,24 +304,23 @@ class TorchExportableModuleWithStaticCache(torch.nn.Module):
     def __init__(
         self,
         model: PreTrainedModel,
-        config: PretrainedConfig,
-        generation_config: GenerationConfig,
     ):
         """
         Initializes the wrapper module with the pretrained model.
 
         Args:
             model (`PreTrainedModel`): The pretrained model to wrap. The model must have caching
-            enabled and use a 'static' caching implementation.
-            config (`PretrainedConfig`): The pretrained text config for the model.
-            generation_config (`GenerationConfig`): The generation config for the model.
+                enabled and use a 'static' caching implementation.
 
         Raises:
             AssertionError: If the pretrained model does not have caching enabled or if it does
             not use a 'static' caching implementation in `model.generation_config`.
         """
         super().__init__()
 
+        config = model.config.get_text_config()
+        generation_config = model.generation_config
+
         # Sanity checks
         if generation_config is None:
             raise AssertionError(
@@ -354,13 +341,11 @@ def __init__(
             )
 
         self.model = model
-        self.config = config
-        self.generation_config = generation_config
         self.static_cache = StaticCache(
             config=config,
-            max_batch_size=self.generation_config.cache_config.get("batch_size"),
-            max_cache_len=self.generation_config.cache_config.get("max_cache_len"),
-            device=self.generation_config.cache_config.get("device"),
+            max_batch_size=generation_config.cache_config.get("batch_size"),
+            max_cache_len=generation_config.cache_config.get("max_cache_len"),
+            device=generation_config.cache_config.get("device"),
             dtype=self.model.dtype,
         )
 
@@ -471,26 +456,20 @@ class TorchExportableModuleWithHybridCache(torch.nn.Module):
     def __init__(
         self,
         model: PreTrainedModel,
-        config: PretrainedConfig,
-        generation_config: GenerationConfig,
     ):
         """
         Initializes the exportable module with `HybridCache`.
 
         Args:
             model (`PreTrainedModel`): The pretrained model to wrap.
-            config (`PretrainedConfig`): The pretrained text config for the model.
-            generation_config (`GenerationConfig`): The generation config for the model.
-            max_batch_size (int): Maximum batch size for the cache.
-            max_cache_len (int): Maximum sequence length for the cache.
 
         Raises:
             AssertionError: If the model doesn't have the expected configuration for HybridCache.
         """
         super().__init__()
         self.model = model
-        self.config = config
-        self.generation_config = generation_config
+        config = model.config.get_text_config()
+        generation_config = model.generation_config
 
         # Verify the model is configured for HybridCache
         if not config.use_cache:
@@ -499,9 +478,9 @@ def __init__(
         # Initialize the HybridCache
         self.cache = HybridCache(
             config=config,
-            max_batch_size=self.generation_config.cache_config.get("batch_size"),
-            max_cache_len=self.generation_config.cache_config.get("max_cache_len"),
-            device=self.generation_config.cache_config.get("device"),
+            max_batch_size=generation_config.cache_config.get("batch_size"),
+            max_cache_len=generation_config.cache_config.get("max_cache_len"),
+            device=generation_config.cache_config.get("device"),
             dtype=self.model.dtype,
         )
 
@@ -543,8 +522,6 @@ def forward(
 
 def convert_and_export_with_cache(
     model: PreTrainedModel,
-    config: PretrainedConfig,
-    generation_config: GenerationConfig,
     example_input_ids: Optional[torch.Tensor] = None,
     example_cache_position: Optional[torch.Tensor] = None,
     dynamic_shapes: Optional[dict] = None,
@@ -556,8 +533,6 @@ def convert_and_export_with_cache(
 
     Args:
         model (`PreTrainedModel`): The pretrained model to be exported.
-        config (`PretrainedConfig`): The pretrained text config for the decoder model.
-        generation_config (`GenerationConfig`): The generation config for the model.
         example_input_ids (`Optional[torch.Tensor]`): Example input token id used by `torch.export`.
         example_cache_position (`Optional[torch.Tensor]`): Example current cache position used by `torch.export`.
         dynamic_shapes(`Optional[dict]`): Dynamic shapes used by `torch.export`.
@@ -591,7 +566,7 @@ def convert_and_export_with_cache(
 
         if is_torch_greater_or_equal("2.6.0"):
             exported_program = torch.export.export(
-                TorchExportableModuleWithStaticCache(model=model, config=config, generation_config=generation_config),
+                TorchExportableModuleWithStaticCache(model),
                 args=(),
                 kwargs={"input_ids": example_input_ids, "cache_position": example_cache_position},
                 dynamic_shapes=dynamic_shapes,
@@ -609,11 +584,7 @@ def convert_and_export_with_cache(
             # Due to issue https://github.com/pytorch/pytorch/issues/128394, we need to switch to use an internal
             # export API and pre_dispatch=False. Switch to use the public API once the issue is included in 2.5 release.
             exported_program = torch.export._trace._export(
-                TorchExportableModuleWithStaticCache(
-                    model=model,
-                    config=config,
-                    generation_config=generation_config,
-                ),
+                TorchExportableModuleWithStaticCache(model),
                 args=(),
                 kwargs={"input_ids": example_input_ids, "cache_position": example_cache_position},
                 pre_dispatch=False,
diff --git a/tests/models/cohere2/test_modeling_cohere2.py b/tests/models/cohere2/test_modeling_cohere2.py
@@ -275,9 +275,7 @@ def test_export_static_cache(self):
         max_new_tokens = 30 - prompt_token_ids.shape[-1]
 
         # Static Cache + export
-        exported_program = convert_and_export_with_cache(
-            model, config=model.config, generation_config=model.generation_config
-        )
+        exported_program = convert_and_export_with_cache(model)
         ep_generated_ids = TorchExportableModuleWithStaticCache.generate(
             exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens
         )
diff --git a/tests/models/exaone4/test_modeling_exaone4.py b/tests/models/exaone4/test_modeling_exaone4.py
@@ -400,9 +400,7 @@ def test_export_static_cache(self):
         max_new_tokens = max_generation_length - prompt_token_ids.shape[-1]
 
         # Static Cache + export
-        exported_program = convert_and_export_with_cache(
-            model, config=model.config, generation_config=model.generation_config
-        )
+        exported_program = convert_and_export_with_cache(model)
         ep_generated_ids = TorchExportableModuleWithStaticCache.generate(
             exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens
         )
diff --git a/tests/models/gemma/test_modeling_gemma.py b/tests/models/gemma/test_modeling_gemma.py
@@ -459,9 +459,7 @@ def test_export_static_cache(self):
         # Static Cache + export
         from transformers.integrations.executorch import TorchExportableModuleForDecoderOnlyLM
 
-        exportable_module = TorchExportableModuleForDecoderOnlyLM(
-            model, config=model.config, generation_config=model.generation_config
-        )
+        exportable_module = TorchExportableModuleForDecoderOnlyLM(model)
         exported_program = exportable_module.export(
             input_ids=prompt_token_ids,
             cache_position=torch.arange(prompt_token_ids.shape[-1], dtype=torch.long, device=model.device),
diff --git a/tests/models/gemma2/test_modeling_gemma2.py b/tests/models/gemma2/test_modeling_gemma2.py
@@ -364,9 +364,7 @@ def test_export_static_cache(self):
         # Static Cache + export
         from transformers.integrations.executorch import TorchExportableModuleForDecoderOnlyLM
 
-        exportable_module = TorchExportableModuleForDecoderOnlyLM(
-            model, config=model.config, generation_config=model.generation_config
-        )
+        exportable_module = TorchExportableModuleForDecoderOnlyLM(model)
         exported_program = exportable_module.export(
             input_ids=prompt_token_ids,
             cache_position=torch.arange(prompt_token_ids.shape[-1], dtype=torch.long, device=model.device),
@@ -393,9 +391,7 @@ def test_export_hybrid_cache(self):
 
         # Export + HybridCache
         model.eval()
-        exportable_module = TorchExportableModuleForDecoderOnlyLM(
-            model, config=model.config, generation_config=model.generation_config
-        )
+        exportable_module = TorchExportableModuleForDecoderOnlyLM(model)
         exported_program = exportable_module.export(
             input_ids=torch.tensor([[1]], dtype=torch.long, device=model.device),
             cache_position=torch.tensor([0], dtype=torch.long, device=model.device),
diff --git a/tests/models/gemma3/test_modeling_gemma3.py b/tests/models/gemma3/test_modeling_gemma3.py
@@ -808,9 +808,7 @@ def test_export_text_only_with_hybrid_cache(self):
 
         # Export + HybridCache
         model.eval()
-        exportable_module = TorchExportableModuleForDecoderOnlyLM(
-            model, config=model.config, generation_config=model.generation_config
-        )
+        exportable_module = TorchExportableModuleForDecoderOnlyLM(model)
         exported_program = exportable_module.export(
             input_ids=torch.tensor([[1]], dtype=torch.long, device=model.device),
             cache_position=torch.tensor([0], dtype=torch.long, device=model.device),
diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py
@@ -352,9 +352,7 @@ def test_export_static_cache(self):
             # Static Cache + export
             from transformers.integrations.executorch import TorchExportableModuleForDecoderOnlyLM
 
-            exportable_module = TorchExportableModuleForDecoderOnlyLM(
-                model, config=model.config, generation_config=model.generation_config
-            )
+            exportable_module = TorchExportableModuleForDecoderOnlyLM(model)
             exported_program = exportable_module.export(
                 input_ids=prompt_token_ids,
                 cache_position=torch.arange(prompt_token_ids.shape[-1], dtype=torch.long, device=model.device),
diff --git a/tests/models/olmo/test_modeling_olmo.py b/tests/models/olmo/test_modeling_olmo.py
@@ -383,9 +383,7 @@ def test_export_static_cache(self):
         # Static Cache + export
         from transformers.integrations.executorch import TorchExportableModuleForDecoderOnlyLM
 
-        exportable_module = TorchExportableModuleForDecoderOnlyLM(
-            model, config=model.config, generation_config=model.generation_config
-        )
+        exportable_module = TorchExportableModuleForDecoderOnlyLM(model)
         exported_program = exportable_module.export(
             input_ids=prompt_token_ids,
             cache_position=torch.arange(prompt_token_ids.shape[-1], dtype=torch.long, device=model.device),
diff --git a/tests/models/olmo2/test_modeling_olmo2.py b/tests/models/olmo2/test_modeling_olmo2.py
@@ -383,9 +383,7 @@ def test_export_static_cache(self):
         self.assertEqual(EXPECTED_TEXT_COMPLETION, eager_generated_text)
 
         # Static Cache + export
-        exported_program = convert_and_export_with_cache(
-            model, config=model.config, generation_config=model.generation_config
-        )
+        exported_program = convert_and_export_with_cache(model)
         ep_generated_ids = TorchExportableModuleWithStaticCache.generate(
             exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens
         )
diff --git a/tests/models/phi3/test_modeling_phi3.py b/tests/models/phi3/test_modeling_phi3.py
@@ -416,9 +416,7 @@ def test_export_static_cache(self):
         # Static Cache + export
         from transformers.integrations.executorch import TorchExportableModuleForDecoderOnlyLM
 
-        exportable_module = TorchExportableModuleForDecoderOnlyLM(
-            model, config=model.config, generation_config=model.generation_config
-        )
+        exportable_module = TorchExportableModuleForDecoderOnlyLM(model)
         exported_program = exportable_module.export(
             input_ids=prompt_token_ids,
             cache_position=torch.arange(prompt_token_ids.shape[-1], dtype=torch.long, device=model.device),
diff --git a/tests/models/qwen2/test_modeling_qwen2.py b/tests/models/qwen2/test_modeling_qwen2.py
@@ -299,9 +299,7 @@ def test_export_static_cache(self):
         # Static Cache + export
         from transformers.integrations.executorch import TorchExportableModuleForDecoderOnlyLM
 
-        exportable_module = TorchExportableModuleForDecoderOnlyLM(
-            model, config=model.config, generation_config=model.generation_config
-        )
+        exportable_module = TorchExportableModuleForDecoderOnlyLM(model)
         strict = version.parse(torch.__version__) != version.parse(
             "2.7.0"
         )  # Due to https://github.com/pytorch/pytorch/issues/150994
diff --git a/tests/models/qwen3/test_modeling_qwen3.py b/tests/models/qwen3/test_modeling_qwen3.py
@@ -292,9 +292,7 @@ def test_export_static_cache(self):
         # Static Cache + export
         from transformers.integrations.executorch import TorchExportableModuleForDecoderOnlyLM
 
-        exportable_module = TorchExportableModuleForDecoderOnlyLM(
-            model, config=model.config, generation_config=model.generation_config
-        )
+        exportable_module = TorchExportableModuleForDecoderOnlyLM(model)
         exported_program = exportable_module.export(
             input_ids=prompt_token_ids,
             cache_position=torch.arange(prompt_token_ids.shape[-1], dtype=torch.long, device=model.device),
diff --git a/tests/models/smollm3/test_modeling_smollm3.py b/tests/models/smollm3/test_modeling_smollm3.py
@@ -219,9 +219,7 @@ def test_export_static_cache(self):
 
         # Static Cache + export
         strict = is_torch_greater_or_equal("2.7.0")  # Due to https://github.com/pytorch/pytorch/issues/150994
-        exported_program = convert_and_export_with_cache(
-            model, config=model.config, generation_config=model.generation_config, strict=strict
-        )
+        exported_program = convert_and_export_with_cache(model, strict=strict)
         ep_generated_ids = TorchExportableModuleWithStaticCache.generate(
             exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens
         )
diff --git a/tests/test_executorch.py b/tests/test_executorch.py
@@ -56,7 +56,9 @@ def test_static_cache_module_forward(self):
             cache_config={"batch_size": 1, "max_cache_len": 32, "device": "cpu"},
         )
 
-        module = TorchExportableModuleWithStaticCache(self.model, self.model.config, generation_config)
+        # Set generation config on model
+        self.model.generation_config = generation_config
+        module = TorchExportableModuleWithStaticCache(self.model)
 
         # Test with input_ids
         eager_output_ids = self.model(input_ids=self.input_ids, use_cache=False).logits
@@ -80,7 +82,9 @@ def test_hybrid_cache_module_forward(self):
             cache_config={"batch_size": 1, "max_cache_len": 32, "device": "cpu"},
         )
 
-        module = TorchExportableModuleWithHybridCache(self.model, config, generation_config)
+        # Set generation config on model
+        self.model.generation_config = generation_config
+        module = TorchExportableModuleWithHybridCache(self.model)
 
         # Test with input_ids
         eager_output_ids = self.model(input_ids=self.input_ids, use_cache=False).logits
diff --git a/tests/utils/test_cache_utils.py b/tests/utils/test_cache_utils.py