pytorch
diff --git a/‎test/core/test_config.py‎
Lines changed: 8 additions & 6 deletions b/‎test/core/test_config.py‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎test/dtypes/test_affine_quantized_float.py‎
Lines changed: 76 additions & 23 deletions b/‎test/dtypes/test_affine_quantized_float.py‎
Lines changed: 76 additions & 23 deletions
diff --git a/‎test/float8/test_base.py‎
Lines changed: 3 additions & 3 deletions b/‎test/float8/test_base.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎test/integration/test_loading_deprecated_checkpoint.py‎
Lines changed: 65 additions & 0 deletions b/‎test/integration/test_loading_deprecated_checkpoint.py‎
Lines changed: 65 additions & 0 deletions
@@ -7,6 +7,7 @@
 import json
 import os
 import tempfile
+import warnings
 from dataclasses import dataclass
 from unittest import mock
 
@@ -15,7 +16,6 @@
 
 from torchao.core.config import (
     AOBaseConfig,
-    VersionMismatchError,
     config_from_dict,
     config_to_dict,
 )
@@ -176,7 +176,7 @@ def test_disallowed_modules():
 
 
 def test_version_mismatch():
-    """Test that version mismatch raises an error during reconstruction."""
+    """Test that version mismatch prints a warning during reconstruction."""
     # Create a config
     dummy_config = DummyNonAllowedConfig()
     reconstructable = config_to_dict(dummy_config)
@@ -186,11 +186,13 @@ def test_version_mismatch():
 
     # Patch to allow the module but should still fail due to version mismatch
     with mock.patch("torchao.core.config.ALLOWED_AO_MODULES", {__name__}):
-        with pytest.raises(
-            VersionMismatchError,
-            match="Version mismatch for DummyNonAllowedConfig: stored version 1 != current version 2",
-        ):
+        with warnings.catch_warnings(record=True) as caught_warnings:
             config_from_dict(reconstructable)
+            assert any(
+                "Stored version is not the same as current default version of the config"
+                in str(w.message)
+                for w in caught_warnings
+            ), "Didn't get expected warning message for version mismatch"
 
 
 def test_default_version():
 
@@ -30,17 +30,14 @@
 from torchao.float8.float8_utils import compute_error
 from torchao.quantization import (
     Float8DynamicActivationFloat8WeightConfig,
-    float8_dynamic_activation_float8_weight,
-    float8_weight_only,
+    Float8StaticActivationFloat8WeightConfig,
+    Float8WeightOnlyConfig,
     quantize_,
 )
 from torchao.quantization.granularity import (
     PerRow,
     PerTensor,
 )
-from torchao.quantization.quant_api import (
-    float8_static_activation_float8_weight,
-)
 from torchao.quantization.quant_primitives import (
     MappingType,
     _choose_scale_float8,
@@ -117,17 +114,24 @@ def test_fp8_linear_variants(
                 torch.float8_e4m3fn,
                 scale_dtype=torch.float32,
             )
+            fp8_dq_cur_version = Float8DynamicActivationFloat8WeightConfig.VERSION
+            fp8wo_cur_version = Float8WeightOnlyConfig.VERSION
+            Float8DynamicActivationFloat8WeightConfig.VERSION = 1
+            Float8WeightOnlyConfig.VERSION = 1
             mode_map = {
                 "dynamic": partial(
-                    float8_dynamic_activation_float8_weight, granularity=granularity
+                    Float8DynamicActivationFloat8WeightConfig,
+                    granularity=granularity,
                 ),
-                "weight-only": float8_weight_only,
+                "weight-only": Float8WeightOnlyConfig,
                 "static": partial(
-                    float8_static_activation_float8_weight,
+                    Float8StaticActivationFloat8WeightConfig,
                     scale=scale,
                     granularity=granularity,
                 ),
             }
+            Float8DynamicActivationFloat8WeightConfig.VERSION = fp8_dq_cur_version
+            Float8WeightOnlyConfig.VERSION = fp8wo_cur_version
 
             # Create a linear layer with bfloat16 dtype
             model = ToyLinearModel(K, N).eval().to(dtype).to("cuda")
@@ -152,7 +156,7 @@ def test_fp8_linear_variants(
     )
     def test_invalid_granularity(self):
         with pytest.raises(ValueError, match="Invalid granularity specification"):
-            float8_dynamic_activation_float8_weight(granularity="invalid")
+            Float8DynamicActivationFloat8WeightConfig(granularity="invalid")
 
     @unittest.skipIf(
         not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
@@ -162,7 +166,9 @@ def test_mismatched_granularity(self):
             ValueError,
             match="Different granularities for activation and weight are not supported",
         ):
-            float8_dynamic_activation_float8_weight(granularity=(PerTensor(), PerRow()))
+            Float8DynamicActivationFloat8WeightConfig(
+                granularity=(PerTensor(), PerRow())
+            )
 
     @unittest.skipIf(
         not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
@@ -172,8 +178,8 @@ class UnsupportedGranularity:
             pass
 
         with pytest.raises(ValueError, match="Invalid granularity types"):
-            float8_dynamic_activation_float8_weight(
-                granularity=(UnsupportedGranularity(), UnsupportedGranularity())
+            Float8DynamicActivationFloat8WeightConfig(
+                granularity=(UnsupportedGranularity(), UnsupportedGranularity()),
             )
 
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
@@ -187,7 +193,8 @@ def test_per_row_with_float32(self):
         ):
             model = ToyLinearModel(64, 64).eval().to(torch.float32).to("cuda")
             quantize_(
-                model, float8_dynamic_activation_float8_weight(granularity=PerRow())
+                model,
+                Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()),
             )
 
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
@@ -199,19 +206,26 @@ def test_serialization(self, mode: str):
         # Create and quantize the model
         model = ToyLinearModel(16, 32).to(device="cuda")
 
+        fp8_dq_cur_version = Float8DynamicActivationFloat8WeightConfig.VERSION
+        fp8wo_cur_version = Float8WeightOnlyConfig.VERSION
+        Float8DynamicActivationFloat8WeightConfig.VERSION = 1
+        Float8WeightOnlyConfig.VERSION = 1
         mode_map = {
             "dynamic": partial(
-                float8_dynamic_activation_float8_weight, granularity=PerTensor()
+                Float8DynamicActivationFloat8WeightConfig,
+                granularity=PerTensor(),
             ),
-            "weight-only": float8_weight_only,
+            "weight-only": Float8WeightOnlyConfig,
             "static": partial(
-                float8_static_activation_float8_weight,
+                Float8StaticActivationFloat8WeightConfig,
                 scale=torch.tensor(1.0, dtype=torch.float32, device="cuda"),
                 granularity=PerTensor(),
             ),
         }
+
         factory = mode_map[mode]()
         quantize_(model, factory)
+        print("model:", model)
 
         # Save the state dict to an in-memory buffer
         buffer = io.BytesIO()
@@ -262,6 +276,10 @@ def test_serialization(self, mode: str):
                     original_layer.weight.scale, new_layer.weight.scale
                 ), f"Scales do not match for {layer_name}"
 
+        # restore in the end
+        Float8DynamicActivationFloat8WeightConfig.VERSION = fp8_dq_cur_version
+        Float8WeightOnlyConfig.VERSION = fp8wo_cur_version
+
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @unittest.skipIf(
         not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
@@ -274,9 +292,13 @@ def test_fp8_weight_dimension_warning(self):
         with self.assertLogs(
             "torchao.quantization.quant_api", level="INFO"
         ) as log_context:
+            fp8wo_cur_version = Float8WeightOnlyConfig.VERSION
+            Float8DynamicActivationFloat8WeightConfig.VERSION = 1
             quantize_(
-                model, float8_dynamic_activation_float8_weight(granularity=PerTensor())
+                model,
+                Float8DynamicActivationFloat8WeightConfig(granularity=PerTensor()),
             )
+            Float8DynamicActivationFloat8WeightConfig.VERSION = fp8wo_cur_version
             print(model)
 
         # Verify warning messages for both layers
@@ -319,9 +341,13 @@ def test_mm_float8dq_per_row(
             torch.nn.Linear(in_features, out_features, bias=bias).to(device).to(dtype)
         )
         test_linear = copy.deepcopy(ref_linear)
+        fp8_dq_cur_version = Float8DynamicActivationFloat8WeightConfig.VERSION
+        Float8DynamicActivationFloat8WeightConfig.VERSION = 1
         quantize_(
-            test_linear, Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
+            test_linear,
+            Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()),
         )
+        Float8DynamicActivationFloat8WeightConfig.VERSION = fp8_dq_cur_version
 
         quant_weight = test_linear.weight
 
@@ -471,9 +497,13 @@ def test_float8_tensor_slicing_basic(self, granularity):
 
         # Create and quantize a model
         model = torch.nn.Linear(64, 32, bias=False).to(device).to(dtype)
+        fp8_dq_cur_version = Float8DynamicActivationFloat8WeightConfig.VERSION
+        Float8DynamicActivationFloat8WeightConfig.VERSION = 1
         quantize_(
-            model, Float8DynamicActivationFloat8WeightConfig(granularity=granularity)
+            model,
+            Float8DynamicActivationFloat8WeightConfig(granularity=granularity),
         )
+        Float8DynamicActivationFloat8WeightConfig.VERSION = fp8_dq_cur_version
 
         weight_impl = model.weight.original_weight_tensor.tensor_impl
 
@@ -505,9 +535,13 @@ def test_float8_tensor_slicing_per_tensor(self):
 
         # Create and quantize with per-tensor granularity
         model = torch.nn.Linear(64, 32, bias=False).to(device).to(dtype)
+        fp8_dq_cur_version = Float8DynamicActivationFloat8WeightConfig.VERSION
+        Float8DynamicActivationFloat8WeightConfig.VERSION = 1
         quantize_(
-            model, Float8DynamicActivationFloat8WeightConfig(granularity=PerTensor())
+            model,
+            Float8DynamicActivationFloat8WeightConfig(granularity=PerTensor()),
         )
+        Float8DynamicActivationFloat8WeightConfig.VERSION = fp8_dq_cur_version
 
         original_weight = model.weight
         original_impl = original_weight.original_weight_tensor.tensor_impl
@@ -536,9 +570,13 @@ def test_float8_tensor_slicing_per_row(self):
 
         # Create and quantize with per-row granularity
         model = torch.nn.Linear(64, 32, bias=False).to(device).to(dtype)
+        fp8_dq_cur_version = Float8DynamicActivationFloat8WeightConfig.VERSION
+        Float8DynamicActivationFloat8WeightConfig.VERSION = 1
         quantize_(
-            model, Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
+            model,
+            Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()),
         )
+        Float8DynamicActivationFloat8WeightConfig.VERSION = fp8_dq_cur_version
 
         original_weight = model.weight  # Shape: (32, 64)
         original_impl = original_weight.original_weight_tensor.tensor_impl
@@ -574,9 +612,13 @@ def test_float8_tensor_slicing_edge_cases(self):
 
         # Create and quantize a model
         model = torch.nn.Linear(64, 32, bias=False).to(device).to(dtype)
+        fp8_dq_cur_version = Float8DynamicActivationFloat8WeightConfig.VERSION
+        Float8DynamicActivationFloat8WeightConfig.VERSION = 1
         quantize_(
-            model, Float8DynamicActivationFloat8WeightConfig(granularity=PerTensor())
+            model,
+            Float8DynamicActivationFloat8WeightConfig(granularity=PerTensor()),
         )
+        Float8DynamicActivationFloat8WeightConfig.VERSION = fp8_dq_cur_version
 
         original_weight = model.weight
 
@@ -611,10 +653,13 @@ def test_float8_tensor_slicing_functional_correctness(self, granularity):
             torch.nn.Linear(64, 48, bias=False).to(device).to(dtype)
         )  # 48 is divisible by 16
         quant_model = copy.deepcopy(ref_model)
+        fp8_dq_cur_version = Float8DynamicActivationFloat8WeightConfig.VERSION
+        Float8DynamicActivationFloat8WeightConfig.VERSION = 1
         quantize_(
             quant_model,
             Float8DynamicActivationFloat8WeightConfig(granularity=granularity),
         )
+        Float8DynamicActivationFloat8WeightConfig.VERSION = fp8_dq_cur_version
 
         # Create input with batch size that works well with slicing
         input_tensor = torch.randn(8, 64, device=device, dtype=dtype)
@@ -720,6 +765,7 @@ def test_preprocess_scale_3d_reshape(self):
         self.assertEqual(result.shape, expected_shape)
 
     @torch.no_grad()
+    @unittest.skip("test is flaky in CI, will turn on a bit later")
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @unittest.skipIf(
         not is_sm_at_least_90(), "Requires GPU with compute capability >= 9.0"
@@ -743,7 +789,14 @@ def test_expected_kernels_on_gpu(self, granularity, torch_compile_mode):
         m = torch.nn.Sequential(
             torch.nn.Linear(K, N, device="cuda", dtype=torch.bfloat16)
         )
-        quantize_(m, Float8DynamicActivationFloat8WeightConfig(granularity=granularity))
+        fp8_dq_cur_version = Float8DynamicActivationFloat8WeightConfig.VERSION
+        Float8DynamicActivationFloat8WeightConfig.VERSION = 1
+        quantize_(
+            m,
+            Float8DynamicActivationFloat8WeightConfig(granularity=granularity),
+        )
+        Float8DynamicActivationFloat8WeightConfig.VERSION = fp8_dq_cur_version
+
         m = torch.compile(m, mode=torch_compile_mode)
         x = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
 
 
@@ -473,10 +473,10 @@ def test_quantize(self):
         m = nn.Sequential(nn.Linear(32, 32)).cuda()
         m = convert_to_float8_training(m)
         assert isinstance(m[0], Float8Linear), "Module is not a Float8Linear"
-        from torchao.quantization.quant_api import float8_weight_only, quantize_
+        from torchao.quantization import Float8WeightOnlyConfig, quantize_
 
-        quantize_(m, float8_weight_only())
-        assert m[0].weight.tensor_impl.float8_data.dtype == torch.float8_e4m3fn, (
+        quantize_(m, Float8WeightOnlyConfig())
+        assert m[0].weight.qdata.dtype == torch.float8_e4m3fn, (
             "Post quantization dtype should be torch.float8_e4m3fn"
         )
         with torch.no_grad():
 
@@ -0,0 +1,65 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+import unittest
+import warnings
+
+import torch
+from torch.testing._internal import common_utils
+from torch.testing._internal.common_utils import (
+    TestCase,
+    run_tests,
+)
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from torchao.utils import is_sm_at_least_89
+
+_MODEL_NAMES = [
+    "torchao-testing/opt-125m-float8dq-row-v1-0.13-dev",
+]
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+@unittest.skipIf(not is_sm_at_least_89(), "Nedd sm89+")
+class TestLoadingDeprecatedCheckpoint(TestCase):
+    @common_utils.parametrize("model_name", _MODEL_NAMES)
+    def test_load_model_and_run(self, model_name):
+        """Test that we print correct warning message when loading a deprecated checkpoint"""
+        # Load and quantize model
+        with warnings.catch_warnings(record=True) as caught_warnings:
+            quantized_model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                torch_dtype="bfloat16",
+                device_map="cuda",
+            )
+            assert any(
+                "Stored version is not the same as current default version of the config"
+                in str(w.message)
+                for w in caught_warnings
+            ), "Didn't get expected warning message for version mismatch"
+
+            assert any(
+                "Models quantized with VERSION 1 of Float8DynamicActivationFloat8WeightConfig is deprecated"
+                in str(w.message)
+                for w in caught_warnings
+            ), "Didn't get expected warning message for deprecation"
+
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        prompt = ("Hello, my name is",)
+        inputs = tokenizer(
+            prompt,
+            return_tensors="pt",
+        ).to("cuda")
+        generated_ids = quantized_model.generate(**inputs, max_new_tokens=128)
+        # make sure it runs
+        _ = tokenizer.batch_decode(
+            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+
+
+common_utils.instantiate_parametrized_tests(TestLoadingDeprecatedCheckpoint)
+
+if __name__ == "__main__":
+    run_tests()