pytorch
diff --git a/‎test/dtypes/test_affine_quantized_float.py
Lines changed: 38 additions & 83 deletions b/‎test/dtypes/test_affine_quantized_float.py
Lines changed: 38 additions & 83 deletions
@@ -25,10 +25,11 @@
 from torch._inductor.test_case import TestCase as InductorTestCase
 from torch.testing._internal import common_utils
 
-from torchao.dtypes.floatx.float8_layout import Float8AQTTensorImpl, preprocess_scale
+from torchao.dtypes.floatx.float8_layout import preprocess_scale
 from torchao.float8.float8_utils import compute_error
 from torchao.quantization import (
     Float8DynamicActivationFloat8WeightConfig,
+    Float8Tensor,
     float8_dynamic_activation_float8_weight,
     float8_weight_only,
     quantize_,
@@ -236,12 +237,8 @@ def test_serialization(self, mode: str):
             new_layer = getattr(new_model, layer_name)
 
             # Compare weights
-            if mode == "weight-only":
-                original_weight = original_layer.weight.tensor_impl.float8_data.to(
-                    torch.float32
-                )
-                new_weight = new_layer.weight.tensor_impl.float8_data.to(torch.float32)
-            else:
+            if mode == "static":
+                # note: we haven't migrated static quant to the new API
                 original_weight = original_layer.weight.original_weight_tensor.tensor_impl.float8_data.to(
                     torch.float32
                 )
@@ -250,6 +247,9 @@ def test_serialization(self, mode: str):
                         torch.float32
                     )
                 )
+            else:
+                original_weight = original_layer.weight.float8_data.to(torch.float32)
+                new_weight = new_layer.weight.float8_data.to(torch.float32)
 
             assert torch.allclose(original_weight, new_weight), (
                 f"Weights do not match for {layer_name}"
@@ -324,19 +324,15 @@ def test_mm_float8dq_per_row(
 
         quant_weight = test_linear.weight
 
-        self.assertTrue(hasattr(quant_weight, "original_weight_tensor"))
-        weight_impl = quant_weight.original_weight_tensor.tensor_impl
-
-        self.assertTrue(hasattr(weight_impl, "float8_data"))
-        self.assertTrue(hasattr(weight_impl, "scale"))
-        self.assertFalse(weight_impl.transposed)
+        self.assertTrue(hasattr(quant_weight, "float8_data"))
+        self.assertTrue(hasattr(quant_weight, "scale"))
 
         # Verify scale shape for row-wise quantization
         expected_scale_shape = (out_features, 1)
-        actual_scale_shape = weight_impl.scale.shape
+        actual_scale_shape = quant_weight.scale.shape
         self.assertEqual(actual_scale_shape, expected_scale_shape)
 
-        self.assertEqual(weight_impl.float8_data.shape, (out_features, in_features))
+        self.assertEqual(quant_weight.float8_data.shape, (out_features, in_features))
 
         input_tensor = torch.randn(*input_shape, device=device, dtype=dtype)
 
@@ -357,7 +353,7 @@ def test_mm_float8dq_per_row(
     @common_utils.parametrize("float8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
     @common_utils.parametrize("output_dtype", [torch.float32, torch.bfloat16])
     @common_utils.parametrize("block_size", [None, (1, 32), (2, 16), (4, 8)])
-    def test_dequantize_affine_float8(self, float8_dtype, output_dtype, block_size):
+    def test__dequantize_affine_float8(self, float8_dtype, output_dtype, block_size):
         """Test _dequantize_affine_float8 with various configurations"""
 
         device = "cuda"
@@ -387,7 +383,7 @@ def test_dequantize_affine_float8(self, float8_dtype, output_dtype, block_size):
     @unittest.skipIf(
         not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
-    def test_dequantize_affine_float8_scale_broadcasting(self):
+    def test__dequantize_affine_float8_scale_broadcasting(self):
         """Test that scale broadcasting works correctly for block-wise quantization"""
         device = "cuda"
         # Create input tensor with known block structure
@@ -431,24 +427,24 @@ def test_float8_tensor_slicing_basic(self, granularity):
             model, Float8DynamicActivationFloat8WeightConfig(granularity=granularity)
         )
 
-        weight_impl = model.weight.original_weight_tensor.tensor_impl
+        weight = model.weight
 
         # Test dimension 0 slicing (rows)
-        sliced_0 = weight_impl[10:20]
+        sliced_0 = weight[10:20]
         self.assertEqual(sliced_0.shape, (10, 64))
 
         # Test dimension 1 slicing (columns)
-        sliced_1 = weight_impl[:, 20:40]
+        sliced_1 = weight[:, 20:40]
         self.assertEqual(sliced_1.shape, (32, 20))
 
         # Test combined slicing
-        sliced_both = weight_impl[5:15, 10:30]
+        sliced_both = weight[5:15, 10:30]
         self.assertEqual(sliced_both.shape, (10, 20))
 
         # Verify the sliced tensors are still Float8 tensors
-        self.assertTrue(isinstance(sliced_0, Float8AQTTensorImpl))
-        self.assertTrue(isinstance(sliced_1, Float8AQTTensorImpl))
-        self.assertTrue(isinstance(sliced_both, Float8AQTTensorImpl))
+        self.assertTrue(isinstance(sliced_0, Float8Tensor))
+        self.assertTrue(isinstance(sliced_1, Float8Tensor))
+        self.assertTrue(isinstance(sliced_both, Float8Tensor))
 
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @unittest.skipIf(
@@ -466,16 +462,15 @@ def test_float8_tensor_slicing_per_tensor(self):
         )
 
         original_weight = model.weight
-        original_impl = original_weight.original_weight_tensor.tensor_impl
-        original_scale = original_impl.scale
+        original_scale = original_weight.scale
 
         # Test slicing
         sliced_weight = original_weight[10:20, 20:40]
-        sliced_impl = sliced_weight.original_weight_tensor.tensor_impl
+        sliced_scale = sliced_weight.scale
 
         # For per-tensor quantization, scale should be identical
-        self.assertTrue(torch.equal(original_scale, sliced_impl.scale))
-        self.assertEqual(sliced_impl.scale.numel(), 1)
+        self.assertTrue(torch.equal(original_scale, sliced_scale))
+        self.assertEqual(sliced_scale.numel(), 1)
 
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @unittest.skipIf(
@@ -497,27 +492,26 @@ def test_float8_tensor_slicing_per_row(self):
         )
 
         original_weight = model.weight  # Shape: (32, 64)
-        original_impl = original_weight.original_weight_tensor.tensor_impl
-        original_scale = original_impl.scale  # Shape: (32, 1)
+        original_scale = model.weight.scale  # Shape: (32, 1)
 
         # Test row slicing (dimension 0)
         sliced_rows = original_weight[10:20]  # Shape: (10, 64)
-        sliced_impl = sliced_rows.original_weight_tensor.tensor_impl
+        sliced_scale = sliced_rows.scale
 
         # Scale should be sliced to match the rows
         expected_scale_shape = (10, 1)
-        self.assertEqual(sliced_impl.scale.shape, expected_scale_shape)
+        self.assertEqual(sliced_scale.shape, expected_scale_shape)
 
         # Verify the scale values are correct (should be subset of original)
-        self.assertTrue(torch.equal(sliced_impl.scale, original_scale[10:20]))
+        self.assertTrue(torch.equal(sliced_scale, original_scale[10:20]))
 
         # Test column slicing (dimension 1) - scale should not change for per-row
         sliced_cols = original_weight[:, 20:40]  # Shape: (32, 20)
-        sliced_cols_impl = sliced_cols.original_weight_tensor.tensor_impl
+        sliced_cols_scale = sliced_cols.scale
 
         # Scale shape should remain the same since we're not changing rows
-        self.assertEqual(sliced_cols_impl.scale.shape, (32, 1))
-        self.assertTrue(torch.equal(sliced_cols_impl.scale, original_scale))
+        self.assertEqual(sliced_cols_scale.shape, (32, 1))
+        self.assertTrue(torch.equal(sliced_cols_scale, original_scale))
 
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @unittest.skipIf(
@@ -552,11 +546,11 @@ def test_float8_tensor_slicing_edge_cases(self):
     @unittest.skipIf(
         not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
-    @common_utils.parametrize("granularity", [PerTensor(), PerRow()])
     @unittest.skipIf(
         is_sm_version(8, 9),
         "TODO: AssertionError: tensor(-2.1562, device='cuda:0', dtype=torch.bfloat16) not greater than 15",
     )
+    @common_utils.parametrize("granularity", [PerTensor(), PerRow()])
     def test_float8_tensor_slicing_functional_correctness(self, granularity):
         """Test that sliced tensors produce correct results in computations"""
         device = "cuda"
@@ -579,15 +573,16 @@ def test_float8_tensor_slicing_functional_correctness(self, granularity):
         quant_weight_slice = quant_model.weight[0:16, 0:32]
 
         # Verify that the sliced weights maintain Float8 properties
-        self.assertTrue(hasattr(quant_weight_slice, "original_weight_tensor"))
-        sliced_impl = quant_weight_slice.original_weight_tensor.tensor_impl
-        self.assertTrue(isinstance(sliced_impl, Float8AQTTensorImpl))
+        self.assertTrue(hasattr(quant_weight_slice, "float8_data"))
+        self.assertTrue(hasattr(quant_weight_slice, "scale"))
+        sliced_impl = quant_weight_slice
+        self.assertTrue(isinstance(sliced_impl, Float8Tensor))
 
         # Verify sliced weight shapes
         self.assertEqual(sliced_impl.float8_data.shape, (16, 32))
 
         # Get original quantized weight implementation for scale comparison
-        original_quant_impl = quant_model.weight.original_weight_tensor.tensor_impl
+        original_quant_impl = quant_model.weight
 
         # Verify scale properties based on granularity
         if isinstance(granularity, PerTensor):
@@ -604,7 +599,7 @@ def test_float8_tensor_slicing_functional_correctness(self, granularity):
             )
 
         # Verify that sliced quantized data matches the correct slice from original
-        original_float8_data_slice = original_quant_impl.float8_data[0:16, 0:32]
+        original_float8_data_slice = quant_model.weight.float8_data[0:16, 0:32]
         self.assertTrue(
             torch.equal(sliced_impl.float8_data, original_float8_data_slice)
         )
@@ -675,46 +670,6 @@ def test_preprocess_scale_3d_reshape(self):
         expected_shape = (8, 1)  # Flattened (2*2*2, 1)
         self.assertEqual(result.shape, expected_shape)
 
-    @common_utils.parametrize("float8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
-    @common_utils.parametrize("hp_dtype", [torch.float32, torch.bfloat16])
-    def test_quantize_dequantize_fp8_inductor(self, float8_dtype, hp_dtype):
-        quantize_affine_float8 = torch.ops.torchao.quantize_affine_float8
-        dequantize_affine_float8 = torch.ops.torchao.dequantize_affine_float8
-        input = torch.randn(10, 10)
-        with torch.no_grad():
-            torch._dynamo.reset()
-            expected_scale = torch.tensor(2.0)
-            expected_quantized = quantize_affine_float8(
-                input,
-                expected_scale,
-                float8_dtype=float8_dtype,
-            )
-            expected_dequantized = dequantize_affine_float8(
-                expected_quantized,
-                expected_scale,
-                output_dtype=hp_dtype,
-            )
-            test_q, (code_q,) = torch._inductor.utils.run_and_get_code(
-                torch.compile(quantize_affine_float8),
-                input,
-                expected_scale,
-                float8_dtype=float8_dtype,
-            )
-            torch.testing.FileCheck().check(
-                "torch.ops.torchao.quantize_affine_float8.default"
-            ).run(code_q)
-            test_dq, (code_dq,) = torch._inductor.utils.run_and_get_code(
-                torch.compile(dequantize_affine_float8),
-                test_q,
-                expected_scale,
-                hp_dtype,
-            )
-            torch.testing.FileCheck().check(
-                "torch.ops.torchao.dequantize_affine_float8.default"
-            ).run(code_dq)
-            torch.testing.assert_close(expected_quantized, test_q)
-            torch.testing.assert_close(expected_dequantized, test_dq)
-
 
 common_utils.instantiate_parametrized_tests(TestAffineQuantizedFloat8Compile)