From d19a6ace4c1ac09ae715dc82086d00bbe6abab52 Mon Sep 17 00:00:00 2001
From: Tim Moon <tmoon@nvidia.com>
Date: Wed, 29 Jan 2025 02:23:46 +0000
Subject: [PATCH 1/2] Relax dim constraint in MXFP8 tests

Dims are multiples of 32 instead of 128.

Signed-off-by: Tim Moon <tmoon@nvidia.com>
---
 tests/pytorch/distributed/run_numerics.py     | 18 +++----
 tests/pytorch/distributed/test_fusible_ops.py |  8 +--
 tests/pytorch/test_fusible_ops.py             | 50 ++++++++-----------
 3 files changed, 33 insertions(+), 43 deletions(-)

diff --git a/tests/pytorch/distributed/run_numerics.py b/tests/pytorch/distributed/run_numerics.py
index 39fbd265e7..492ff15f67 100644
--- a/tests/pytorch/distributed/run_numerics.py
+++ b/tests/pytorch/distributed/run_numerics.py
@@ -79,6 +79,8 @@ def main(argv=None, namespace=None):
     parser.add_argument("--quantization", type=str, default=None)
     args = parser.parse_args(argv, namespace)
 
+    QUANTIZATION = args.quantization
+
     test_dict = [
         test_linear,
         test_layernorm,
@@ -87,14 +89,6 @@ def main(argv=None, namespace=None):
         test_transformer_layer,
     ]
 
-    # Quantization scheme
-    QUANTIZATION = args.quantization
-    if QUANTIZATION == "mxfp8":
-        global SEQ_LEN, BATCH_SIZE, HIDDEN_SIZE
-        SEQ_LEN = 64
-        BATCH_SIZE = 64
-        HIDDEN_SIZE = 256
-
     for test in test_dict:
         test()
     dist.destroy_process_group()
@@ -575,7 +569,9 @@ def _test_layernorm_mlp(set_parallel_mode=None, sequence_parallel=False, **kwarg
     """
     # Set parameter data type
     params_dtype = kwargs.get("params_dtype", torch.float32)
-    FFN_HIDDEN_SIZE = {None: 32, "fp8": 64, "mxfp8": 256}[QUANTIZATION]
+    FFN_HIDDEN_SIZE = (
+        64 if QUANTIZATION in ("fp8", "mxfp8") else 32
+    )  # larger tensors lead to numerical failures with tight atol and rtol
 
     # Create models
     model_single_node = te.LayerNormMLP(HIDDEN_SIZE, FFN_HIDDEN_SIZE, **kwargs)
@@ -665,7 +661,9 @@ def test_layernorm_mlp():
 @run_distributed_test()
 def _test_transformer_layer_parallel(sequence_parallel=False, **kwargs):
     params_dtype = kwargs.get("params_dtype", torch.float32)
-    FFN_HIDDEN_SIZE = {None: 32, "fp8": 64, "mxfp8": 256}[QUANTIZATION]
+    FFN_HIDDEN_SIZE = (
+        64 if QUANTIZATION in ("fp8", "mxfp8") else 32
+    )  # larger tensors lead to numerical failures with tight atol and rtol
 
     model_single_node = te.TransformerLayer(
         HIDDEN_SIZE, FFN_HIDDEN_SIZE, NR_HEADS, attention_dropout=0, hidden_dropout=0, **kwargs
diff --git a/tests/pytorch/distributed/test_fusible_ops.py b/tests/pytorch/distributed/test_fusible_ops.py
index 11a7df5852..d5be2a969e 100644
--- a/tests/pytorch/distributed/test_fusible_ops.py
+++ b/tests/pytorch/distributed/test_fusible_ops.py
@@ -315,8 +315,8 @@ def _test_reduce_scatter(
 
 def _test_basic_linear(
     *,
-    local_weight_shape: tuple[int, int] = (128, 128),
-    local_batch_size: int = 128,
+    local_weight_shape: tuple[int, int] = (16, 16),
+    local_batch_size: int = 16,
     dtype: torch.dtype = torch.float32,
     device: torch.device = "cuda",
     quantization: Optional[str] = None,
@@ -459,8 +459,8 @@ def _test_basic_linear(
 def _test_linear(
     *,
     bias: bool = True,
-    local_weight_shape: tuple[int, int] = (128, 128),
-    local_batch_size: int = 128,
+    local_weight_shape: tuple[int, int] = (16, 16),
+    local_batch_size: int = 16,
     dtype: torch.dtype = torch.float32,
     device: torch.device = "cuda",
     quantization: Optional[str] = None,
diff --git a/tests/pytorch/test_fusible_ops.py b/tests/pytorch/test_fusible_ops.py
index b2bd623ad8..c35ba71b15 100644
--- a/tests/pytorch/test_fusible_ops.py
+++ b/tests/pytorch/test_fusible_ops.py
@@ -64,8 +64,8 @@ def maybe_skip_quantization(
             if math.prod(dims[:-1]) % 16 != 0 or dims[-1] % 16 != 0:
                 pytest.skip("FP8 GEMMs require dims that are divisible by 16")
         elif quantization == "mxfp8":
-            if math.prod(dims[:-1]) % 128 != 0 or dims[-1] % 128 != 0:
-                pytest.skip("FP8 GEMMs require dims that are divisible by 128")
+            if math.prod(dims[:-1]) % 16 != 0 or dims[-1] % 16 != 0:
+                pytest.skip("MXFP8 GEMMs require dims that are divisible by 16")
 
     # Check if device is supported
     if device is not None and torch.device(device).type != "cuda":
@@ -368,6 +368,7 @@ def test_fp8_scale_update(
     def test_dtype_cast(
         self,
         *,
+        size: int = 16,
         init_dtype: torch.dtype,
         final_dtype: torch.dtype,
         device: torch.device = "cuda",
@@ -379,11 +380,6 @@ def test_dtype_cast(
         maybe_skip_quantization(quantization, device=device)
         with_quantization = quantization is not None
 
-        # Data dimensions
-        size = 16
-        if quantization == "mxfp8":
-            size = 128
-
         # Random data
         dtype = torch.float32
         if torch.float16 in (init_dtype, final_dtype):
@@ -437,6 +433,7 @@ def test_dtype_cast(
     def test_pyt_autocast(
         self,
         *,
+        size: int = 16,
         model_dtype: torch.dtype,
         autocast_dtype: torch.dtype,
         device: torch.device = "cuda",
@@ -450,11 +447,6 @@ def test_pyt_autocast(
         quantized_compute = quantization is not None
         maybe_skip_quantization(quantization)
 
-        # Data dimensions
-        size = 16
-        if quantization == "mxfp8":
-            size = 128
-
         # Construct operation
         recipe = make_recipe(quantization)
         with te.fp8_model_init(enabled=quantized_weights, recipe=recipe):
@@ -692,7 +684,7 @@ def test_bias(
     def test_quantize(
         self,
         *,
-        in_shape: Iterable[int] = (128, 128),
+        in_shape: Iterable[int] = (16, 16),
         dtype: torch.dtype = torch.bfloat16,
         device: torch.device = "cuda",
         quantization: str,
@@ -859,8 +851,8 @@ def _test_basic_linear(
             )
         torch.testing.assert_close(dw_test, w_ref.grad, **tols)
 
-    @pytest.mark.parametrize("weight_shape", ((128, 128), (3, 5)))
-    @pytest.mark.parametrize("in_shape", ((-1,), (5, 1, -1), (4, 4, 8, -1)))
+    @pytest.mark.parametrize("weight_shape", ((48, 16), (3, 5)))
+    @pytest.mark.parametrize("in_shape", ((-1,), (5, 1, -1), (2, 2, 4, -1)))
     @pytest.mark.parametrize("dtype", _dtypes)
     @pytest.mark.parametrize("quantization", (None, "fp8", "mxfp8"))
     @pytest.mark.parametrize("accumulate_into_main_grad", (False, True))
@@ -921,8 +913,8 @@ def test_linear(
         self,
         *,
         bias: bool,
-        weight_shape: tuple[int, int] = (128, 128),
-        in_shape: Iterable[int] = (128, -1),
+        weight_shape: tuple[int, int] = (16, 16),
+        in_shape: Iterable[int] = (16, -1),
         dtype: torch.dtype = torch.float32,
         device: torch.device = "cuda",
         quantization: Optional[str],
@@ -1012,8 +1004,8 @@ def test_linear(
             db_test = op.bias.grad.to(dtype=torch.float64, device="cpu")
             torch.testing.assert_close(db_test, b_ref.grad, **tols)
 
-    @pytest.mark.parametrize("weight_shape", ((7, 2), (128,)))
-    @pytest.mark.parametrize("in_shape", ((-1,), (6, 64, -1)))
+    @pytest.mark.parametrize("weight_shape", ((7, 2), (16,)))
+    @pytest.mark.parametrize("in_shape", ((-1,), (6, 8, -1)))
     @pytest.mark.parametrize("dtype", _dtypes)
     @pytest.mark.parametrize("zero_centered_gamma", (False, True))
     @pytest.mark.parametrize("quantization", (None, "fp8", "mxfp8"))
@@ -1182,8 +1174,8 @@ def test_layer_norm_autocast(
         torch.testing.assert_close(dw_test, w_ref.grad, **dtype_tols(dtype))
         torch.testing.assert_close(db_test, b_ref.grad, **dtype_tols(dtype))
 
-    @pytest.mark.parametrize("weight_shape", ((19,), (128,)))
-    @pytest.mark.parametrize("in_shape", ((-1,), (6, 64, -1)))
+    @pytest.mark.parametrize("weight_shape", ((19,), (64,)))
+    @pytest.mark.parametrize("in_shape", ((-1,), (6, 8, -1)))
     @pytest.mark.parametrize("dtype", _dtypes)
     @pytest.mark.parametrize("zero_centered_gamma", (False, True))
     @pytest.mark.parametrize("quantization", (None, "fp8", "mxfp8"))
@@ -1395,7 +1387,7 @@ def test_make_extra_output(
         torch.testing.assert_close(dx_test, x_ref.grad, **tols)
 
     @pytest.mark.parametrize("activation", ("relu", "gelu", "geglu", "reglu", "swiglu"))
-    @pytest.mark.parametrize("out_shape", ((37,), (2, 13), (128, 1, 128)))
+    @pytest.mark.parametrize("out_shape", ((37,), (2, 13), (4, 1, 16)))
     @pytest.mark.parametrize("dtype", _dtypes)
     @pytest.mark.parametrize("quantization", (None, "fp8", "mxfp8"))
     def test_activation(
@@ -1491,7 +1483,7 @@ def test_activation(
     def test_swiglu(
         self,
         *,
-        out_shape: Iterable[int] = (128, 128),
+        out_shape: Iterable[int] = (16, 16),
         dtype: torch.dtype,
         device: torch.device = "cuda",
         quantization: Optional[str],
@@ -1560,8 +1552,8 @@ def setup_class(cls) -> None:
         torch.manual_seed(seed)
         torch.cuda.manual_seed(seed)
 
-    @pytest.mark.parametrize("weight_shape", ((128, 128), (3, 5)))
-    @pytest.mark.parametrize("in_shape", ((-1,), (1, 7, -1), (128, -1)))
+    @pytest.mark.parametrize("weight_shape", ((32, 48), (3, 5)))
+    @pytest.mark.parametrize("in_shape", ((-1,), (1, 7, -1), (4, 2, 10, -1)))
     @pytest.mark.parametrize("dtype", _dtypes)
     @pytest.mark.parametrize("quantization", (None, "fp8", "mxfp8"))
     @pytest.mark.parametrize("quantized_weight", (False, True))
@@ -1678,8 +1670,8 @@ def test_forward_linear_bias_add(
         self,
         *,
         bias: bool,
-        weight_shape: tuple[int, int] = (128, 128),
-        in_shape: Iterable[int] = (128, -1),
+        weight_shape: tuple[int, int] = (16, 16),
+        in_shape: Iterable[int] = (16, -1),
         dtype: torch.dtype,
         device: torch.device = "cuda",
         quantization: Optional[str],
@@ -1791,8 +1783,8 @@ def test_forward_linear_bias_add(
     def test_backward_linear_add(
         self,
         *,
-        weight_shape: tuple[int, int] = (128, 128),
-        in_shape: Iterable[int] = (128, -1),
+        weight_shape: tuple[int, int] = (16, 16),
+        in_shape: Iterable[int] = (16, -1),
         dtype: torch.dtype,
         device: torch.device = "cuda",
         quantization: Optional[str],

From 4129b3765cab6b8add03346a9df434bd0faf1eda Mon Sep 17 00:00:00 2001
From: Tim Moon <tmoon@nvidia.com>
Date: Fri, 31 Jan 2025 01:37:09 +0000
Subject: [PATCH 2/2] Make tensor dims multiples of 32

Signed-off-by: Tim Moon <tmoon@nvidia.com>
---
 tests/pytorch/distributed/run_numerics.py | 12 +++----
 tests/pytorch/test_fusible_ops.py         | 40 +++++++++++------------
 2 files changed, 24 insertions(+), 28 deletions(-)

diff --git a/tests/pytorch/distributed/run_numerics.py b/tests/pytorch/distributed/run_numerics.py
index 492ff15f67..7e3a9bb39b 100644
--- a/tests/pytorch/distributed/run_numerics.py
+++ b/tests/pytorch/distributed/run_numerics.py
@@ -23,8 +23,8 @@
 )
 from run_layer_with_overlap import _compare_tensors
 
-SEQ_LEN, BATCH_SIZE = 16, 16
-HIDDEN_SIZE = 64
+SEQ_LEN, BATCH_SIZE = 32, 32
+HIDDEN_SIZE = 128
 NR_HEADS = 4
 WORLD_RANK, WORLD_SIZE = None, None
 NCCL_WORLD = None
@@ -569,9 +569,7 @@ def _test_layernorm_mlp(set_parallel_mode=None, sequence_parallel=False, **kwarg
     """
     # Set parameter data type
     params_dtype = kwargs.get("params_dtype", torch.float32)
-    FFN_HIDDEN_SIZE = (
-        64 if QUANTIZATION in ("fp8", "mxfp8") else 32
-    )  # larger tensors lead to numerical failures with tight atol and rtol
+    FFN_HIDDEN_SIZE = 128
 
     # Create models
     model_single_node = te.LayerNormMLP(HIDDEN_SIZE, FFN_HIDDEN_SIZE, **kwargs)
@@ -661,9 +659,7 @@ def test_layernorm_mlp():
 @run_distributed_test()
 def _test_transformer_layer_parallel(sequence_parallel=False, **kwargs):
     params_dtype = kwargs.get("params_dtype", torch.float32)
-    FFN_HIDDEN_SIZE = (
-        64 if QUANTIZATION in ("fp8", "mxfp8") else 32
-    )  # larger tensors lead to numerical failures with tight atol and rtol
+    FFN_HIDDEN_SIZE = 128
 
     model_single_node = te.TransformerLayer(
         HIDDEN_SIZE, FFN_HIDDEN_SIZE, NR_HEADS, attention_dropout=0, hidden_dropout=0, **kwargs
diff --git a/tests/pytorch/test_fusible_ops.py b/tests/pytorch/test_fusible_ops.py
index c35ba71b15..4506fb628d 100644
--- a/tests/pytorch/test_fusible_ops.py
+++ b/tests/pytorch/test_fusible_ops.py
@@ -64,8 +64,8 @@ def maybe_skip_quantization(
             if math.prod(dims[:-1]) % 16 != 0 or dims[-1] % 16 != 0:
                 pytest.skip("FP8 GEMMs require dims that are divisible by 16")
         elif quantization == "mxfp8":
-            if math.prod(dims[:-1]) % 16 != 0 or dims[-1] % 16 != 0:
-                pytest.skip("MXFP8 GEMMs require dims that are divisible by 16")
+            if math.prod(dims[:-1]) % 32 != 0 or dims[-1] % 32 != 0:
+                pytest.skip("MXFP8 GEMMs require dims that are divisible by 32")
 
     # Check if device is supported
     if device is not None and torch.device(device).type != "cuda":
@@ -368,7 +368,7 @@ def test_fp8_scale_update(
     def test_dtype_cast(
         self,
         *,
-        size: int = 16,
+        size: int = 32,
         init_dtype: torch.dtype,
         final_dtype: torch.dtype,
         device: torch.device = "cuda",
@@ -433,7 +433,7 @@ def test_dtype_cast(
     def test_pyt_autocast(
         self,
         *,
-        size: int = 16,
+        size: int = 32,
         model_dtype: torch.dtype,
         autocast_dtype: torch.dtype,
         device: torch.device = "cuda",
@@ -684,7 +684,7 @@ def test_bias(
     def test_quantize(
         self,
         *,
-        in_shape: Iterable[int] = (16, 16),
+        in_shape: Iterable[int] = (32, 32),
         dtype: torch.dtype = torch.bfloat16,
         device: torch.device = "cuda",
         quantization: str,
@@ -851,8 +851,8 @@ def _test_basic_linear(
             )
         torch.testing.assert_close(dw_test, w_ref.grad, **tols)
 
-    @pytest.mark.parametrize("weight_shape", ((48, 16), (3, 5)))
-    @pytest.mark.parametrize("in_shape", ((-1,), (5, 1, -1), (2, 2, 4, -1)))
+    @pytest.mark.parametrize("weight_shape", ((64, 32), (3, 5)))
+    @pytest.mark.parametrize("in_shape", ((-1,), (5, 1, -1), (4, 2, 4, -1)))
     @pytest.mark.parametrize("dtype", _dtypes)
     @pytest.mark.parametrize("quantization", (None, "fp8", "mxfp8"))
     @pytest.mark.parametrize("accumulate_into_main_grad", (False, True))
@@ -913,8 +913,8 @@ def test_linear(
         self,
         *,
         bias: bool,
-        weight_shape: tuple[int, int] = (16, 16),
-        in_shape: Iterable[int] = (16, -1),
+        weight_shape: tuple[int, int] = (32, 32),
+        in_shape: Iterable[int] = (32, -1),
         dtype: torch.dtype = torch.float32,
         device: torch.device = "cuda",
         quantization: Optional[str],
@@ -1004,8 +1004,8 @@ def test_linear(
             db_test = op.bias.grad.to(dtype=torch.float64, device="cpu")
             torch.testing.assert_close(db_test, b_ref.grad, **tols)
 
-    @pytest.mark.parametrize("weight_shape", ((7, 2), (16,)))
-    @pytest.mark.parametrize("in_shape", ((-1,), (6, 8, -1)))
+    @pytest.mark.parametrize("weight_shape", ((7, 2), (32,)))
+    @pytest.mark.parametrize("in_shape", ((-1,), (6, 16, -1)))
     @pytest.mark.parametrize("dtype", _dtypes)
     @pytest.mark.parametrize("zero_centered_gamma", (False, True))
     @pytest.mark.parametrize("quantization", (None, "fp8", "mxfp8"))
@@ -1175,7 +1175,7 @@ def test_layer_norm_autocast(
         torch.testing.assert_close(db_test, b_ref.grad, **dtype_tols(dtype))
 
     @pytest.mark.parametrize("weight_shape", ((19,), (64,)))
-    @pytest.mark.parametrize("in_shape", ((-1,), (6, 8, -1)))
+    @pytest.mark.parametrize("in_shape", ((-1,), (6, 16, -1)))
     @pytest.mark.parametrize("dtype", _dtypes)
     @pytest.mark.parametrize("zero_centered_gamma", (False, True))
     @pytest.mark.parametrize("quantization", (None, "fp8", "mxfp8"))
@@ -1387,7 +1387,7 @@ def test_make_extra_output(
         torch.testing.assert_close(dx_test, x_ref.grad, **tols)
 
     @pytest.mark.parametrize("activation", ("relu", "gelu", "geglu", "reglu", "swiglu"))
-    @pytest.mark.parametrize("out_shape", ((37,), (2, 13), (4, 1, 16)))
+    @pytest.mark.parametrize("out_shape", ((37,), (2, 13), (32, 1, 32)))
     @pytest.mark.parametrize("dtype", _dtypes)
     @pytest.mark.parametrize("quantization", (None, "fp8", "mxfp8"))
     def test_activation(
@@ -1483,7 +1483,7 @@ def test_activation(
     def test_swiglu(
         self,
         *,
-        out_shape: Iterable[int] = (16, 16),
+        out_shape: Iterable[int] = (32, 32),
         dtype: torch.dtype,
         device: torch.device = "cuda",
         quantization: Optional[str],
@@ -1552,8 +1552,8 @@ def setup_class(cls) -> None:
         torch.manual_seed(seed)
         torch.cuda.manual_seed(seed)
 
-    @pytest.mark.parametrize("weight_shape", ((32, 48), (3, 5)))
-    @pytest.mark.parametrize("in_shape", ((-1,), (1, 7, -1), (4, 2, 10, -1)))
+    @pytest.mark.parametrize("weight_shape", ((32, 64), (3, 5)))
+    @pytest.mark.parametrize("in_shape", ((-1,), (1, 7, -1), (8, 2, 10, -1)))
     @pytest.mark.parametrize("dtype", _dtypes)
     @pytest.mark.parametrize("quantization", (None, "fp8", "mxfp8"))
     @pytest.mark.parametrize("quantized_weight", (False, True))
@@ -1670,8 +1670,8 @@ def test_forward_linear_bias_add(
         self,
         *,
         bias: bool,
-        weight_shape: tuple[int, int] = (16, 16),
-        in_shape: Iterable[int] = (16, -1),
+        weight_shape: tuple[int, int] = (32, 32),
+        in_shape: Iterable[int] = (32, -1),
         dtype: torch.dtype,
         device: torch.device = "cuda",
         quantization: Optional[str],
@@ -1783,8 +1783,8 @@ def test_forward_linear_bias_add(
     def test_backward_linear_add(
         self,
         *,
-        weight_shape: tuple[int, int] = (16, 16),
-        in_shape: Iterable[int] = (16, -1),
+        weight_shape: tuple[int, int] = (32, 32),
+        in_shape: Iterable[int] = (32, -1),
         dtype: torch.dtype,
         device: torch.device = "cuda",
         quantization: Optional[str],