NVIDIA
diff --git a/‎tests/pytorch/test_numerics.py‎
Lines changed: 140 additions & 3 deletions b/‎tests/pytorch/test_numerics.py‎
Lines changed: 140 additions & 3 deletions
diff --git a/‎transformer_engine/pytorch/csrc/common.cpp‎
Lines changed: 16 additions & 0 deletions b/‎transformer_engine/pytorch/csrc/common.cpp‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎transformer_engine/pytorch/csrc/common.h‎
Lines changed: 8 additions & 0 deletions b/‎transformer_engine/pytorch/csrc/common.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎transformer_engine/pytorch/csrc/extensions/cast.cpp‎
Lines changed: 0 additions & 38 deletions b/‎transformer_engine/pytorch/csrc/extensions/cast.cpp‎
Lines changed: 0 additions & 38 deletions
diff --git a/‎transformer_engine/pytorch/csrc/extensions/pybind.cpp‎
Lines changed: 3 additions & 0 deletions b/‎transformer_engine/pytorch/csrc/extensions/pybind.cpp‎
Lines changed: 3 additions & 0 deletions
@@ -12,11 +12,13 @@
 import torch.nn as nn
 from torch.nn import Parameter
 
+import transformer_engine.pytorch as te
 from transformer_engine.pytorch.fp8 import (
     FP8GlobalStateManager,
     fp8_autocast,
     fp8_model_init,
 )
+from transformer_engine.pytorch.tensor.mxfp8_tensor import MXFP8Quantizer
 from transformer_engine.pytorch.utils import (
     init_method_normal,
     scaled_init_method_normal,
@@ -1697,13 +1699,16 @@ def _test_grouped_linear_accuracy(
     fp8,
     fuse_wgrad_accumulation,
     delay_wgrad_compute=False,
+    activation_func=None,  # assume gated activation function
 ):
     reset_rng_states()
     if fp8:
         FP8GlobalStateManager.reset()
 
+    # assume gated activation function
+    hidden_size = config.hidden_size if activation_func is None else 2 * config.hidden_size
     inp_hidden_states = torch.randn(
-        (config.max_seqlen_q, bs, config.hidden_size),
+        (config.max_seqlen_q, bs, hidden_size),
         dtype=dtype,
         device="cuda",
         requires_grad=True,
@@ -1728,11 +1733,11 @@ def _test_grouped_linear_accuracy(
     with fp8_autocast(enabled=fp8, fp8_recipe=recipe):
         if isinstance(block, GroupedLinear):
             m_splits = m_splits * bs
-            out = block(inp_hidden_states, m_splits.tolist())
+            out = block(activation_func(inp_hidden_states), m_splits.tolist())
         else:
             out = torch.cat(
                 [
-                    block[i](inp)
+                    block[i](activation_func(inp))
                     for i, inp in enumerate(torch.split(inp_hidden_states, m_splits.tolist()))
                 ]
             )
@@ -1967,6 +1972,92 @@ def test_grouped_linear_accuracy_single_gemm(recipe):
     )
 
 
+@pytest.mark.skipif(not mxfp8_available, reason="MXFP8 is not available")
+@pytest.mark.parametrize("dtype", param_types, ids=str)
+@pytest.mark.parametrize("num_gemms", [3, 6])
+@pytest.mark.parametrize("bs", batch_sizes)
+@pytest.mark.parametrize("model", ["126m"])
+@pytest.mark.parametrize("recipe", [recipe.MXFP8BlockScaling()])
+@pytest.mark.parametrize("fp8_model_params", all_boolean)
+def test_grouped_linear_fp8_input(
+    dtype,
+    num_gemms,
+    bs,
+    model,
+    recipe,
+    fp8_model_params,
+):
+    config = model_configs[model]
+    if config.max_seqlen_q % 32 != 0:
+        pytest.skip("MXFP8 requires sequence length to be divisible by 32.")
+
+    with fp8_model_init(enabled=fp8_model_params, recipe=recipe):
+        grouped_linear_bf16_input = GroupedLinear(
+            num_gemms,
+            config.hidden_size,
+            4 * config.hidden_size,
+            bias=False,
+            params_dtype=dtype,
+            device="cuda",
+            fuse_wgrad_accumulation=True,
+        ).eval()
+
+        grouped_linear_fp8_input = GroupedLinear(
+            num_gemms,
+            config.hidden_size,
+            4 * config.hidden_size,
+            bias=False,
+            params_dtype=dtype,
+            device="cuda",
+            fuse_wgrad_accumulation=True,
+        ).eval()
+
+    # Share params
+    with torch.no_grad():
+        for i in range(num_gemms):
+            setattr(
+                grouped_linear_fp8_input,
+                f"weight{i}",
+                Parameter(getattr(grouped_linear_bf16_input, f"weight{i}").clone()),
+            )
+            weight_i = getattr(grouped_linear_bf16_input, f"weight{i}")
+            weight_i.main_grad = torch.rand_like(weight_i, dtype=torch.float32)
+            weight_i_copy = getattr(grouped_linear_fp8_input, f"weight{i}")
+            weight_i_copy.main_grad = weight_i.main_grad.clone()
+
+    bf16_activation = te.ops.SwiGLU()
+    fp8_activation = te.ops.Sequential(
+        te.ops.SwiGLU(),
+        te.ops.Quantize(forward=True, backward=False),  # Output QuantizedTensor in forward
+    )
+
+    outputs_ref = _test_grouped_linear_accuracy(
+        grouped_linear_bf16_input,
+        num_gemms,
+        bs,
+        dtype,
+        config,
+        recipe,
+        fp8=True,
+        fuse_wgrad_accumulation=True,
+        activation_func=bf16_activation,
+    )
+    outputs = _test_grouped_linear_accuracy(
+        grouped_linear_fp8_input,
+        num_gemms,
+        bs,
+        dtype,
+        config,
+        recipe,
+        fp8=True,
+        fuse_wgrad_accumulation=True,
+        activation_func=fp8_activation,
+    )
+    # Shoule be bit-wise match
+    for i, (o, o_ref) in enumerate(zip(outputs, outputs_ref)):
+        torch.testing.assert_close(o, o_ref, rtol=0, atol=0)
+
+
 def _test_padding_grouped_linear_accuracy(block, num_gemms, bs, dtype, config, recipe, fp8=False):
 
     def _pad_tensor_for_fp8(hidden_states, tokens_per_expert):
@@ -2706,3 +2797,49 @@ def _run_module(m, inp):
     out = _run_module(g2, b)
 
     assert_allclose(out, outT, 1e-7)
+
+
+@pytest.mark.skipif(not mxfp8_available, reason="MXFP8 is not available")
+@pytest.mark.parametrize("dtype", param_types, ids=str)
+@pytest.mark.parametrize("num_experts", [8])
+@pytest.mark.parametrize("m", [64, 128, 256])
+@pytest.mark.parametrize("k", [64, 128, 256])
+def test_split_quantized_tensor(dtype, num_experts, m, k):
+
+    tensor = torch.randn((m * num_experts, k), dtype=dtype, device="cuda")
+    m_splits = [m] * num_experts
+
+    quantizer = MXFP8Quantizer(
+        fp8_dtype=tex.DType.kFloat8E4M3,
+        rowwise=True,
+        columnwise=True,
+    )
+
+    # Split and quantize one by one
+    ref_mxfp8 = tex.split_quantize(tensor, m_splits, [quantizer] * num_experts)
+
+    # Quantize as a whole and then split
+    out_mxfp8 = tex.split_quantized_tensor(quantizer.quantize(tensor), m_splits)
+
+    for ref, out in zip(ref_mxfp8, out_mxfp8):
+        assert ref._quantizer.rowwise_usage == out._quantizer.rowwise_usage
+        assert ref._quantizer.columnwise_usage == out._quantizer.columnwise_usage
+        assert ref._quantizer.dtype == out._quantizer.dtype
+        assert ref._quantizer.internal == out._quantizer.internal
+
+        torch.testing.assert_close(ref._rowwise_data, out._rowwise_data, rtol=0, atol=0)
+        # Padded area are random filled.
+        torch.testing.assert_close(
+            ref._rowwise_scale_inv[:m, : k // 32],
+            out._rowwise_scale_inv[:m, : k // 32],
+            rtol=0,
+            atol=0,
+        )
+        torch.testing.assert_close(ref._columnwise_data, out._columnwise_data, rtol=0, atol=0)
+        # Padded area are random filled.
+        torch.testing.assert_close(
+            ref._columnwise_scale_inv[: m // 32, :k],
+            out._columnwise_scale_inv[: m // 32, :k],
+            rtol=0,
+            atol=0,
+        )
@@ -291,4 +291,20 @@ size_t roundup(const size_t value, const size_t multiple) {
   return ((value + multiple - 1) / multiple) * multiple;
 }
 
+at::Tensor make_torch_view(std::shared_ptr<at::Tensor>& buffer, const std::vector<size_t>& shape,
+                           size_t offset, at::ScalarType dtype) {
+  std::vector<int64_t> shape_int64(shape.begin(), shape.end());
+  // in the case where full buffer is empty because local rank receives no tokens for all the experts
+  // then the data_ptr is nullptr, we need to return an empty tensor instead of calling from_blob
+  // but in the case where some experts receive tokens, some not, we want to leverage from_blob
+  // as much as possible to avoid CPU overhead
+  if (buffer->data_ptr<uint8_t>() == nullptr) {
+    return at::empty(shape_int64, at::device(at::kCUDA).dtype(dtype));
+  }
+  return at::from_blob(
+      buffer->data_ptr<uint8_t>() + offset, shape_int64,
+      [buffer](void*) {},  // deleter holds shared_ptr
+      at::device(at::kCUDA).dtype(dtype));
+}
+
 }  // namespace transformer_engine::pytorch
@@ -420,6 +420,14 @@ std::vector<size_t> convertShape(const NVTEShape& shape);
 size_t roundup(const size_t value, const size_t multiple);
 
 NVTEShape convertTorchShape(const c10::IntArrayRef torch_shape);
+
+/*! @brief Helper function to construct tensor view
+ *
+ * Note: Deleter holds a shared_ptr for the buffer, so the buffer
+ * will survive until all views are deleted.
+ */
+at::Tensor make_torch_view(std::shared_ptr<at::Tensor>& buffer, const std::vector<size_t>& shape,
+                           size_t offset, at::ScalarType dtype);
 }  // namespace transformer_engine::pytorch
 
 namespace std {
 
@@ -199,25 +199,6 @@ std::tuple<std::vector<py::object>, std::vector<TensorWrapper>> bulk_allocate_fp
   constexpr size_t fp8_elem_size = 1;
   constexpr size_t scale_elem_size = 4;
 
-  // Helper function to construct tensor view
-  // Note: Deleter holds a shared_ptr for the buffer, so the buffer
-  // will survive until all views are deleted.
-  auto make_torch_view = [](std::shared_ptr<at::Tensor> &buffer, const std::vector<size_t> &shape,
-                            size_t offset, at::ScalarType dtype) -> at::Tensor {
-    std::vector<int64_t> shape_int64(shape.begin(), shape.end());
-    // in the case where full buffer is empty because local rank receives no tokens for all the experts
-    // then the data_ptr is nullptr, we need to return an empty tensor instead of calling from_blob
-    // but in the case where some experts receive tokens, some not, we want to leverage from_blob
-    // as much as possible to avoid CPU overhead
-    if (buffer->data_ptr<uint8_t>() == nullptr) {
-      return at::empty(shape_int64, at::device(at::kCUDA).dtype(dtype));
-    }
-    return at::from_blob(
-        buffer->data_ptr<uint8_t>() + offset, shape_int64,
-        [buffer](void *) {},  // deleter holds shared_ptr
-        at::device(at::kCUDA).dtype(dtype));
-  };
-
   // Allocate row-wise data
   std::vector<at::Tensor> rowwise_data_list, rowwise_scale_list;
   std::vector<std::vector<size_t>> rowwise_data_shapes, rowwise_scale_shapes;
@@ -353,25 +334,6 @@ std::tuple<std::vector<py::object>, std::vector<TensorWrapper>> bulk_allocate_mx
   constexpr size_t fp8_elem_size = 1;
   constexpr size_t scale_elem_size = 1;
 
-  // Helper function to construct tensor view
-  // Note: Deleter holds a shared_ptr for the buffer, so the buffer
-  // will survive until all views are deleted.
-  auto make_torch_view = [](std::shared_ptr<at::Tensor> &buffer, const std::vector<size_t> &shape,
-                            size_t offset, at::ScalarType dtype) -> at::Tensor {
-    std::vector<int64_t> shape_int64(shape.begin(), shape.end());
-    // in the case where full buffer is empty because local rank receives no tokens for all the experts
-    // then the data_ptr is nullptr, we need to return an empty tensor instead of calling from_blob
-    // but in the case where some experts receive tokens, some not, we want to leverage from_blob
-    // as much as possible to avoid CPU overhead
-    if (buffer->data_ptr<uint8_t>() == nullptr) {
-      return at::empty(shape_int64, at::device(at::kCUDA).dtype(dtype));
-    }
-    return at::from_blob(
-        buffer->data_ptr<uint8_t>() + offset, shape_int64,
-        [buffer](void *) {},  // deleter holds shared_ptr
-        at::device(at::kCUDA).dtype(dtype));
-  };
-
   // Allocate row-wise data
   std::vector<at::Tensor> rowwise_data_list, rowwise_scale_list;
   std::vector<std::vector<size_t>> rowwise_data_shapes, rowwise_scale_shapes;
 
@@ -18,6 +18,7 @@
 
 #include "../common.h"
 #include "../extensions.h"
+#include "../util.h"
 #include "common.h"
 
 namespace transformer_engine::pytorch {
@@ -240,6 +241,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         "Fused Multi-tensor padding", py::call_guard<py::gil_scoped_release>());
   m.def("fused_multi_row_unpadding", &transformer_engine::pytorch::fused_multi_row_unpadding,
         "Fused Multi-tensor unpadding", py::call_guard<py::gil_scoped_release>());
+  m.def("split_quantized_tensor", &transformer_engine::pytorch::split_quantized_tensor,
+        "Split quantized tensor");
 
   // attention kernels
   m.def("fa_prepare_fwd", &transformer_engine::pytorch::fa_prepare_fwd,