Test working as I think it should work

vthumbe1503 · vthumbe1503 · commit 2688f368f817 · 2025-08-26T20:47:36.000Z
diff --git a/tests/pytorch/test_sanity.py b/tests/pytorch/test_sanity.py
@@ -104,18 +104,7 @@ def is_fp8_supported(config: ModelConfig):
 all_boolean = [True, False]
 batch_sizes_with_zero = [0, 1, 2]
 
-all_activations = [
-    "gelu",
-    "geglu",
-    "qgelu",
-    "qgeglu",
-    "relu",
-    "reglu",
-    "srelu",
-    "sreglu",
-    "silu",
-    "swiglu",
-]
+all_activations = ["gelu", "relu", "reglu", "geglu", "swiglu", "srelu", "qgelu", "qgeglu"]
 all_normalizations = ["LayerNorm", "RMSNorm"]
 
 
@@ -910,6 +899,39 @@ def test_sanity_fp8_gemm_with_unalignment(N, datatype):
     )
     torch.cuda.synchronize()
 
+@pytest.mark.skipif(not fp8_available, reason=reason_for_no_fp8)
+@pytest.mark.parametrize("N", [32])
+@pytest.mark.parametrize("datatype", [torch.float16, torch.bfloat16])
+def test_sanity_gemm_with_fp8quantization_and_unalignment(N, datatype):
+    offset = 16
+    scratchpad = torch.randn(N, N * N + offset, device="cuda", dtype=datatype)
+    inp = torch.reshape(scratchpad[0][:-offset], (N, N))
+    weight = torch.reshape(scratchpad[0][offset:], (N, N))
+    scales = torch.ones(1).cuda().squeeze()
+    amaxes = torch.ones(1).cuda().squeeze()
+    outp_type = datatype
+    quantizer = Float8Quantizer(scales, amaxes, tex.DType.kFloat8E4M3)
+    quantized_out, *_ = general_gemm(
+            weight,
+            inp,
+            get_workspace(),
+            outp_type,
+            quantization_params=quantizer,
+            bias=None,
+            use_split_accumulator=False,
+        )
+    out, *_ = general_gemm(
+            weight,
+            inp,
+            get_workspace(),
+            outp_type,
+            quantization_params=None,
+            bias=None,
+            use_split_accumulator=False,
+        )
+    expected_quantized_out = quantizer(out)
+    torch.testing.assert_close(expected_quantized_out, quantized_out)
+
 
 @pytest.mark.skipif(not fp8_available, reason=reason_for_no_fp8)
 def test_replace_raw_data_for_float8tensor():
diff --git a/transformer_engine/pytorch/csrc/extensions/gemm.cpp b/transformer_engine/pytorch/csrc/extensions/gemm.cpp
@@ -123,28 +123,42 @@ std::vector<py::object> gemm(py::handle A, bool transa, py::handle B, bool trans
                "into D tensor. Beta has nothing to be applied to.");
   }
 
+  DType gemm_output_dtype = out_dtype ? *out_dtype : A_tensor.dtype();
   // Output tensor
   TensorWrapper D_tensor;
   if (D.is_none()) {
-    DType output_dtype = out_dtype ? *out_dtype : A_tensor.dtype();
-    std::tie(D_tensor, D) = createOutputTensor(D_shape, output_dtype, quantizer);
+    std::tie(D_tensor, D) = createOutputTensor(D_shape, gemm_output_dtype, quantizer);
   } else {
     D_tensor = makeTransformerEngineTensor(D, quantizer);
     NVTE_CHECK(detail::checkGemmShape(D_shape, D_tensor.shape()),
                "GEMM output has invalid dims (expected ", std::to_string(D_shape), ", got ",
                std::to_string(D_tensor.shape()), ")");
-    if (out_dtype) {
-      NVTE_CHECK(*out_dtype == D_tensor.dtype(), "GEMM output has invalid dtype (expected ",
-                 static_cast<int>(*out_dtype), ", found ", static_cast<int>(D_tensor.dtype()), ")");
-    }
+    // TODO: Rather check quantizer output data type and compare with out_dtype
+    // if (out_dtype) {
+    //   NVTE_CHECK(*out_dtype == D_tensor.dtype(), "GEMM output has invalid dtype (expected ",
+    //              static_cast<int>(*out_dtype), ", found ", static_cast<int>(D_tensor.dtype()), ")");
+    // }
   }
 
+  // maintain unquantized tensor in case we need to output fp8 quantized tensor from op consuming hp data type.
+  TensorWrapper unquantized_D_tensor;
+  py::object unquantized_out;
+  std::unique_ptr<Quantizer> my_quantizer = convert_quantizer(quantizer);
+  bool quantization_needed = !quantizer.is_none(); // TODO: Another use-case:
+  // If already output is FP8, then no need for quantization, since cublas is gonna take care of it.
+  if(quantization_needed)
+  {
+    NoneQuantizer q{none};
+    std::tie(unquantized_D_tensor, unquantized_out) = q.create_tensor(D_shape, gemm_output_dtype);
+  }
+
+  TensorWrapper &out_tensor = quantization_needed ? unquantized_D_tensor : D_tensor;
   // Bias tensor
   TensorWrapper bias_tensor;
   MaybeTensor bias_grad = std::nullopt;
   if (bias.has_value()) {
     if (grad) {
-      auto opts = torch::TensorOptions().dtype(GetATenDType(D_tensor.dtype())).device(torch::kCUDA);
+      auto opts = torch::TensorOptions().dtype(GetATenDType(out_tensor.dtype())).device(torch::kCUDA);
       bias_grad = at::empty({static_cast<int64_t>(B_shape.data[B_shape.ndim - 1])}, opts);
       bias_tensor = makeTransformerEngineTensor(*bias_grad);
     } else {
@@ -157,7 +171,7 @@ std::vector<py::object> gemm(py::handle A, bool transa, py::handle B, bool trans
 
   // Activation input tensor
   MaybeTensor pre_gelu_out = std::nullopt;
-  DType gelu_type = low_precision ? bias_type : D_tensor.dtype();
+  DType gelu_type = low_precision ? bias_type : out_tensor.dtype();
   if (gelu) {
     if (!grad) {
       auto dtype = GetATenDType(gelu_type);
@@ -210,22 +224,22 @@ std::vector<py::object> gemm(py::handle A, bool transa, py::handle B, bool trans
       // Direct GEMM call to the correct overlap
       if (bulk_overlap) {
         NVTE_SCOPED_GIL_RELEASE({
-          comm_overlap->bulk_overlap(A_tensor, transa, B_tensor, transb, D_tensor, bias_tensor,
+          comm_overlap->bulk_overlap(A_tensor, transa, B_tensor, transb, out_tensor, bias_tensor,
                                      te_pre_gelu_out, te_workspace, grad, accumulate,
                                      use_split_accumulator, comm_type.value(), extra_output_tensor,
                                      main_stream);
         });
       } else if (comm_type.value() == CommOverlapType::AG) {
         if (comm_overlap->is_atomic_gemm()) {
           NVTE_SCOPED_GIL_RELEASE({
-            comm_overlap->atomic_gemm_overlap_ag(A_tensor, transa, B_tensor, transb, D_tensor,
+            comm_overlap->atomic_gemm_overlap_ag(A_tensor, transa, B_tensor, transb, out_tensor,
                                                  bias_tensor, te_pre_gelu_out, te_workspace, grad,
                                                  accumulate, use_split_accumulator,
                                                  extra_output_tensor, main_stream);
           });
         } else {
           NVTE_SCOPED_GIL_RELEASE({
-            comm_overlap->split_overlap_ag(A_tensor, transa, B_tensor, transb, D_tensor,
+            comm_overlap->split_overlap_ag(A_tensor, transa, B_tensor, transb, out_tensor,
                                            bias_tensor, te_pre_gelu_out, te_workspace, grad,
                                            accumulate, use_split_accumulator, extra_output_tensor,
                                            main_stream);
@@ -234,14 +248,14 @@ std::vector<py::object> gemm(py::handle A, bool transa, py::handle B, bool trans
       } else {
         if (comm_overlap->is_atomic_gemm()) {
           NVTE_SCOPED_GIL_RELEASE({
-            comm_overlap->atomic_gemm_overlap_rs(A_tensor, transa, B_tensor, transb, D_tensor,
+            comm_overlap->atomic_gemm_overlap_rs(A_tensor, transa, B_tensor, transb, out_tensor,
                                                  bias_tensor, te_pre_gelu_out, te_workspace, grad,
                                                  accumulate, use_split_accumulator,
                                                  extra_output_tensor, main_stream);
           });
         } else {
           NVTE_SCOPED_GIL_RELEASE({
-            comm_overlap->split_overlap_rs(A_tensor, transa, B_tensor, transb, D_tensor,
+            comm_overlap->split_overlap_rs(A_tensor, transa, B_tensor, transb, out_tensor,
                                            bias_tensor, te_pre_gelu_out, te_workspace, grad,
                                            accumulate, use_split_accumulator, extra_output_tensor,
                                            main_stream);
@@ -251,23 +265,24 @@ std::vector<py::object> gemm(py::handle A, bool transa, py::handle B, bool trans
     } else {
       // Launch GEMM
       NVTE_SCOPED_GIL_RELEASE({
-        nvte_cublas_gemm_scaled(A_tensor.data(), B_tensor.data(), D_tensor.data(),
+        nvte_cublas_gemm_scaled(A_tensor.data(), B_tensor.data(), out_tensor.data(),
                                 bias_tensor.data(), te_pre_gelu_out.data(), transa, transb, grad,
                                 te_workspace.data(), alpha, *beta, use_split_accumulator,
                                 num_math_sms, main_stream);
       });
     }
   } else {
-    if (D_tensor.numel() != 0 && !accumulate) {
-      D_tensor.zero_(main_stream);
+    if (out_tensor.numel() != 0 && !accumulate) {
+      out_tensor.zero_(main_stream);
     }
     if (bias.has_value()) {
       if (bias->numel() != 0 && grad) {
         bias_grad->zero_();
       }
     }
   }
-
+  if(quantization_needed)
+    my_quantizer->quantize(unquantized_D_tensor, D_tensor);
   // Pack outputs
   std::vector<py::object> out;
   out.emplace_back(std::move(D));