[pre-commit.ci] auto fixes from pre-commit.com hooks

pre-commit-ci[bot] · pre-commit-ci[bot] · commit 9c3a6756607f · 2025-08-12T08:41:40.000Z
for more information, see https://pre-commit.ci
diff --git a/transformer_engine/common/comm_gemm_overlap/comm_gemm_overlap.cpp b/transformer_engine/common/comm_gemm_overlap/comm_gemm_overlap.cpp
@@ -861,7 +861,7 @@ void CommOverlapP2PBase::split_overlap_ag(const TensorWrapper &A, bool transa,
     // Chunk dims
     std::vector<size_t> input_b_chunk_shape =
         (transb ? std::vector<size_t>{2 * k_chunk, n} : std::vector<size_t>{2 * n_chunk, k});
-        // (transb ? std::vector<size_t>{k, 2 * n_chunk} : std::vector<size_t>{2 * n_chunk, k});
+    // (transb ? std::vector<size_t>{k, 2 * n_chunk} : std::vector<size_t>{2 * n_chunk, k});
     std::vector<size_t> output_chunk_shape = {(transb ? 1 : 2) * n_chunk, m};
     input_a_chunk_size *= transb ? 2 : 1;
     input_b_chunk_size *= 2;
@@ -894,22 +894,22 @@ void CommOverlapP2PBase::split_overlap_ag(const TensorWrapper &A, bool transa,
 
       // GEMM
       TensorWrapper input_a_chunk, input_b_chunk;
-      if (ag_on_B) { // AllGather is performed on input B tensor (default case).
-                     // Use case: AG->{FC2, PROJ}_Wgrad, AG->{FC1, QKV}_FPROP.
-        input_a_chunk = get_tensor_chunk(A, transb ? input_a_chunk_size * send_chunk_id / 2 : 0,
+      if (ag_on_B) {  // AllGather is performed on input B tensor (default case).
+                      // Use case: AG->{FC2, PROJ}_Wgrad, AG->{FC1, QKV}_FPROP.
+        input_a_chunk = get_tensor_chunk(
+            A, transb ? input_a_chunk_size * send_chunk_id / 2 : 0,
             transb ? std::vector<size_t>{k_chunk * 2, m} : shape_to_vector(A.shape()));
         input_b_chunk =
             get_buffer_chunk_like(B, input_b_chunk_size * send_chunk_id / 2, input_b_chunk_shape);
-      } else { // AllGather is performed on input A tensor. Use case: AG->{FC1, QKV}_Wgrad.
+      } else {  // AllGather is performed on input A tensor. Use case: AG->{FC1, QKV}_Wgrad.
         assert(transa == false && transb == true);
-        input_a_chunk = get_buffer_chunk_like(
-            A, input_a_chunk_size * send_chunk_id / 2, std::vector<size_t>{k_chunk * 2, m}
-        );
+        input_a_chunk = get_buffer_chunk_like(A, input_a_chunk_size * send_chunk_id / 2,
+                                              std::vector<size_t>{k_chunk * 2, m});
         input_b_chunk =
             get_tensor_chunk(B, input_b_chunk_size * send_chunk_id / 2, input_b_chunk_shape);
       }
-      auto output_chunk =
-          get_tensor_chunk(D, transb ? 0 : output_chunk_size * send_chunk_id / 2, output_chunk_shape);
+      auto output_chunk = get_tensor_chunk(D, transb ? 0 : output_chunk_size * send_chunk_id / 2,
+                                           output_chunk_shape);
       auto aux_chunk = (do_gelu)
                            ? get_tensor_chunk(pre_gelu_out, output_chunk_size * send_chunk_id / 2,
                                               {2 * n_chunk, k})
@@ -964,15 +964,17 @@ void CommOverlapP2PBase::split_overlap_ag(const TensorWrapper &A, bool transa,
 
       // GEMM
       TensorWrapper input_a_chunk, input_b_chunk;
-      if (ag_on_B) { // AllGather is performed on input B tensor (default case).
-                     // Use case: AG->{FC2, PROJ}_Wgrad, AG->{FC1, QKV}_FPROP.
-        input_a_chunk = get_tensor_chunk(A, transb ? input_a_chunk_size * send_chunk_id : 0,
-            transb ? std::vector<size_t>{k_chunk, m} : shape_to_vector(A.shape()));
+      if (ag_on_B) {  // AllGather is performed on input B tensor (default case).
+                      // Use case: AG->{FC2, PROJ}_Wgrad, AG->{FC1, QKV}_FPROP.
+        input_a_chunk =
+            get_tensor_chunk(A, transb ? input_a_chunk_size * send_chunk_id : 0,
+                             transb ? std::vector<size_t>{k_chunk, m} : shape_to_vector(A.shape()));
         input_b_chunk =
             get_buffer_chunk_like(B, input_b_chunk_size * send_chunk_id, input_b_chunk_shape);
-      } else { // AllGather is performed on input A tensor. Use case: AG->{FC1, QKV}_Wgrad.
+      } else {  // AllGather is performed on input A tensor. Use case: AG->{FC1, QKV}_Wgrad.
         assert(transa == false && transb == true);
-        input_a_chunk = get_buffer_chunk_like(A, input_a_chunk_size * send_chunk_id,
+        input_a_chunk = get_buffer_chunk_like(
+            A, input_a_chunk_size * send_chunk_id,
             transb ? std::vector<size_t>{k_chunk, m} : std::vector<size_t>{m, k});
         input_b_chunk =
             get_tensor_chunk(B, input_b_chunk_size * send_chunk_id, input_b_chunk_shape);
diff --git a/transformer_engine/common/include/transformer_engine/comm_gemm_overlap.h b/transformer_engine/common/include/transformer_engine/comm_gemm_overlap.h
@@ -130,9 +130,8 @@ class CommOverlapCore {
   virtual void split_overlap_ag(const TensorWrapper &A, bool transa, const TensorWrapper &B,
                                 bool transb, TensorWrapper &D, TensorWrapper &bias,
                                 TensorWrapper &pre_gelu_out, TensorWrapper &workspace, bool grad,
-                                bool accumulate, bool use_split_accumulator,
-                                bool ag_on_B, TensorWrapper &B_copy,
-                                cudaStream_t stream_main) {
+                                bool accumulate, bool use_split_accumulator, bool ag_on_B,
+                                TensorWrapper &B_copy, cudaStream_t stream_main) {
     NVTE_ERROR("Operation is not implemented.");
   }
 
diff --git a/transformer_engine/pytorch/cpp_extensions/gemm.py b/transformer_engine/pytorch/cpp_extensions/gemm.py
@@ -108,7 +108,7 @@ def general_gemm(
         workspace.shape[0],
         accumulate,
         use_split_accumulator,
-        ag_on_B, # ag_on_B
+        ag_on_B,  # ag_on_B
     )
     kwargs = {
         "comm_overlap": ub,
diff --git a/transformer_engine/pytorch/csrc/extensions.h b/transformer_engine/pytorch/csrc/extensions.h
@@ -120,7 +120,8 @@ std::vector<py::object> gemm(py::handle A, bool transa, py::handle B, bool trans
                              py::handle quantizer, std::optional<DType> out_dtype, MaybeTensor bias,
                              DType bias_type, bool gelu, MaybeTensor gelu_in, bool grad,
                              at::Tensor workspace, size_t workspaceSize, bool accumulate,
-                             bool use_split_accumulator, bool ag_on_B, CommOverlapCore *comm_overlap = nullptr,
+                             bool use_split_accumulator, bool ag_on_B,
+                             CommOverlapCore *comm_overlap = nullptr,
                              std::optional<CommOverlapType> comm_type = std::nullopt,
                              MaybeTensor extra_output = std::nullopt, bool bulk_overlap = false);
 
diff --git a/transformer_engine/pytorch/csrc/extensions/gemm.cpp b/transformer_engine/pytorch/csrc/extensions/gemm.cpp
@@ -90,7 +90,8 @@ std::vector<py::object> gemm(py::handle A, bool transa, py::handle B, bool trans
                              py::handle quantizer, std::optional<DType> out_dtype, MaybeTensor bias,
                              DType bias_type, bool gelu, MaybeTensor gelu_in, bool grad,
                              at::Tensor workspace, size_t workspaceSize, bool accumulate,
-                             bool use_split_accumulator, bool ag_on_B, CommOverlapCore* comm_overlap,
+                             bool use_split_accumulator, bool ag_on_B,
+                             CommOverlapCore* comm_overlap,
                              std::optional<CommOverlapType> comm_type, MaybeTensor extra_output,
                              bool bulk_overlap) {
   // Input tensors
diff --git a/transformer_engine/pytorch/csrc/extensions/pybind.cpp b/transformer_engine/pytorch/csrc/extensions/pybind.cpp
@@ -110,8 +110,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         py::arg("quantizer"), py::arg("output_dtype"), py::arg("bias"), py::arg("bias_type"),
         py::arg("gelu"), py::arg("gelu_in"), py::arg("grad"), py::arg("workspace"),
         py::arg("workspace_size"), py::arg("accumulate"), py::arg("use_split_accumulator"),
-        py::arg("ag_on_B"),
-        py::arg("comm_overlap") = nullptr, py::arg("comm_type") = std::nullopt,
+        py::arg("ag_on_B"), py::arg("comm_overlap") = nullptr, py::arg("comm_type") = std::nullopt,
         py::arg("extra_output") = std::nullopt, py::arg("bulk_overlap") = false);
   m.def("gelu", transformer_engine::pytorch::gelu, "GeLU activation", py::arg("input"),
         py::arg("quantizer"));

Original file line number	Diff line number	Diff line change
`@@ -108,7 +108,7 @@ def general_gemm(`
`108`	`108`	`workspace.shape[0],`
`109`	`109`	`accumulate,`
`110`	`110`	`use_split_accumulator,`
`111`		`- ag_on_B, # ag_on_B`
	`111`	`+ ag_on_B, # ag_on_B`
`112`	`112`	`)`
`113`	`113`	`kwargs = {`
`114`	`114`	`"comm_overlap": ub,`