pytorch · MatrixAssembler · Jan 27, 2025 · Jan 28, 2025 · Feb 25, 2025
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_grouped.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_grouped.cu
@@ -145,17 +145,17 @@ __global__ void set_kernel_args_kernel(
             GroupedGemmArgs::ProblemShape::UnderlyingProblemShape*>(
             problem_shape_buf);
     // Pass dummy configs to get Stride structure
-    GroupedGemmArgs::GroupedGemmConfigs<128, 128, 128, 2, 1, 1, true>::
+    GroupedGemmArgs::GroupedGemmConfigs<128, 256, 128, 2, 1, 1, false>::
         StrideInputA* stride_input_A_ptr = reinterpret_cast<
-            GroupedGemmArgs::GroupedGemmConfigs<128, 128, 128, 2, 1, 1, true>::
+            GroupedGemmArgs::GroupedGemmConfigs<128, 256, 128, 2, 1, 1, false>::
                 StrideInputA*>(stride_buf);
-    GroupedGemmArgs::GroupedGemmConfigs<128, 128, 128, 2, 1, 1, true>::
+    GroupedGemmArgs::GroupedGemmConfigs<128, 256, 128, 2, 1, 1, false>::
         StrideInputB* stride_input_B_ptr = reinterpret_cast<
-            GroupedGemmArgs::GroupedGemmConfigs<128, 128, 128, 2, 1, 1, true>::
+            GroupedGemmArgs::GroupedGemmConfigs<128, 256, 128, 2, 1, 1, false>::
                 StrideInputB*>(stride_buf + stride_size);
-    GroupedGemmArgs::GroupedGemmConfigs<128, 128, 128, 2, 1, 1, true>::
+    GroupedGemmArgs::GroupedGemmConfigs<128, 256, 128, 2, 1, 1, false>::
         StrideOutput* stride_output_ptr = reinterpret_cast<
-            GroupedGemmArgs::GroupedGemmConfigs<128, 128, 128, 2, 1, 1, true>::
+            GroupedGemmArgs::GroupedGemmConfigs<128, 256, 128, 2, 1, 1, false>::
                 StrideOutput*>(stride_buf + (stride_size * 2));
 
     output_args_ptr[group_index] =
@@ -169,15 +169,15 @@ __global__ void set_kernel_args_kernel(
         GroupedGemmArgs::ProblemShape::UnderlyingProblemShape(M, N, K);
     stride_input_A_ptr[group_index] = cutlass::make_cute_packed_stride(
         typename GroupedGemmArgs::
-            GroupedGemmConfigs<128, 128, 128, 2, 1, 1, true>::StrideInputA{},
+            GroupedGemmConfigs<128, 256, 128, 2, 1, 1, false>::StrideInputA{},
         {M, K, 1});
     stride_input_B_ptr[group_index] = cutlass::make_cute_packed_stride(
         typename GroupedGemmArgs::
-            GroupedGemmConfigs<128, 128, 128, 2, 1, 1, true>::StrideInputB{},
+            GroupedGemmConfigs<128, 256, 128, 2, 1, 1, false>::StrideInputB{},
         {N, K, 1});
     stride_output_ptr[group_index] = cutlass::make_cute_packed_stride(
         typename GroupedGemmArgs::
-            GroupedGemmConfigs<128, 128, 128, 2, 1, 1, true>::StrideOutput{},
+            GroupedGemmConfigs<128, 256, 128, 2, 1, 1, false>::StrideOutput{},
         {M, N, 1});
   }
 }
@@ -219,17 +219,17 @@ __global__ void set_dynamic_kernel_args_kernel(
             GroupedGemmArgs::ProblemShape::UnderlyingProblemShape*>(
             problem_shape_buf);
     // Pass dummy configs to get Stride structure
-    GroupedGemmArgs::GroupedGemmConfigs<128, 128, 128, 2, 1, 1, true>::
+    GroupedGemmArgs::GroupedGemmConfigs<128, 256, 128, 2, 1, 1, false>::
         StrideInputA* stride_input_A_ptr = reinterpret_cast<
-            GroupedGemmArgs::GroupedGemmConfigs<128, 128, 128, 2, 1, 1, true>::
+            GroupedGemmArgs::GroupedGemmConfigs<128, 256, 128, 2, 1, 1, false>::
                 StrideInputA*>(stride_buf);
-    GroupedGemmArgs::GroupedGemmConfigs<128, 128, 128, 2, 1, 1, true>::
+    GroupedGemmArgs::GroupedGemmConfigs<128, 256, 128, 2, 1, 1, false>::
         StrideInputB* stride_input_B_ptr = reinterpret_cast<
-            GroupedGemmArgs::GroupedGemmConfigs<128, 128, 128, 2, 1, 1, true>::
+            GroupedGemmArgs::GroupedGemmConfigs<128, 256, 128, 2, 1, 1, false>::
                 StrideInputB*>(stride_buf + stride_size);
-    GroupedGemmArgs::GroupedGemmConfigs<128, 128, 128, 2, 1, 1, true>::
+    GroupedGemmArgs::GroupedGemmConfigs<128, 256, 128, 2, 1, 1, false>::
         StrideOutput* stride_output_ptr = reinterpret_cast<
-            GroupedGemmArgs::GroupedGemmConfigs<128, 128, 128, 2, 1, 1, true>::
+            GroupedGemmArgs::GroupedGemmConfigs<128, 256, 128, 2, 1, 1, false>::
                 StrideOutput*>(stride_buf + (stride_size * 2));
 
     output_args_ptr[group_index] =
@@ -244,15 +244,15 @@ __global__ void set_dynamic_kernel_args_kernel(
             zero_start_index_M[group_index], N, K);
     stride_input_A_ptr[group_index] = cutlass::make_cute_packed_stride(
         typename GroupedGemmArgs::
-            GroupedGemmConfigs<128, 128, 128, 2, 1, 1, true>::StrideInputA{},
+            GroupedGemmConfigs<128, 256, 128, 2, 1, 1, false>::StrideInputA{},
         {zero_start_index_M[group_index], K, 1});
     stride_input_B_ptr[group_index] = cutlass::make_cute_packed_stride(
         typename GroupedGemmArgs::
-            GroupedGemmConfigs<128, 128, 128, 2, 1, 1, true>::StrideInputB{},
+            GroupedGemmConfigs<128, 256, 128, 2, 1, 1, false>::StrideInputB{},
         {N, K, 1});
     stride_output_ptr[group_index] = cutlass::make_cute_packed_stride(
         typename GroupedGemmArgs::
-            GroupedGemmConfigs<128, 128, 128, 2, 1, 1, true>::StrideOutput{},
+            GroupedGemmConfigs<128, 256, 128, 2, 1, 1, false>::StrideOutput{},
         {zero_start_index_M[group_index], N, 1});
   }
 }
@@ -487,7 +487,7 @@ std::vector<at::Tensor> dispatch_fp8_grouped_kernel(
     return f8f8bf16_grouped_impl<64, 128, 128, 2, 1, 1, true, FastAccum>(
         xq_group, wq_group, scale, zero_start_index_M);
   } else if (kernel == KernelMode::Large) {
-    return f8f8bf16_grouped_impl<128, 128, 128, 2, 1, 1, true, FastAccum>(
+    return f8f8bf16_grouped_impl<128, 256, 128, 2, 1, 1, false, FastAccum>(
         xq_group, wq_group, scale, zero_start_index_M);
   } else {
     return f8f8bf16_grouped_impl<128, 128, 128, 1, 2, 1, true, FastAccum>(

diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_tensorwise.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_tensorwise.cu
@@ -99,17 +99,22 @@ at::Tensor f8f8bf16_tensorwise_impl(
       KernelScheduleAuto; // Kernel to launch based on the default setting in
                           // the Collective Builder
 
-  using DefaultSchedule = cutlass::gemm::KernelTmaWarpSpecialized;
+  using DefaultSchedule = cutlass::gemm::KernelTmaWarpSpecializedCooperative;
   using PongSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
   using FastDefaultSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
   using FastPongSchedule =
       cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
   using SlowAccum = cute::conditional_t<PONG, PongSchedule, DefaultSchedule>;
   using FastAccum =
       cute::conditional_t<PONG, FastPongSchedule, FastDefaultSchedule>;
+  using CooperativeEpilogueSchedule =
+      cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using PongEpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
   using MainLoopSchedule =
       cute::conditional_t<FAST_ACCUM, FastAccum, SlowAccum>;
+  using EpilogueSchedule = cute::
+      conditional_t<PONG, PongEpilogueSchedule, CooperativeEpilogueSchedule>;
 
   using Scale_ =
       cutlass::epilogue::fusion::Sm90ScalarBroadcast<ElementComputeEpilogue>;
@@ -140,7 +145,7 @@ at::Tensor f8f8bf16_tensorwise_impl(
           ElementOutput,
           LayoutOutput,
           AlignmentOutput,
-          cutlass::epilogue::TmaWarpSpecialized,
+          EpilogueSchedule,
           EpilogueEVT>::CollectiveOp;
 
   using CollectiveMainloop =
@@ -239,10 +244,10 @@ at::Tensor f8f8bf16_tensorwise(
     return f8f8bf16_tensorwise_impl<64, 128, 128, 2, 1, 1, true, true>(
         XQ, WQ, scale);
   } else if (kernel == KernelMode::Large) {
-    return f8f8bf16_tensorwise_impl<128, 128, 128, 2, 1, 1, true, true>(
+    return f8f8bf16_tensorwise_impl<128, 256, 128, 2, 1, 1, false, true>(
         XQ, WQ, scale);
   } else {
-    return f8f8bf16_tensorwise_impl<128, 128, 128, 1, 2, 1, false, true>(
+    return f8f8bf16_tensorwise_impl<128, 128, 128, 1, 2, 1, true, true>(
         XQ, WQ, scale);
   }
 }