Code cleanup

adityachatter · adityachatter · commit 177ec96e9f9c · 2025-10-13T07:51:20.000Z
Signed-off-by: Aditya Chatterjee &lt;Aditya.Chatterjee@intel.com&gt;
diff --git a/applications/flash_attention_v2/collective/xe_flash_attn_chunk_prefill_mma.hpp b/applications/flash_attention_v2/collective/xe_flash_attn_chunk_prefill_mma.hpp
@@ -28,7 +28,6 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-
 #pragma once
 
 #include "cutlass/cutlass.h"
@@ -280,8 +279,7 @@ struct FlashChunkPrefillMma<
   template <class FragQccum, class TensorQ, class TensorK, class FragSrc>
   CUTLASS_DEVICE void mmaQK(FragQccum &accum, TensorQ gQ, TensorK gK,
                             FragSrc const &frag_src, int const &k_tile_count,
-                            Params const &params, bool is_KV_cache,
-                            int const& q_head_coord, int const& kv_head_coord) {
+                            Params const &params, bool is_KV_cache) {
 
     auto &gmem_tiled_copy_k =
         is_KV_cache ? params.gmem_tiled_copy_k_cache : params.gmem_tiled_copy_k;
@@ -317,8 +315,9 @@ struct FlashChunkPrefillMma<
     Tensor tQgQ = thr_copy_Q.retile_S(tCgQ);
     Tensor tKgK = thr_copy_K.retile_S(tCgK);
 
-    float q_scale = params.ptr_q_scale[0]; //q_head_coord];
-    float k_scale = params.ptr_k_scale[0]; //kv_head_coord];
+    // Currently, supporting per-tensor scaling
+    float q_scale = params.ptr_q_scale[0];
+    float k_scale = params.ptr_k_scale[0];
 
     //
     // Mainloop
@@ -327,22 +326,15 @@ struct FlashChunkPrefillMma<
       copy(params.gmem_tiled_copy_q, tQgQ(_, _, _, k_tile), tQrQ);
       copy(gmem_tiled_copy_k, tKgK(_, _, _, k_tile), tKrK);
 
-      // FP8 path: Convert FP8 fragments to FP16 IN-PLACE to avoid register spilling.
+      // FP8 path: Convert FP8 fragments to BF16
       if constexpr (is_fp8_v<ElementQ> || is_fp8_v<ElementK>) {
-        // Recast the memory region of the FP8 tensors as FP16 tensors.
-        // This does NOT allocate new registers. It reuses the existing ones.
-        //auto tCrQ_fp16 = cute::recast<half_t>(tCrQ);
-        //auto tCrK_fp16 = cute::recast<half_t>(tCrK);
-
 	auto tCrQ_fp16 = make_fragment_like<bfloat16_t>(tCrQ);
 	auto tCrK_fp16 = make_fragment_like<bfloat16_t>(tCrK);
 
-        // Perform the conversion, writing the FP16 results directly into the
-        // reused register space.
         if constexpr (is_fp8_v<ElementQ>) {
           convert_and_descale<ElementQ>(tCrQ, tCrQ_fp16, q_scale);
         } else {
-          // If Q is already FP16, just copy it to the correctly-named variable.
+          // If Q is already FP16, copy it.
           copy(tCrQ, tCrQ_fp16);
         }
 
@@ -352,11 +344,10 @@ struct FlashChunkPrefillMma<
           copy(tCrK, tCrK_fp16);
         }
 
-        // Now, gemm is called on the FP16 tensors which occupy the same
-        // register space as the original FP8 tensors did. Register pressure is not increased.
+        // Now, gemm is called on the BF16 tensors
         cute::gemm(tiled_mma, accum, tCrQ_fp16, tCrK_fp16, frag_src);
       } else {
-        // FP16 path (already fast)
+        // BF16 path
         cute::gemm(tiled_mma, accum, tCrQ, tCrK, frag_src);
       }
 
@@ -404,12 +395,12 @@ struct FlashChunkPrefillMma<
             class FragSrc>
   CUTLASS_DEVICE void mmaPV(FragQccum &accum, FragS const &tSr, TensorV gV,
                             FragSrc const &frag_src, Params const &params,
-                            bool is_KV_cache, int const& kv_head_coord) {
+                            bool is_KV_cache) {
 
     auto &gmem_tiled_copy_v =
         is_KV_cache ? params.gmem_tiled_copy_v_cache : params.gmem_tiled_copy_v;
 
-    float v_scale = params.ptr_v_scale[0]; //kv_head_coord];
+    float v_scale = params.ptr_v_scale[0];
 
     int thread_idx = static_cast<int>(ThreadIdxX());
     // Instantiate the MMA object
@@ -461,20 +452,11 @@ struct FlashChunkPrefillMma<
       copy(gmem_tiled_copy_v, tVgV(_, _, _, i), tVrV);
 
       if constexpr (is_fp8_v<ElementV>) {
-        // Correctly reuse the registers of tCrV for the new FP16 tensor.
-        // This avoids doubling the register pressure.
-        //auto tCrV_fp16 = cute::recast<half_t>(tCrV);
 	auto tCrV_fp16 = make_fragment_like<bfloat16_t>(tCrV);
-
-        // Perform the conversion in-place, overwriting the old FP8 data
-        // with the new FP16 data in the same register space.
         convert_and_descale<ElementV>(tCrV, tCrV_fp16, v_scale);
 
-        // The GEMM now operates on an FP16 tensor that is in registers,
-        // preventing a catastrophic performance drop from register spilling.
         cute::gemm(tiled_mma, accum(_,_,_,i), tPr, tCrV_fp16, frag_src(_,_,_,i));
       } else {
-        // Native FP16 path (already fast)
         cute::gemm(tiled_mma, accum(_,_,_,i), tPr, tCrV, frag_src(_,_,_,i));
       }
     }
diff --git a/applications/flash_attention_v2/kernel/xe_chunk_prefill.hpp b/applications/flash_attention_v2/kernel/xe_chunk_prefill.hpp
@@ -454,8 +454,8 @@ class FMHAPrefillChunk {
       // Perform the collective scoped MMA
       CollectiveMainloop collective_mma;
 
-      auto q_group_size = num_heads_q / num_heads_kv;
-      auto kv_head_coord = q_head_coord / q_group_size;
+      // auto q_group_size = num_heads_q / num_heads_kv;
+      // auto kv_head_coord = q_head_coord / q_group_size;
 
       // when causal mask is true. It is not possible to set the scope
       // of the barrier to workgroup level as the number n block is
@@ -483,7 +483,7 @@ class FMHAPrefillChunk {
 
         collective_mma.mmaQK(tSr, gQ, gK_, tSr,
                              ceil_div(head_size_qk, QK_BLK_K), mainloop_params,
-                             is_KV_cache, q_head_coord, kv_head_coord);
+                             is_KV_cache);
 
         if constexpr (LocalMask) {
           // Sliding windows
@@ -577,7 +577,7 @@ class FMHAPrefillChunk {
 
         // 5) Perform GEMM O = S*V
         collective_mma.template mmaPV<VSlicer>(out_reg, tSr, gV_, out_reg,
-                                               mainloop_params, is_KV_cache, kv_head_coord);
+                                               mainloop_params, is_KV_cache);
 
         // ... prefetch next tile ...
         // Prefetch the next Q tile
@@ -624,7 +624,7 @@ class FMHAPrefillChunk {
         // 3) Perform GEMM S = Q*K
         collective_mma.mmaQK(tSr, gQ, gK(_, _, kv_splits_new - 1, _), tSr,
                              ceil_div(head_size_qk, QK_BLK_K), mainloop_params,
-                             false, q_head_coord, kv_head_coord);
+                             false);
 
         // we only need one block ahead, there is enough gap to prefetch it
         // while doing softmax. because the gap between the two MMA is big,
@@ -655,7 +655,7 @@ class FMHAPrefillChunk {
 
         collective_mma.template mmaPV<VSlicer>(out_reg, tSr,
                                                gV(_, _, kv_splits_new - 1),
-                                               out_reg, mainloop_params, false, kv_head_coord);
+                                               out_reg, mainloop_params, false);
       }
 
 
diff --git a/examples/06_bmg_flash_attention/06_bmg_chunk_prefill_fp8.cpp b/examples/06_bmg_flash_attention/06_bmg_chunk_prefill_fp8.cpp
@@ -29,38 +29,28 @@
  *
  **************************************************************************************************/
 /*! \file
-    \brief Flash Attention V2 Prefill for Intel BMG
+    \brief fp8 Chunk Prefill for Intel BMG
 
-    This example constructs and executes a Flash Attention Prefill with KV cache on Intel BMG. The
+    This example constructs and executes a FP8 Flash Attention Chunk Prefill on Intel BMG. The
     definition of the GEMM, options etc for this example are defined in the associated
-    bmg_flash_attn_cachedKV_runner.hpp header file.
+    bmg_flash_chunk_prefill_runner.hpp header file.
 
     See https://arxiv.org/pdf/2307.08691 for details of Flash Attention V2 algorithm
 
-    To run this example:
-      $ ./examples/sycl/06_bmg_flash_attention_cachedKV/06_bmg_prefill_attention_cachedKV --seq_len_qo=512
-        --seq_len_kv=512 --seq_len_kv_cache=512 --head_size_vo=128 --head_size_qk=128
-
-    Causal masking of the first matrix multiplication is supported (`--is_causal`)
-
     To build & run this example (from your build dir):
 
-      $ ninja 06_bmg_prefill_attention_cachedKV
-      $ ./examples/sycl/06_bmg_flash_attention_cachedKV/06_bmg_prefill_attention_cachedKV
+      $ ninja 06_bmg_chunk_prefill_fp8_hdim128
+      $ ./examples/06_bmg_flash_attention/06_bmg_chunk_prefill_fp8_hdim128
 
     Call with `--help` for information about available options
 */
 
 #include "bmg_flash_chunk_prefill_runner.hpp"
 
 int main(int argc, const char **argv) {
-  //
   // Parse options
-  //
 
   Options options;
-  // Override the default data type for this test
-  // options.dtype = "fp8";
   options.parse(argc, argv);
 
   if (options.help) {
@@ -118,12 +108,12 @@ int main(int argc, const char **argv) {
   // =================================================================================================
   // FP8 Type Definitions
   // =================================================================================================
-  using ElementInputQ = cutlass::float_e5m2_t;     // <- data type of elements in input matrix A
-    using ElementInputKV = cutlass::float_e5m2_t;    // <- data type of elements in input matrix B
-    using MMAOperation = XE_8x16x16_F32F16F16F32_TT; //XE_8x16x16_F32BF16BF16F32_TT;
-    using GmemTiledCopyQ = XE_2D_U8x8x32_LD_N; // XE_2D_U8x8x32_LD_N;
-    using GmemTiledCopyK = XE_2D_U8x16x16_LD_T; // _T designates a transposed block load operation
-    using GmemTiledCopyV = XE_2D_U8x32x32_LD_V;
+  using ElementInputQ = cutlass::float_e5m2_t;     // data type of elements in input matrix A
+  using ElementInputKV = cutlass::float_e5m2_t;    // data type of elements in input matrix B
+  using MMAOperation = XE_8x16x16_F32F16F16F32_TT; //XE_8x16x16_F32BF16BF16F32_TT;
+  using GmemTiledCopyQ = XE_2D_U8x8x32_LD_N;       // XE_2D_U8x8x32_LD_N;
+  using GmemTiledCopyK = XE_2D_U8x16x16_LD_T;      // _T designates a transposed block load operation
+  using GmemTiledCopyV = XE_2D_U8x32x32_LD_V;
 
   constexpr int PipelineStages = 2;
 
diff --git a/examples/06_bmg_flash_attention/bmg_flash_chunk_prefill_runner.hpp b/examples/06_bmg_flash_attention/bmg_flash_chunk_prefill_runner.hpp
@@ -28,7 +28,6 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-
 #pragma once
 
 #include "cutlass/epilogue/collective/default_epilogue.hpp"
@@ -220,64 +219,6 @@ template <class FMHAChunkPrefillKernel, bool isVarLen> struct ExampleRunner {
   // Methods
   //
 
-/*
-template <typename T>
-void initialize_block_random(cutlass::DeviceAllocation<T>& block) {
-    if (block.size() == 0) {
-        return;
-    }
-    std::vector<T> host_tensor(block.size());
-    std::mt19937 gen(seed);
-    std::uniform_real_distribution<float> dis(-1.f, 1.f);
-
-    for (size_t i = 0; i < host_tensor.size(); ++i) {
-        host_tensor[i] = static_cast<T>(dis(gen));
-    }
-    block.copy_from_host(host_tensor.data(), host_tensor.size());
-}
-*/
-
-template <typename T>
-void initialize_block_random(cutlass::DeviceAllocation<T>& block) {
-    if (block.size() == 0) {
-        return;
-    }
-    std::vector<T> host_tensor(block.size());
-    std::mt19937 gen(seed);
-    std::uniform_int_distribution<> dis(1, 9);
-
-    for (size_t i = 0; i < host_tensor.size(); ++i) {
-        host_tensor[i] = static_cast<T>(dis(gen));
-    }
-    block.copy_from_host(host_tensor.data(), host_tensor.size());
-}
-
-template <typename T>
-void initialize_block_identity(cutlass::DeviceAllocation<T>& block, int rows, int cols) {
-    if (block.size() == 0) {
-        return;
-    }
-    std::vector<T> host_tensor(block.size(), T(0.f));
-    for (int i = 0; i < rows; ++i) {
-        if (i < cols) {
-            host_tensor[i * cols + i] = T(1.f);
-        }
-    }
-    block.copy_from_host(host_tensor.data(), host_tensor.size());
-}
-
-template <typename T>
-void initialize_block_iota(cutlass::DeviceAllocation<T>& block) {
-    if (block.size() == 0) {
-        return;
-    }
-    std::vector<T> host_tensor(block.size());
-    for (size_t i = 0; i < host_tensor.size(); ++i) {
-        host_tensor[i] = static_cast<T>(static_cast<float>(1.0));
-    }
-    block.copy_from_host(host_tensor.data(), host_tensor.size());
-}
-
 template <typename SrcType, typename DstType, typename Encoding>
 void run_conversion_kernel(SrcType* src_ptr_in, DstType* dst_ptr_in, int64_t num_elements, float scale) {
     sycl::queue queue = compat::get_default_queue();
@@ -300,28 +241,6 @@ void run_conversion_kernel(SrcType* src_ptr_in, DstType* dst_ptr_in, int64_t num
     });
 }
 
-template<typename T>
-void print_device_tensor(const char* name, T* ptr, size_t size, int max_elements_to_print = 1153) {
-    std::cout << "--- " << name << " ---" << std::endl;
-    if (ptr == nullptr || size == 0) {
-        std::cout << "(null)" << std::endl;
-        return;
-    }
-    std::vector<T> host_tensor(size);
-    compat::memcpy(host_tensor.data(), ptr, size * sizeof(T));
-    compat::wait();
-
-    int count = 0;
-    for (const auto& val : host_tensor) {
-        if (count++ >= max_elements_to_print) {
-            std::cout << "..." << std::endl;
-            break;
-        }
-        std::cout << static_cast<float>(val) << " ";
-    }
-    std::cout << std::endl << "--- End " << name << " ---" << std::endl;
-}
-
 bool verify(ProblemShapeType problem_size, Options options, const float* q_scale, const float* k_scale, const float* v_scale) {
     std::vector<ElementOutput> host_O(block_ref_O.size());
 
@@ -351,7 +270,7 @@ bool verify(ProblemShapeType problem_size, Options options, const float* q_scale
     int offset_o = 0;
 
     using namespace cutlass;
-    using RefElement = bfloat16_t; //half_t;
+    using RefElement = bfloat16_t;
     DeviceAllocation<RefElement> block_Q_ref, block_K_ref, block_V_ref;
 
     // loop over the batch dimension to compute the output
@@ -479,22 +398,6 @@ bool verify(ProblemShapeType problem_size, Options options, const float* q_scale
       }
       compat::wait();
 
-      // Print inputs for the first batch item
-      if (b == 0) {
-        if constexpr (is_fp8_v<ElementQ>) {
-            std::cout << "\n========= FP8 Kernel Inputs (Batch 0) =========\n";
-            print_device_tensor("FP8 Input Q", q_ptr_orig, seq_len_qo * num_heads_q * head_size_qk);
-            print_device_tensor("FP8 Input K", k_ptr_orig, seq_len_kv_total * num_heads_kv * head_size_qk);
-            print_device_tensor("FP8 Input V", v_ptr_orig, seq_len_kv_total * num_heads_kv * head_size_vo);
-            std::cout << "\n========= Reference Kernel Inputs (Batch 0, Descaled) =========\n";
-        } else {
-            std::cout << "\n========= FP16 Kernel and Reference Kernel Inputs (Batch 0) =========\n";
-        }
-        print_device_tensor("Input Q", reinterpret_cast<RefElement*>(q_ptr), seq_len_qo * num_heads_q * head_size_qk);
-        print_device_tensor("Input K", reinterpret_cast<RefElement*>(k_ptr), seq_len_kv_total * num_heads_kv * head_size_qk);
-        print_device_tensor("Input V", reinterpret_cast<RefElement*>(v_ptr), seq_len_kv_total * num_heads_kv * head_size_vo);
-      }
-
       for (int q_group = 0; q_group < num_heads_q / q_group_size; q_group++) {
         for (int q_head = 0; q_head < q_group_size; q_head++) {
           cutlass::DeviceAllocation<ElementAccumulator> block_S;
@@ -646,11 +549,6 @@ bool verify(ProblemShapeType problem_size, Options options, const float* q_scale
     compat::wait();
     compat::memcpy<ElementOutput>(block_ref_O.get(), host_O.data(), host_O.size());
 
-    std::cout << "\n========= Kernel Outputs =========\n";
-    print_device_tensor("Actual Kernel Output (block_O)", block_O.get(), block_O.size());
-    print_device_tensor("Reference Kernel Output (block_ref_O)", block_ref_O.get(), block_ref_O.size());
-    std::cout << "\n==================================\n";
-
     // Check if output from CUTLASS kernel and reference kernel are equal or not
     bool passed = cutlass::reference::device::BlockCompareRelativelyEqual(block_ref_O.get(), block_O.get(),
                                                                           block_O.size(), ElementOutput{0.5}, ElementOutput{0.5});
@@ -806,18 +704,6 @@ bool verify(ProblemShapeType problem_size, Options options, const float* q_scale
       block_V_cache.reset(num_pages * paged_kv_cache.page_size * num_heads_kv * head_size_vo);
     }
 
-    /*initialize_block_iota(block_Q);
-    initialize_block_iota(block_K);
-    initialize_block_iota(block_V); //, seq_len_kv, head_size_vo);
-    initialize_block_iota(block_K_cache);
-    initialize_block_iota(block_V_cache); //, seq_len_kv_cache, head_size_vo);*/
-					  //
-    /*initialize_block_random(block_Q);
-    initialize_block_random(block_K);
-    initialize_block_random(block_V);
-    initialize_block_random(block_K_cache);
-    initialize_block_random(block_V_cache);*/
-
     initialize_block(block_Q, seed + 2023);
     initialize_block(block_K, seed + 2022);
     initialize_block(block_V, seed + 2021);
diff --git a/include/cutlass/fp8_to_fp16.h b/include/cutlass/fp8_to_fp16.h