refactor: rename attention func.

XuZhang99 · XuZhang99 · commit a5fa4d86642b · 2025-10-23T14:42:32.000+08:00
diff --git a/xllm/core/kernels/mlu/attention.cpp b/xllm/core/kernels/mlu/attention.cpp
@@ -28,28 +28,28 @@ void reshape_paged_cache(torch::Tensor& key,
       key, value, k_cache, v_cache, slot_mapping, direction);
 }
 
-void flash_attention(const torch::Tensor& query,
-                     const torch::Tensor& key,
-                     const torch::Tensor& value,
-                     torch::Tensor& output,
-                     torch::Tensor& output_lse,
-                     const std::optional<torch::Tensor>& query_start_loc,
-                     const std::optional<torch::Tensor>& seq_start_loc,
-                     const std::optional<torch::Tensor>& alibi_slope,
-                     const std::optional<torch::Tensor>& attn_bias,
-                     const std::optional<torch::Tensor>& q_quant_scale,
-                     const std::optional<torch::Tensor>& k_quant_scale,
-                     const std::optional<torch::Tensor>& v_quant_scale,
-                     const std::optional<torch::Tensor>& out_quant_scale,
-                     const std::optional<torch::Tensor>& block_table,
-                     int max_query_len,
-                     int max_seq_len,
-                     float scale,
-                     bool is_causal,
-                     int window_size_left,
-                     int window_size_right,
-                     const std::string& compute_dtype,
-                     bool return_lse) {
+void batch_prefill(const torch::Tensor& query,
+                   const torch::Tensor& key,
+                   const torch::Tensor& value,
+                   torch::Tensor& output,
+                   torch::Tensor& output_lse,
+                   const std::optional<torch::Tensor>& query_start_loc,
+                   const std::optional<torch::Tensor>& seq_start_loc,
+                   const std::optional<torch::Tensor>& alibi_slope,
+                   const std::optional<torch::Tensor>& attn_bias,
+                   const std::optional<torch::Tensor>& q_quant_scale,
+                   const std::optional<torch::Tensor>& k_quant_scale,
+                   const std::optional<torch::Tensor>& v_quant_scale,
+                   const std::optional<torch::Tensor>& out_quant_scale,
+                   const std::optional<torch::Tensor>& block_table,
+                   int max_query_len,
+                   int max_seq_len,
+                   float scale,
+                   bool is_causal,
+                   int window_size_left,
+                   int window_size_right,
+                   const std::string& compute_dtype,
+                   bool return_lse) {
   tmo::torch_api::flash_attention(query,
                                   key,
                                   value,
@@ -74,27 +74,26 @@ void flash_attention(const torch::Tensor& query,
                                   return_lse);
 }
 
-void single_query_cached_kv_attn(
-    const torch::Tensor& query,
-    const torch::Tensor& k_cache,
-    torch::Tensor& output,
-    const torch::Tensor& block_table,
-    const torch::Tensor& seq_lens,
-    const torch::Tensor& v_cache,
-    torch::Tensor& output_lse,
-    const std::optional<torch::Tensor>& q_quant_scale,
-    const std::optional<torch::Tensor>& k_cache_quant_scale,
-    const std::optional<torch::Tensor>& v_cache_quant_scale,
-    const std::optional<torch::Tensor>& out_quant_scale,
-    const std::optional<torch::Tensor>& alibi_slope,
-    const std::optional<torch::Tensor>& mask,
-    const std::string& compute_dtype,
-    int max_seq_len,
-    int window_size_left,
-    int window_size_right,
-    float scale,
-    bool return_lse,
-    int kv_cache_quant_bit_size) {
+void batch_decode(const torch::Tensor& query,
+                  const torch::Tensor& k_cache,
+                  torch::Tensor& output,
+                  const torch::Tensor& block_table,
+                  const torch::Tensor& seq_lens,
+                  const torch::Tensor& v_cache,
+                  torch::Tensor& output_lse,
+                  const std::optional<torch::Tensor>& q_quant_scale,
+                  const std::optional<torch::Tensor>& k_cache_quant_scale,
+                  const std::optional<torch::Tensor>& v_cache_quant_scale,
+                  const std::optional<torch::Tensor>& out_quant_scale,
+                  const std::optional<torch::Tensor>& alibi_slope,
+                  const std::optional<torch::Tensor>& mask,
+                  const std::string& compute_dtype,
+                  int max_seq_len,
+                  int window_size_left,
+                  int window_size_right,
+                  float scale,
+                  bool return_lse,
+                  int kv_cache_quant_bit_size) {
   tmo::torch_api::single_query_cached_kv_attn(query,
                                               k_cache,
                                               output,
diff --git a/xllm/core/kernels/mlu/mlu_ops_api.h b/xllm/core/kernels/mlu/mlu_ops_api.h
@@ -58,50 +58,49 @@ void reshape_paged_cache(torch::Tensor& key,
                          const torch::Tensor& slot_mapping,
                          bool direction);
 
-void flash_attention(const torch::Tensor& query,
-                     const torch::Tensor& key,
-                     const torch::Tensor& value,
-                     torch::Tensor& output,
-                     torch::Tensor& output_lse,
-                     const std::optional<torch::Tensor>& query_start_loc,
-                     const std::optional<torch::Tensor>& seq_start_loc,
-                     const std::optional<torch::Tensor>& alibi_slope,
-                     const std::optional<torch::Tensor>& attn_bias,
-                     const std::optional<torch::Tensor>& q_quant_scale,
-                     const std::optional<torch::Tensor>& k_quant_scale,
-                     const std::optional<torch::Tensor>& v_quant_scale,
-                     const std::optional<torch::Tensor>& out_quant_scale,
-                     const std::optional<torch::Tensor>& block_tables,
-                     int max_query_len,
-                     int max_seq_len,
-                     float scale,
-                     bool is_causal,
-                     int window_size_left,
-                     int window_size_right,
-                     const std::string& compute_dtype,
-                     bool return_lse);
-
-void single_query_cached_kv_attn(
-    const torch::Tensor& query,
-    const torch::Tensor& k_cache,
-    torch::Tensor& output,
-    const torch::Tensor& block_table,
-    const torch::Tensor& seq_lens,
-    const torch::Tensor& v_cache,
-    torch::Tensor& output_lse,
-    const std::optional<torch::Tensor>& q_quant_scale,
-    const std::optional<torch::Tensor>& k_cache_quant_scale,
-    const std::optional<torch::Tensor>& v_cache_quant_scale,
-    const std::optional<torch::Tensor>& out_quant_scale,
-    const std::optional<torch::Tensor>& alibi_slope,
-    const std::optional<torch::Tensor>& mask,
-    const std::string& compute_dtype,
-    int max_seq_len,
-    int window_size_left,
-    int window_size_right,
-    float scale,
-    bool return_lse,
-    int kv_cache_quant_bit_size);
+void batch_prefill(const torch::Tensor& query,
+                   const torch::Tensor& key,
+                   const torch::Tensor& value,
+                   torch::Tensor& output,
+                   torch::Tensor& output_lse,
+                   const std::optional<torch::Tensor>& query_start_loc,
+                   const std::optional<torch::Tensor>& seq_start_loc,
+                   const std::optional<torch::Tensor>& alibi_slope,
+                   const std::optional<torch::Tensor>& attn_bias,
+                   const std::optional<torch::Tensor>& q_quant_scale,
+                   const std::optional<torch::Tensor>& k_quant_scale,
+                   const std::optional<torch::Tensor>& v_quant_scale,
+                   const std::optional<torch::Tensor>& out_quant_scale,
+                   const std::optional<torch::Tensor>& block_tables,
+                   int max_query_len,
+                   int max_seq_len,
+                   float scale,
+                   bool is_causal,
+                   int window_size_left,
+                   int window_size_right,
+                   const std::string& compute_dtype,
+                   bool return_lse);
+
+void batch_decode(const torch::Tensor& query,
+                  const torch::Tensor& k_cache,
+                  torch::Tensor& output,
+                  const torch::Tensor& block_table,
+                  const torch::Tensor& seq_lens,
+                  const torch::Tensor& v_cache,
+                  torch::Tensor& output_lse,
+                  const std::optional<torch::Tensor>& q_quant_scale,
+                  const std::optional<torch::Tensor>& k_cache_quant_scale,
+                  const std::optional<torch::Tensor>& v_cache_quant_scale,
+                  const std::optional<torch::Tensor>& out_quant_scale,
+                  const std::optional<torch::Tensor>& alibi_slope,
+                  const std::optional<torch::Tensor>& mask,
+                  const std::string& compute_dtype,
+                  int max_seq_len,
+                  int window_size_left,
+                  int window_size_right,
+                  float scale,
+                  bool return_lse,
+                  int kv_cache_quant_bit_size);
 
 void fused_layernorm(const torch::Tensor& input,
                      torch::Tensor& output,
diff --git a/xllm/core/kernels/ops_api.cpp b/xllm/core/kernels/ops_api.cpp
@@ -69,67 +69,67 @@ void reshape_paged_cache(ReshapePagedCacheParams& params) {
 #endif
 }
 
-void prefill_attention(AttentionParams& params) {
+void batch_prefill(AttentionParams& params) {
 #if defined(USE_MLU)
   torch::Tensor lse = params.output_lse.value_or(torch::Tensor());
-  mlu::flash_attention(params.query,
-                       params.key,
-                       params.value,
-                       params.output,
-                       lse,
-                       params.query_start_loc,
-                       params.seq_start_loc,
-                       params.alibi_slope,
-                       params.attn_bias,
-                       params.q_quant_scale,
-                       params.k_quant_scale,
-                       params.v_quant_scale,
-                       params.out_quant_scale,
-                       params.block_table,
-                       params.max_query_len,
-                       params.max_seq_len,
-                       params.scale,
-                       params.is_causal,
-                       params.window_size_left,
-                       params.window_size_right,
-                       params.compute_dtype,
-                       params.return_lse);
+  mlu::batch_prefill(params.query,
+                     params.key,
+                     params.value,
+                     params.output,
+                     lse,
+                     params.query_start_loc,
+                     params.seq_start_loc,
+                     params.alibi_slope,
+                     params.attn_bias,
+                     params.q_quant_scale,
+                     params.k_quant_scale,
+                     params.v_quant_scale,
+                     params.out_quant_scale,
+                     params.block_table,
+                     params.max_query_len,
+                     params.max_seq_len,
+                     params.scale,
+                     params.is_causal,
+                     params.window_size_left,
+                     params.window_size_right,
+                     params.compute_dtype,
+                     params.return_lse);
   params.output_lse = lse;
 #elif defined(USE_CUDA)
-  throw std::runtime_error("prefill_attention for cuda not implemented");
+  throw std::runtime_error("batch_prefill for cuda not implemented");
 #else
-  throw std::runtime_error("prefill_attention not implemented");
+  throw std::runtime_error("batch_prefill not implemented");
 #endif
 }
 
-void decode_attention(AttentionParams& params) {
+void batch_decode(AttentionParams& params) {
 #if defined(USE_MLU)
   torch::Tensor lse = params.output_lse.value_or(torch::Tensor());
-  mlu::single_query_cached_kv_attn(params.query,
-                                   params.k_cache,
-                                   params.output,
-                                   params.block_table,
-                                   params.seq_lens,
-                                   params.v_cache,
-                                   lse,
-                                   params.q_quant_scale,
-                                   params.k_cache_quant_scale,
-                                   params.v_cache_quant_scale,
-                                   params.out_quant_scale,
-                                   params.alibi_slope,
-                                   params.mask,
-                                   params.compute_dtype,
-                                   params.max_seq_len,
-                                   params.window_size_left,
-                                   params.window_size_right,
-                                   params.scale,
-                                   params.return_lse,
-                                   params.kv_cache_quant_bit_size);
+  mlu::batch_decode(params.query,
+                    params.k_cache,
+                    params.output,
+                    params.block_table,
+                    params.seq_lens,
+                    params.v_cache,
+                    lse,
+                    params.q_quant_scale,
+                    params.k_cache_quant_scale,
+                    params.v_cache_quant_scale,
+                    params.out_quant_scale,
+                    params.alibi_slope,
+                    params.mask,
+                    params.compute_dtype,
+                    params.max_seq_len,
+                    params.window_size_left,
+                    params.window_size_right,
+                    params.scale,
+                    params.return_lse,
+                    params.kv_cache_quant_bit_size);
   params.output_lse = lse;
 #elif defined(USE_CUDA)
-  throw std::runtime_error("decode_attention for cuda not implemented");
+  throw std::runtime_error("batch_decode for cuda not implemented");
 #else
-  throw std::runtime_error("decode_attention not implemented");
+  throw std::runtime_error("batch_decode not implemented");
 #endif
 }
 
diff --git a/xllm/core/kernels/ops_api.h b/xllm/core/kernels/ops_api.h
@@ -29,8 +29,8 @@ namespace kernel {
 void apply_rotary(RotaryParams& params);
 void active(ActivationParams& params);
 void reshape_paged_cache(ReshapePagedCacheParams& params);
-void prefill_attention(AttentionParams& params);
-void decode_attention(AttentionParams& params);
+void batch_prefill(AttentionParams& params);
+void batch_decode(AttentionParams& params);
 void fused_layernorm(FusedLayerNormParams& params);
 torch::Tensor matmul(MatmulParams& params);
 torch::Tensor fused_moe(FusedMoEParams& params);
diff --git a/xllm/core/layers/mlu/attention.cpp b/xllm/core/layers/mlu/attention.cpp
@@ -105,7 +105,7 @@ std::tuple<torch::Tensor, std::optional<torch::Tensor>> AttentionImpl::forward(
     attention_params.seq_start_loc = attn_metadata.seq_start_loc;
     attention_params.max_query_len = attn_metadata.max_query_len;
 
-    xllm::kernel::prefill_attention(attention_params);
+    xllm::kernel::batch_prefill(attention_params);
   } else if (attn_metadata.is_chunked_prefill) {
     attention_params.key = k_cache;
     attention_params.value = v_cache;
@@ -114,7 +114,7 @@ std::tuple<torch::Tensor, std::optional<torch::Tensor>> AttentionImpl::forward(
     attention_params.max_query_len = attn_metadata.max_query_len;
     attention_params.block_table = attn_metadata.block_table;
 
-    xllm::kernel::prefill_attention(attention_params);
+    xllm::kernel::batch_prefill(attention_params);
   } else {
     query = query.view({-1, 1, num_heads_, head_size_});
     output = output.view({-1, 1, num_heads_, head_size_});
@@ -134,7 +134,7 @@ std::tuple<torch::Tensor, std::optional<torch::Tensor>> AttentionImpl::forward(
     attention_params.paged_kv_last_page_len =
         attn_metadata.paged_kv_last_page_len;
 
-    xllm::kernel::decode_attention(attention_params);
+    xllm::kernel::batch_decode(attention_params);
   }
 
   output = output.view({-1, num_heads_ * head_size_});