adjust kv block sizes

vadiklyutiy · vadiklyutiy · commit 23b55cbe275c · 2025-10-31T05:16:57.000+04:00
Signed-off-by: Vadim Gimpelson &lt;vadim.gimpelson@gmail.com&gt;
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
@@ -7,7 +7,7 @@
 import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.model_executor.models import ModelRegistry
-from vllm.utils.math_utils import cdiv, round_up
+from vllm.utils.math_utils import cdiv, next_power_of_2, round_up
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec
 
@@ -426,6 +426,7 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
         # user has not set it or (b) the user has set it
         # too small.
         if cache_config.block_size is None or cache_config.block_size < attn_block_size:
+            attn_block_size = next_power_of_2(attn_block_size)
             cache_config.block_size = attn_block_size
             logger.info(
                 "Setting attention block size to %d tokens "
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
@@ -173,7 +173,10 @@ def get_supported_kernel_block_size() -> list[int | MultipleOf]:
         # Note: Not sure for all platforms,
         # but on Blackwell, only support a page size of
         # 16, 32, 64
-        return [16, 32, 64]
+        # TODO: 16 is temporary removed because TRT-LLM kernel has a bug when using 16.
+        # See https://github.com/flashinfer-ai/flashinfer/issues/1993
+        # for more details.
+        return [32, 64]
 
     @classmethod
     def validate_head_size(cls, head_size: int) -> None: