Skip to content

Commit 23b55cb

Browse files
committed
adjust kv block sizes
Signed-off-by: Vadim Gimpelson <[email protected]>
1 parent b2e65cb commit 23b55cb

File tree

2 files changed

+6
-2
lines changed

2 files changed

+6
-2
lines changed

vllm/model_executor/models/config.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import vllm.envs as envs
88
from vllm.logger import init_logger
99
from vllm.model_executor.models import ModelRegistry
10-
from vllm.utils.math_utils import cdiv, round_up
10+
from vllm.utils.math_utils import cdiv, next_power_of_2, round_up
1111
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
1212
from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec
1313

@@ -426,6 +426,7 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
426426
# user has not set it or (b) the user has set it
427427
# too small.
428428
if cache_config.block_size is None or cache_config.block_size < attn_block_size:
429+
attn_block_size = next_power_of_2(attn_block_size)
429430
cache_config.block_size = attn_block_size
430431
logger.info(
431432
"Setting attention block size to %d tokens "

vllm/v1/attention/backends/flashinfer.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,10 @@ def get_supported_kernel_block_size() -> list[int | MultipleOf]:
173173
# Note: Not sure for all platforms,
174174
# but on Blackwell, only support a page size of
175175
# 16, 32, 64
176-
return [16, 32, 64]
176+
# TODO: 16 is temporary removed because TRT-LLM kernel has a bug when using 16.
177+
# See https://github.com/flashinfer-ai/flashinfer/issues/1993
178+
# for more details.
179+
return [32, 64]
177180

178181
@classmethod
179182
def validate_head_size(cls, head_size: int) -> None:

0 commit comments

Comments
 (0)