diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index e71d4ca4629d..d8cb41b8de24 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -302,11 +302,7 @@ def __init__( self.disable_split_kv = False self.compilation_config = vllm_config.compilation_config - max_num_pages_per_req = cdiv( - self.model_config.max_model_len, self.kv_cache_spec.block_size - ) max_num_reqs = vllm_config.scheduler_config.max_num_seqs - max_num_pages = max_num_reqs * max_num_pages_per_req speculative_config = vllm_config.speculative_config num_spec_tokens = ( speculative_config.num_speculative_tokens @@ -333,7 +329,21 @@ def __init__( self.num_kv_heads = self.kv_cache_spec.num_kv_heads self.head_dim = self.kv_cache_spec.head_size FlashInferBackend.validate_head_size(self.head_dim) - self.page_size = self.kv_cache_spec.block_size + + # IMPORTANT: page_size must match the actual kernel block size, + # not the KV manager block size! + # When using hybrid blocks(self.kv_cache_spec.block_size != kernel_block_size), + # the KV cache is allocated with kernel_block_size, so we must use that + # for page_size when calling FlashInfer. + kernel_block_size = self.kv_cache_spec.find_compatible_kernel_block_sizes( + FlashInferBackend, return_all=False + )[0] + self.page_size = kernel_block_size + + # Calculate buffer sizes using the actual kernel block size (page_size) + # to ensure buffers are correctly sized in hybrid mode + max_num_pages_per_req = cdiv(self.model_config.max_model_len, self.page_size) + max_num_pages = max_num_reqs * max_num_pages_per_req self.cache_dtype = self.cache_config.cache_dtype if self.cache_dtype.startswith("fp8"): diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index 0f564fdb3b08..dc8935fa86eb 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -4,6 +4,7 @@ import copy from dataclasses import dataclass, fields from math import prod +from typing import TYPE_CHECKING import torch from typing_extensions import Self @@ -13,6 +14,9 @@ from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import get_dtype_size +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionBackend + logger = init_logger(__name__) @@ -71,6 +75,46 @@ def page_size_bytes(self) -> int: * get_dtype_size(self.dtype) ) + def find_compatible_kernel_block_sizes( + self, backend_cls: "type[AttentionBackend]", return_all: bool = False + ) -> list[int]: + """Find compatible kernel block sizes for this spec and backend. + + Args: + backend_cls: The attention backend class + return_all: If True, return all compatible sizes; + If False, return only the max + + Returns: + List of compatible kernel block sizes + + Raises: + ValueError: If no compatible block size found + """ + from vllm.attention.backends.abstract import MultipleOf + + kv_manager_block_size = self.block_size + supported_sizes = backend_cls.get_supported_kernel_block_size() + compatible_sizes = [] + + for block_size in supported_sizes: + if isinstance(block_size, int) and kv_manager_block_size % block_size == 0: + compatible_sizes.append(block_size) + elif ( + isinstance(block_size, MultipleOf) + and kv_manager_block_size % block_size.base == 0 + ): + compatible_sizes.append(kv_manager_block_size) + + if not compatible_sizes: + raise ValueError( + f"No compatible kernel block size found for " + f"kv_manager_block_size={kv_manager_block_size}, " + f"supported_sizes={supported_sizes}" + ) + + return compatible_sizes if return_all else [max(compatible_sizes)] + @dataclass(frozen=True) class FullAttentionSpec(AttentionSpec): diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 6759fe630e62..e51cc445c634 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -19,7 +19,7 @@ import vllm.envs as envs from vllm.attention import Attention, AttentionType -from vllm.attention.backends.abstract import AttentionBackend, MultipleOf +from vllm.attention.backends.abstract import AttentionBackend from vllm.compilation.counter import compilation_counter from vllm.compilation.cuda_graph import CUDAGraphWrapper from vllm.compilation.monitor import set_cudagraph_capturing_enabled @@ -4146,44 +4146,6 @@ def calculate_reorder_batch_threshold(self) -> None: else: self.reorder_batch_threshold = reorder_batch_threshold_i - def _find_compatible_block_sizes( - self, - kv_manager_block_size: int, - backend_cls: type[AttentionBackend], - return_all: bool = False, - ) -> list[int]: - """ - Find compatible block sizes for a backend. - - Args: - kv_manager_block_size: Physical block size of KV cache - backend_cls: Attention backend class - return_all: Return all compatible sizes if True, max size if False - - Returns: - Compatible block size(s) based on return_all parameter - - Raises: - ValueError: If no compatible block size found - """ - supported_block_size = backend_cls.get_supported_kernel_block_size() - compatible_sizes = [] - - for block_size in supported_block_size: - if isinstance(block_size, int): - if kv_manager_block_size % block_size == 0: - compatible_sizes.append(block_size) - elif ( - isinstance(block_size, MultipleOf) - and kv_manager_block_size % block_size.base == 0 - ): - compatible_sizes.append(kv_manager_block_size) - - if not compatible_sizes: - raise ValueError(f"No compatible block size for {kv_manager_block_size}") - - return compatible_sizes if return_all else [max(compatible_sizes)] - def _select_common_block_size( self, kv_manager_block_size: int, attn_groups: list[AttentionGroup] ) -> int: @@ -4204,8 +4166,13 @@ def _select_common_block_size( all_backend_supports = [] for attn_group in attn_groups: - compatible_sizes = self._find_compatible_block_sizes( - kv_manager_block_size, attn_group.backend, return_all=True + kv_cache_spec = attn_group.kv_cache_spec + assert isinstance(kv_cache_spec, AttentionSpec), ( + f"Expected AttentionSpec for attention group {attn_group.layer_names}, " + f"but got {type(kv_cache_spec)}" + ) + compatible_sizes = kv_cache_spec.find_compatible_kernel_block_sizes( + attn_group.backend, return_all=True ) supported_sizes = sorted(list(set(compatible_sizes)), reverse=True) all_backend_supports.append(set(supported_sizes)) @@ -4388,8 +4355,8 @@ def _reshape_kv_cache_tensors( if isinstance(kv_cache_spec, AttentionSpec): has_attn = True kv_manager_block_size = kv_cache_spec.block_size - kernel_size_list = self._find_compatible_block_sizes( - kv_manager_block_size, attn_backend, return_all=False + kernel_size_list = kv_cache_spec.find_compatible_kernel_block_sizes( + attn_backend, return_all=False ) kernel_size = kernel_size_list[0] num_blocks_per_kv_block = kv_manager_block_size // kernel_size