[Core] Prefix cache: frequency- and cost-aware eviction (opt-in)

youkaichao · Aminsed · commit 3e40ef60f029 · 2025-10-26T20:39:17.000-04:00
Signed-off-by: Amin Sedaghat &lt;amin32846@gmail.com&gt;
diff --git a/requirements/test.txt b/requirements/test.txt
@@ -1225,7 +1225,6 @@ typeshed-client==2.8.2
     # via jsonargparse
 typing-extensions==4.15.0
     # via
-    #   aiosignal
     #   albumentations
     #   alembic
     #   chz
diff --git a/tests/v1/core/test_eviction_policies.py b/tests/v1/core/test_eviction_policies.py
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import time
+
+import pytest
+
+from vllm.v1.core.eviction_policies import FrequencyCostEvictionPolicy
+from vllm.v1.core.kv_cache_utils import KVCacheBlock
+
+pytestmark = pytest.mark.cpu_test
+
+
+def test_frequency_cost_eviction_orders_by_score():
+    policy = FrequencyCostEvictionPolicy(block_size=16, alpha=2.0)
+
+    blocks = []
+    now = time.monotonic()
+    # Create three cached-free blocks with different access patterns
+    for i, (age, access) in enumerate([(10.0, 1), (5.0, 1), (5.0, 10)]):
+        b = KVCacheBlock(block_id=i)
+        # mark as free and cached by simulating a non-None hash
+        b._block_hash = b"dummy_hash"  # type: ignore[attr-defined]
+        b.ref_cnt = 0
+        # manually set tracking attributes used by the policy
+        b.first_access_ts = now - age  # type: ignore[attr-defined]
+        b.access_count = access  # type: ignore[attr-defined]
+        blocks.append(b)
+        policy.on_block_release(b)
+
+    evicted = policy.get_eviction_candidates(3)
+    # The block with lowest frequency/age should be first (age=10, access=1)
+    assert evicted[0] == 0
+    # The most frequently accessed among recent ones should be retained longer
+    assert set(evicted) == {0, 1, 2}
+
+
+def test_policy_remove_block():
+    policy = FrequencyCostEvictionPolicy(block_size=16)
+    b = KVCacheBlock(block_id=42)
+    b._block_hash = b"dummy"  # type: ignore[attr-defined]
+    b.ref_cnt = 0
+    b.first_access_ts = time.monotonic() - 1.0  # type: ignore[attr-defined]
+    b.access_count = 5  # type: ignore[attr-defined]
+    policy.on_block_release(b)
+
+    # Removing the block should make it unselectable
+    policy.remove_block(b)
+    selected = policy.get_eviction_candidates(1)
+    assert 42 not in selected
diff --git a/vllm/config/cache.py b/vllm/config/cache.py
@@ -24,6 +24,7 @@
 CacheDType = Literal["auto", "bfloat16", "fp8", "fp8_e4m3", "fp8_e5m2", "fp8_inc"]
 MambaDType = Literal["auto", "float32"]
 PrefixCachingHashAlgo = Literal["sha256", "sha256_cbor"]
+PrefixCacheEvictionPolicy = Literal["lru", "frequency_cost"]
 
 
 @config
@@ -126,6 +127,17 @@ class CacheConfig:
     gpu_memory_utilization. Note that kv_cache_memory_bytes
     (when not-None) ignores gpu_memory_utilization"""
 
+    # Eviction policy for prefix caching (experimental, opt-in)
+    prefix_cache_eviction_policy: PrefixCacheEvictionPolicy = "lru"
+    """Eviction policy for prefix caching free cached blocks. Default is LRU.
+    Set to "frequency_cost" to enable frequency × cost-aware eviction."""
+
+    eviction_cost_alpha: float = 2.0
+    """Alpha exponent for the compute cost term (block_size^alpha)."""
+
+    eviction_time_decay: float = 0.0
+    """Optional exponential time decay factor for the frequency term."""
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -417,6 +417,12 @@ class EngineArgs:
     prefix_caching_hash_algo: PrefixCachingHashAlgo = (
         CacheConfig.prefix_caching_hash_algo
     )
+    # Eviction policy flags for prefix caching
+    prefix_cache_eviction_policy: Literal["lru", "frequency_cost"] = (
+        CacheConfig.prefix_cache_eviction_policy
+    )
+    eviction_cost_alpha: float = CacheConfig.eviction_cost_alpha
+    eviction_time_decay: float = CacheConfig.eviction_time_decay
     disable_sliding_window: bool = ModelConfig.disable_sliding_window
     disable_cascade_attn: bool = ModelConfig.disable_cascade_attn
     swap_space: float = CacheConfig.swap_space
@@ -881,6 +887,16 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         cache_group.add_argument(
             "--prefix-caching-hash-algo", **cache_kwargs["prefix_caching_hash_algo"]
         )
+        cache_group.add_argument(
+            "--prefix-cache-eviction-policy",
+            **cache_kwargs["prefix_cache_eviction_policy"],
+        )
+        cache_group.add_argument(
+            "--eviction-cost-alpha", **cache_kwargs["eviction_cost_alpha"]
+        )
+        cache_group.add_argument(
+            "--eviction-time-decay", **cache_kwargs["eviction_time_decay"]
+        )
         cache_group.add_argument("--cpu-offload-gb", **cache_kwargs["cpu_offload_gb"])
         cache_group.add_argument(
             "--calculate-kv-scales", **cache_kwargs["calculate_kv_scales"]
@@ -1386,6 +1402,9 @@ def create_engine_config(
             sliding_window=sliding_window,
             enable_prefix_caching=self.enable_prefix_caching,
             prefix_caching_hash_algo=self.prefix_caching_hash_algo,
+            prefix_cache_eviction_policy=self.prefix_cache_eviction_policy,
+            eviction_cost_alpha=self.eviction_cost_alpha,
+            eviction_time_decay=self.eviction_time_decay,
             cpu_offload_gb=self.cpu_offload_gb,
             calculate_kv_scales=self.calculate_kv_scales,
             kv_sharing_fast_prefill=self.kv_sharing_fast_prefill,
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
@@ -11,6 +11,7 @@
     KVCacheEvent,
 )
 from vllm.logger import init_logger
+from vllm.v1.core.eviction_policies import FrequencyCostEvictionPolicy
 from vllm.v1.core.kv_cache_utils import (
     BlockHash,
     BlockHashWithGroupId,
@@ -166,6 +167,25 @@ def __init__(
         self.enable_kv_cache_events = enable_kv_cache_events
         self.kv_event_queue: list[KVCacheEvent] = []
 
+        # Optional frequency-cost policy (set via configure_eviction_policy)
+        self._policy: FrequencyCostEvictionPolicy | None = None
+
+    def configure_eviction_policy(
+        self,
+        policy: str,
+        *,
+        block_size: int,
+        alpha: float = 2.0,
+        time_decay: float = 0.0,
+    ) -> None:
+        """Configure optional eviction policy. Defaults to LRU if not set."""
+        if policy == "frequency_cost":
+            self._policy = FrequencyCostEvictionPolicy(
+                block_size=block_size, alpha=alpha, time_decay_factor=time_decay
+            )
+        else:
+            self._policy = None
+
     def get_cached_block(
         self, block_hash: BlockHash, kv_cache_group_ids: list[int]
     ) -> list[KVCacheBlock] | None:
@@ -278,19 +298,65 @@ def get_new_blocks(self, num_blocks: int) -> list[KVCacheBlock]:
         if num_blocks > self.get_num_free_blocks():
             raise ValueError(f"Cannot get {num_blocks} free blocks from the pool")
 
-        ret: list[KVCacheBlock] = self.free_block_queue.popleft_n(num_blocks)
-
-        # In order to only iterate the list once, we duplicated code a bit
+        # Fast path: no policy configured -> original LRU behavior
+        if self._policy is None:
+            ret: list[KVCacheBlock] = self.free_block_queue.popleft_n(num_blocks)
+            if self.enable_caching:
+                for block in ret:
+                    self._maybe_evict_cached_block(block)
+                    assert block.ref_cnt == 0
+                    block.ref_cnt += 1
+            else:
+                for block in ret:
+                    assert block.ref_cnt == 0
+                    block.ref_cnt += 1
+            return ret
+
+        # Policy path: prefer non-cached free blocks from LRU head, then
+        # choose cached-free blocks via policy ranking.
+        selected: list[KVCacheBlock] = []
+        deferred_cached: list[KVCacheBlock] = []
+
+        while len(selected) < num_blocks:
+            # Exhausted free blocks -> impossible due to initial check
+            blk = self.free_block_queue.popleft()
+            if blk.block_hash is None:
+                selected.append(blk)
+            else:
+                # remove from policy to avoid selecting it immediately
+                if self._policy is not None:
+                    self._policy.remove_block(blk)
+                deferred_cached.append(blk)
+            if self.get_num_free_blocks() == 0 and len(selected) < num_blocks:
+                break
+
+        if len(selected) < num_blocks:
+            need = num_blocks - len(selected)
+            # Ask policy for global cached-free candidates by block_id
+            ids = self._policy.get_eviction_candidates(need)
+            for block_id in ids:
+                blk = self.blocks[block_id]
+                # Remove from free list if still present
+                if blk.prev_free_block is not None and blk.next_free_block is not None:
+                    self.free_block_queue.remove(blk)
+                # Evict hash later below
+                selected.append(blk)
+
+        # Return deferred cached blocks to the free list tail to keep queue sound
+        for blk in deferred_cached:
+            self.free_block_queue.append(blk)
+
+        # Finalize selection: evict hashes for cached blocks; inc ref_cnt
         if self.enable_caching:
-            for block in ret:
-                self._maybe_evict_cached_block(block)
-                assert block.ref_cnt == 0
-                block.ref_cnt += 1
+            for blk in selected:
+                self._maybe_evict_cached_block(blk)
+                assert blk.ref_cnt == 0
+                blk.ref_cnt += 1
         else:
-            for block in ret:
-                assert block.ref_cnt == 0
-                block.ref_cnt += 1
-        return ret
+            for blk in selected:
+                assert blk.ref_cnt == 0
+                blk.ref_cnt += 1
+        return selected
 
     def _maybe_evict_cached_block(self, block: KVCacheBlock) -> bool:
         """
@@ -342,7 +408,11 @@ def touch(self, blocks: tuple[Sequence[KVCacheBlock], ...]) -> None:
                 # candidate), so remove it.
                 if block.ref_cnt == 0 and not block.is_null:
                     self.free_block_queue.remove(block)
+                    if self._policy is not None:
+                        self._policy.remove_block(block)
                 block.ref_cnt += 1
+                if self._policy is not None:
+                    self._policy.on_block_access(block)
 
     def free_blocks(self, ordered_blocks: Iterable[KVCacheBlock]) -> None:
         """Free a list of blocks. The blocks should be ordered by their
@@ -356,9 +426,15 @@ def free_blocks(self, ordered_blocks: Iterable[KVCacheBlock]) -> None:
         blocks_list = list(ordered_blocks)
         for block in blocks_list:
             block.ref_cnt -= 1
-        self.free_block_queue.append_n(
-            [block for block in blocks_list if block.ref_cnt == 0 and not block.is_null]
-        )
+        freed = [
+            block for block in blocks_list if block.ref_cnt == 0 and not block.is_null
+        ]
+        self.free_block_queue.append_n(freed)
+        if self._policy is not None:
+            for block in freed:
+                # Track only cached-free blocks
+                if block.block_hash is not None:
+                    self._policy.on_block_release(block)
 
     def reset_prefix_cache(self) -> bool:
         """Reset prefix cache. This function may be used in RLHF
@@ -390,6 +466,9 @@ def reset_prefix_cache(self) -> bool:
         if self.enable_kv_cache_events:
             self.kv_event_queue.append(AllBlocksCleared())
 
+        if self._policy is not None:
+            self._policy.reset()
+
         return True
 
     def get_num_free_blocks(self) -> int:
diff --git a/vllm/v1/core/eviction_policies/__init__.py b/vllm/v1/core/eviction_policies/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from .frequency_cost import FrequencyCostEvictionPolicy
+
+__all__ = ["FrequencyCostEvictionPolicy"]
diff --git a/vllm/v1/core/eviction_policies/frequency_cost.py b/vllm/v1/core/eviction_policies/frequency_cost.py
@@ -0,0 +1,95 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import heapq
+import math
+import time
+
+from vllm.v1.core.kv_cache_utils import KVCacheBlock
+
+
+class FrequencyCostEvictionPolicy:
+    """Min-heap policy over cached-free blocks by retention score.
+
+    Implementation notes:
+    - Uses lazy deletion with an `entry_finder` dict to avoid in-place heap edits.
+    - Scores are computed lazily when a block becomes cached-free.
+    - `block_size` is provided once at init; not stored on each block.
+    - This class tracks only blocks that are both free (ref_cnt==0) and cached
+      (i.e., have a non-None block_hash).
+    """
+
+    def __init__(
+        self,
+        block_size: int,
+        alpha: float = 2.0,
+        time_decay_factor: float = 0.0,
+        min_time_window: float = 1.0,
+    ) -> None:
+        self.block_size = block_size
+        self.alpha = alpha
+        self.time_decay_factor = time_decay_factor
+        self.min_time_window = min_time_window
+
+        # Heap entries: (score, counter, block_id)
+        self._heap: list[tuple[float, int, int]] = []
+        self._entry_finder: dict[int, tuple[float, int, int]] = {}
+        self._counter = 0
+
+    def _score(self, block: KVCacheBlock) -> float:
+        # If the block was never accessed through prefix hits, treat as lowest value.
+        first_ts = getattr(block, "first_access_ts", None)
+        access_count = getattr(block, "access_count", 0) or 0
+        if first_ts is None:
+            return 0.0
+        now = time.monotonic()
+        dt = max(self.min_time_window, now - first_ts)
+        if self.time_decay_factor > 0.0:
+            w = math.exp(-self.time_decay_factor * dt)
+            freq = (access_count * w) / dt
+        else:
+            freq = access_count / dt
+        cost = float(self.block_size) ** self.alpha
+        return min(freq * cost, 1e15)
+
+    def _add(self, block: KVCacheBlock) -> None:
+        # Only track cached-free blocks
+        if block.ref_cnt != 0 or block.block_hash is None:
+            return
+        score = self._score(block)
+        self._counter += 1
+        entry = (score, self._counter, block.block_id)
+        self._entry_finder[block.block_id] = entry
+        heapq.heappush(self._heap, entry)
+
+    def on_block_access(self, block: KVCacheBlock) -> None:
+        # Minimal tracking on access for frequency stats
+        first_ts = getattr(block, "first_access_ts", None)
+        if first_ts is None:
+            block.first_access_ts = time.monotonic()
+        block.access_count = (getattr(block, "access_count", 0) or 0) + 1
+
+    def on_block_release(self, block: KVCacheBlock) -> None:
+        # Block became cached-free
+        self._add(block)
+
+    def get_eviction_candidates(self, num_blocks: int) -> list[int]:
+        out: list[int] = []
+        while self._heap and len(out) < num_blocks:
+            score, counter, block_id = heapq.heappop(self._heap)
+            if self._entry_finder.get(block_id) == (score, counter, block_id):
+                self._entry_finder.pop(block_id, None)
+                out.append(block_id)
+        return out
+
+    def remove_block(self, block: KVCacheBlock) -> None:
+        # Lazy deletion: ensure future pops skip this block
+        self._entry_finder.pop(block.block_id, None)
+
+    def reset(self) -> None:
+        self._heap.clear()
+        self._entry_finder.clear()
+
+    @property
+    def name(self) -> str:
+        return f"FrequencyCost(alpha={self.alpha})"
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py