fix: rename prefix scorer parameters and convert test to benchmark test

kfirtoledo · kfirtoledo · commit 7eca1dc06aeb · 2025-06-11T10:31:01.000+03:00
Signed-off-by: Kfir Toledo &lt;kfir.toledo@ibm.com&gt;
diff --git a/cmd/epp/main.go b/cmd/epp/main.go
@@ -122,7 +122,7 @@ func loadPrefixCacheConfig() prefix.Config {
 
 	return prefix.Config{
 		HashBlockSize:          envutil.GetEnvInt("PREFIX_CACHE_HASH_BLOCK_SIZE", prefix.DefaultHashBlockSize, baseLogger),
-		MaxNumServersToMatch:   envutil.GetEnvInt("PREFIX_CACHE_MAX_SERVER_TO_MATCH", prefix.DefaultNumServersToMatch, baseLogger),
+		MaxPodsPerPrefix:       envutil.GetEnvInt("PREFIX_MAX_PODS_PER_PREFIX", prefix.DefaultMaxPodsPerPrefix, baseLogger),
 		MaxPrefixBlocksToMatch: envutil.GetEnvInt("PREFIX_CACHE_MAX_PREFIX_BLOCKS", prefix.DefaultMaxPrefixBlocks, baseLogger),
 		LRUIndexerCapacity:     envutil.GetEnvInt("PREFIX_CACHE_LRU_CAPACITY", prefix.DefaultLRUIndexerCapacity, baseLogger),
 	}
diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer.go
@@ -27,23 +27,23 @@ import (
 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
 )
 
-// block holds an LRU cache of servers that may have a specific prefix hash.
-type block struct {
-	Pods *lru.Cache[ServerID, struct{}] // Can be extended with metadata (e.g., timestamp).
+// podSet holds an LRU cache of servers that may have a specific prefix hash.
+type podSet struct {
+	enteries *lru.Cache[ServerID, struct{}] // Can be extended with metadata (e.g., timestamp).
 }
 
 // An indexer maintains an LRU cache of prompt prefix hashes and the server(s) that might have that
 // prefix cached .
 type indexer struct {
 	mu                sync.RWMutex
-	cache             *lru.Cache[BlockHash, *block]
+	cache             *lru.Cache[BlockHash, *podSet]
 	maxCacheSize      int
 	maxServersToMatch int
 }
 
 // newIndexer initializes an indexer with size limits and starts cache size reporting.
 func newIndexer(maxCacheSize, maxServersToMatch int) *indexer {
-	c, err := lru.New[BlockHash, *block](maxCacheSize)
+	c, err := lru.New[BlockHash, *podSet](maxCacheSize)
 	if err != nil {
 		panic(err)
 	}
@@ -58,23 +58,23 @@ func newIndexer(maxCacheSize, maxServersToMatch int) *indexer {
 
 // Add adds a list of prefix hashes to the cache, tied to the server.
 func (i *indexer) Add(hashes []BlockHash, pod ServerID) {
-	if len(hashes) == 0 || pod.Name == "" {
+	if pod.Name == "" {
 		return
 	}
 
 	i.mu.Lock()
 	defer i.mu.Unlock()
 
 	for _, hash := range hashes {
-		b, ok := i.cache.Get(hash)
+		p, ok := i.cache.Get(hash)
 		if !ok {
-			// Create block with new LRU
+			// Create podSet with new LRU
 			podLRU, _ := lru.New[ServerID, struct{}](i.maxServersToMatch)
-			b = &block{Pods: podLRU}
-			i.cache.Add(hash, b)
+			p = &podSet{enteries: podLRU}
+			i.cache.Add(hash, p)
 		}
 
-		b.Pods.Add(pod, struct{}{})
+		p.enteries.Add(pod, struct{}{})
 	}
 }
 
@@ -84,11 +84,11 @@ func (i *indexer) Get(hash BlockHash) map[ServerID]bool {
 	defer i.mu.RUnlock()
 
 	res := map[ServerID]bool{}
-	block, ok := i.cache.Get(hash)
+	pods, ok := i.cache.Get(hash)
 	if !ok {
 		return res
 	}
-	for _, pod := range block.Pods.Keys() {
+	for _, pod := range pods.enteries.Keys() {
 		res[pod] = true
 	}
 	return res
diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go
@@ -32,11 +32,10 @@ import (
 
 const (
 	DefaultScorerWeight = 1
-	// Attempt to return DefaultNumServersToMatch servers with their longest prefix match length.
-	// Why not just return the server with longest prefix match?
-	// It may not be the optimal choice, e.g., it may have a high queue depth.
-	// We optimistically search more than one to give more candidates for the scheduler to choose.
-	DefaultNumServersToMatch = 16
+	// DefaultMaxPodsPerPrefix defines the maximum number of pods (servers) to track per prefix hash in the LRU indexer.
+	// This limits the number of recent pods associated with a given prefix to reduce memory usage
+	// and ensure faster lookup. When the limit is reached, the least recently used pod is evicted.
+	DefaultMaxPodsPerPrefix = 4
 	// vLLM default token block size is 16, and a good guess of average characters per token is 4.
 	DefaultHashBlockSize = 64
 	// The maximum number of blocks to match. Two long requests with the same prefix up to this
@@ -64,8 +63,8 @@ type Config struct {
 	// MaxPrefixBlocksToMatch is the maximum number of prefix blocks to match. Input beyond this limit will
 	// be ignored.
 	MaxPrefixBlocksToMatch int
-	// NumServersToMatch is the maximum number that can match per hash BlockHash.
-	MaxNumServersToMatch int
+	// MaxPodsPerPrefix defines the maximum number of pods (servers) to track per prefix hash in the LRU indexer.
+	MaxPodsPerPrefix int
 	// Max (approximate) size of the LRU indexer in number of entries.
 	LRUIndexerCapacity int
 }
@@ -122,7 +121,7 @@ var _ framework.PostCycle = &Plugin{}
 func New(config Config) *Plugin {
 	m := &Plugin{
 		Config:  config,
-		indexer: newIndexer(config.LRUIndexerCapacity, config.MaxNumServersToMatch),
+		indexer: newIndexer(config.LRUIndexerCapacity, config.MaxPodsPerPrefix),
 	}
 	return m
 }
@@ -136,7 +135,7 @@ func (m *Plugin) Name() string {
 func (m *Plugin) Score(ctx context.Context, request *types.LLMRequest, cycleState *types.CycleState, pods []types.Pod) map[types.Pod]float64 {
 	loggerTrace := log.FromContext(ctx).V(logutil.TRACE)
 	// pre score step, hashing prompt and find longest prefix match.
-	hashes := hashPrompt(ctx, request, m.HashBlockSize, m.MaxPrefixBlocksToMatch)
+	hashes := hashPrompt(ctx, request, m.HashBlockSize, m.MaxPodsPerPrefix)
 	state := &schedulingContextState{
 		PrefixHashes:       hashes,
 		PrefixCacheServers: m.matchLongestPrefix(ctx, hashes),
diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin_test.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin_test.go
@@ -36,7 +36,7 @@ func TestPrefixPlugin(t *testing.T) {
 		HashBlockSize:          4,
 		MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks,
 		LRUIndexerCapacity:     DefaultLRUIndexerCapacity,
-		MaxNumServersToMatch:   DefaultNumServersToMatch,
+		MaxPodsPerPrefix:       DefaultMaxPodsPerPrefix,
 	}
 	plugin := New(config)
 
@@ -144,18 +144,20 @@ func TestPrefixPlugin(t *testing.T) {
 }
 
 // TestPrefixPluginStress is a stress test for the prefix scoring plugin, using prompts of increasing length.
-func TestPrefixPluginStress(t *testing.T) {
+func BenchmarkPrefixPluginStress(b *testing.B) {
 	blockSize := 4
+	maxPrefixBlocks := 50000
 	config := Config{
 		HashBlockSize:          blockSize,
-		MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks,
+		MaxPrefixBlocksToMatch: maxPrefixBlocks,
 		LRUIndexerCapacity:     DefaultLRUIndexerCapacity,
-		MaxNumServersToMatch:   DefaultNumServersToMatch,
+		MaxPodsPerPrefix:       DefaultMaxPodsPerPrefix,
 	}
 
 	plugin := New(config)
 	types.NewCycleState()
-	for i := 0; i < 1000; i++ {
+	promptLen := []int{10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000, 50000}
+	for _, i := range promptLen {
 		// Generate increasing-length random prompts
 		prompt := randomPrompt(4 + i)
 		pod := &types.PodMetrics{
@@ -179,9 +181,9 @@ func TestPrefixPluginStress(t *testing.T) {
 
 		// Second cycle: validate internal state
 		state, err := plugin.getPrefixState(cycleState)
-		assert.NoError(t, err)
-		expectedHashes := int(math.Min(DefaultMaxPrefixBlocks+1, float64(len(req.Prompt)/blockSize+1))) // the extra one is for the model.
-		assert.Equal(t, expectedHashes, len(state.PrefixHashes), "number of hashes is incorrect")
+		assert.NoError(b, err)
+		expectedHashes := int(math.Min(float64(maxPrefixBlocks+1), float64(len(req.Prompt)/blockSize+1))) // the extra one is for the model.
+		assert.Equal(b, expectedHashes, len(state.PrefixHashes), "number of hashes is incorrect")
 	}
 }
 
diff --git a/site-src/guides/epp-configuration/prefix-aware.md b/site-src/guides/epp-configuration/prefix-aware.md
@@ -4,7 +4,7 @@ The [prefix cache plugin](https://github.com/kubernetes-sigs/gateway-api-inferen
 takes advantage of the prefix caching (e.g., [vllm APC](https://docs.vllm.ai/en/latest/features/automatic_prefix_caching.html))
 feature of model servers, and optimizes request scheduling by placing requests sharing the longest
 prefixes to the same server as much as possible, while balancing the server load by considering kv-cache
-and queue depth. 
+and queue depth.
 
 ## Enable the prefix cache plugin
 
@@ -34,14 +34,18 @@ for performance.
 
 * `PREFIX_CACHE_LRU_CAPACITY`: Maximum capacity the prefix LRU indexer in number of block hashes. Below
 shows a detailed analysis on how to estimate this.
+* `PREFIX_MAX_PODS_PER_PREFIX`: Defines the maximum number of pods (servers) tracked per prefix hash in the internal LRU cache.
+This setting helps optimize memory usage by retaining only the hottest (most recently active) pods for each prefix.
+When the limit is reached, older pods are evicted based on least-recently-used (LRU) order.
+
 
     The prefix cache plugin estimates the prefix cache indexes in model server HBMs.  In the perfect
     scenario, EPP has the exact same prefix cache entries per model server as their HBM cache entries. If
     the EPP cache is smaller than HBM cache, a positive EPP cache match is more accurate, but there are more
     false cache misses. If the EPP cache is larger than the HBM cache, then there are more false cache hits.
     Therefore **the EPP prefix cache indexer size should be as close as possible to the HBM cache size.**
 
-    NOTE: EPP builds prefix cache based on characters, while model server maintains prefix cache entries 
+    NOTE: EPP builds prefix cache based on characters, while model server maintains prefix cache entries
     in tokens, a conversion between character <-> token is needed.
 
     Below are the formulas to estimate the EPP prefix indexer size:
@@ -63,7 +67,7 @@ shows a detailed analysis on how to estimate this.
     max_kv_tokens_per_server = (80GB - 16GB) / 128KB = 500,000
     # assume avg_chars_per_token = 4, prefix_indexer_hash_block_size = 64 (default)
     # each entry is about 358KB, so the memory footrpint is abut 11 MB per server
-    lru_indexer_capacity_per_server = 500,000*4/64 = 31250 
+    lru_indexer_capacity_per_server = 500,000*4/64 = 31250
     lru_indexer_capacity_total = 3 * 31250 = 93750
     ```
 

Original file line number	Diff line number	Diff line change
`@@ -122,7 +122,7 @@ func loadPrefixCacheConfig() prefix.Config {`
`122`	`122`
`123`	`123`	`return prefix.Config{`
`124`	`124`	`HashBlockSize: envutil.GetEnvInt("PREFIX_CACHE_HASH_BLOCK_SIZE", prefix.DefaultHashBlockSize, baseLogger),`
`125`		`- MaxNumServersToMatch: envutil.GetEnvInt("PREFIX_CACHE_MAX_SERVER_TO_MATCH", prefix.DefaultNumServersToMatch, baseLogger),`
	`125`	`+ MaxPodsPerPrefix: envutil.GetEnvInt("PREFIX_MAX_PODS_PER_PREFIX", prefix.DefaultMaxPodsPerPrefix, baseLogger),`
`126`	`126`	`MaxPrefixBlocksToMatch: envutil.GetEnvInt("PREFIX_CACHE_MAX_PREFIX_BLOCKS", prefix.DefaultMaxPrefixBlocks, baseLogger),`
`127`	`127`	`LRUIndexerCapacity: envutil.GetEnvInt("PREFIX_CACHE_LRU_CAPACITY", prefix.DefaultLRUIndexerCapacity, baseLogger),`
`128`	`128`	`}`