Skip to content

Commit 7eca1dc

Browse files
committed
fix: rename prefix scorer parameters and convert test to benchmark test
Signed-off-by: Kfir Toledo <[email protected]>
1 parent 60d3ea5 commit 7eca1dc

File tree

5 files changed

+39
-34
lines changed

5 files changed

+39
-34
lines changed

cmd/epp/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ func loadPrefixCacheConfig() prefix.Config {
122122

123123
return prefix.Config{
124124
HashBlockSize: envutil.GetEnvInt("PREFIX_CACHE_HASH_BLOCK_SIZE", prefix.DefaultHashBlockSize, baseLogger),
125-
MaxNumServersToMatch: envutil.GetEnvInt("PREFIX_CACHE_MAX_SERVER_TO_MATCH", prefix.DefaultNumServersToMatch, baseLogger),
125+
MaxPodsPerPrefix: envutil.GetEnvInt("PREFIX_MAX_PODS_PER_PREFIX", prefix.DefaultMaxPodsPerPrefix, baseLogger),
126126
MaxPrefixBlocksToMatch: envutil.GetEnvInt("PREFIX_CACHE_MAX_PREFIX_BLOCKS", prefix.DefaultMaxPrefixBlocks, baseLogger),
127127
LRUIndexerCapacity: envutil.GetEnvInt("PREFIX_CACHE_LRU_CAPACITY", prefix.DefaultLRUIndexerCapacity, baseLogger),
128128
}

pkg/epp/scheduling/framework/plugins/multi/prefix/indexer.go

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -27,23 +27,23 @@ import (
2727
logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
2828
)
2929

30-
// block holds an LRU cache of servers that may have a specific prefix hash.
31-
type block struct {
32-
Pods *lru.Cache[ServerID, struct{}] // Can be extended with metadata (e.g., timestamp).
30+
// podSet holds an LRU cache of servers that may have a specific prefix hash.
31+
type podSet struct {
32+
enteries *lru.Cache[ServerID, struct{}] // Can be extended with metadata (e.g., timestamp).
3333
}
3434

3535
// An indexer maintains an LRU cache of prompt prefix hashes and the server(s) that might have that
3636
// prefix cached .
3737
type indexer struct {
3838
mu sync.RWMutex
39-
cache *lru.Cache[BlockHash, *block]
39+
cache *lru.Cache[BlockHash, *podSet]
4040
maxCacheSize int
4141
maxServersToMatch int
4242
}
4343

4444
// newIndexer initializes an indexer with size limits and starts cache size reporting.
4545
func newIndexer(maxCacheSize, maxServersToMatch int) *indexer {
46-
c, err := lru.New[BlockHash, *block](maxCacheSize)
46+
c, err := lru.New[BlockHash, *podSet](maxCacheSize)
4747
if err != nil {
4848
panic(err)
4949
}
@@ -58,23 +58,23 @@ func newIndexer(maxCacheSize, maxServersToMatch int) *indexer {
5858

5959
// Add adds a list of prefix hashes to the cache, tied to the server.
6060
func (i *indexer) Add(hashes []BlockHash, pod ServerID) {
61-
if len(hashes) == 0 || pod.Name == "" {
61+
if pod.Name == "" {
6262
return
6363
}
6464

6565
i.mu.Lock()
6666
defer i.mu.Unlock()
6767

6868
for _, hash := range hashes {
69-
b, ok := i.cache.Get(hash)
69+
p, ok := i.cache.Get(hash)
7070
if !ok {
71-
// Create block with new LRU
71+
// Create podSet with new LRU
7272
podLRU, _ := lru.New[ServerID, struct{}](i.maxServersToMatch)
73-
b = &block{Pods: podLRU}
74-
i.cache.Add(hash, b)
73+
p = &podSet{enteries: podLRU}
74+
i.cache.Add(hash, p)
7575
}
7676

77-
b.Pods.Add(pod, struct{}{})
77+
p.enteries.Add(pod, struct{}{})
7878
}
7979
}
8080

@@ -84,11 +84,11 @@ func (i *indexer) Get(hash BlockHash) map[ServerID]bool {
8484
defer i.mu.RUnlock()
8585

8686
res := map[ServerID]bool{}
87-
block, ok := i.cache.Get(hash)
87+
pods, ok := i.cache.Get(hash)
8888
if !ok {
8989
return res
9090
}
91-
for _, pod := range block.Pods.Keys() {
91+
for _, pod := range pods.enteries.Keys() {
9292
res[pod] = true
9393
}
9494
return res

pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,10 @@ import (
3232

3333
const (
3434
DefaultScorerWeight = 1
35-
// Attempt to return DefaultNumServersToMatch servers with their longest prefix match length.
36-
// Why not just return the server with longest prefix match?
37-
// It may not be the optimal choice, e.g., it may have a high queue depth.
38-
// We optimistically search more than one to give more candidates for the scheduler to choose.
39-
DefaultNumServersToMatch = 16
35+
// DefaultMaxPodsPerPrefix defines the maximum number of pods (servers) to track per prefix hash in the LRU indexer.
36+
// This limits the number of recent pods associated with a given prefix to reduce memory usage
37+
// and ensure faster lookup. When the limit is reached, the least recently used pod is evicted.
38+
DefaultMaxPodsPerPrefix = 4
4039
// vLLM default token block size is 16, and a good guess of average characters per token is 4.
4140
DefaultHashBlockSize = 64
4241
// The maximum number of blocks to match. Two long requests with the same prefix up to this
@@ -64,8 +63,8 @@ type Config struct {
6463
// MaxPrefixBlocksToMatch is the maximum number of prefix blocks to match. Input beyond this limit will
6564
// be ignored.
6665
MaxPrefixBlocksToMatch int
67-
// NumServersToMatch is the maximum number that can match per hash BlockHash.
68-
MaxNumServersToMatch int
66+
// MaxPodsPerPrefix defines the maximum number of pods (servers) to track per prefix hash in the LRU indexer.
67+
MaxPodsPerPrefix int
6968
// Max (approximate) size of the LRU indexer in number of entries.
7069
LRUIndexerCapacity int
7170
}
@@ -122,7 +121,7 @@ var _ framework.PostCycle = &Plugin{}
122121
func New(config Config) *Plugin {
123122
m := &Plugin{
124123
Config: config,
125-
indexer: newIndexer(config.LRUIndexerCapacity, config.MaxNumServersToMatch),
124+
indexer: newIndexer(config.LRUIndexerCapacity, config.MaxPodsPerPrefix),
126125
}
127126
return m
128127
}
@@ -136,7 +135,7 @@ func (m *Plugin) Name() string {
136135
func (m *Plugin) Score(ctx context.Context, request *types.LLMRequest, cycleState *types.CycleState, pods []types.Pod) map[types.Pod]float64 {
137136
loggerTrace := log.FromContext(ctx).V(logutil.TRACE)
138137
// pre score step, hashing prompt and find longest prefix match.
139-
hashes := hashPrompt(ctx, request, m.HashBlockSize, m.MaxPrefixBlocksToMatch)
138+
hashes := hashPrompt(ctx, request, m.HashBlockSize, m.MaxPodsPerPrefix)
140139
state := &schedulingContextState{
141140
PrefixHashes: hashes,
142141
PrefixCacheServers: m.matchLongestPrefix(ctx, hashes),

pkg/epp/scheduling/framework/plugins/multi/prefix/plugin_test.go

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ func TestPrefixPlugin(t *testing.T) {
3636
HashBlockSize: 4,
3737
MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks,
3838
LRUIndexerCapacity: DefaultLRUIndexerCapacity,
39-
MaxNumServersToMatch: DefaultNumServersToMatch,
39+
MaxPodsPerPrefix: DefaultMaxPodsPerPrefix,
4040
}
4141
plugin := New(config)
4242

@@ -144,18 +144,20 @@ func TestPrefixPlugin(t *testing.T) {
144144
}
145145

146146
// TestPrefixPluginStress is a stress test for the prefix scoring plugin, using prompts of increasing length.
147-
func TestPrefixPluginStress(t *testing.T) {
147+
func BenchmarkPrefixPluginStress(b *testing.B) {
148148
blockSize := 4
149+
maxPrefixBlocks := 50000
149150
config := Config{
150151
HashBlockSize: blockSize,
151-
MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks,
152+
MaxPrefixBlocksToMatch: maxPrefixBlocks,
152153
LRUIndexerCapacity: DefaultLRUIndexerCapacity,
153-
MaxNumServersToMatch: DefaultNumServersToMatch,
154+
MaxPodsPerPrefix: DefaultMaxPodsPerPrefix,
154155
}
155156

156157
plugin := New(config)
157158
types.NewCycleState()
158-
for i := 0; i < 1000; i++ {
159+
promptLen := []int{10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000, 50000}
160+
for _, i := range promptLen {
159161
// Generate increasing-length random prompts
160162
prompt := randomPrompt(4 + i)
161163
pod := &types.PodMetrics{
@@ -179,9 +181,9 @@ func TestPrefixPluginStress(t *testing.T) {
179181

180182
// Second cycle: validate internal state
181183
state, err := plugin.getPrefixState(cycleState)
182-
assert.NoError(t, err)
183-
expectedHashes := int(math.Min(DefaultMaxPrefixBlocks+1, float64(len(req.Prompt)/blockSize+1))) // the extra one is for the model.
184-
assert.Equal(t, expectedHashes, len(state.PrefixHashes), "number of hashes is incorrect")
184+
assert.NoError(b, err)
185+
expectedHashes := int(math.Min(float64(maxPrefixBlocks+1), float64(len(req.Prompt)/blockSize+1))) // the extra one is for the model.
186+
assert.Equal(b, expectedHashes, len(state.PrefixHashes), "number of hashes is incorrect")
185187
}
186188
}
187189

site-src/guides/epp-configuration/prefix-aware.md

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ The [prefix cache plugin](https://github.com/kubernetes-sigs/gateway-api-inferen
44
takes advantage of the prefix caching (e.g., [vllm APC](https://docs.vllm.ai/en/latest/features/automatic_prefix_caching.html))
55
feature of model servers, and optimizes request scheduling by placing requests sharing the longest
66
prefixes to the same server as much as possible, while balancing the server load by considering kv-cache
7-
and queue depth.
7+
and queue depth.
88

99
## Enable the prefix cache plugin
1010

@@ -34,14 +34,18 @@ for performance.
3434

3535
* `PREFIX_CACHE_LRU_CAPACITY`: Maximum capacity the prefix LRU indexer in number of block hashes. Below
3636
shows a detailed analysis on how to estimate this.
37+
* `PREFIX_MAX_PODS_PER_PREFIX`: Defines the maximum number of pods (servers) tracked per prefix hash in the internal LRU cache.
38+
This setting helps optimize memory usage by retaining only the hottest (most recently active) pods for each prefix.
39+
When the limit is reached, older pods are evicted based on least-recently-used (LRU) order.
40+
3741

3842
The prefix cache plugin estimates the prefix cache indexes in model server HBMs. In the perfect
3943
scenario, EPP has the exact same prefix cache entries per model server as their HBM cache entries. If
4044
the EPP cache is smaller than HBM cache, a positive EPP cache match is more accurate, but there are more
4145
false cache misses. If the EPP cache is larger than the HBM cache, then there are more false cache hits.
4246
Therefore **the EPP prefix cache indexer size should be as close as possible to the HBM cache size.**
4347

44-
NOTE: EPP builds prefix cache based on characters, while model server maintains prefix cache entries
48+
NOTE: EPP builds prefix cache based on characters, while model server maintains prefix cache entries
4549
in tokens, a conversion between character <-> token is needed.
4650

4751
Below are the formulas to estimate the EPP prefix indexer size:
@@ -63,7 +67,7 @@ shows a detailed analysis on how to estimate this.
6367
max_kv_tokens_per_server = (80GB - 16GB) / 128KB = 500,000
6468
# assume avg_chars_per_token = 4, prefix_indexer_hash_block_size = 64 (default)
6569
# each entry is about 358KB, so the memory footrpint is abut 11 MB per server
66-
lru_indexer_capacity_per_server = 500,000*4/64 = 31250
70+
lru_indexer_capacity_per_server = 500,000*4/64 = 31250
6771
lru_indexer_capacity_total = 3 * 31250 = 93750
6872
```
6973

0 commit comments

Comments
 (0)