Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions cmd/epp/runner/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ func (r *Runner) Run(ctx context.Context) error {
runtime.SetBlockProfileRate(1)
}

err = r.parsePluginsConfiguration(ctx)
err = r.parsePluginsConfiguration(ctx, datastore)
if err != nil {
setupLog.Error(err, "Failed to parse plugins configuration")
return err
Expand Down Expand Up @@ -310,7 +310,7 @@ func (r *Runner) registerInTreePlugins() {
plugins.Register(testfilter.HeaderBasedTestingFilterType, testfilter.HeaderBasedTestingFilterFactory)
}

func (r *Runner) parsePluginsConfiguration(ctx context.Context) error {
func (r *Runner) parsePluginsConfiguration(ctx context.Context, ds datastore.Datastore) error {
if *configText == "" && *configFile == "" {
return nil // configuring through code, not through file
}
Expand All @@ -329,8 +329,9 @@ func (r *Runner) parsePluginsConfiguration(ctx context.Context) error {
}

r.registerInTreePlugins()
handle := plugins.NewEppHandle(ctx)
handle := plugins.NewEppHandle(ctx, ds.PodList)
config, err := loader.LoadConfig(configBytes, handle, logger)

if err != nil {
return fmt.Errorf("failed to load the configuration - %w", err)
}
Expand Down
17 changes: 16 additions & 1 deletion pkg/epp/plugins/handle.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ package plugins
import (
"context"
"fmt"

backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
)

// Handle provides plugins a set of standard data and tools to work with
Expand All @@ -27,6 +29,9 @@ type Handle interface {
Context() context.Context

HandlePlugins

// PodList lists pods matching the given predicate.
PodList(predicate func(backendmetrics.PodMetrics) bool) []backendmetrics.PodMetrics
}

// HandlePlugins defines a set of APIs to work with instantiated plugins
Expand All @@ -44,10 +49,14 @@ type HandlePlugins interface {
GetAllPluginsWithNames() map[string]Plugin
}

// PodListFunc is a function type that filters and returns a list of pod metrics
type PodListFunc func(predicate func(backendmetrics.PodMetrics) bool) []backendmetrics.PodMetrics

// eppHandle is an implementation of the interface plugins.Handle
type eppHandle struct {
ctx context.Context
HandlePlugins
podList PodListFunc
}

// Context returns a context the plugins can use, if they need one
Expand Down Expand Up @@ -84,12 +93,18 @@ func (h *eppHandlePlugins) GetAllPluginsWithNames() map[string]Plugin {
return h.plugins
}

func NewEppHandle(ctx context.Context) Handle {
// PodList lists pods matching the given predicate.
func (h *eppHandle) PodList(predicate func(backendmetrics.PodMetrics) bool) []backendmetrics.PodMetrics {
return h.podList(predicate)
}

func NewEppHandle(ctx context.Context, podList PodListFunc) Handle {
return &eppHandle{
ctx: ctx,
HandlePlugins: &eppHandlePlugins{
plugins: map[string]Plugin{},
},
podList: podList,
}
}

Expand Down
32 changes: 32 additions & 0 deletions pkg/epp/scheduling/framework/plugins/multi/prefix/indexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,3 +149,35 @@ func (i *indexer) reportLRUSize(ctx context.Context, interval time.Duration) {
i.mu.RUnlock()
}
}

// RemovePod removes a pod and its associated entries from the indexer.
func (i *indexer) RemovePod(pod ServerID) {
i.mu.RLock()
lruCache, exists := i.podToLRU[pod]
i.mu.RUnlock()

if !exists {
return
}

// Remove all hashes associated with the pod from hashToPods (triggers eviction callbacks).
for _, hash := range lruCache.Keys() {
lruCache.Remove(hash)
}

i.mu.Lock()
delete(i.podToLRU, pod)
i.mu.Unlock()
}

// Pods returns the list of all pods currently tracked in the indexer.
func (i *indexer) Pods() []ServerID {
i.mu.RLock()
defer i.mu.RUnlock()

pods := make([]ServerID, 0, len(i.podToLRU))
for pod := range i.podToLRU {
pods = append(pods, pod)
}
return pods
}
60 changes: 60 additions & 0 deletions pkg/epp/scheduling/framework/plugins/multi/prefix/indexer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,63 @@ func TestIndexer_AddAndGet(t *testing.T) {
servers = i.Get(BlockHash(4))
assert.Empty(t, servers, "Cache should not contain non-existent hash")
}

func TestIndexer_RemovePodAndEviction(t *testing.T) {
const indexerSize = 10

i := newIndexer(context.Background(), indexerSize)

server1 := ServerID{Namespace: "default", Name: "server1"}
server2 := ServerID{Namespace: "default", Name: "server2"}

// Add indexerSize hashes to both servers
var hashes []BlockHash
for j := 0; j < indexerSize; j++ {
h := BlockHash(j)
hashes = append(hashes, h)
i.Add([]BlockHash{h}, server1)
i.Add([]BlockHash{h}, server2)
}

// Ensure all entries are added
assert.Equal(t, indexerSize, i.podToLRU[server1].Len(), "server1 should have 10 entries")
assert.Equal(t, indexerSize, i.podToLRU[server2].Len(), "server2 should have 10 entries")

// Ensure each hash in hashToPods maps to both server1 and server2
for _, h := range hashes {
pods := i.hashToPods[h]
assert.Len(t, pods, 2, "Each hash should be associated with exactly 2 pods")
assert.Contains(t, pods, server1, "hash should be associated with server1")
assert.Contains(t, pods, server2, "hash should be associated with server2")
}

// Add indexerSize hash to server1 → should evict BlockHash(0)
evictedHash := BlockHash(0)
newHash := BlockHash(indexerSize)
i.Add([]BlockHash{newHash}, server1)

// server1 LRU should still be at max capacity
assert.Equal(t, indexerSize, i.podToLRU[server1].Len(), "server1 LRU should maintain max size")

// BlockHash(0) should no longer have server1 in hashToPods
pods := i.Get(evictedHash)
assert.NotContains(t, pods, server1, "server1 should be evicted from hashToPods for hash 0")
assert.Contains(t, pods, server2, "server2 should still have hash 0")

// Remove server2
i.RemovePod(server2)

// hashToPods for hash 0 should now be empty
pods = i.Get(evictedHash)
assert.NotContains(t, pods, server2, "server2 should be removed from hash 0")
assert.Empty(t, pods, "hash 0 should have no pods after both eviction and removal")

// All remaining hashes should map only to server1
for hash, pods := range i.hashToPods {
assert.Len(t, pods, 1, "hash %v should have only 1 pod after server2 removal", hash)
assert.Contains(t, pods, server1, "hash %v should only contain server1", hash)
}

// Ensure hashToPods contains exactly indexerSize hashes (post-eviction and server2 removal)
assert.Len(t, i.hashToPods, indexerSize, "hashToPods should contain %d hashes after cleanup", indexerSize)
}
39 changes: 38 additions & 1 deletion pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,13 @@ import (
"encoding/binary"
"encoding/json"
"fmt"
"time"

"github.com/cespare/xxhash/v2"
k8stypes "k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/log"

backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol"
Expand Down Expand Up @@ -56,6 +58,10 @@ const (
PrefixCachePluginType = "prefix-cache-scorer"
)

const (
PodActiveCheckInterval = 2 * time.Minute
)

var DefaultConfig = Config{
HashBlockSize: DefaultHashBlockSize,
MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks,
Expand Down Expand Up @@ -86,6 +92,8 @@ type podSet map[ServerID]struct{}
type Indexer interface {
Get(hash BlockHash) podSet
Add(hashes []BlockHash, server ServerID)
RemovePod(server ServerID)
Pods() []ServerID
}

// BlockHash is a hash of the block of request body.
Expand Down Expand Up @@ -140,7 +148,9 @@ func PrefixCachePluginFactory(name string, rawParameters json.RawMessage, handle
}
}

return New(handle.Context(), parameters).WithName(name), nil
p := New(handle.Context(), parameters).WithName(name)
go p.CleanUpInactivePods(handle.Context(), handle)
return p, nil
}

// New initializes a new prefix Plugin and returns its pointer.
Expand Down Expand Up @@ -246,6 +256,33 @@ func (p *Plugin) matchLongestPrefix(ctx context.Context, hashes []BlockHash) map
return res
}

// CleanUpInactivePods starts a goroutine that watches for inactive pods.
func (m *Plugin) CleanUpInactivePods(ctx context.Context, handle plugins.Handle) {
logger := log.FromContext(ctx).V(logutil.VERBOSE)
ticker := time.NewTicker(PodActiveCheckInterval)
defer ticker.Stop()

for {
select {
case <-ctx.Done():
return
case <-ticker.C:
activePodMetrics := handle.PodList(func(_ backendmetrics.PodMetrics) bool { return true })
activePods := make(map[ServerID]struct{}, len(activePodMetrics))
for _, pm := range activePodMetrics {
activePods[ServerID(pm.GetPod().NamespacedName)] = struct{}{}
}

for _, pod := range m.indexer.Pods() {
if _, ok := activePods[pod]; !ok {
m.indexer.RemovePod(pod)
logger.Info("Removed pod not in active set", "pod", pod)
}
}
}
}
}

// hashPrompt divides the prompt into blocks and calculate the prefix cache for each block.
// hash(0) is the hash of the model name, since different models generally don't share prefix cache.
// For block i, hash(i) = hash(block i content, hash(i-1)).
Expand Down
5 changes: 5 additions & 0 deletions test/utils/handle.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package utils
import (
"context"

backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
)

Expand All @@ -33,6 +34,10 @@ func (h *testHandle) Context() context.Context {
return h.ctx
}

func (h *testHandle) PodList(predicate func(backendmetrics.PodMetrics) bool) []backendmetrics.PodMetrics {
return []backendmetrics.PodMetrics{}
}

type testHandlePlugins struct {
plugins map[string]plugins.Plugin
}
Expand Down