Skip to content
This repository was archived by the owner on May 15, 2025. It is now read-only.

[DO NOT MERGE] Renamed Scheduler APIs to better reflect their usage. #116

Closed
wants to merge 44 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
b96946e
fix lint
clubanderson May 1, 2025
476cbab
fix lint
clubanderson May 1, 2025
847daf8
fix lint
clubanderson May 1, 2025
288589c
fix lint
clubanderson May 1, 2025
5c364e2
fix lint
clubanderson May 1, 2025
d835ccb
fix lint
clubanderson May 1, 2025
4818a31
fix lint
clubanderson May 1, 2025
ed2db83
fix lint
clubanderson May 1, 2025
869f7cd
fix lint
clubanderson May 1, 2025
91a0f85
fix lint
clubanderson May 1, 2025
424d5b4
fix lint
clubanderson May 1, 2025
27a62bb
fix lint
clubanderson May 1, 2025
d88c441
fix lint
clubanderson May 1, 2025
c56372c
fix lint
clubanderson May 1, 2025
1e5466a
fix lint
clubanderson May 1, 2025
6f36dfd
fix lint
clubanderson May 1, 2025
3a95881
fix lint
clubanderson May 1, 2025
ce547eb
fix lint
clubanderson May 1, 2025
39dd257
fix lint
clubanderson May 1, 2025
5e6f5b6
fix lint
clubanderson May 1, 2025
d62d457
fix
clubanderson May 1, 2025
508fd29
fix
clubanderson May 1, 2025
06f6d26
fix
clubanderson May 1, 2025
19617ad
fix
clubanderson May 1, 2025
255beb5
fix
clubanderson May 1, 2025
f9e6530
add comments to working golang file
clubanderson May 1, 2025
e2f398a
Provide a way to enable the PDFilter
lionelvillard May 1, 2025
01c043e
update readme
lionelvillard May 1, 2025
58f3213
Merge pull request #104 from neuralmagic/enable-pd
vMaroon May 1, 2025
a1d7254
add log lines
lionelvillard May 1, 2025
c2d68de
Update pod labels to match ModelService
jgchn May 2, 2025
2f6a763
Merge pull request #106 from jgchn/label
vMaroon May 2, 2025
867b18c
address review comments
lionelvillard May 2, 2025
ca12b30
Merge pull request #105 from neuralmagic/debug-pd
vMaroon May 2, 2025
e0eee4c
fix build:
vMaroon May 2, 2025
466e773
Fixed scorer tests
shmuelk May 4, 2025
49b6afa
Added PostResponse to scheduler config
shmuelk May 4, 2025
3e8284c
Use an init() function instead of modifying the scheduler code to inj…
shmuelk May 4, 2025
32e43b1
Added code to scheduler to enable running the PostResponse plugins
shmuelk May 4, 2025
4655be4
Invoke the PostResponse handlers and send any added headers to the user
shmuelk May 4, 2025
6fffe9e
Added a simple unit test for the PostResponse plugin invocation
shmuelk May 4, 2025
220915f
Merge pull request #113 from shmuelk/post-response
shmuelk May 4, 2025
25cb42a
Renamed scheduler.Schedule to scheduler.OnRequest to better indicate …
shmuelk May 4, 2025
4ef6b62
Renamed scheduler.RunPostResponsePlugins to be scheduler.OnResponse t…
shmuelk May 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,9 @@ go.work.sum

# generated docs
site

# tokenizer lib
lib

# local configuration files
.envrc
12 changes: 3 additions & 9 deletions .golangci.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
run:
timeout: 5m
allow-parallel-runners: true
skip-files:
- "pkg/epp/server/runserver_test.go"

# Settings related to issues
issues:
Expand All @@ -21,7 +19,7 @@ linters:
- fatcontext
- ginkgolinter
- gocritic
- govet
# - govet # do not enable - this causes some metalinter issue
- loggercheck
- misspell
- perfsprint
Expand All @@ -30,17 +28,13 @@ linters:
- makezero
- errcheck
- goconst
- gofmt
- goimports
- gosimple
- ineffassign
- nakedret
- prealloc
- typecheck
- unparam
- unused

linters-settings:
revive:
rules:
- name: comment-spacings
- name: comment-spacings
15 changes: 13 additions & 2 deletions .tekton/buildah-build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,15 @@ spec:
USERNAME=$(jq -r '.auths["quay.io"].username' /root/.docker/config.json)
PASSWORD=$(jq -r '.auths["quay.io"].password' /root/.docker/config.json)

echo "🔐 Extracting Git credentials from workspace..."
GIT_USER=$(cat /workspace/git-auth/username)
GIT_TOKEN=$(cat /workspace/git-auth/token)

if [ -z "$GIT_USER" ] || [ -z "$GIT_TOKEN" ]; then
echo "❌ Error: Missing git-auth credentials"
exit 1
fi

if [ "$USERNAME" = "null" ] || [ "$PASSWORD" = "null" ]; then
echo "❌ Error: Missing registry credentials"
exit 1
Expand All @@ -56,8 +65,10 @@ spec:
export DOCKER_CONFIG=/root/.docker
export BUILDER=buildah
export IMG=$(params.image_tag_base):$(params.dev-version)

export GIT_NM_USER=$GIT_USER
export NM_TOKEN=$GIT_TOKEN

echo "🚀 Calling make buildah-build with IMG=$IMG..."
make buildah-build IMG=$IMG
make buildah-build IMG=$IMG

echo "$IMG" > /tekton/results/image-url
19 changes: 19 additions & 0 deletions .tekton/go-build-task.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,24 @@ spec:
script: |
#!/bin/bash
cd $(workspaces.source.path)

echo "🔐 Extracting Git credentials from workspace..."
GIT_USER=$(cat /workspace/git-auth/username)
GIT_TOKEN=$(cat /workspace/git-auth/token)

if [ -z "$GIT_USER" ] || [ -z "$GIT_TOKEN" ]; then
echo "❌ Error: Missing git-auth credentials"
exit 1
fi

echo "🔐 Configuring Git..."
git config --global user.email "[email protected]"
git config --global user.name "ci-tag-bot"
git config --global url."https://${GIT_USER}:${GIT_TOKEN}@github.com".insteadOf "https://github.com"
git config --global --add safe.directory "$(pwd)"

# required for go build with tokenizer lib linking
dnf install -y gcc-c++ libstdc++ libstdc++-devel && dnf clean all

go env -w GOFLAGS=-buildvcs=false
make build
1 change: 1 addition & 0 deletions .tekton/go-lint-task.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ spec:
steps:
- name: run-lint
image: us.icr.io/ibm-hc4ai-operator/golangci-lint:v1.64.8
# image: us.icr.io/ibm-hc4ai-operator/golangci-lint:v2.0.3
imagePullPolicy: IfNotPresent
script: |
#!/bin/bash
Expand Down
5 changes: 5 additions & 0 deletions .tekton/pipelinerun.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,9 @@ spec:
workspaces:
- name: source
workspace: source
- name: git-auth
workspace: git-auth


- name: extract-version-and-registry
params:
Expand Down Expand Up @@ -328,6 +331,8 @@ spec:
workspace: registry-secret
- name: container-storage
workspace: container-storage
- name: git-auth
workspace: git-auth

- name: vulnerability-scan
when:
Expand Down
27 changes: 23 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -439,11 +439,20 @@ lint: check-golangci-lint ## Run lint
golangci-lint run

##@ Build
LDFLAGS ?= -extldflags '-L$(shell pwd)/lib'
CGO_ENABLED=1 # Enable CGO

.PHONY: download-tokenizer
download-tokenizer: ## Download the HuggingFace tokenizer bindings.
@echo "Downloading HuggingFace tokenizer bindings..."
mkdir -p lib
curl -L https://github.com/daulet/tokenizers/releases/download/v1.20.2/libtokenizers.$(TARGETOS)-$(TARGETARCH).tar.gz | tar -xz -C lib
ranlib lib/*.a

.PHONY: build
build: check-go ##
build: check-go download-tokenizer ##
@printf "\033[33;1m==== Building ====\033[0m\n"
go build -o bin/epp cmd/epp/main.go cmd/epp/health.go
go build -ldflags="$(LDFLAGS)" -o bin/epp cmd/epp/main.go cmd/epp/health.go

##@ Container Build/Push

Expand All @@ -456,7 +465,12 @@ buildah-build: check-builder load-version-json ## Build and push image (multi-ar
for arch in amd64; do \
ARCH_TAG=$$FINAL_TAG-$$arch; \
echo "📦 Building for architecture: $$arch"; \
buildah build --arch=$$arch --os=linux --layers -t $(IMG)-$$arch . || exit 1; \
buildah build \
--arch=$$arch \
--build-arg GIT_NM_USER=$(GIT_NM_USER) \
--build-arg NM_TOKEN=$(NM_TOKEN) \
--os=linux \
--layers -t $(IMG)-$$arch . || exit 1; \
echo "🚀 Pushing image: $(IMG)-$$arch"; \
buildah push $(IMG)-$$arch docker://$(IMG)-$$arch || exit 1; \
done; \
Expand All @@ -474,7 +488,11 @@ buildah-build: check-builder load-version-json ## Build and push image (multi-ar
sed -e '1 s/\(^FROM\)/FROM --platform=$${BUILDPLATFORM}/' Dockerfile > Dockerfile.cross; \
- docker buildx create --use --name image-builder || true; \
docker buildx use image-builder; \
docker buildx build --push --platform=$(PLATFORMS) --tag $(IMG) -f Dockerfile.cross . || exit 1; \
docker buildx build --push \
--platform=$(PLATFORMS) \
--build-arg GIT_NM_USER=$(GIT_NM_USER)\
--build-arg NM_TOKEN=$(NM_TOKEN) \
--tag $(IMG) -f Dockerfile.cross . || exit 1; \
docker buildx rm image-builder || true; \
rm Dockerfile.cross; \
elif [ "$(BUILDER)" = "podman" ]; then \
Expand All @@ -494,6 +512,7 @@ image-build: check-container-tool load-version-json ## Build container image usi
--build-arg TARGETARCH=$(TARGETARCH) \
--build-arg GIT_NM_USER=$(GIT_NM_USER)\
--build-arg NM_TOKEN=$(NM_TOKEN) \
--progress=plain \
-t $(IMG) .

.PHONY: image-push
Expand Down
10 changes: 7 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ To enable LoadAwareScorer, the following env vars must be configured:
export ENABLE_LOAD_AWARE_SCORER=true
export LOAD_AWARE_SCORER_WEIGHT=1.0
```

To enable PDFilter, the following env var must be configured:
```
export ENABLE_PD_FILTER=true
```
---
[Inference Gateways]:#concepts-and-definitions

Expand Down Expand Up @@ -96,8 +101,8 @@ See our website at https://gateway-api-inference-extension.sigs.k8s.io/ for deta
## Roadmap

As Inference Gateway builds towards a GA release. We will continue to expand our capabilities, namely:
1. Prefix-cache aware load balancing with interfaces for remote caches
1. Recommended LoRA adapter pipeline for automated rollout
1. Prefix-cache aware load balancing with interfaces for remote caches
1. Recommended LoRA adapter pipeline for automated rollout
1. Fairness and priority between workloads within the same criticality band
1. HPA support for autoscaling on aggregate metrics derived from the load balancer
1. Support for large multi-modal inputs and outputs
Expand All @@ -121,4 +126,3 @@ Contributions are readily welcomed, follow the [dev guide](./docs/dev.md) to sta
### Code of conduct

Participation in the Kubernetes community is governed by the [Kubernetes Code of Conduct](code-of-conduct.md).

2 changes: 1 addition & 1 deletion pkg/epp/backend/metrics/pod_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ import (

const (
fetchMetricsTimeout = 5 * time.Second
roleLabel = "llmd.org/role"
roleLabel = "llm-d.ai/role"
rolePrefill = "prefill"
roleDecode = "decode"
roleBoth = "both"
Expand Down
2 changes: 1 addition & 1 deletion pkg/epp/handlers/request.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ func (s *StreamingServer) HandleRequestBody(
return reqCtx, errutil.Error{Code: errutil.Internal, Msg: fmt.Sprintf("error marshaling request body: %v", err)}
}

res, err := s.scheduler.Schedule(ctx, llmReq)
res, err := s.scheduler.OnRequest(ctx, llmReq)
if err != nil {
return reqCtx, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Errorf("failed to find target pod: %w", err).Error()}
}
Expand Down
61 changes: 45 additions & 16 deletions pkg/epp/handlers/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import (
backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
schedulingtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error"
logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
Expand Down Expand Up @@ -65,7 +66,8 @@ type StreamingServer struct {
}

type Scheduler interface {
Schedule(ctx context.Context, b *schedulingtypes.LLMRequest) (result *schedulingtypes.Result, err error)
OnRequest(ctx context.Context, b *schedulingtypes.LLMRequest) (result *schedulingtypes.Result, err error)
OnResponse(ctx context.Context, req *types.LLMRequest, tragetPodName string) (*schedulingtypes.Result, error)
}

// RequestContext stores context information during the life time of an HTTP request.
Expand Down Expand Up @@ -189,6 +191,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)
case *extProcPb.ProcessingRequest_RequestTrailers:
// This is currently unused.
case *extProcPb.ProcessingRequest_ResponseHeaders:
responseHeaders := make(map[string]string)
for _, header := range v.ResponseHeaders.Headers.GetHeaders() {
value := string(header.RawValue)

Expand All @@ -199,27 +202,53 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)
reqCtx.modelServerStreaming = true
loggerTrace.Info("model server is streaming response")
}
responseHeaders[header.Key] = value
}

reqCtx.RequestState = ResponseRecieved
reqCtx.respHeaderResp = &extProcPb.ProcessingResponse{
Response: &extProcPb.ProcessingResponse_ResponseHeaders{
ResponseHeaders: &extProcPb.HeadersResponse{
Response: &extProcPb.CommonResponse{
HeaderMutation: &extProcPb.HeaderMutation{
SetHeaders: []*configPb.HeaderValueOption{
{
Header: &configPb.HeaderValue{
// This is for debugging purpose only.
Key: "x-went-into-resp-headers",
RawValue: []byte("true"),
},
},
llmReq := &schedulingtypes.LLMRequest{
Model: reqCtx.Model,
Headers: responseHeaders,
ResolvedTargetModel: reqCtx.ResolvedTargetModel,
}

var result *types.Result
result, err = s.scheduler.OnResponse(ctx, llmReq, reqCtx.TargetPod)
if err != nil {
logger.V(logutil.DEFAULT).Error(err, "Error handling response")
reqCtx.ResponseStatusCode = errutil.ModelServerError
} else {
headers := []*configPb.HeaderValueOption{
{
Header: &configPb.HeaderValue{
// This is for debugging purpose only.
Key: "x-went-into-resp-headers",
RawValue: []byte("true"),
},
},
}

// Add headers added by PostResponse
for key, value := range result.MutatedHeaders {
headers = append(headers, &configPb.HeaderValueOption{
Header: &configPb.HeaderValue{
Key: key,
RawValue: []byte(value),
},
})
}

reqCtx.RequestState = ResponseRecieved
reqCtx.respHeaderResp = &extProcPb.ProcessingResponse{
Response: &extProcPb.ProcessingResponse_ResponseHeaders{
ResponseHeaders: &extProcPb.HeadersResponse{
Response: &extProcPb.CommonResponse{
HeaderMutation: &extProcPb.HeaderMutation{
SetHeaders: headers,
},
},
},
},
},
}
}

case *extProcPb.ProcessingRequest_ResponseBody:
Expand Down
2 changes: 2 additions & 0 deletions pkg/epp/scheduling/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ type SchedulerConfig struct {
scorers map[plugins.Scorer]int // map from scorer to weight
picker plugins.Picker
postSchedulePlugins []plugins.PostSchedule
postResponsePlugins []plugins.PostResponse
}

var defPlugin = &defaultPlugin{}
Expand All @@ -40,4 +41,5 @@ var defaultConfig = &SchedulerConfig{
scorers: map[plugins.Scorer]int{},
picker: defPlugin,
postSchedulePlugins: []plugins.PostSchedule{},
postResponsePlugins: []plugins.PostResponse{},
}
21 changes: 21 additions & 0 deletions pkg/epp/scheduling/local_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ package scheduling

import (
"context"

"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/filter"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/picker"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/scorer"
envutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/env"
Expand All @@ -28,16 +30,22 @@ import (
const (
kvCacheScorerEnablementEnvVar = "ENABLE_KVCACHE_AWARE_SCORER"
loadAwareScorerEnablementEnvVar = "ENABLE_LOAD_AWARE_SCORER"
pdFilterEnablementEnvVar = "ENABLE_PD_FILTER"

kvCacheScorerWeightEnvVar = "KVCACHE_AWARE_SCORER_WEIGHT"
loadAwareScorerWeightEnvVar = "LOAD_AWARE_SCORER_WEIGHT"
)

func init() {
setDefaultConfig()
}

func setDefaultConfig() {
// since the default config is a global variable, we add this function to minimize rebase conflicts.
// this configuration is a temporary state, it should be better streamlined.
setLoadAwareScorer()
setKVCacheAwareScorer()
setPDFilter()

defaultConfig.picker = picker.NewMaxScorePicker()
}
Expand Down Expand Up @@ -75,3 +83,16 @@ func setKVCacheAwareScorer() {
defaultConfig.scorers[kvCacheScorer] = kvCacheScorerWeight
loggerDebug.Info("Initialized KVCacheAwareScorer", "weight", kvCacheScorerWeight)
}

func setPDFilter() {
ctx := context.Background()
loggerDebug := log.FromContext(ctx).WithName("scheduler_config").V(logutil.DEBUG)

if envutil.GetEnvString(pdFilterEnablementEnvVar, "false", loggerDebug) != "true" {
loggerDebug.Info("Skipping PDFilter creation as it is not enabled")
return
}

defaultConfig.filters = append(defaultConfig.filters, filter.PDFilter)
loggerDebug.Info("Initialized PDFilter")
}
Loading