Skip to content

Commit 1c3d559

Browse files
authored
feat(metrics): add request prompt, generation, max_tokens and success metrics (#202)
* feat(metrics): add request prompt, generation, max_tokens and success metrics Signed-off-by: googs1025 <[email protected]> * fix review comment Signed-off-by: googs1025 <[email protected]> * fix conflict Signed-off-by: googs1025 <[email protected]> --------- Signed-off-by: googs1025 <[email protected]>
1 parent 8cc134e commit 1c3d559

File tree

7 files changed

+472
-19
lines changed

7 files changed

+472
-19
lines changed

manifests/config_with_fake.yaml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,15 @@ time-to-first-token: 2000
77
inter-token-latency: 1000
88
kv-cache-transfer-latency: 100
99
seed: 100100100
10-
fake-metrics:
10+
fake-metrics:
1111
running-requests: 16
1212
waiting-requests: 3
1313
kv-cache-usage: 0.3
14+
request-success-total:
15+
stop: 20
16+
request-prompt-tokens: [ 10, 20, 30, 15 ]
17+
request-generation-tokens: [ 50, 60, 40 ]
18+
request-params-max-tokens: [ 128, 256, 512 ]
1419
loras:
1520
- '{"running":"lora1,lora2","waiting":"lora3","timestamp":1257894567}'
1621
- '{"running":"lora1,lora3","waiting":"","timestamp":1257894569}'

pkg/common/config.go

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,27 @@ const (
4343
FailureTypeServerError = "server_error"
4444
FailureTypeInvalidRequest = "invalid_request"
4545
FailureTypeModelNotFound = "model_not_found"
46+
47+
StopFinishReason = "stop"
48+
LengthFinishReason = "length"
49+
ToolsFinishReason = "tool_calls"
50+
RemoteDecodeFinishReason = "remote_decode"
51+
)
52+
53+
var (
54+
requiredFinishReasons = []string{
55+
StopFinishReason,
56+
LengthFinishReason,
57+
ToolsFinishReason,
58+
RemoteDecodeFinishReason,
59+
}
60+
61+
validFinishReasons = map[string]struct{}{
62+
StopFinishReason: {},
63+
LengthFinishReason: {},
64+
ToolsFinishReason: {},
65+
RemoteDecodeFinishReason: {},
66+
}
4667
)
4768

4869
type Configuration struct {
@@ -223,6 +244,13 @@ type Metrics struct {
223244
// 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
224245
// 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, +Inf
225246
TPOTBucketValues []int `yaml:"tpot-buckets-values" json:"tpot-buckets-values"`
247+
// RequestPromptTokens RequestGenerationTokens RequestParamsMaxTokens Histogram fake-observation arrays for init.
248+
// Each value will be passed to Observe() once at start-up.
249+
RequestPromptTokens []int `yaml:"request-prompt-tokens" json:"request-prompt-tokens"` // prompt-length samples
250+
RequestGenerationTokens []int `yaml:"request-generation-tokens" json:"request-generation-tokens"` // generation-length samples
251+
RequestParamsMaxTokens []int `yaml:"request-params-max-tokens" json:"request-params-max-tokens"` // max_tokens parameter samples
252+
// RequestSuccessTotal is the number of successful requests, key: finish-reason (stop, length, etc.).
253+
RequestSuccessTotal map[string]int64 `yaml:"request-success-total" json:"request-success-total"`
226254
}
227255

228256
type LorasMetrics struct {
@@ -521,6 +549,38 @@ func (c *Configuration) validate() error {
521549
}
522550
}
523551
}
552+
if c.FakeMetrics.RequestSuccessTotal != nil {
553+
for reason, count := range c.FakeMetrics.RequestSuccessTotal {
554+
if count < 0 {
555+
return fmt.Errorf("fake metrics request-success-total.%s "+
556+
"cannot be negative, got %d", reason, count)
557+
}
558+
if _, ok := validFinishReasons[reason]; !ok {
559+
return fmt.Errorf("invalid finish reason in request-success-total: "+
560+
"%s (valid reasons: %v)", reason, requiredFinishReasons)
561+
}
562+
}
563+
for _, reason := range requiredFinishReasons {
564+
if _, exists := c.FakeMetrics.RequestSuccessTotal[reason]; !exists {
565+
c.FakeMetrics.RequestSuccessTotal[reason] = 0
566+
}
567+
}
568+
}
569+
for _, v := range c.FakeMetrics.RequestPromptTokens {
570+
if v < 0 {
571+
return errors.New("fake metrics request-prompt-tokens cannot contain negative values")
572+
}
573+
}
574+
for _, v := range c.FakeMetrics.RequestGenerationTokens {
575+
if v < 0 {
576+
return errors.New("fake metrics request-generation-tokens cannot contain negative values")
577+
}
578+
}
579+
for _, v := range c.FakeMetrics.RequestParamsMaxTokens {
580+
if v < 0 {
581+
return errors.New("fake metrics request-params-max-tokens cannot contain negative values")
582+
}
583+
}
524584
}
525585

526586
if c.DPSize < 1 || c.DPSize > 8 {

pkg/common/config_test.go

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -203,8 +203,17 @@ var _ = Describe("Simulator configuration", func() {
203203
"{\"running\":\"lora1,lora2\",\"waiting\":\"lora3\",\"timestamp\":1257894567}",
204204
"{\"running\":\"lora1,lora3\",\"waiting\":\"\",\"timestamp\":1257894569}",
205205
},
206-
TTFTBucketValues: []int{10, 20, 30, 10},
207-
TPOTBucketValues: []int{0, 0, 10, 20, 30},
206+
TTFTBucketValues: []int{10, 20, 30, 10},
207+
TPOTBucketValues: []int{0, 0, 10, 20, 30},
208+
RequestPromptTokens: []int{10, 20, 30, 15},
209+
RequestGenerationTokens: []int{50, 60, 40},
210+
RequestParamsMaxTokens: []int{128, 256, 512},
211+
RequestSuccessTotal: map[string]int64{
212+
StopFinishReason: 20,
213+
LengthFinishReason: 0,
214+
ToolsFinishReason: 0,
215+
RemoteDecodeFinishReason: 0,
216+
},
208217
}
209218
test = testCase{
210219
name: "config with fake metrics file",

pkg/llm-d-inference-sim/metrics.go

Lines changed: 145 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ package llmdinferencesim
2020

2121
import (
2222
"context"
23+
"math"
2324
"strconv"
2425
"strings"
2526
"sync"
@@ -65,6 +66,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
6566
return err
6667
}
6768

69+
// not supported for now, reports constant value
6870
s.waitingRequests = prometheus.NewGaugeVec(
6971
prometheus.GaugeOpts{
7072
Subsystem: "",
@@ -123,6 +125,61 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
123125
return err
124126
}
125127

128+
s.requestPromptTokens = prometheus.NewHistogramVec(
129+
prometheus.HistogramOpts{
130+
Subsystem: "",
131+
Name: "vllm:request_prompt_tokens",
132+
Help: "Number of prefill tokens processed.",
133+
Buckets: build125Buckets(s.config.MaxModelLen),
134+
},
135+
[]string{vllmapi.PromLabelModelName},
136+
)
137+
if err := s.registry.Register(s.requestPromptTokens); err != nil {
138+
s.logger.Error(err, "Prometheus request_prompt_tokens histogram register failed")
139+
return err
140+
}
141+
142+
s.requestGenerationTokens = prometheus.NewHistogramVec(
143+
prometheus.HistogramOpts{
144+
Subsystem: "",
145+
Name: "vllm:request_generation_tokens",
146+
Help: "Number of generation tokens processed.",
147+
Buckets: build125Buckets(s.config.MaxModelLen),
148+
},
149+
[]string{vllmapi.PromLabelModelName},
150+
)
151+
if err := s.registry.Register(s.requestGenerationTokens); err != nil {
152+
s.logger.Error(err, "Prometheus request_generation_tokens histogram register failed")
153+
return err
154+
}
155+
156+
s.requestParamsMaxTokens = prometheus.NewHistogramVec(
157+
prometheus.HistogramOpts{
158+
Subsystem: "",
159+
Name: "vllm:request_params_max_tokens",
160+
Help: "Histogram of the max_tokens request parameter.",
161+
Buckets: build125Buckets(s.config.MaxModelLen),
162+
},
163+
[]string{vllmapi.PromLabelModelName},
164+
)
165+
if err := s.registry.Register(s.requestParamsMaxTokens); err != nil {
166+
s.logger.Error(err, "Prometheus request_params_max_tokens histogram register failed")
167+
return err
168+
}
169+
170+
s.requestSuccessTotal = prometheus.NewCounterVec(
171+
prometheus.CounterOpts{
172+
Subsystem: "",
173+
Name: "vllm:request_success_total",
174+
Help: "Count of successfully processed requests.",
175+
},
176+
[]string{vllmapi.PromLabelModelName, vllmapi.PromLabelFinishReason},
177+
)
178+
if err := s.registry.Register(s.requestSuccessTotal); err != nil {
179+
s.logger.Error(err, "Prometheus request_success_total counter register failed")
180+
return err
181+
}
182+
126183
s.setInitialPrometheusMetrics()
127184

128185
return nil
@@ -132,21 +189,34 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
132189
// the fake metrics if set
133190
func (s *VllmSimulator) setInitialPrometheusMetrics() {
134191
var nRunningReqs, nWaitingReqs, kvCacheUsage float64
192+
modelName := s.getDisplayedModelName(s.config.Model)
135193
if s.config.FakeMetrics != nil {
136194
nRunningReqs = float64(s.config.FakeMetrics.RunningRequests)
137195
nWaitingReqs = float64(s.config.FakeMetrics.WaitingRequests)
138196
kvCacheUsage = float64(s.config.FakeMetrics.KVCacheUsagePercentage)
139-
140197
if s.config.FakeMetrics.TTFTBucketValues != nil {
141198
s.initFakeHistogram(s.ttft, common.TTFTBucketsBoundaries, s.config.FakeMetrics.TTFTBucketValues)
142199
}
143200

144201
if s.config.FakeMetrics.TPOTBucketValues != nil {
145202
s.initFakeHistogram(s.tpot, common.TPOTBucketsBoundaries, s.config.FakeMetrics.TPOTBucketValues)
146203
}
204+
buckets := build125Buckets(s.config.MaxModelLen)
205+
if s.config.FakeMetrics.RequestPromptTokens != nil {
206+
s.initFakeHistogram(s.requestPromptTokens, buckets, s.config.FakeMetrics.RequestPromptTokens)
207+
}
208+
if s.config.FakeMetrics.RequestGenerationTokens != nil {
209+
s.initFakeHistogram(s.requestParamsMaxTokens, buckets, s.config.FakeMetrics.RequestGenerationTokens)
210+
}
211+
if s.config.FakeMetrics.RequestParamsMaxTokens != nil {
212+
s.initFakeHistogram(s.requestGenerationTokens, buckets, s.config.FakeMetrics.RequestParamsMaxTokens)
213+
}
214+
215+
for reason, requestSuccessTotal := range s.config.FakeMetrics.RequestSuccessTotal {
216+
s.requestSuccessTotal.WithLabelValues(modelName, reason).Add(float64(requestSuccessTotal))
217+
}
147218
}
148219

149-
modelName := s.getDisplayedModelName(s.config.Model)
150220
s.runningRequests.WithLabelValues(modelName).Set(nRunningReqs)
151221
s.waitingRequests.WithLabelValues(modelName).Set(nWaitingReqs)
152222
s.kvCacheUsagePercentage.WithLabelValues(modelName).Set(kvCacheUsage)
@@ -288,6 +358,7 @@ func (s *VllmSimulator) startMetricsUpdaters(ctx context.Context) {
288358
go s.kvCacheUsageUpdater(ctx)
289359
go s.ttftUpdater(ctx)
290360
go s.tpotUpdater(ctx)
361+
go s.recordRequestUpdater(ctx)
291362
}
292363

293364
// waitingRequestsUpdater updates the waiting requests metric by listening on the relevant channel
@@ -396,3 +467,75 @@ func (s *VllmSimulator) decrementLoraRefCount(lora string, theMap *sync.Map) {
396467
s.logger.Error(nil, "Zero model reference", "model", lora)
397468
}
398469
}
470+
471+
// recordRequestUpdater listens on requestSuccessChan and drives the Prometheus metric
472+
// for successfully completed requests.
473+
func (s *VllmSimulator) recordRequestUpdater(ctx context.Context) {
474+
for {
475+
select {
476+
case <-ctx.Done():
477+
return
478+
case event := <-s.requestSuccessChan:
479+
s.recordRequestMetricsOnSuccess(
480+
event.promptTokens,
481+
event.generationTokens,
482+
event.maxTokens,
483+
event.finishReason,
484+
)
485+
}
486+
}
487+
}
488+
489+
// requestSuccessEvent represents the data associated with a successfully completed request,
490+
// which is sent through the requestSuccessChan for asynchronous metrics recording.
491+
type requestSuccessEvent struct {
492+
// promptTokens is the number of input (prompt) tokens in the request
493+
promptTokens int
494+
// generationTokens is the number of generated (output) tokens in the response
495+
generationTokens int
496+
// maxTokens is the maximum number of tokens allowed for generation (if specified in the request)
497+
maxTokens *int64
498+
// finishReason indicates why the generation stopped (e.g., "stop", "length", "tool_calls")
499+
finishReason string
500+
}
501+
502+
// recordRequestMetricsOnSuccess records metrics for a successfully completed request
503+
func (s *VllmSimulator) recordRequestMetricsOnSuccess(promptTokens,
504+
generationTokens int, maxTokens *int64, finishReason string) {
505+
modelName := s.getDisplayedModelName(s.config.Model)
506+
s.requestPromptTokens.WithLabelValues(modelName).Observe(float64(promptTokens))
507+
s.requestGenerationTokens.WithLabelValues(modelName).Observe(float64(generationTokens))
508+
if maxTokens != nil {
509+
s.requestParamsMaxTokens.WithLabelValues(modelName).Observe(float64(*maxTokens))
510+
}
511+
s.requestSuccessTotal.WithLabelValues(modelName, finishReason).Inc()
512+
}
513+
514+
// build125Buckets generates histogram buckets in powers of 10 scaled by [1,2,5].
515+
// This matches vLLM's build_1_2_5_buckets() in metrics.py.
516+
//
517+
// Reference: https://github.com/vllm-project/vllm/blob/main/vllm/engine/metrics.py#L175
518+
func build125Buckets(maxValue int) []float64 {
519+
if maxValue <= 0 {
520+
return []float64{}
521+
}
522+
var buckets []float64
523+
exponent := 0
524+
mantissa := []int{1, 2, 5}
525+
526+
for {
527+
complete := true
528+
for _, m := range mantissa {
529+
value := m * int(math.Pow10(exponent))
530+
if value <= maxValue {
531+
buckets = append(buckets, float64(value))
532+
complete = false
533+
}
534+
}
535+
if complete {
536+
break
537+
}
538+
exponent++
539+
}
540+
return buckets
541+
}

0 commit comments

Comments
 (0)