@@ -20,6 +20,7 @@ package llmdinferencesim
2020
2121import (
2222 "context"
23+ "math"
2324 "strconv"
2425 "strings"
2526 "sync"
@@ -65,6 +66,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
6566 return err
6667 }
6768
69+ // not supported for now, reports constant value
6870 s .waitingRequests = prometheus .NewGaugeVec (
6971 prometheus.GaugeOpts {
7072 Subsystem : "" ,
@@ -123,6 +125,61 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
123125 return err
124126 }
125127
128+ s .requestPromptTokens = prometheus .NewHistogramVec (
129+ prometheus.HistogramOpts {
130+ Subsystem : "" ,
131+ Name : "vllm:request_prompt_tokens" ,
132+ Help : "Number of prefill tokens processed." ,
133+ Buckets : build125Buckets (s .config .MaxModelLen ),
134+ },
135+ []string {vllmapi .PromLabelModelName },
136+ )
137+ if err := s .registry .Register (s .requestPromptTokens ); err != nil {
138+ s .logger .Error (err , "Prometheus request_prompt_tokens histogram register failed" )
139+ return err
140+ }
141+
142+ s .requestGenerationTokens = prometheus .NewHistogramVec (
143+ prometheus.HistogramOpts {
144+ Subsystem : "" ,
145+ Name : "vllm:request_generation_tokens" ,
146+ Help : "Number of generation tokens processed." ,
147+ Buckets : build125Buckets (s .config .MaxModelLen ),
148+ },
149+ []string {vllmapi .PromLabelModelName },
150+ )
151+ if err := s .registry .Register (s .requestGenerationTokens ); err != nil {
152+ s .logger .Error (err , "Prometheus request_generation_tokens histogram register failed" )
153+ return err
154+ }
155+
156+ s .requestParamsMaxTokens = prometheus .NewHistogramVec (
157+ prometheus.HistogramOpts {
158+ Subsystem : "" ,
159+ Name : "vllm:request_params_max_tokens" ,
160+ Help : "Histogram of the max_tokens request parameter." ,
161+ Buckets : build125Buckets (s .config .MaxModelLen ),
162+ },
163+ []string {vllmapi .PromLabelModelName },
164+ )
165+ if err := s .registry .Register (s .requestParamsMaxTokens ); err != nil {
166+ s .logger .Error (err , "Prometheus request_params_max_tokens histogram register failed" )
167+ return err
168+ }
169+
170+ s .requestSuccessTotal = prometheus .NewCounterVec (
171+ prometheus.CounterOpts {
172+ Subsystem : "" ,
173+ Name : "vllm:request_success_total" ,
174+ Help : "Count of successfully processed requests." ,
175+ },
176+ []string {vllmapi .PromLabelModelName , vllmapi .PromLabelFinishReason },
177+ )
178+ if err := s .registry .Register (s .requestSuccessTotal ); err != nil {
179+ s .logger .Error (err , "Prometheus request_success_total counter register failed" )
180+ return err
181+ }
182+
126183 s .setInitialPrometheusMetrics ()
127184
128185 return nil
@@ -132,21 +189,34 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
132189// the fake metrics if set
133190func (s * VllmSimulator ) setInitialPrometheusMetrics () {
134191 var nRunningReqs , nWaitingReqs , kvCacheUsage float64
192+ modelName := s .getDisplayedModelName (s .config .Model )
135193 if s .config .FakeMetrics != nil {
136194 nRunningReqs = float64 (s .config .FakeMetrics .RunningRequests )
137195 nWaitingReqs = float64 (s .config .FakeMetrics .WaitingRequests )
138196 kvCacheUsage = float64 (s .config .FakeMetrics .KVCacheUsagePercentage )
139-
140197 if s .config .FakeMetrics .TTFTBucketValues != nil {
141198 s .initFakeHistogram (s .ttft , common .TTFTBucketsBoundaries , s .config .FakeMetrics .TTFTBucketValues )
142199 }
143200
144201 if s .config .FakeMetrics .TPOTBucketValues != nil {
145202 s .initFakeHistogram (s .tpot , common .TPOTBucketsBoundaries , s .config .FakeMetrics .TPOTBucketValues )
146203 }
204+ buckets := build125Buckets (s .config .MaxModelLen )
205+ if s .config .FakeMetrics .RequestPromptTokens != nil {
206+ s .initFakeHistogram (s .requestPromptTokens , buckets , s .config .FakeMetrics .RequestPromptTokens )
207+ }
208+ if s .config .FakeMetrics .RequestGenerationTokens != nil {
209+ s .initFakeHistogram (s .requestParamsMaxTokens , buckets , s .config .FakeMetrics .RequestGenerationTokens )
210+ }
211+ if s .config .FakeMetrics .RequestParamsMaxTokens != nil {
212+ s .initFakeHistogram (s .requestGenerationTokens , buckets , s .config .FakeMetrics .RequestParamsMaxTokens )
213+ }
214+
215+ for reason , requestSuccessTotal := range s .config .FakeMetrics .RequestSuccessTotal {
216+ s .requestSuccessTotal .WithLabelValues (modelName , reason ).Add (float64 (requestSuccessTotal ))
217+ }
147218 }
148219
149- modelName := s .getDisplayedModelName (s .config .Model )
150220 s .runningRequests .WithLabelValues (modelName ).Set (nRunningReqs )
151221 s .waitingRequests .WithLabelValues (modelName ).Set (nWaitingReqs )
152222 s .kvCacheUsagePercentage .WithLabelValues (modelName ).Set (kvCacheUsage )
@@ -288,6 +358,7 @@ func (s *VllmSimulator) startMetricsUpdaters(ctx context.Context) {
288358 go s .kvCacheUsageUpdater (ctx )
289359 go s .ttftUpdater (ctx )
290360 go s .tpotUpdater (ctx )
361+ go s .recordRequestUpdater (ctx )
291362}
292363
293364// waitingRequestsUpdater updates the waiting requests metric by listening on the relevant channel
@@ -396,3 +467,75 @@ func (s *VllmSimulator) decrementLoraRefCount(lora string, theMap *sync.Map) {
396467 s .logger .Error (nil , "Zero model reference" , "model" , lora )
397468 }
398469}
470+
471+ // recordRequestUpdater listens on requestSuccessChan and drives the Prometheus metric
472+ // for successfully completed requests.
473+ func (s * VllmSimulator ) recordRequestUpdater (ctx context.Context ) {
474+ for {
475+ select {
476+ case <- ctx .Done ():
477+ return
478+ case event := <- s .requestSuccessChan :
479+ s .recordRequestMetricsOnSuccess (
480+ event .promptTokens ,
481+ event .generationTokens ,
482+ event .maxTokens ,
483+ event .finishReason ,
484+ )
485+ }
486+ }
487+ }
488+
489+ // requestSuccessEvent represents the data associated with a successfully completed request,
490+ // which is sent through the requestSuccessChan for asynchronous metrics recording.
491+ type requestSuccessEvent struct {
492+ // promptTokens is the number of input (prompt) tokens in the request
493+ promptTokens int
494+ // generationTokens is the number of generated (output) tokens in the response
495+ generationTokens int
496+ // maxTokens is the maximum number of tokens allowed for generation (if specified in the request)
497+ maxTokens * int64
498+ // finishReason indicates why the generation stopped (e.g., "stop", "length", "tool_calls")
499+ finishReason string
500+ }
501+
502+ // recordRequestMetricsOnSuccess records metrics for a successfully completed request
503+ func (s * VllmSimulator ) recordRequestMetricsOnSuccess (promptTokens ,
504+ generationTokens int , maxTokens * int64 , finishReason string ) {
505+ modelName := s .getDisplayedModelName (s .config .Model )
506+ s .requestPromptTokens .WithLabelValues (modelName ).Observe (float64 (promptTokens ))
507+ s .requestGenerationTokens .WithLabelValues (modelName ).Observe (float64 (generationTokens ))
508+ if maxTokens != nil {
509+ s .requestParamsMaxTokens .WithLabelValues (modelName ).Observe (float64 (* maxTokens ))
510+ }
511+ s .requestSuccessTotal .WithLabelValues (modelName , finishReason ).Inc ()
512+ }
513+
514+ // build125Buckets generates histogram buckets in powers of 10 scaled by [1,2,5].
515+ // This matches vLLM's build_1_2_5_buckets() in metrics.py.
516+ //
517+ // Reference: https://github.com/vllm-project/vllm/blob/main/vllm/engine/metrics.py#L175
518+ func build125Buckets (maxValue int ) []float64 {
519+ if maxValue <= 0 {
520+ return []float64 {}
521+ }
522+ var buckets []float64
523+ exponent := 0
524+ mantissa := []int {1 , 2 , 5 }
525+
526+ for {
527+ complete := true
528+ for _ , m := range mantissa {
529+ value := m * int (math .Pow10 (exponent ))
530+ if value <= maxValue {
531+ buckets = append (buckets , float64 (value ))
532+ complete = false
533+ }
534+ }
535+ if complete {
536+ break
537+ }
538+ exponent ++
539+ }
540+ return buckets
541+ }
0 commit comments