diff --git a/internal/translator/openai_gcpvertexai.go b/internal/translator/openai_gcpvertexai.go index 3e766bb07..85c237c9f 100644 --- a/internal/translator/openai_gcpvertexai.go +++ b/internal/translator/openai_gcpvertexai.go @@ -194,8 +194,41 @@ func (o *openAIToGCPVertexAITranslatorV1ChatCompletion) handleStreamingResponse( // Convert GCP chunk to OpenAI chunk. openAIChunk := o.convertGCPChunkToOpenAI(chunk) - // Extract token usage if present in this chunk (typically in the last chunk). - if chunk.UsageMetadata != nil { + // Serialize to SSE format as expected by OpenAI API. + err := serializeOpenAIChatCompletionChunk(*openAIChunk, &newBody) + if err != nil { + return nil, nil, metrics.TokenUsage{}, "", fmt.Errorf("error marshaling OpenAI chunk: %w", err) + } + + if span != nil { + span.RecordResponseChunk(openAIChunk) + } + + // Extract token usage only in the last chunk. + if chunk.UsageMetadata != nil && chunk.UsageMetadata.PromptTokenCount > 0 { + // Convert usage to pointer if available. + usage := ptr.To(geminiUsageToOpenAIUsage(chunk.UsageMetadata)) + + usageChunk := openai.ChatCompletionResponseChunk{ + ID: chunk.ResponseID, + Created: openai.JSONUNIXTime(chunk.CreateTime), + Object: "chat.completion.chunk", + Choices: []openai.ChatCompletionResponseChunkChoice{}, + // usage is nil for all chunks other than the last chunk + Usage: usage, + Model: o.requestModel, + } + + // Serialize to SSE format as expected by OpenAI API. + err := serializeOpenAIChatCompletionChunk(usageChunk, &newBody) + if err != nil { + return nil, nil, metrics.TokenUsage{}, "", fmt.Errorf("error marshaling OpenAI chunk: %w", err) + } + + if span != nil { + span.RecordResponseChunk(&usageChunk) + } + if chunk.UsageMetadata.PromptTokenCount >= 0 { tokenUsage.SetInputTokens(uint32(chunk.UsageMetadata.PromptTokenCount)) //nolint:gosec } @@ -209,16 +242,6 @@ func (o *openAIToGCPVertexAITranslatorV1ChatCompletion) handleStreamingResponse( tokenUsage.SetCachedInputTokens(uint32(chunk.UsageMetadata.CachedContentTokenCount)) //nolint:gosec } } - - // Serialize to SSE format as expected by OpenAI API. - err := serializeOpenAIChatCompletionChunk(*openAIChunk, &newBody) - if err != nil { - return nil, nil, metrics.TokenUsage{}, "", fmt.Errorf("error marshaling OpenAI chunk: %w", err) - } - - if span != nil { - span.RecordResponseChunk(openAIChunk) - } } if endOfStream { @@ -381,19 +404,14 @@ func (o *openAIToGCPVertexAITranslatorV1ChatCompletion) convertGCPChunkToOpenAI( choices = []openai.ChatCompletionResponseChunkChoice{} } - // Convert usage to pointer if available. - var usage *openai.Usage - if chunk.UsageMetadata != nil { - usage = ptr.To(geminiUsageToOpenAIUsage(chunk.UsageMetadata)) - } - return &openai.ChatCompletionResponseChunk{ ID: chunk.ResponseID, Created: openai.JSONUNIXTime(chunk.CreateTime), Object: "chat.completion.chunk", Choices: choices, - Usage: usage, - Model: o.requestModel, + // usage is nil for all chunks other than the last chunk + Usage: nil, + Model: o.requestModel, } } diff --git a/internal/translator/openai_gcpvertexai_test.go b/internal/translator/openai_gcpvertexai_test.go index a60b1bcf9..e86a1ed61 100644 --- a/internal/translator/openai_gcpvertexai_test.go +++ b/internal/translator/openai_gcpvertexai_test.go @@ -969,7 +969,9 @@ func TestOpenAIToGCPVertexAITranslatorV1ChatCompletion_ResponseBody(t *testing.T endOfStream: true, wantError: false, wantHeaderMut: nil, - wantBodyMut: []byte(`data: {"choices":[{"index":0,"delta":{"content":"Hello","role":"assistant"}}],"object":"chat.completion.chunk","usage":{"prompt_tokens":5,"completion_tokens":3,"total_tokens":8,"completion_tokens_details":{},"prompt_tokens_details":{}}} + wantBodyMut: []byte(`data: {"choices":[{"index":0,"delta":{"content":"Hello","role":"assistant"}}],"object":"chat.completion.chunk"} + +data: {"object":"chat.completion.chunk","usage":{"prompt_tokens":5,"completion_tokens":3,"total_tokens":8,"completion_tokens_details":{},"prompt_tokens_details":{}}} data: [DONE] `), @@ -1178,7 +1180,9 @@ data: {"candidates":[{"content":{"parts":[{"text":"Hello"}]}}],"usageMetadata":{ wantHeaderMut: nil, wantBodyMut: []byte(`data: {"choices":[{"index":0,"delta":{"role":"assistant","reasoning_content":{"text":"let me think step by step and reply you."}}}],"object":"chat.completion.chunk"} -data: {"choices":[{"index":0,"delta":{"content":"Hello","role":"assistant"}}],"object":"chat.completion.chunk","usage":{"prompt_tokens":5,"completion_tokens":3,"total_tokens":8,"completion_tokens_details":{},"prompt_tokens_details":{}}} +data: {"choices":[{"index":0,"delta":{"content":"Hello","role":"assistant"}}],"object":"chat.completion.chunk"} + +data: {"object":"chat.completion.chunk","usage":{"prompt_tokens":5,"completion_tokens":3,"total_tokens":8,"completion_tokens_details":{},"prompt_tokens_details":{}}} data: [DONE] `), diff --git a/tests/extproc/testupstream_test.go b/tests/extproc/testupstream_test.go index dfe90a049..779341f46 100644 --- a/tests/extproc/testupstream_test.go +++ b/tests/extproc/testupstream_test.go @@ -579,7 +579,9 @@ data: {"id":"msg_123","choices":[{"index":0,"delta":{"content":" you","role":"as data: {"id":"msg_123","choices":[{"index":0,"delta":{"content":" today","role":"assistant"}}],"created":123,"model":"gemini-1.5-pro","object":"chat.completion.chunk"} -data: {"id":"msg_123","choices":[{"index":0,"delta":{"content":"?","role":"assistant"},"finish_reason":"stop"}],"created":123,"model":"gemini-1.5-pro","object":"chat.completion.chunk","usage":{"prompt_tokens":10,"completion_tokens":7,"total_tokens":17,"completion_tokens_details":{},"prompt_tokens_details":{}}} +data: {"id":"msg_123","choices":[{"index":0,"delta":{"content":"?","role":"assistant"},"finish_reason":"stop"}],"created":123,"model":"gemini-1.5-pro","object":"chat.completion.chunk"} + +data: {"id":"msg_123","created":123,"model":"gemini-1.5-pro","object":"chat.completion.chunk","usage":{"prompt_tokens":10,"completion_tokens":7,"total_tokens":17,"completion_tokens_details":{},"prompt_tokens_details":{}}} data: [DONE] `,