feat: add langfuse observability to bedrock gateway

jake-marsden · jake-marsden · commit 221e93554da9 · 2025-11-14T09:45:46.000+11:00
diff --git a/src/api/models/bedrock.py b/src/api/models/bedrock.py
@@ -1,6 +1,7 @@
 import base64
 import json
 import logging
+import os
 import re
 import time
 from abc import ABC
@@ -13,6 +14,7 @@
 import tiktoken
 from botocore.config import Config
 from fastapi import HTTPException
+from langfuse import observe, langfuse_context
 from starlette.concurrency import run_in_threadpool
 
 from api.models.base import BaseChatModel, BaseEmbeddingsModel
@@ -230,6 +232,7 @@ def validate(self, chat_request: ChatRequest):
                 detail=error,
             )
 
+    @observe(as_type="generation", name="Bedrock Converse")
     async def _invoke_bedrock(self, chat_request: ChatRequest, stream=False):
         """Common logic for invoke bedrock models"""
         if DEBUG:
@@ -240,6 +243,27 @@ async def _invoke_bedrock(self, chat_request: ChatRequest, stream=False):
         if DEBUG:
             logger.info("Bedrock request: " + json.dumps(str(args)))
 
+        # Extract model metadata for Langfuse
+        args_clone = args.copy()
+        messages = args_clone.get('messages', [])
+        model_id = args_clone.get('modelId', 'unknown')
+        model_parameters = {
+            **args_clone.get('inferenceConfig', {}),
+            **args_clone.get('additionalModelRequestFields', {})
+        }
+        
+        # Update Langfuse generation with input metadata
+        langfuse_context.update_current_observation(
+            input=messages,
+            model=model_id,
+            model_parameters=model_parameters,
+            metadata={
+                'system': args_clone.get('system', []),
+                'toolConfig': args_clone.get('toolConfig', {}),
+                'stream': stream
+            }
+        )
+
         try:
             if stream:
                 # Run the blocking boto3 call in a thread pool
@@ -249,14 +273,56 @@ async def _invoke_bedrock(self, chat_request: ChatRequest, stream=False):
             else:
                 # Run the blocking boto3 call in a thread pool
                 response = await run_in_threadpool(bedrock_runtime.converse, **args)
+                
+                # For non-streaming, extract response metadata immediately
+                if response and not stream:
+                    output_message = response.get("output", {}).get("message", {})
+                    usage = response.get("usage", {})
+                    
+                    # Build metadata
+                    metadata = {
+                        "stopReason": response.get("stopReason"),
+                        "ResponseMetadata": response.get("ResponseMetadata", {})
+                    }
+                    
+                    # Check for reasoning content in response
+                    has_reasoning = False
+                    reasoning_text = ""
+                    if output_message and "content" in output_message:
+                        for content_block in output_message.get("content", []):
+                            if "reasoningContent" in content_block:
+                                has_reasoning = True
+                                reasoning_text = content_block.get("reasoningContent", {}).get("reasoningText", {}).get("text", "")
+                                break
+                    
+                    if has_reasoning and reasoning_text:
+                        metadata["has_extended_thinking"] = True
+                        metadata["reasoning_content"] = reasoning_text
+                        metadata["reasoning_tokens_estimate"] = len(reasoning_text) // 4
+                    
+                    langfuse_context.update_current_observation(
+                        output=output_message,
+                        usage={
+                            "input": usage.get("inputTokens", 0),
+                            "output": usage.get("outputTokens", 0),
+                            "total": usage.get("totalTokens", 0)
+                        },
+                        metadata=metadata
+                    )
         except bedrock_runtime.exceptions.ValidationException as e:
-            logger.error("Bedrock validation error for model %s: %s", chat_request.model, str(e))
+            error_message = f"Bedrock validation error for model {chat_request.model}: {str(e)}"
+            logger.error(error_message)
+            langfuse_context.update_current_observation(level="ERROR", status_message=error_message)
             raise HTTPException(status_code=400, detail=str(e))
         except bedrock_runtime.exceptions.ThrottlingException as e:
-            logger.warning("Bedrock throttling for model %s: %s", chat_request.model, str(e))
+            error_message = f"Bedrock throttling for model {chat_request.model}: {str(e)}"
+            logger.warning(error_message)
+            langfuse_context.update_current_observation(level="WARNING", status_message=error_message)
             raise HTTPException(status_code=429, detail=str(e))
         except Exception as e:
-            logger.error("Bedrock invocation failed for model %s: %s", chat_request.model, str(e))
+            error_message = f"Bedrock invocation failed for model {chat_request.model}: {str(e)}"
+            logger.error(error_message)
+            langfuse_context.update_current_observation(level="ERROR", status_message=error_message)
             raise HTTPException(status_code=500, detail=str(e))
         return response
 
@@ -296,11 +362,37 @@ async def chat_stream(self, chat_request: ChatRequest) -> AsyncIterable[bytes]:
             message_id = self.generate_message_id()
             stream = response.get("stream")
             self.think_emitted = False
+            
+            # Track streaming output and usage for Langfuse
+            accumulated_output = []
+            accumulated_reasoning = []
+            final_usage = None
+            finish_reason = None
+            has_reasoning = False
+            
             async for chunk in self._async_iterate(stream):
                 args = {"model_id": chat_request.model, "message_id": message_id, "chunk": chunk}
                 stream_response = self._create_response_stream(**args)
                 if not stream_response:
                     continue
+                
+                # Accumulate output content for Langfuse tracking
+                if stream_response.choices:
+                    for choice in stream_response.choices:
+                        if choice.delta and choice.delta.content:
+                            content = choice.delta.content
+                            # Check if this is reasoning content (wrapped in <think> tags)
+                            if "<think>" in content or self.think_emitted:
+                                accumulated_reasoning.append(content)
+                                has_reasoning = True
+                            accumulated_output.append(content)
+                        if choice.finish_reason:
+                            finish_reason = choice.finish_reason
+                
+                # Capture final usage metrics for Langfuse tracking
+                if stream_response.usage:
+                    final_usage = stream_response.usage
+                
                 if DEBUG:
                     logger.info("Proxy response :" + stream_response.model_dump_json())
                 if stream_response.choices:
@@ -314,11 +406,43 @@ async def chat_stream(self, chat_request: ChatRequest) -> AsyncIterable[bytes]:
                     # All other chunks will also include a usage field, but with a null value.
                     yield self.stream_response_to_bytes(stream_response)
 
+            # Update Langfuse with final streaming metadata
+            if final_usage or accumulated_output:
+                update_params = {}
+                if accumulated_output:
+                    final_output = "".join(accumulated_output)
+                    update_params["output"] = final_output
+                if final_usage:
+                    update_params["usage"] = {
+                        "input": final_usage.prompt_tokens,
+                        "output": final_usage.completion_tokens,
+                        "total": final_usage.total_tokens
+                    }
+                # Build metadata
+                metadata = {}
+                if finish_reason:
+                    metadata["finish_reason"] = finish_reason
+                if has_reasoning and accumulated_reasoning:
+                    reasoning_text = "".join(accumulated_reasoning)
+                    metadata["has_extended_thinking"] = True
+                    metadata["reasoning_content"] = reasoning_text
+                    # Estimate reasoning tokens (rough approximation: ~4 chars per token)
+                    metadata["reasoning_tokens_estimate"] = len(reasoning_text) // 4
+                if metadata:
+                    update_params["metadata"] = metadata
+                
+                langfuse_context.update_current_observation(**update_params)
+
             # return an [DONE] message at the end.
             yield self.stream_response_to_bytes()
             self.think_emitted = False  # Cleanup
         except Exception as e:
             logger.error("Stream error for model %s: %s", chat_request.model, str(e))
+            # Update Langfuse with error
+            langfuse_context.update_current_observation(
+                level="ERROR",
+                status_message=f"Stream error: {str(e)}"
+            )
             error_event = Error(error=ErrorMessage(message=str(e)))
             yield self.stream_response_to_bytes(error_event)
 
diff --git a/src/requirements.txt b/src/requirements.txt
@@ -7,3 +7,4 @@ requests==2.32.4
 numpy==2.2.5
 boto3==1.40.4
 botocore==1.40.4
+langfuse