ytjhai
diff --git a/‎.circleci/config.yml
+3-1 b/‎.circleci/config.yml
+3-1
diff --git a/‎docs/my-website/docs/proxy/guardrails/aim_security.md
+1-1 b/‎docs/my-website/docs/proxy/guardrails/aim_security.md
+1-1
diff --git a/‎litellm/integrations/custom_logger.py
+20-1 b/‎litellm/integrations/custom_logger.py
+20-1
diff --git a/‎litellm/llms/bedrock/chat/converse_handler.py
+1-1 b/‎litellm/llms/bedrock/chat/converse_handler.py
+1-1
diff --git a/‎litellm/proxy/guardrails/guardrail_hooks/aim.py
+111-4 b/‎litellm/proxy/guardrails/guardrail_hooks/aim.py
+111-4
diff --git a/‎litellm/proxy/proxy_server.py
+20-10 b/‎litellm/proxy/proxy_server.py
+20-10
diff --git a/‎litellm/proxy/utils.py
+32-2 b/‎litellm/proxy/utils.py
+32-2
diff --git a/‎litellm/realtime_api/main.py
+2 b/‎litellm/realtime_api/main.py
+2
@@ -71,7 +71,7 @@ jobs:
             pip install "Pillow==10.3.0"
             pip install "jsonschema==4.22.0"
             pip install "pytest-xdist==3.6.1"
-            pip install "websockets==10.4"
+            pip install "websockets==13.1.0"
             pip uninstall posthog -y
       - save_cache:
           paths:
@@ -189,6 +189,7 @@ jobs:
             pip install "diskcache==5.6.1"
             pip install "Pillow==10.3.0"
             pip install "jsonschema==4.22.0"
+            pip install "websockets==13.1.0"
       - save_cache:
           paths:
             - ./venv
@@ -288,6 +289,7 @@ jobs:
             pip install "diskcache==5.6.1"
             pip install "Pillow==10.3.0"
             pip install "jsonschema==4.22.0"
+            pip install "websockets==13.1.0"
       - save_cache:
           paths:
             - ./venv
 
@@ -37,7 +37,7 @@ guardrails:
   - guardrail_name: aim-protected-app
     litellm_params:
       guardrail: aim
-      mode: pre_call # 'during_call' is also available
+      mode: [pre_call, post_call] # "During_call" is also available
       api_key: os.environ/AIM_API_KEY
       api_base: os.environ/AIM_API_BASE # Optional, use only when using a self-hosted Aim Outpost
 ```
 
@@ -1,7 +1,16 @@
 #### What this does ####
 #    On success, logs events to Promptlayer
 import traceback
-from typing import TYPE_CHECKING, Any, List, Literal, Optional, Tuple, Union
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    AsyncGenerator,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    Union,
+)
 
 from pydantic import BaseModel
 
@@ -14,6 +23,7 @@
     EmbeddingResponse,
     ImageResponse,
     ModelResponse,
+    ModelResponseStream,
     StandardCallbackDynamicParams,
     StandardLoggingPayload,
 )
@@ -251,6 +261,15 @@ async def async_post_call_streaming_hook(
     ) -> Any:
         pass
 
+    async def async_post_call_streaming_iterator_hook(
+        self,
+        user_api_key_dict: UserAPIKeyAuth,
+        response: Any,
+        request_data: dict,
+    ) -> AsyncGenerator[ModelResponseStream, None]:
+        async for item in response:
+            yield item
+
     #### SINGLE-USE #### - https://docs.litellm.ai/docs/observability/custom_callback#using-your-custom-callback-function
 
     def log_input_event(self, model, messages, kwargs, print_verbose, callback_func):
 
@@ -274,7 +274,7 @@ def completion(  # noqa: PLR0915
         if modelId is not None:
             modelId = self.encode_model_id(model_id=modelId)
         else:
-            modelId = model
+            modelId = self.encode_model_id(model_id=model)
 
         if stream is True and "ai21" in modelId:
             fake_stream = True
 
@@ -4,11 +4,14 @@
 #                   https://www.aim.security/
 #
 # +-------------------------------------------------------------+
-
+import asyncio
+import json
 import os
-from typing import Literal, Optional, Union
+from typing import Any, AsyncGenerator, Literal, Optional, Union
 
 from fastapi import HTTPException
+from pydantic import BaseModel
+from websockets.asyncio.client import ClientConnection, connect
 
 from litellm import DualCache
 from litellm._logging import verbose_proxy_logger
@@ -18,6 +21,14 @@
     httpxSpecialProvider,
 )
 from litellm.proxy._types import UserAPIKeyAuth
+from litellm.proxy.proxy_server import StreamingCallbackError
+from litellm.types.utils import (
+    Choices,
+    EmbeddingResponse,
+    ImageResponse,
+    ModelResponse,
+    ModelResponseStream,
+)
 
 
 class AimGuardrailMissingSecrets(Exception):
@@ -41,6 +52,9 @@ def __init__(
         self.api_base = (
             api_base or os.environ.get("AIM_API_BASE") or "https://api.aim.security"
         )
+        self.ws_api_base = self.api_base.replace("http://", "ws://").replace(
+            "https://", "wss://"
+        )
         super().__init__(**kwargs)
 
     async def async_pre_call_hook(
@@ -98,8 +112,101 @@ async def call_aim_guardrail(self, data: dict, hook: str) -> None:
         detected = res["detected"]
         verbose_proxy_logger.info(
             "Aim: detected: {detected}, enabled policies: {policies}".format(
-                detected=detected, policies=list(res["details"].keys())
-            )
+                detected=detected,
+                policies=list(res["details"].keys()),
+            ),
         )
         if detected:
             raise HTTPException(status_code=400, detail=res["detection_message"])
+
+    async def call_aim_guardrail_on_output(
+        self, request_data: dict, output: str, hook: str
+    ) -> Optional[str]:
+        user_email = (
+            request_data.get("metadata", {}).get("headers", {}).get("x-aim-user-email")
+        )
+        headers = {
+            "Authorization": f"Bearer {self.api_key}",
+            "x-aim-litellm-hook": hook,
+        } | ({"x-aim-user-email": user_email} if user_email else {})
+        response = await self.async_handler.post(
+            f"{self.api_base}/detect/output",
+            headers=headers,
+            json={"output": output, "messages": request_data.get("messages", [])},
+        )
+        response.raise_for_status()
+        res = response.json()
+        detected = res["detected"]
+        verbose_proxy_logger.info(
+            "Aim: detected: {detected}, enabled policies: {policies}".format(
+                detected=detected,
+                policies=list(res["details"].keys()),
+            ),
+        )
+        if detected:
+            return res["detection_message"]
+        return None
+
+    async def async_post_call_success_hook(
+        self,
+        data: dict,
+        user_api_key_dict: UserAPIKeyAuth,
+        response: Union[Any, ModelResponse, EmbeddingResponse, ImageResponse],
+    ) -> Any:
+        if (
+            isinstance(response, ModelResponse)
+            and response.choices
+            and isinstance(response.choices[0], Choices)
+        ):
+            content = response.choices[0].message.content or ""
+            detection = await self.call_aim_guardrail_on_output(
+                data, content, hook="output"
+            )
+            if detection:
+                raise HTTPException(status_code=400, detail=detection)
+
+    async def async_post_call_streaming_iterator_hook(
+        self,
+        user_api_key_dict: UserAPIKeyAuth,
+        response,
+        request_data: dict,
+    ) -> AsyncGenerator[ModelResponseStream, None]:
+        user_email = (
+            request_data.get("metadata", {}).get("headers", {}).get("x-aim-user-email")
+        )
+        headers = {
+            "Authorization": f"Bearer {self.api_key}",
+        } | ({"x-aim-user-email": user_email} if user_email else {})
+        async with connect(
+            f"{self.ws_api_base}/detect/output/ws", additional_headers=headers
+        ) as websocket:
+            sender = asyncio.create_task(
+                self.forward_the_stream_to_aim(websocket, response)
+            )
+            while True:
+                result = json.loads(await websocket.recv())
+                if verified_chunk := result.get("verified_chunk"):
+                    yield ModelResponseStream.model_validate(verified_chunk)
+                else:
+                    sender.cancel()
+                    if result.get("done"):
+                        return
+                    if blocking_message := result.get("blocking_message"):
+                        raise StreamingCallbackError(blocking_message)
+                    verbose_proxy_logger.error(
+                        f"Unknown message received from AIM: {result}"
+                    )
+                    return
+
+    async def forward_the_stream_to_aim(
+        self,
+        websocket: ClientConnection,
+        response_iter,
+    ) -> None:
+        async for chunk in response_iter:
+            if isinstance(chunk, BaseModel):
+                chunk = chunk.model_dump_json()
+            if isinstance(chunk, dict):
+                chunk = json.dumps(chunk)
+            await websocket.send(chunk)
+        await websocket.send(json.dumps({"done": True}))
@@ -23,6 +23,11 @@
     get_origin,
     get_type_hints,
 )
+from litellm.types.utils import (
+    ModelResponse,
+    ModelResponseStream,
+    TextCompletionResponse,
+)
 
 if TYPE_CHECKING:
     from opentelemetry.trace import Span as _Span
@@ -1377,6 +1382,10 @@ async def _run_background_health_check():
             await asyncio.sleep(health_check_interval)
 
 
+class StreamingCallbackError(Exception):
+    pass
+
+
 class ProxyConfig:
     """
     Abstraction class on top of config loading/updating logic. Gives us one place to control all config updating logic.
@@ -3038,8 +3047,7 @@ async def async_data_generator(
 ):
     verbose_proxy_logger.debug("inside generator")
     try:
-        time.time()
-        async for chunk in response:
+        async for chunk in proxy_logging_obj.async_post_call_streaming_iterator_hook(user_api_key_dict=user_api_key_dict, response=response, request_data=request_data):
             verbose_proxy_logger.debug(
                 "async_data_generator: received streaming chunk - {}".format(chunk)
             )
@@ -3076,6 +3084,8 @@ async def async_data_generator(
 
         if isinstance(e, HTTPException):
             raise e
+        elif isinstance(e, StreamingCallbackError):
+            error_msg = str(e)
         else:
             error_traceback = traceback.format_exc()
             error_msg = f"{str(e)}\n\n{error_traceback}"
@@ -5421,11 +5431,11 @@ async def token_counter(request: TokenCountRequest):
 )
 async def supported_openai_params(model: str):
     """
-    Returns supported openai params for a given litellm model name 
+    Returns supported openai params for a given litellm model name
 
-    e.g. `gpt-4` vs `gpt-3.5-turbo` 
+    e.g. `gpt-4` vs `gpt-3.5-turbo`
 
-    Example curl: 
+    Example curl:
     ```
     curl -X GET --location 'http://localhost:4000/utils/supported_openai_params?model=gpt-3.5-turbo-16k' \
         --header 'Authorization: Bearer sk-1234'
@@ -6194,7 +6204,7 @@ async def model_group_info(
     - /model_group/info returns all model groups. End users of proxy should use /model_group/info since those models will be used for /chat/completions, /embeddings, etc.
     - /model_group/info?model_group=rerank-english-v3.0 returns all model groups for a specific model group (`model_name` in config.yaml)
 
-    
+
 
     Example Request (All Models):
     ```shell
@@ -6212,10 +6222,10 @@ async def model_group_info(
     -H 'Authorization: Bearer sk-1234'
     ```
 
-    Example Request (Specific Wildcard Model Group): (e.g. `model_name: openai/*` on config.yaml) 
+    Example Request (Specific Wildcard Model Group): (e.g. `model_name: openai/*` on config.yaml)
     ```shell
     curl -X 'GET' \
-    'http://localhost:4000/model_group/info?model_group=openai/tts-1' 
+    'http://localhost:4000/model_group/info?model_group=openai/tts-1'
     -H 'accept: application/json' \
     -H 'Authorization: Bearersk-1234'
     ```
@@ -7242,7 +7252,7 @@ async def invitation_update(
 ):
     """
     Update when invitation is accepted
-    
+
     ```
     curl -X POST 'http://localhost:4000/invitation/update' \
         -H 'Content-Type: application/json' \
@@ -7303,7 +7313,7 @@ async def invitation_delete(
 ):
     """
     Delete invitation link
-    
+
     ```
     curl -X POST 'http://localhost:4000/invitation/delete' \
         -H 'Content-Type: application/json' \
 
@@ -18,6 +18,7 @@
     ProxyErrorTypes,
     ProxyException,
 )
+from litellm.types.guardrails import GuardrailEventHooks
 
 try:
     import backoff
@@ -31,7 +32,7 @@
 import litellm
 import litellm.litellm_core_utils
 import litellm.litellm_core_utils.litellm_logging
-from litellm import EmbeddingResponse, ImageResponse, ModelResponse, Router
+from litellm import EmbeddingResponse, ImageResponse, ModelResponse, Router, ModelResponseStream
 from litellm._logging import verbose_proxy_logger
 from litellm._service_logger import ServiceLogging, ServiceTypes
 from litellm.caching.caching import DualCache, RedisCache
@@ -972,7 +973,7 @@ async def async_post_call_streaming_hook(
         1. /chat/completions
         """
         response_str: Optional[str] = None
-        if isinstance(response, ModelResponse):
+        if isinstance(response, (ModelResponse, ModelResponseStream)):
             response_str = litellm.get_response_string(response_obj=response)
         if response_str is not None:
             for callback in litellm.callbacks:
@@ -992,6 +993,35 @@ async def async_post_call_streaming_hook(
                     raise e
         return response
 
+    def async_post_call_streaming_iterator_hook(
+        self,
+        response,
+        user_api_key_dict: UserAPIKeyAuth,
+        request_data: dict,
+    ):
+        """
+        Allow user to modify outgoing streaming data -> Given a whole response iterator.
+        This hook is best used when you need to modify multiple chunks of the response at once.
+
+        Covers:
+        1. /chat/completions
+        """
+        for callback in litellm.callbacks:
+            _callback: Optional[CustomLogger] = None
+            if isinstance(callback, str):
+                _callback = litellm.litellm_core_utils.litellm_logging.get_custom_logger_compatible_class(callback)
+            else:
+                _callback = callback  # type: ignore
+            if _callback is not None and isinstance(_callback, CustomLogger):
+                if not isinstance(_callback, CustomGuardrail) or _callback.should_run_guardrail(
+                        data=request_data, event_type=GuardrailEventHooks.post_call
+                ):
+                    response = _callback.async_post_call_streaming_iterator_hook(
+                        user_api_key_dict=user_api_key_dict, response=response, request_data=request_data
+                    )
+        return response
+
+
     async def post_call_streaming_hook(
         self,
         response: str,
 
@@ -151,6 +151,8 @@ async def _realtime_health_check(
         url = openai_realtime._construct_url(
             api_base=api_base or "https://api.openai.com/", model=model
         )
+    else:
+        raise ValueError(f"Unsupported model: {model}")
     async with websockets.connect(  # type: ignore
         url,
         extra_headers={
Original file line number	Diff line number	Diff line change
`@@ -151,6 +151,8 @@ async def _realtime_health_check(`
`151`	`151`	`url = openai_realtime._construct_url(`
`152`	`152`	`api_base=api_base or "https://api.openai.com/", model=model`
`153`	`153`	`)`
	`154`	`+ else:`
	`155`	`+ raise ValueError(f"Unsupported model: {model}")`
`154`	`156`	`async with websockets.connect( # type: ignore`
`155`	`157`	`url,`
`156`	`158`	`extra_headers={`