From 685992084837699482cc0109efcefc278cdaaaa1 Mon Sep 17 00:00:00 2001 From: Meshcheryakov Ilya Date: Tue, 16 Apr 2024 00:54:35 +0300 Subject: [PATCH 01/22] initial commit --- clearml_serving/serving/entrypoint.sh | 1 + clearml_serving/serving/init.py | 2 ++ clearml_serving/serving/main.py | 26 ++++++++++++++++++- .../serving/model_request_processor.py | 7 +++++ 4 files changed, 35 insertions(+), 1 deletion(-) diff --git a/clearml_serving/serving/entrypoint.sh b/clearml_serving/serving/entrypoint.sh index e1a5bbc..a5efea1 100755 --- a/clearml_serving/serving/entrypoint.sh +++ b/clearml_serving/serving/entrypoint.sh @@ -2,6 +2,7 @@ # print configuration echo CLEARML_SERVING_TASK_ID="$CLEARML_SERVING_TASK_ID" +echo CLEARML_INFERENCE_TASK_ID="$CLEARML_INFERENCE_TASK_ID" echo CLEARML_SERVING_PORT="$CLEARML_SERVING_PORT" echo CLEARML_USE_GUNICORN="$CLEARML_USE_GUNICORN" echo CLEARML_EXTRA_PYTHON_PACKAGES="$CLEARML_EXTRA_PYTHON_PACKAGES" diff --git a/clearml_serving/serving/init.py b/clearml_serving/serving/init.py index 2ae54a8..0c75712 100644 --- a/clearml_serving/serving/init.py +++ b/clearml_serving/serving/init.py @@ -6,6 +6,7 @@ def setup_task(force_threaded_logging=None): serving_service_task_id = os.environ.get("CLEARML_SERVING_TASK_ID", None) + inference_service_task_id = os.environ.get("CLEARML_INFERENCE_TASK_ID", False) # according Task.init() docs # always use background thread, it requires less memory if force_threaded_logging or os.environ.get("CLEARML_BKG_THREAD_REPORT") in ("1", "Y", "y", "true"): @@ -24,6 +25,7 @@ def setup_task(force_threaded_logging=None): project_name=serving_task.get_project_name(), task_name="{} - serve instance".format(serving_task.name), task_type="inference", # noqa + continue_last_task=inference_service_task_id, ) instance_task.set_system_tags(["service"]) # make sure we start logging thread/process diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py index 6865c93..98c9f21 100644 --- a/clearml_serving/serving/main.py +++ b/clearml_serving/serving/main.py @@ -1,9 +1,13 @@ import os import traceback import gzip +import asyncio from fastapi import FastAPI, Request, Response, APIRouter, HTTPException from fastapi.routing import APIRoute +from fastapi.responses import PlainTextResponse + +from starlette.background import BackgroundTask from typing import Optional, Dict, Any, Callable, Union @@ -48,6 +52,9 @@ async def custom_route_handler(request: Request) -> Response: except (ValueError, TypeError): pass +class CUDAException(Exception): + def __init__(self, exception: str): + self.exception = exception # start FastAPI app app = FastAPI(title="ClearML Serving Service", version=__version__, description="ClearML Service Service router") @@ -70,6 +77,20 @@ async def startup_event(): processor.launch(poll_frequency_sec=model_sync_frequency_secs*60) +@app.on_event('shutdown') +def shutdown_event(): + print('RESTARTING INFERENCE SERVICE!') + +async def exit_app(): + loop = asyncio.get_running_loop() + loop.stop() + +@app.exception_handler(CUDAException) +async def cuda_exception_handler(request, exc): + task = BackgroundTask(exit_app) + return PlainTextResponse("CUDA out of memory. Restarting service", status_code=500, background=task) + + router = APIRouter( prefix="/serve", tags=["models"], @@ -102,7 +123,10 @@ async def serve_model(model_id: str, version: Optional[str] = None, request: Uni except ValueError as ex: session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format( instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) - raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex)) + if "CUDA out of memory. " in str(ex): + raise CUDAException(exception=ex) + else: + raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex)) except Exception as ex: session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format( instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py index ba9242d..35f5120 100644 --- a/clearml_serving/serving/model_request_processor.py +++ b/clearml_serving/serving/model_request_processor.py @@ -1,5 +1,7 @@ import json import os +import gc +import torch from collections import deque from pathlib import Path from random import random @@ -915,7 +917,12 @@ def _sync_daemon(self, poll_frequency_sec: float = 300) -> None: for k in list(self._engine_processor_lookup.keys()): if k not in self._endpoints: # atomic + self._engine_processor_lookup[k]._model = None + self._engine_processor_lookup[k]._preprocess = None + del self._engine_processor_lookup[k] self._engine_processor_lookup.pop(k, None) + gc.collect() + torch.cuda.empty_cache() cleanup = False model_monitor_update = False except Exception as ex: From 64daef23ba9ffaf111dd223126d4db28d1c942ab Mon Sep 17 00:00:00 2001 From: Meshcheryakov Ilya Date: Wed, 29 May 2024 21:18:39 +0300 Subject: [PATCH 02/22] initial commit --- clearml_serving/serving/Dockerfile | 2 +- clearml_serving/serving/entrypoint.sh | 2 +- clearml_serving/serving/main.py | 84 +++++++++++++++++-- .../serving/model_request_processor.py | 19 +++-- clearml_serving/serving/preprocess_service.py | 15 +++- clearml_serving/statistics/entrypoint.sh | 2 +- 6 files changed, 104 insertions(+), 20 deletions(-) diff --git a/clearml_serving/serving/Dockerfile b/clearml_serving/serving/Dockerfile index bd817ea..a2d6a47 100644 --- a/clearml_serving/serving/Dockerfile +++ b/clearml_serving/serving/Dockerfile @@ -4,7 +4,7 @@ FROM python:3.11-bullseye ENV LC_ALL=C.UTF-8 # install base package -RUN pip3 install --no-cache-dir clearml-serving +# RUN pip3 install --no-cache-dir clearml-serving # get latest execution code from the git repository # RUN cd $HOME && git clone https://github.com/allegroai/clearml-serving.git diff --git a/clearml_serving/serving/entrypoint.sh b/clearml_serving/serving/entrypoint.sh index a5efea1..0e51dda 100755 --- a/clearml_serving/serving/entrypoint.sh +++ b/clearml_serving/serving/entrypoint.sh @@ -18,7 +18,7 @@ UVICORN_SERVE_LOOP="${UVICORN_SERVE_LOOP:-uvloop}" UVICORN_LOG_LEVEL="${UVICORN_LOG_LEVEL:-warning}" # set default internal serve endpoint (for request pipelining) -CLEARML_DEFAULT_BASE_SERVE_URL="${CLEARML_DEFAULT_BASE_SERVE_URL:-http://127.0.0.1:$SERVING_PORT/serve}" +CLEARML_DEFAULT_BASE_SERVE_URL="${CLEARML_DEFAULT_BASE_SERVE_URL:-http://127.0.0.1:$SERVING_PORT/clearml}" CLEARML_DEFAULT_TRITON_GRPC_ADDR="${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-127.0.0.1:8001}" # print configuration diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py index 98c9f21..c3c5f7c 100644 --- a/clearml_serving/serving/main.py +++ b/clearml_serving/serving/main.py @@ -92,7 +92,7 @@ async def cuda_exception_handler(request, exc): router = APIRouter( - prefix="/serve", + prefix="/clearml", tags=["models"], responses={404: {"description": "Model Serving Endpoint Not found"}}, route_class=GzipRoute, # mark-out to remove support for GZip content encoding @@ -100,15 +100,49 @@ async def cuda_exception_handler(request, exc): # cover all routing options for model version `/{model_id}`, `/{model_id}/123`, `/{model_id}?version=123` -@router.post("/{model_id}/{version}") -@router.post("/{model_id}/") -@router.post("/{model_id}") -async def serve_model(model_id: str, version: Optional[str] = None, request: Union[bytes, Dict[Any, Any]] = None): +# @router.post("/{model_id}/{version}") +# @router.post("/{model_id}/") +# @router.post("/{model_id}") +# async def serve_model(model_id: str, version: Optional[str] = None, request: Union[bytes, Dict[Any, Any]] = None): +# try: +# return_value = await processor.process_request( +# base_url=model_id, +# version=version, +# request_body=request +# ) +# except EndpointNotFoundException as ex: +# raise HTTPException(status_code=404, detail="Error processing request, endpoint was not found: {}".format(ex)) +# except (EndpointModelLoadException, EndpointBackendEngineException) as ex: +# session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format( +# instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) +# raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex)) +# except ServingInitializationException as ex: +# session_logger.report_text("[{}] Exception [{}] {} while loading serving inference: {}\n{}".format( +# instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) +# raise HTTPException(status_code=500, detail="Error [{}] processing request: {}".format(type(ex), ex)) +# except ValueError as ex: +# session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format( +# instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) +# if "CUDA out of memory. " in str(ex) or "NVML_SUCCESS == r INTERNAL ASSERT FAILED" in str(ex): +# raise CUDAException(exception=ex) +# else: +# raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex)) +# except Exception as ex: +# session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format( +# instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) +# raise HTTPException(status_code=500, detail="Error [{}] processing request: {}".format(type(ex), ex)) +# return return_value + + +@router.post("/{model_id}/v1/chat/completions") +@router.post("/{model_id}/v1/chat/completions/") +async def serve_model(model_id: str, request: Union[bytes, Dict[Any, Any]] = None): try: return_value = await processor.process_request( base_url=model_id, - version=version, - request_body=request + version=None, + request_body=request, + url_type="chat_completion" ) except EndpointNotFoundException as ex: raise HTTPException(status_code=404, detail="Error processing request, endpoint was not found: {}".format(ex)) @@ -123,7 +157,41 @@ async def serve_model(model_id: str, version: Optional[str] = None, request: Uni except ValueError as ex: session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format( instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) - if "CUDA out of memory. " in str(ex): + if "CUDA out of memory. " in str(ex) or "NVML_SUCCESS == r INTERNAL ASSERT FAILED" in str(ex): + raise CUDAException(exception=ex) + else: + raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex)) + except Exception as ex: + session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format( + instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) + raise HTTPException(status_code=500, detail="Error [{}] processing request: {}".format(type(ex), ex)) + return return_value + + +@router.post("/{model_id}/v1/completions") +@router.post("/{model_id}/v1/completions/") +async def serve_model(model_id: str, request: Union[bytes, Dict[Any, Any]] = None): + try: + return_value = await processor.process_request( + base_url=model_id, + version=None, + request_body=request, + url_type="completion" + ) + except EndpointNotFoundException as ex: + raise HTTPException(status_code=404, detail="Error processing request, endpoint was not found: {}".format(ex)) + except (EndpointModelLoadException, EndpointBackendEngineException) as ex: + session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format( + instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) + raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex)) + except ServingInitializationException as ex: + session_logger.report_text("[{}] Exception [{}] {} while loading serving inference: {}\n{}".format( + instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) + raise HTTPException(status_code=500, detail="Error [{}] processing request: {}".format(type(ex), ex)) + except ValueError as ex: + session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format( + instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) + if "CUDA out of memory. " in str(ex) or "NVML_SUCCESS == r INTERNAL ASSERT FAILED" in str(ex): raise CUDAException(exception=ex) else: raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex)) diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py index 35f5120..93198f8 100644 --- a/clearml_serving/serving/model_request_processor.py +++ b/clearml_serving/serving/model_request_processor.py @@ -155,7 +155,7 @@ def __init__( self._serving_base_url = None self._metric_log_freq = None - async def process_request(self, base_url: str, version: str, request_body: dict) -> dict: + async def process_request(self, base_url: str, version: str, request_body: dict, url_type: str) -> dict: """ Process request coming in, Raise Value error if url does not match existing endpoints @@ -188,7 +188,7 @@ async def process_request(self, base_url: str, version: str, request_body: dict) processor = processor_cls(model_endpoint=ep, task=self._task) self._engine_processor_lookup[url] = processor - return_value = await self._process_request(processor=processor, url=url, body=request_body) + return_value = await self._process_request(processor=processor, url=url, body=request_body, url_type=url_type) finally: self._request_processing_state.dec() @@ -1188,7 +1188,7 @@ def _deserialize_conf_dict(self, configuration: dict) -> None: # update preprocessing classes BasePreprocessRequest.set_server_config(self._configuration) - async def _process_request(self, processor: BasePreprocessRequest, url: str, body: dict) -> dict: + async def _process_request(self, processor: BasePreprocessRequest, url: str, body: dict, url_type: str) -> dict: # collect statistics for this request stats_collect_fn = None collect_stats = False @@ -1211,9 +1211,16 @@ async def _process_request(self, processor: BasePreprocessRequest, url: str, bod if processor.is_preprocess_async \ else processor.preprocess(body, state, stats_collect_fn) # noinspection PyUnresolvedReferences - processed = await processor.process(preprocessed, state, stats_collect_fn) \ - if processor.is_process_async \ - else processor.process(preprocessed, state, stats_collect_fn) + if url_type == "completion": + processed = await processor.completion(preprocessed, state, stats_collect_fn) \ + if processor.is_process_async \ + else processor.completion(preprocessed, state, stats_collect_fn) + elif url_type == "chat_completion": + processed = await processor.chat_completion(preprocessed, state, stats_collect_fn) \ + if processor.is_process_async \ + else processor.chat_completion(preprocessed, state, stats_collect_fn) + else: + raise ValueError(f"wrong url_type: expected 'completion' and 'chat_completion', got {url_type}") # noinspection PyUnresolvedReferences return_value = await processor.postprocess(processed, state, stats_collect_fn) \ if processor.is_postprocess_async \ diff --git a/clearml_serving/serving/preprocess_service.py b/clearml_serving/serving/preprocess_service.py index a5c069c..0618a0a 100644 --- a/clearml_serving/serving/preprocess_service.py +++ b/clearml_serving/serving/preprocess_service.py @@ -576,13 +576,22 @@ async def postprocess( return await self._preprocess.postprocess(data, state, collect_custom_statistics_fn) return data - async def process(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: + async def completion(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: """ The actual processing function. We run the process in this context """ - if self._preprocess is not None and hasattr(self._preprocess, 'process'): - return await self._preprocess.process(data, state, collect_custom_statistics_fn) + if self._preprocess is not None and hasattr(self._preprocess, 'completion'): + return await self._preprocess.completion(data, state, collect_custom_statistics_fn) + return None + + async def chat_completion(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: + """ + The actual processing function. + We run the process in this context + """ + if self._preprocess is not None and hasattr(self._preprocess, 'chat_completion'): + return await self._preprocess.chat_completion(data, state, collect_custom_statistics_fn) return None @staticmethod diff --git a/clearml_serving/statistics/entrypoint.sh b/clearml_serving/statistics/entrypoint.sh index 1af8bef..f2e7e72 100755 --- a/clearml_serving/statistics/entrypoint.sh +++ b/clearml_serving/statistics/entrypoint.sh @@ -10,7 +10,7 @@ echo CLEARML_DEFAULT_KAFKA_SERVE_URL="$CLEARML_DEFAULT_KAFKA_SERVE_URL" SERVING_PORT="${CLEARML_SERVING_PORT:-9999}" # set default internal serve endpoint (for request pipelining) -CLEARML_DEFAULT_BASE_SERVE_URL="${CLEARML_DEFAULT_BASE_SERVE_URL:-http://127.0.0.1:$SERVING_PORT/serve}" +CLEARML_DEFAULT_BASE_SERVE_URL="${CLEARML_DEFAULT_BASE_SERVE_URL:-http://127.0.0.1:$SERVING_PORT/clearml}" CLEARML_DEFAULT_TRITON_GRPC_ADDR="${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-127.0.0.1:8001}" # print configuration From b8f5d81636613992c7a0119f6ec384482ab99e79 Mon Sep 17 00:00:00 2001 From: Meshcheryakov Ilya Date: Thu, 30 May 2024 00:30:30 +0300 Subject: [PATCH 03/22] initial commit --- clearml_serving/serving/main.py | 80 ++----------------- .../serving/model_request_processor.py | 17 ++-- clearml_serving/serving/preprocess_service.py | 74 ++++------------- 3 files changed, 28 insertions(+), 143 deletions(-) diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py index c3c5f7c..8ac27d4 100644 --- a/clearml_serving/serving/main.py +++ b/clearml_serving/serving/main.py @@ -99,84 +99,14 @@ async def cuda_exception_handler(request, exc): ) -# cover all routing options for model version `/{model_id}`, `/{model_id}/123`, `/{model_id}?version=123` -# @router.post("/{model_id}/{version}") -# @router.post("/{model_id}/") -# @router.post("/{model_id}") -# async def serve_model(model_id: str, version: Optional[str] = None, request: Union[bytes, Dict[Any, Any]] = None): -# try: -# return_value = await processor.process_request( -# base_url=model_id, -# version=version, -# request_body=request -# ) -# except EndpointNotFoundException as ex: -# raise HTTPException(status_code=404, detail="Error processing request, endpoint was not found: {}".format(ex)) -# except (EndpointModelLoadException, EndpointBackendEngineException) as ex: -# session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format( -# instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) -# raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex)) -# except ServingInitializationException as ex: -# session_logger.report_text("[{}] Exception [{}] {} while loading serving inference: {}\n{}".format( -# instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) -# raise HTTPException(status_code=500, detail="Error [{}] processing request: {}".format(type(ex), ex)) -# except ValueError as ex: -# session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format( -# instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) -# if "CUDA out of memory. " in str(ex) or "NVML_SUCCESS == r INTERNAL ASSERT FAILED" in str(ex): -# raise CUDAException(exception=ex) -# else: -# raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex)) -# except Exception as ex: -# session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format( -# instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) -# raise HTTPException(status_code=500, detail="Error [{}] processing request: {}".format(type(ex), ex)) -# return return_value - - -@router.post("/{model_id}/v1/chat/completions") -@router.post("/{model_id}/v1/chat/completions/") -async def serve_model(model_id: str, request: Union[bytes, Dict[Any, Any]] = None): +@router.post("/v1/{endpoint_type}") +@router.post("/v1/{endpoint_type}/") +async def llm_serve_model(endpoint_type: str, request: Union[bytes, Dict[Any, Any]] = None): try: return_value = await processor.process_request( - base_url=model_id, - version=None, + base_url=request["model"], request_body=request, - url_type="chat_completion" - ) - except EndpointNotFoundException as ex: - raise HTTPException(status_code=404, detail="Error processing request, endpoint was not found: {}".format(ex)) - except (EndpointModelLoadException, EndpointBackendEngineException) as ex: - session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format( - instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) - raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex)) - except ServingInitializationException as ex: - session_logger.report_text("[{}] Exception [{}] {} while loading serving inference: {}\n{}".format( - instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) - raise HTTPException(status_code=500, detail="Error [{}] processing request: {}".format(type(ex), ex)) - except ValueError as ex: - session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format( - instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) - if "CUDA out of memory. " in str(ex) or "NVML_SUCCESS == r INTERNAL ASSERT FAILED" in str(ex): - raise CUDAException(exception=ex) - else: - raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex)) - except Exception as ex: - session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format( - instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) - raise HTTPException(status_code=500, detail="Error [{}] processing request: {}".format(type(ex), ex)) - return return_value - - -@router.post("/{model_id}/v1/completions") -@router.post("/{model_id}/v1/completions/") -async def serve_model(model_id: str, request: Union[bytes, Dict[Any, Any]] = None): - try: - return_value = await processor.process_request( - base_url=model_id, - version=None, - request_body=request, - url_type="completion" + url_type=endpoint_type ) except EndpointNotFoundException as ex: raise HTTPException(status_code=404, detail="Error processing request, endpoint was not found: {}".format(ex)) diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py index 93198f8..04c79e6 100644 --- a/clearml_serving/serving/model_request_processor.py +++ b/clearml_serving/serving/model_request_processor.py @@ -155,7 +155,7 @@ def __init__( self._serving_base_url = None self._metric_log_freq = None - async def process_request(self, base_url: str, version: str, request_body: dict, url_type: str) -> dict: + async def process_request(self, base_url: str, request_body: dict, url_type: str) -> dict: """ Process request coming in, Raise Value error if url does not match existing endpoints @@ -167,11 +167,11 @@ async def process_request(self, base_url: str, version: str, request_body: dict, while self._update_lock_flag: await asyncio.sleep(0.5+random()) # retry to process - return await self.process_request(base_url=base_url, version=version, request_body=request_body) + return await self.process_request(base_url=base_url, request_body=request_body, url_type=url_type) try: # normalize url and version - url = self._normalize_endpoint_url(base_url, version) + url = self._normalize_endpoint_url(base_url) # check canary canary_url = self._process_canary(base_url=url) @@ -1211,16 +1211,16 @@ async def _process_request(self, processor: BasePreprocessRequest, url: str, bod if processor.is_preprocess_async \ else processor.preprocess(body, state, stats_collect_fn) # noinspection PyUnresolvedReferences - if url_type == "completion": + if url_type == "completions": processed = await processor.completion(preprocessed, state, stats_collect_fn) \ if processor.is_process_async \ else processor.completion(preprocessed, state, stats_collect_fn) - elif url_type == "chat_completion": + elif url_type == "chat/completions": processed = await processor.chat_completion(preprocessed, state, stats_collect_fn) \ if processor.is_process_async \ else processor.chat_completion(preprocessed, state, stats_collect_fn) else: - raise ValueError(f"wrong url_type: expected 'completion' and 'chat_completion', got {url_type}") + raise ValueError(f"wrong url_type: expected 'completions' and 'chat/completions', got {url_type}") # noinspection PyUnresolvedReferences return_value = await processor.postprocess(processed, state, stats_collect_fn) \ if processor.is_postprocess_async \ @@ -1341,8 +1341,9 @@ def _create_task( return task @classmethod - def _normalize_endpoint_url(cls, endpoint: str, version: Optional[str] = None) -> str: - return "{}/{}".format(endpoint.rstrip("/"), version or "").rstrip("/") + def _normalize_endpoint_url(cls, endpoint: str) -> str: + # return "{}/{}".format(endpoint.rstrip("/"), version or "").rstrip("/") + return endpoint @classmethod def _validate_model(cls, endpoint: Union[ModelEndpoint, ModelMonitoring]) -> bool: diff --git a/clearml_serving/serving/preprocess_service.py b/clearml_serving/serving/preprocess_service.py index 0618a0a..e065144 100644 --- a/clearml_serving/serving/preprocess_service.py +++ b/clearml_serving/serving/preprocess_service.py @@ -19,7 +19,7 @@ class BasePreprocessRequest(object): __preprocessing_lookup = {} __preprocessing_modules = set() _grpc_env_conf_prefix = "CLEARML_GRPC_" - _default_serving_base_url = "http://127.0.0.1:8080/serve/" + _default_serving_base_url = "http://127.0.0.1:8080/clearml/" _server_config = {} # externally configured by the serving inference service _timeout = None # timeout in seconds for the entire request, set in __init__ is_preprocess_async = False @@ -292,7 +292,7 @@ def __init__(self, model_endpoint: ModelEndpoint, task: Task = None): self._grpc_stub = {} - async def process( + async def chat_completion( self, data: Any, state: dict, @@ -428,74 +428,28 @@ async def process( return output_results[0] if index == 1 else output_results -@BasePreprocessRequest.register_engine("sklearn", modules=["joblib", "sklearn"]) -class SKLearnPreprocessRequest(BasePreprocessRequest): - def __init__(self, model_endpoint: ModelEndpoint, task: Task = None): - super(SKLearnPreprocessRequest, self).__init__( - model_endpoint=model_endpoint, task=task) - if self._model is None: - # get model - import joblib # noqa - self._model = joblib.load(filename=self._get_local_model_file()) - - def process(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: - """ - The actual processing function. - We run the model in this context - """ - return self._model.predict(data) - - -@BasePreprocessRequest.register_engine("xgboost", modules=["xgboost"]) -class XGBoostPreprocessRequest(BasePreprocessRequest): - def __init__(self, model_endpoint: ModelEndpoint, task: Task = None): - super(XGBoostPreprocessRequest, self).__init__( - model_endpoint=model_endpoint, task=task) - if self._model is None: - # get model - import xgboost # noqa - self._model = xgboost.Booster() - self._model.load_model(self._get_local_model_file()) - - def process(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: - """ - The actual processing function. - We run the model in this context - """ - return self._model.predict(data) - - -@BasePreprocessRequest.register_engine("lightgbm", modules=["lightgbm"]) -class LightGBMPreprocessRequest(BasePreprocessRequest): - def __init__(self, model_endpoint: ModelEndpoint, task: Task = None): - super(LightGBMPreprocessRequest, self).__init__( - model_endpoint=model_endpoint, task=task) - if self._model is None: - # get model - import lightgbm # noqa - self._model = lightgbm.Booster(model_file=self._get_local_model_file()) - - def process(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: - """ - The actual processing function. - We run the model in this context - """ - return self._model.predict(data) - - @BasePreprocessRequest.register_engine("custom") class CustomPreprocessRequest(BasePreprocessRequest): def __init__(self, model_endpoint: ModelEndpoint, task: Task = None): super(CustomPreprocessRequest, self).__init__( model_endpoint=model_endpoint, task=task) - def process(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: + def completion(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: """ The actual processing function. We run the process in this context """ - if self._preprocess is not None and hasattr(self._preprocess, 'process'): - return self._preprocess.process(data, state, collect_custom_statistics_fn) + if self._preprocess is not None and hasattr(self._preprocess, 'completion'): + return self._preprocess.completion(data, state, collect_custom_statistics_fn) + return None + + def chat_completion(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: + """ + The actual processing function. + We run the process in this context + """ + if self._preprocess is not None and hasattr(self._preprocess, 'chat_completion'): + return self._preprocess.chat_completion(data, state, collect_custom_statistics_fn) return None From 4796d77ad791a02729dd1dc442ede567fffb2f8d Mon Sep 17 00:00:00 2001 From: Meshcheryakov Ilya Date: Thu, 30 May 2024 15:52:06 +0300 Subject: [PATCH 04/22] fix shash processing --- clearml_serving/serving/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py index 8ac27d4..683c89c 100644 --- a/clearml_serving/serving/main.py +++ b/clearml_serving/serving/main.py @@ -99,8 +99,8 @@ async def cuda_exception_handler(request, exc): ) -@router.post("/v1/{endpoint_type}") -@router.post("/v1/{endpoint_type}/") +@router.post("/v1/{endpoint_type:path}") +@router.post("/v1/{endpoint_type:path}/") async def llm_serve_model(endpoint_type: str, request: Union[bytes, Dict[Any, Any]] = None): try: return_value = await processor.process_request( From 5b73bdf085b89bb048806ba6a2a529902b94e041 Mon Sep 17 00:00:00 2001 From: IlyaMescheryakov1402 Date: Thu, 27 Feb 2025 22:56:39 +0300 Subject: [PATCH 05/22] fix suffix and add router --- clearml_serving/serving/entrypoint.sh | 4 +- clearml_serving/serving/main.py | 78 ++++++++++++------- .../serving/model_request_processor.py | 41 ++++++---- clearml_serving/statistics/entrypoint.sh | 3 +- docker/docker-compose-triton-gpu.yml | 1 + docker/docker-compose-triton.yml | 1 + docker/docker-compose.yml | 1 + 7 files changed, 84 insertions(+), 45 deletions(-) diff --git a/clearml_serving/serving/entrypoint.sh b/clearml_serving/serving/entrypoint.sh index 0e51dda..06cb4be 100755 --- a/clearml_serving/serving/entrypoint.sh +++ b/clearml_serving/serving/entrypoint.sh @@ -18,7 +18,8 @@ UVICORN_SERVE_LOOP="${UVICORN_SERVE_LOOP:-uvloop}" UVICORN_LOG_LEVEL="${UVICORN_LOG_LEVEL:-warning}" # set default internal serve endpoint (for request pipelining) -CLEARML_DEFAULT_BASE_SERVE_URL="${CLEARML_DEFAULT_BASE_SERVE_URL:-http://127.0.0.1:$SERVING_PORT/clearml}" +CLEARML_DEFAULT_SERVE_SUFFIX="${CLEARML_DEFAULT_SERVE_SUFFIX:-serve}" +CLEARML_DEFAULT_BASE_SERVE_URL="${CLEARML_DEFAULT_BASE_SERVE_URL:-http://127.0.0.1:$SERVING_PORT/$CLEARML_DEFAULT_SERVE_SUFFIX}" CLEARML_DEFAULT_TRITON_GRPC_ADDR="${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-127.0.0.1:8001}" # print configuration @@ -31,6 +32,7 @@ echo GUNICORN_EXTRA_ARGS="$GUNICORN_EXTRA_ARGS" echo UVICORN_SERVE_LOOP="$UVICORN_SERVE_LOOP" echo UVICORN_EXTRA_ARGS="$UVICORN_EXTRA_ARGS" echo UVICORN_LOG_LEVEL="$UVICORN_LOG_LEVEL" +echo CLEARML_DEFAULT_SERVE_SUFFIX="$CLEARML_DEFAULT_SERVE_SUFFIX" echo CLEARML_DEFAULT_BASE_SERVE_URL="$CLEARML_DEFAULT_BASE_SERVE_URL" echo CLEARML_DEFAULT_TRITON_GRPC_ADDR="$CLEARML_DEFAULT_TRITON_GRPC_ADDR" diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py index c1cc698..1950265 100644 --- a/clearml_serving/serving/main.py +++ b/clearml_serving/serving/main.py @@ -60,9 +60,6 @@ async def custom_route_handler(request: Request) -> Response: except (ValueError, TypeError): pass -class CUDAException(Exception): - def __init__(self, exception: str): - self.exception = exception grpc_aio_ignore_errors = parse_grpc_errors(shlex.split(os.environ.get("CLEARML_SERVING_AIO_RPC_IGNORE_ERRORS", ""))) grpc_aio_verbose_errors = parse_grpc_errors(shlex.split(os.environ.get("CLEARML_SERVING_AIO_RPC_VERBOSE_ERRORS", ""))) @@ -113,36 +110,18 @@ async def cuda_exception_handler(request, exc): return PlainTextResponse("CUDA out of memory. Restarting service", status_code=500, background=task) -@app.on_event('shutdown') -def shutdown_event(): - print('RESTARTING INFERENCE SERVICE!') - -async def exit_app(): - loop = asyncio.get_running_loop() - loop.stop() - -@app.exception_handler(CUDAException) -async def cuda_exception_handler(request, exc): - task = BackgroundTask(exit_app) - return PlainTextResponse("CUDA out of memory. Restarting service", status_code=500, background=task) - - -router = APIRouter( - prefix="/clearml", - tags=["models"], - responses={404: {"description": "Model Serving Endpoint Not found"}}, - route_class=GzipRoute, # mark-out to remove support for GZip content encoding -) - - -@router.post("/v1/{endpoint_type:path}") -@router.post("/v1/{endpoint_type:path}/") -async def llm_serve_model(endpoint_type: str, request: Union[bytes, Dict[Any, Any]] = None): +def process_with_exceptions( + base_url: str, + version: Optional[str], + request: Union[bytes, Dict[Any, Any]], + serve_type: str +): try: return_value = await processor.process_request( - base_url=request["model"], + base_url=base_url, + version=version, request_body=request, - url_type=endpoint_type + serve_type=serve_type ) except EndpointNotFoundException as ex: raise HTTPException(status_code=404, detail="Error processing request, endpoint was not found: {}".format(ex)) @@ -190,4 +169,43 @@ async def llm_serve_model(endpoint_type: str, request: Union[bytes, Dict[Any, An return return_value +router = APIRouter( + prefix=f"/{os.environ.get("CLEARML_DEFAULT_SERVE_SUFFIX", "serve")}", + tags=["models"], + responses={404: {"description": "Model Serving Endpoint Not found"}}, + route_class=GzipRoute, # mark-out to remove support for GZip content encoding +) + + +@router.post("/{model_id}/{version}") +@router.post("/{model_id}/") +@router.post("/{model_id}") +async def base_serve_model( + model_id: str, + version: Optional[str] = None, + request: Union[bytes, Dict[Any, Any]] = None +): + return_value = process_with_exceptions( + base_url=model_id, + version=version, + request_body=request, + serve_type="process" + ) + return return_value + + +@router.post("/openai/v1/{endpoint_type:path}") +@router.post("/openai/v1/{endpoint_type:path}/") +async def openai_serve_model( + endpoint_type: str, + request: Union[bytes, Dict[Any, Any]] = None +): + return_value = process_with_exceptions( + base_url=request.get("model", None), + version=None, + request_body=request, + serve_type=endpoint_type + ) + return return_value + app.include_router(router) diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py index bc6ee12..44eaf9e 100644 --- a/clearml_serving/serving/model_request_processor.py +++ b/clearml_serving/serving/model_request_processor.py @@ -159,7 +159,7 @@ def __init__( self._serving_base_url = None self._metric_log_freq = None - async def process_request(self, base_url: str, request_body: dict, url_type: str) -> dict: + async def process_request(self, base_url: str, version: str, request_body: dict, serve_type: str) -> dict: """ Process request coming in, Raise Value error if url does not match existing endpoints @@ -171,11 +171,16 @@ async def process_request(self, base_url: str, request_body: dict, url_type: str while self._update_lock_flag: await asyncio.sleep(0.5+random()) # retry to process - return await self.process_request(base_url=base_url, request_body=request_body, url_type=url_type) + return await self.process_request( + base_url=base_url, + version=version, + request_body=request_body, + serve_type=serve_type + ) try: # normalize url and version - url = self._normalize_endpoint_url(base_url) + url = self._normalize_endpoint_url(base_url, version) # check canary canary_url = self._process_canary(base_url=url) @@ -192,7 +197,12 @@ async def process_request(self, base_url: str, request_body: dict, url_type: str processor = processor_cls(model_endpoint=ep, task=self._task) self._engine_processor_lookup[url] = processor - return_value = await self._process_request(processor=processor, url=url, body=request_body, url_type=url_type) + return_value = await self._process_request( + processor=processor, + url=url, + body=request_body, + serve_type=serve_type + ) finally: self._request_processing_state.dec() @@ -1193,7 +1203,7 @@ def _deserialize_conf_dict(self, configuration: dict) -> None: # update preprocessing classes BasePreprocessRequest.set_server_config(self._configuration) - async def _process_request(self, processor: BasePreprocessRequest, url: str, body: dict, url_type: str) -> dict: + async def _process_request(self, processor: BasePreprocessRequest, url: str, body: dict, serve_type: str) -> dict: # collect statistics for this request stats_collect_fn = None collect_stats = False @@ -1215,13 +1225,19 @@ async def _process_request(self, processor: BasePreprocessRequest, url: str, bod preprocessed = await processor.preprocess(body, state, stats_collect_fn) \ if processor.is_preprocess_async \ else processor.preprocess(body, state, stats_collect_fn) - # noinspection PyUnresolvedReferences - if url_type == "completions": - processed = await processor.completion(preprocessed, state, stats_collect_fn) \ + if serve_type == "process": + # noinspection PyUnresolvedReferences + processed = await processor.process(preprocessed, state, stats_collect_fn) \ + if processor.is_process_async \ + else processor.process(preprocessed, state, stats_collect_fn) + elif serve_type == "completions": + # noinspection PyUnresolvedReferences + processed = await processor.completions(preprocessed, state, stats_collect_fn) \ if processor.is_process_async \ else processor.completion(preprocessed, state, stats_collect_fn) - elif url_type == "chat/completions": - processed = await processor.chat_completion(preprocessed, state, stats_collect_fn) \ + elif serve_type == "chat/completions": + # noinspection PyUnresolvedReferences + processed = await processor.chat_completions(preprocessed, state, stats_collect_fn) \ if processor.is_process_async \ else processor.chat_completion(preprocessed, state, stats_collect_fn) else: @@ -1346,9 +1362,8 @@ def _create_task( return task @classmethod - def _normalize_endpoint_url(cls, endpoint: str) -> str: - # return "{}/{}".format(endpoint.rstrip("/"), version or "").rstrip("/") - return endpoint + def _normalize_endpoint_url(cls, endpoint: str, version: Optional[str] = None) -> str: + return "{}/{}".format(endpoint.rstrip("/"), version or "").rstrip("/") @classmethod def _validate_model(cls, endpoint: Union[ModelEndpoint, ModelMonitoring]) -> bool: diff --git a/clearml_serving/statistics/entrypoint.sh b/clearml_serving/statistics/entrypoint.sh index f2e7e72..5b4de93 100755 --- a/clearml_serving/statistics/entrypoint.sh +++ b/clearml_serving/statistics/entrypoint.sh @@ -10,7 +10,8 @@ echo CLEARML_DEFAULT_KAFKA_SERVE_URL="$CLEARML_DEFAULT_KAFKA_SERVE_URL" SERVING_PORT="${CLEARML_SERVING_PORT:-9999}" # set default internal serve endpoint (for request pipelining) -CLEARML_DEFAULT_BASE_SERVE_URL="${CLEARML_DEFAULT_BASE_SERVE_URL:-http://127.0.0.1:$SERVING_PORT/clearml}" +CLEARML_DEFAULT_SERVE_SUFFIX="${CLEARML_DEFAULT_SERVE_SUFFIX:-serve}" +CLEARML_DEFAULT_BASE_SERVE_URL="${CLEARML_DEFAULT_BASE_SERVE_URL:-http://127.0.0.1:$SERVING_PORT/$CLEARML_DEFAULT_SERVE_SUFFIX}" CLEARML_DEFAULT_TRITON_GRPC_ADDR="${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-127.0.0.1:8001}" # print configuration diff --git a/docker/docker-compose-triton-gpu.yml b/docker/docker-compose-triton-gpu.yml index 8e54073..a2ec754 100644 --- a/docker/docker-compose-triton-gpu.yml +++ b/docker/docker-compose-triton-gpu.yml @@ -92,6 +92,7 @@ services: CLEARML_SERVING_TASK_ID: ${CLEARML_SERVING_TASK_ID:-} CLEARML_SERVING_PORT: ${CLEARML_SERVING_PORT:-8080} CLEARML_SERVING_POLL_FREQ: ${CLEARML_SERVING_POLL_FREQ:-1.0} + CLEARML_DEFAULT_SERVE_SUFFIX: ${CLEARML_DEFAULT_SERVE_SUFFIX:-serve} CLEARML_DEFAULT_BASE_SERVE_URL: ${CLEARML_DEFAULT_BASE_SERVE_URL:-http://127.0.0.1:8080/serve} CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092} CLEARML_DEFAULT_TRITON_GRPC_ADDR: ${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-clearml-serving-triton:8001} diff --git a/docker/docker-compose-triton.yml b/docker/docker-compose-triton.yml index b815583..edf92a4 100644 --- a/docker/docker-compose-triton.yml +++ b/docker/docker-compose-triton.yml @@ -92,6 +92,7 @@ services: CLEARML_SERVING_TASK_ID: ${CLEARML_SERVING_TASK_ID:-} CLEARML_SERVING_PORT: ${CLEARML_SERVING_PORT:-8080} CLEARML_SERVING_POLL_FREQ: ${CLEARML_SERVING_POLL_FREQ:-1.0} + CLEARML_DEFAULT_SERVE_SUFFIX: ${CLEARML_DEFAULT_SERVE_SUFFIX:-serve} CLEARML_DEFAULT_BASE_SERVE_URL: ${CLEARML_DEFAULT_BASE_SERVE_URL:-http://127.0.0.1:8080/serve} CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092} CLEARML_DEFAULT_TRITON_GRPC_ADDR: ${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-clearml-serving-triton:8001} diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 24e3b95..e73e184 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -92,6 +92,7 @@ services: CLEARML_SERVING_TASK_ID: ${CLEARML_SERVING_TASK_ID:-} CLEARML_SERVING_PORT: ${CLEARML_SERVING_PORT:-8080} CLEARML_SERVING_POLL_FREQ: ${CLEARML_SERVING_POLL_FREQ:-1.0} + CLEARML_DEFAULT_SERVE_SUFFIX: ${CLEARML_DEFAULT_SERVE_SUFFIX:-serve} CLEARML_DEFAULT_BASE_SERVE_URL: ${CLEARML_DEFAULT_BASE_SERVE_URL:-http://127.0.0.1:8080/serve} CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092} CLEARML_DEFAULT_TRITON_GRPC_ADDR: ${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-} From f51bf2e081bfa412fe61a1f4863d438d9e1c5469 Mon Sep 17 00:00:00 2001 From: IlyaMescheryakov1402 Date: Thu, 27 Feb 2025 23:13:47 +0300 Subject: [PATCH 06/22] revert some old changes --- .../serving/model_request_processor.py | 2 +- clearml_serving/serving/preprocess_service.py | 351 ++++++++++++++++-- 2 files changed, 330 insertions(+), 23 deletions(-) diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py index 44eaf9e..0f6bfa8 100644 --- a/clearml_serving/serving/model_request_processor.py +++ b/clearml_serving/serving/model_request_processor.py @@ -1241,7 +1241,7 @@ async def _process_request(self, processor: BasePreprocessRequest, url: str, bod if processor.is_process_async \ else processor.chat_completion(preprocessed, state, stats_collect_fn) else: - raise ValueError(f"wrong url_type: expected 'completions' and 'chat/completions', got {url_type}") + raise ValueError(f"wrong url_type: expected 'completions' and 'chat/completions', got {serve_type}") # noinspection PyUnresolvedReferences return_value = await processor.postprocess(processed, state, stats_collect_fn) \ if processor.is_postprocess_async \ diff --git a/clearml_serving/serving/preprocess_service.py b/clearml_serving/serving/preprocess_service.py index e065144..d29f5f8 100644 --- a/clearml_serving/serving/preprocess_service.py +++ b/clearml_serving/serving/preprocess_service.py @@ -19,7 +19,7 @@ class BasePreprocessRequest(object): __preprocessing_lookup = {} __preprocessing_modules = set() _grpc_env_conf_prefix = "CLEARML_GRPC_" - _default_serving_base_url = "http://127.0.0.1:8080/clearml/" + _default_serving_base_url = "http://127.0.0.1:8080/serve/" _server_config = {} # externally configured by the serving inference service _timeout = None # timeout in seconds for the entire request, set in __init__ is_preprocess_async = False @@ -292,7 +292,7 @@ def __init__(self, model_endpoint: ModelEndpoint, task: Task = None): self._grpc_stub = {} - async def chat_completion( + async def process( self, data: Any, state: dict, @@ -428,28 +428,74 @@ async def chat_completion( return output_results[0] if index == 1 else output_results -@BasePreprocessRequest.register_engine("custom") -class CustomPreprocessRequest(BasePreprocessRequest): +@BasePreprocessRequest.register_engine("sklearn", modules=["joblib", "sklearn"]) +class SKLearnPreprocessRequest(BasePreprocessRequest): def __init__(self, model_endpoint: ModelEndpoint, task: Task = None): - super(CustomPreprocessRequest, self).__init__( + super(SKLearnPreprocessRequest, self).__init__( model_endpoint=model_endpoint, task=task) + if self._model is None: + # get model + import joblib # noqa + self._model = joblib.load(filename=self._get_local_model_file()) - def completion(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: + def process(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: """ The actual processing function. - We run the process in this context + We run the model in this context """ - if self._preprocess is not None and hasattr(self._preprocess, 'completion'): - return self._preprocess.completion(data, state, collect_custom_statistics_fn) - return None - - def chat_completion(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: + return self._model.predict(data) + + +@BasePreprocessRequest.register_engine("xgboost", modules=["xgboost"]) +class XGBoostPreprocessRequest(BasePreprocessRequest): + def __init__(self, model_endpoint: ModelEndpoint, task: Task = None): + super(XGBoostPreprocessRequest, self).__init__( + model_endpoint=model_endpoint, task=task) + if self._model is None: + # get model + import xgboost # noqa + self._model = xgboost.Booster() + self._model.load_model(self._get_local_model_file()) + + def process(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: + """ + The actual processing function. + We run the model in this context + """ + return self._model.predict(data) + + +@BasePreprocessRequest.register_engine("lightgbm", modules=["lightgbm"]) +class LightGBMPreprocessRequest(BasePreprocessRequest): + def __init__(self, model_endpoint: ModelEndpoint, task: Task = None): + super(LightGBMPreprocessRequest, self).__init__( + model_endpoint=model_endpoint, task=task) + if self._model is None: + # get model + import lightgbm # noqa + self._model = lightgbm.Booster(model_file=self._get_local_model_file()) + + def process(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: + """ + The actual processing function. + We run the model in this context + """ + return self._model.predict(data) + + +@BasePreprocessRequest.register_engine("custom") +class CustomPreprocessRequest(BasePreprocessRequest): + def __init__(self, model_endpoint: ModelEndpoint, task: Task = None): + super(CustomPreprocessRequest, self).__init__( + model_endpoint=model_endpoint, task=task) + + def process(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: """ The actual processing function. We run the process in this context """ - if self._preprocess is not None and hasattr(self._preprocess, 'chat_completion'): - return self._preprocess.chat_completion(data, state, collect_custom_statistics_fn) + if self._preprocess is not None and hasattr(self._preprocess, 'process'): + return self._preprocess.process(data, state, collect_custom_statistics_fn) return None @@ -530,23 +576,284 @@ async def postprocess( return await self._preprocess.postprocess(data, state, collect_custom_statistics_fn) return data - async def completion(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: + async def process(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: """ The actual processing function. We run the process in this context """ - if self._preprocess is not None and hasattr(self._preprocess, 'completion'): - return await self._preprocess.completion(data, state, collect_custom_statistics_fn) + if self._preprocess is not None and hasattr(self._preprocess, 'process'): + return await self._preprocess.process(data, state, collect_custom_statistics_fn) return None - - async def chat_completion(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: + + @staticmethod + async def _preprocess_send_request(_, endpoint: str, version: str = None, data: dict = None) -> Optional[dict]: + endpoint = "{}/{}".format(endpoint.strip("/"), version.strip("/")) if version else endpoint.strip("/") + base_url = BasePreprocessRequest.get_server_config().get("base_serving_url") + base_url = (base_url or BasePreprocessRequest._default_serving_base_url).strip("/") + url = "{}/{}".format(base_url, endpoint.strip("/")) + return_value = await CustomAsyncPreprocessRequest.asyncio_to_thread( + request_post, url, json=data, timeout=BasePreprocessRequest._timeout) + if not return_value.ok: + return None + return return_value.json() + + +@BasePreprocessRequest.register_engine("vllm") +class VllmPreprocessRequest(BasePreprocessRequest): + import prometheus_client + + from typing import Any, Union, Optional, Callable + + from fastapi.responses import JSONResponse, StreamingResponse + + from vllm.engine.arg_utils import AsyncEngineArgs + from vllm.engine.async_llm_engine import AsyncLLMEngine + from vllm.entrypoints.logger import RequestLogger + # yapf conflicts with isort for this block + # yapf: disable + from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + ChatCompletionResponse, + CompletionRequest, + ErrorResponse + ) + + # yapf: enable + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding + from vllm.entrypoints.openai.serving_tokenization import OpenAIServingTokenization + from vllm.logger import init_logger + from vllm.usage.usage_lib import UsageContext + from vllm.entrypoints.openai.serving_engine import LoRAModulePath, PromptAdapterPath + + logger = init_logger(__name__) + + REMOVE_WEB_ADDITIONAL_PROMPTS = True + + if VllmPreprocessRequest.asyncio_to_thread is None: + from asyncio import to_thread as asyncio_to_thread + VllmPreprocessRequest.asyncio_to_thread = asyncio_to_thread + + def remove_extra_system_prompts(messages: list) -> list: + """ + Removes all 'system' prompts except the last one. + + :param messages: List of message dicts with 'role' and 'content'. + :return: Modified list of messages with only the last 'system' prompt preserved. + """ + # Фильтруем только системные сообщения + system_messages_indices = [] + for i, msg in enumerate(messages): + if msg["role"] == "system": + system_messages_indices.append(i) + else: + break + + # Если есть больше одного системного сообщения, удалим все, кроме последнего + if len(system_messages_indices) > 1: + last_system_index = system_messages_indices[-1] + # Удаляем все системные сообщения, кроме последнего + messages = [msg for i, msg in enumerate(messages) if msg["role"] != "system" or i == last_system_index] + + return messages + + class CustomRequest: + def __init__(self, headers: Optional[dict] = None): + self.headers = headers + + async def is_disconnected(self): + return False + + def __init__(self, model_endpoint: ModelEndpoint, task: Task = None): + super(VllmPreprocessRequest, self).__init__( + model_endpoint=model_endpoint, task=task) + + def is_port_in_use(port: int) -> bool: + import socket + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + return s.connect_ex(('localhost', port)) == 0 + if not is_port_in_use(8000): + prometheus_client.start_http_server(8000) + + vllm_engine_config = { + "model":f"{local_file_name}/model", + "tokenizer":f"{local_file_name}/tokenizer", + "disable_log_requests": True, + "disable_log_stats": False, + "gpu_memory_utilization": 0.9, + "quantization": None, + "enforce_eager": True, + "served_model_name": "ai_operator_hyp22v4" + } + vllm_model_config = { + "lora_modules": None, # [LoRAModulePath(name=a, path=b)] + "prompt_adapters": None, # [PromptAdapterPath(name=a, path=b)] + "response_role": "assistant", + "chat_template": None, + "return_tokens_as_token_ids": False, + "max_log_len": None + } + + self.engine_args = AsyncEngineArgs(**vllm_engine_config) + self.async_engine_client = AsyncLLMEngine.from_engine_args(self.engine_args, usage_context=UsageContext.OPENAI_API_SERVER) + + + model_config = self.async_engine_client.engine.get_model_config() + + request_logger = RequestLogger(max_log_len=vllm_model_config["max_log_len"]) + + self.openai_serving_chat = OpenAIServingChat( + self.async_engine_client, + model_config, + served_model_names=[vllm_engine_config["served_model_name"]], + response_role=vllm_model_config["response_role"], + lora_modules=vllm_model_config["lora_modules"], + prompt_adapters=vllm_model_config["prompt_adapters"], + request_logger=request_logger, + chat_template=vllm_model_config["chat_template"], + return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"] + ) + self.openai_serving_completion = OpenAIServingCompletion( + self.async_engine_client, + model_config, + served_model_names=[vllm_engine_config["served_model_name"]], + lora_modules=vllm_model_config["lora_modules"], + prompt_adapters=vllm_model_config["prompt_adapters"], + request_logger=request_logger, + return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"] + ) + self.openai_serving_embedding = OpenAIServingEmbedding( + self.async_engine_client, + model_config, + served_model_names=[vllm_engine_config["served_model_name"]], + request_logger=request_logger + ) + self.openai_serving_tokenization = OpenAIServingTokenization( + self.async_engine_client, + model_config, + served_model_names=[vllm_engine_config["served_model_name"]], + lora_modules=vllm_model_config["lora_modules"], + request_logger=request_logger, + chat_template=vllm_model_config["chat_template"] + ) + # override `send_request` method with the async version + self._preprocess.__class__.send_request = VllmPreprocessRequest._preprocess_send_request + + async def preprocess( + self, + request: dict, + state: dict, + collect_custom_statistics_fn: Callable[[dict], None] = None, + ) -> Optional[Any]: + """ + Raise exception to report an error + Return value will be passed to serving engine + + :param request: dictionary as recieved from the RestAPI + :param state: Use state dict to store data passed to the post-processing function call. + Usage example: + >>> def preprocess(..., state): + state['preprocess_aux_data'] = [1,2,3] + >>> def postprocess(..., state): + print(state['preprocess_aux_data']) + :param collect_custom_statistics_fn: Optional, allows to send a custom set of key/values + to the statictics collector servicd + + Usage example: + >>> print(request) + {"x0": 1, "x1": 2} + >>> collect_custom_statistics_fn({"x0": 1, "x1": 2}) + + :return: Object to be passed directly to the model inference + """ + if self._preprocess is not None and hasattr(self._preprocess, 'preprocess'): + return await self._preprocess.preprocess(request, state, collect_custom_statistics_fn) + return request + + async def postprocess( + self, + data: Any, + state: dict, + collect_custom_statistics_fn: Callable[[dict], None] = None + ) -> Optional[dict]: + """ + Raise exception to report an error + Return value will be passed to serving engine + + :param data: object as recieved from the inference model function + :param state: Use state dict to store data passed to the post-processing function call. + Usage example: + >>> def preprocess(..., state): + state['preprocess_aux_data'] = [1,2,3] + >>> def postprocess(..., state): + print(state['preprocess_aux_data']) + :param collect_custom_statistics_fn: Optional, allows to send a custom set of key/values + to the statictics collector servicd + + Usage example: + >>> collect_custom_statistics_fn({"y": 1}) + + :return: Dictionary passed directly as the returned result of the RestAPI + """ + if self._preprocess is not None and hasattr(self._preprocess, 'postprocess'): + return await self._preprocess.postprocess(data, state, collect_custom_statistics_fn) + return data + + + async def completions(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: """ The actual processing function. We run the process in this context """ - if self._preprocess is not None and hasattr(self._preprocess, 'chat_completion'): - return await self._preprocess.chat_completion(data, state, collect_custom_statistics_fn) - return None + if REMOVE_WEB_ADDITIONAL_PROMPTS: + if "messages" in body: + body["messages"] = remove_extra_system_prompts(body["messages"]) + + raw_request = CustomRequest( + headers = { + "traceparent": None, + "tracestate": None + } + ) + request = CompletionRequest(**body) + logger.info(f"Received chat completion request: {request}") + generator = await self.openai_serving_completion.create_completion( + request=request, + raw_request=raw_request + ) + if isinstance(generator, ErrorResponse): + return JSONResponse(content=generator.model_dump(), status_code=generator.code) + if request.stream: + return StreamingResponse(content=generator, media_type="text/event-stream") + else: + return JSONResponse(content=generator.model_dump()) + + + async def chat_completions(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: + """ + The actual processing function. + We run the process in this context + """ + # if self._preprocess is not None and hasattr(self._preprocess, 'chat_completion'): + # return await self._preprocess.chat_completion(data, state, collect_custom_statistics_fn) + # return None + if REMOVE_WEB_ADDITIONAL_PROMPTS: + if "messages" in body: + body["messages"] = remove_extra_system_prompts(body["messages"]) + + request = ChatCompletionRequest(**body) + logger.info(f"Received chat completion request: {request}") + generator = await self.openai_serving_chat.create_chat_completion( + request=request, raw_request=None + ) + if isinstance(generator, ErrorResponse): + return JSONResponse(content=generator.model_dump(), status_code=generator.code) + if request.stream: + return StreamingResponse(content=generator, media_type="text/event-stream") + else: + assert isinstance(generator, ChatCompletionResponse) + return JSONResponse(content=generator.model_dump()) @staticmethod async def _preprocess_send_request(_, endpoint: str, version: str = None, data: dict = None) -> Optional[dict]: From 32d72bcd1c165d2835b69d89a66cd804509a7dcd Mon Sep 17 00:00:00 2001 From: IlyaMescheryakov1402 Date: Fri, 28 Feb 2025 22:36:14 +0300 Subject: [PATCH 07/22] add vllm example --- .../serving/model_request_processor.py | 2 +- clearml_serving/serving/preprocess_service.py | 69 +---------------- examples/vllm/preprocess.py | 74 +++++++++++++++++++ examples/vllm/readme.md | 51 +++++++++++++ 4 files changed, 128 insertions(+), 68 deletions(-) create mode 100644 examples/vllm/preprocess.py create mode 100644 examples/vllm/readme.md diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py index 0f6bfa8..c952ddc 100644 --- a/clearml_serving/serving/model_request_processor.py +++ b/clearml_serving/serving/model_request_processor.py @@ -1241,7 +1241,7 @@ async def _process_request(self, processor: BasePreprocessRequest, url: str, bod if processor.is_process_async \ else processor.chat_completion(preprocessed, state, stats_collect_fn) else: - raise ValueError(f"wrong url_type: expected 'completions' and 'chat/completions', got {serve_type}") + raise ValueError(f"wrong url_type: expected 'process', 'completions' or 'chat/completions', got {serve_type}") # noinspection PyUnresolvedReferences return_value = await processor.postprocess(processed, state, stats_collect_fn) \ if processor.is_postprocess_async \ diff --git a/clearml_serving/serving/preprocess_service.py b/clearml_serving/serving/preprocess_service.py index d29f5f8..b9b6cda 100644 --- a/clearml_serving/serving/preprocess_service.py +++ b/clearml_serving/serving/preprocess_service.py @@ -675,68 +675,6 @@ def is_port_in_use(port: int) -> bool: return s.connect_ex(('localhost', port)) == 0 if not is_port_in_use(8000): prometheus_client.start_http_server(8000) - - vllm_engine_config = { - "model":f"{local_file_name}/model", - "tokenizer":f"{local_file_name}/tokenizer", - "disable_log_requests": True, - "disable_log_stats": False, - "gpu_memory_utilization": 0.9, - "quantization": None, - "enforce_eager": True, - "served_model_name": "ai_operator_hyp22v4" - } - vllm_model_config = { - "lora_modules": None, # [LoRAModulePath(name=a, path=b)] - "prompt_adapters": None, # [PromptAdapterPath(name=a, path=b)] - "response_role": "assistant", - "chat_template": None, - "return_tokens_as_token_ids": False, - "max_log_len": None - } - - self.engine_args = AsyncEngineArgs(**vllm_engine_config) - self.async_engine_client = AsyncLLMEngine.from_engine_args(self.engine_args, usage_context=UsageContext.OPENAI_API_SERVER) - - - model_config = self.async_engine_client.engine.get_model_config() - - request_logger = RequestLogger(max_log_len=vllm_model_config["max_log_len"]) - - self.openai_serving_chat = OpenAIServingChat( - self.async_engine_client, - model_config, - served_model_names=[vllm_engine_config["served_model_name"]], - response_role=vllm_model_config["response_role"], - lora_modules=vllm_model_config["lora_modules"], - prompt_adapters=vllm_model_config["prompt_adapters"], - request_logger=request_logger, - chat_template=vllm_model_config["chat_template"], - return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"] - ) - self.openai_serving_completion = OpenAIServingCompletion( - self.async_engine_client, - model_config, - served_model_names=[vllm_engine_config["served_model_name"]], - lora_modules=vllm_model_config["lora_modules"], - prompt_adapters=vllm_model_config["prompt_adapters"], - request_logger=request_logger, - return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"] - ) - self.openai_serving_embedding = OpenAIServingEmbedding( - self.async_engine_client, - model_config, - served_model_names=[vllm_engine_config["served_model_name"]], - request_logger=request_logger - ) - self.openai_serving_tokenization = OpenAIServingTokenization( - self.async_engine_client, - model_config, - served_model_names=[vllm_engine_config["served_model_name"]], - lora_modules=vllm_model_config["lora_modules"], - request_logger=request_logger, - chat_template=vllm_model_config["chat_template"] - ) # override `send_request` method with the async version self._preprocess.__class__.send_request = VllmPreprocessRequest._preprocess_send_request @@ -818,7 +756,7 @@ async def completions(self, data: Any, state: dict, collect_custom_statistics_fn ) request = CompletionRequest(**body) logger.info(f"Received chat completion request: {request}") - generator = await self.openai_serving_completion.create_completion( + generator = await self._model["openai_serving_completion"].create_completion( request=request, raw_request=raw_request ) @@ -835,16 +773,13 @@ async def chat_completions(self, data: Any, state: dict, collect_custom_statisti The actual processing function. We run the process in this context """ - # if self._preprocess is not None and hasattr(self._preprocess, 'chat_completion'): - # return await self._preprocess.chat_completion(data, state, collect_custom_statistics_fn) - # return None if REMOVE_WEB_ADDITIONAL_PROMPTS: if "messages" in body: body["messages"] = remove_extra_system_prompts(body["messages"]) request = ChatCompletionRequest(**body) logger.info(f"Received chat completion request: {request}") - generator = await self.openai_serving_chat.create_chat_completion( + generator = await self._model["self.openai_serving_chat"].create_chat_completion( request=request, raw_request=None ) if isinstance(generator, ErrorResponse): diff --git a/examples/vllm/preprocess.py b/examples/vllm/preprocess.py new file mode 100644 index 0000000..f54b390 --- /dev/null +++ b/examples/vllm/preprocess.py @@ -0,0 +1,74 @@ +"""Hugginface preprocessing module for ClearML Serving.""" +from typing import Any + + +# Notice Preprocess class Must be named "Preprocess" +class Preprocess: + """Processing class will be run by the ClearML inference services before and after each request.""" + + def __init__(self): + """Set internal state, this will be called only once. (i.e. not per request).""" + self.model_endpoint = None + + def load(self, local_file_name: str) -> Optional[Any]: # noqa + vllm_engine_config = { + "model":f"{local_file_name}/model", + "tokenizer":f"{local_file_name}/tokenizer", + "disable_log_requests": True, + "disable_log_stats": False, + "gpu_memory_utilization": 0.9, + "quantization": None, + "enforce_eager": True, + "served_model_name": "ai_operator_hyp22v4" + } + vllm_model_config = { + "lora_modules": None, # [LoRAModulePath(name=a, path=b)] + "prompt_adapters": None, # [PromptAdapterPath(name=a, path=b)] + "response_role": "assistant", + "chat_template": None, + "return_tokens_as_token_ids": False, + "max_log_len": None + } + self._model = {} + self._model["engine_args"] = AsyncEngineArgs(**vllm_engine_config) + self._model["async_engine_client"] = AsyncLLMEngine.from_engine_args(self.engine_args, usage_context=UsageContext.OPENAI_API_SERVER) + + + self._model["model_config"] = self.async_engine_client.engine.get_model_config() + + self._model["request_logger"] = RequestLogger(max_log_len=vllm_model_config["max_log_len"]) + + self._model["self.openai_serving_chat"] = OpenAIServingChat( + self.async_engine_client, + model_config, + served_model_names=[vllm_engine_config["served_model_name"]], + response_role=vllm_model_config["response_role"], + lora_modules=vllm_model_config["lora_modules"], + prompt_adapters=vllm_model_config["prompt_adapters"], + request_logger=request_logger, + chat_template=vllm_model_config["chat_template"], + return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"] + ) + self._model["openai_serving_completion"] = OpenAIServingCompletion( + self.async_engine_client, + model_config, + served_model_names=[vllm_engine_config["served_model_name"]], + lora_modules=vllm_model_config["lora_modules"], + prompt_adapters=vllm_model_config["prompt_adapters"], + request_logger=request_logger, + return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"] + ) + self._model["self.openai_serving_embedding"] = OpenAIServingEmbedding( + self.async_engine_client, + model_config, + served_model_names=[vllm_engine_config["served_model_name"]], + request_logger=request_logger + ) + self._model["self.openai_serving_tokenization"] = OpenAIServingTokenization( + self.async_engine_client, + model_config, + served_model_names=[vllm_engine_config["served_model_name"]], + lora_modules=vllm_model_config["lora_modules"], + request_logger=request_logger, + chat_template=vllm_model_config["chat_template"] + ) diff --git a/examples/vllm/readme.md b/examples/vllm/readme.md new file mode 100644 index 0000000..fd03eb9 --- /dev/null +++ b/examples/vllm/readme.md @@ -0,0 +1,51 @@ +# Deploy vLLM model + +## setting up the serving service + +1. Create serving Service: `clearml-serving create --name "serving example"` (write down the service ID) +2. Make sure to add any required additional packages (for your custom model) to the [docker-compose.yml](https://github.com/allegroai/clearml-serving/blob/826f503cf4a9b069b89eb053696d218d1ce26f47/docker/docker-compose.yml#L97) (or as environment variable to the `clearml-serving-inference` container), by defining for example: `CLEARML_EXTRA_PYTHON_PACKAGES="vllm==0.5.4"` +3. Create model endpoint: +`clearml-serving --id model add --engine vllm --endpoint "test_vllm" --preprocess "examples/vllm/preprocess.py" --name "test vllm" --project "serving examples"` + +Or auto update + +`clearml-serving --id model auto-update --engine vllm --endpoint "test_vllm" --preprocess "examples/vllm/preprocess.py" --name "test vllm" --project "serving examples" --max-versions 2` + +Or add Canary endpoint + +`clearml-serving --id model canary --endpoint "test_vllm" --weights 0.1 0.9 --input-endpoint-prefix test_vllm` + +4. If you already have the `clearml-serving` docker-compose running, it might take it a minute or two to sync with the new endpoint. + +Or you can run the clearml-serving container independently `docker run -v ~/clearml.conf:/root/clearml.conf -p 8080:8080 -e CLEARML_SERVING_TASK_ID= clearml-serving:latest` + +5. Test new endpoint (do notice the first call will trigger the model pulling, so it might take longer, from here on, it's all in memory): + +```python + +import openai +openai.api_key = "dummy" +openai.api_base = f"http://serving.apps.okd.mts.ai/clearml/v1" + + +r0 = await openai.ChatCompletion.acreate( + model=vllm_endpoint, + messages=[{"role": "system", "content": ""}, {"role": "user", "content": "Hi there, goodman!"}], + temperature=1.0, + max_tokens=1024, + top_p=1.0, + request_timeout=10000, +) + +print(f"ChatCompletion: {r0['choices'][0]['message']}") + +r1 = await openai.Completion.acreate( + model=vllm_endpoint, + prompt="Hi there, goodman!", + temperature=1.0, + max_tokens=256, +) + +print(f"Completion: \n {r1['choices'][0]['text']}") + +``` From 428be766427aa21c9c510796b3e2c328d3d5bc70 Mon Sep 17 00:00:00 2001 From: IlyaMescheryakov1402 Date: Sun, 9 Mar 2025 01:46:05 +0300 Subject: [PATCH 08/22] major vllm engine update --- README.md | 4 + .../engines/triton/triton_helper.py | 2 +- clearml_serving/serving/main.py | 22 +-- clearml_serving/serving/preprocess_service.py | 153 ++++++++---------- clearml_serving/serving/requirements.txt | 2 + clearml_serving/statistics/Dockerfile | 2 +- clearml_serving/version.py | 2 +- docker/docker-compose-gpu.yml | 146 +++++++++++++++++ docker/prometheus.yml | 7 + examples/vllm/preprocess.py | 68 +++++--- examples/vllm/readme.md | 24 ++- 11 files changed, 305 insertions(+), 127 deletions(-) create mode 100644 docker/docker-compose-gpu.yml diff --git a/README.md b/README.md index 7147f40..63f9016 100644 --- a/README.md +++ b/README.md @@ -103,6 +103,10 @@ cat docker/example.env ```bash cd docker && docker-compose --env-file example.env -f docker-compose.yml up ``` +If running on a GPU instance, use gpu docker-compose file +```bash +cd docker && docker-compose --env-file example.env -f docker-compose-gpu.yml up +``` If you need Triton support (keras/pytorch/onnx etc.), use the triton docker-compose file ```bash cd docker && docker-compose --env-file example.env -f docker-compose-triton.yml up diff --git a/clearml_serving/engines/triton/triton_helper.py b/clearml_serving/engines/triton/triton_helper.py index 19fd241..b4921c5 100644 --- a/clearml_serving/engines/triton/triton_helper.py +++ b/clearml_serving/engines/triton/triton_helper.py @@ -561,7 +561,7 @@ def main(): setattr(args, args_var, type(t)(v) if t is not None else v) # noinspection PyProtectedMember - serving_task = ModelRequestProcessor._get_control_plane_task(task_id=args.inference_task_id) + serving_task = ModelRequestProcessor._get_control_plane_task(task_id=args.serving_id) task = Task.init( project_name=args.project or serving_task.get_project_name() or "serving", diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py index 1950265..f158181 100644 --- a/clearml_serving/serving/main.py +++ b/clearml_serving/serving/main.py @@ -110,17 +110,17 @@ async def cuda_exception_handler(request, exc): return PlainTextResponse("CUDA out of memory. Restarting service", status_code=500, background=task) -def process_with_exceptions( +async def process_with_exceptions( base_url: str, version: Optional[str], - request: Union[bytes, Dict[Any, Any]], + request_body: Union[bytes, Dict[Any, Any]], serve_type: str ): try: return_value = await processor.process_request( base_url=base_url, version=version, - request_body=request, + request_body=request_body, serve_type=serve_type ) except EndpointNotFoundException as ex: @@ -128,21 +128,21 @@ def process_with_exceptions( except (EndpointModelLoadException, EndpointBackendEngineException) as ex: session_logger.report_text( "[{}] Exception [{}] {} while processing request: {}\n{}".format( - instance_id, type(ex), ex, request, "".join(traceback.format_exc()) + instance_id, type(ex), ex, request_body, "".join(traceback.format_exc()) ) ) raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex)) except ServingInitializationException as ex: session_logger.report_text( "[{}] Exception [{}] {} while loading serving inference: {}\n{}".format( - instance_id, type(ex), ex, request, "".join(traceback.format_exc()) + instance_id, type(ex), ex, request_body, "".join(traceback.format_exc()) ) ) raise HTTPException(status_code=500, detail="Error [{}] processing request: {}".format(type(ex), ex)) except ValueError as ex: session_logger.report_text( "[{}] Exception [{}] {} while processing request: {}\n{}".format( - instance_id, type(ex), ex, request, "".join(traceback.format_exc()) + instance_id, type(ex), ex, request_body, "".join(traceback.format_exc()) ) ) if "CUDA out of memory. " in str(ex) or "NVML_SUCCESS == r INTERNAL ASSERT FAILED" in str(ex): @@ -152,7 +152,7 @@ def process_with_exceptions( except AioRpcError as ex: if grpc_aio_verbose_errors and ex.code() in grpc_aio_verbose_errors: session_logger.report_text( - "[{}] Exception [AioRpcError] {} while processing request: {}".format(instance_id, ex, request) + "[{}] Exception [AioRpcError] {} while processing request: {}".format(instance_id, ex, request_body) ) elif not grpc_aio_ignore_errors or ex.code() not in grpc_aio_ignore_errors: session_logger.report_text("[{}] Exception [AioRpcError] status={} ".format(instance_id, ex.code())) @@ -162,7 +162,7 @@ def process_with_exceptions( except Exception as ex: session_logger.report_text( "[{}] Exception [{}] {} while processing request: {}\n{}".format( - instance_id, type(ex), ex, request, "".join(traceback.format_exc()) + instance_id, type(ex), ex, request_body, "".join(traceback.format_exc()) ) ) raise HTTPException(status_code=500, detail="Error [{}] processing request: {}".format(type(ex), ex)) @@ -170,7 +170,7 @@ def process_with_exceptions( router = APIRouter( - prefix=f"/{os.environ.get("CLEARML_DEFAULT_SERVE_SUFFIX", "serve")}", + prefix=f"/{os.environ.get('CLEARML_DEFAULT_SERVE_SUFFIX', 'serve')}", tags=["models"], responses={404: {"description": "Model Serving Endpoint Not found"}}, route_class=GzipRoute, # mark-out to remove support for GZip content encoding @@ -185,7 +185,7 @@ async def base_serve_model( version: Optional[str] = None, request: Union[bytes, Dict[Any, Any]] = None ): - return_value = process_with_exceptions( + return_value = await process_with_exceptions( base_url=model_id, version=version, request_body=request, @@ -200,7 +200,7 @@ async def openai_serve_model( endpoint_type: str, request: Union[bytes, Dict[Any, Any]] = None ): - return_value = process_with_exceptions( + return_value = await process_with_exceptions( base_url=request.get("model", None), version=None, request_body=request, diff --git a/clearml_serving/serving/preprocess_service.py b/clearml_serving/serving/preprocess_service.py index b9b6cda..45946c7 100644 --- a/clearml_serving/serving/preprocess_service.py +++ b/clearml_serving/serving/preprocess_service.py @@ -14,6 +14,19 @@ from .endpoints import ModelEndpoint +# try: +# import prometheus_client +# from fastapi.responses import JSONResponse, StreamingResponse +# from vllm.entrypoints.openai.protocol import ( +# ChatCompletionRequest, +# ChatCompletionResponse, +# CompletionRequest, +# ErrorResponse +# ) +# from vllm.logger import init_logger +# except ImportError: +# print("WARNING: install vllm==0.5.4 and prometheus_client==0.21.1 to serve vllm engine") + class BasePreprocessRequest(object): __preprocessing_lookup = {} @@ -598,65 +611,11 @@ async def _preprocess_send_request(_, endpoint: str, version: str = None, data: return return_value.json() -@BasePreprocessRequest.register_engine("vllm") +@BasePreprocessRequest.register_engine("vllm", modules=["vllm", "fastapi"]) class VllmPreprocessRequest(BasePreprocessRequest): - import prometheus_client - - from typing import Any, Union, Optional, Callable - - from fastapi.responses import JSONResponse, StreamingResponse - - from vllm.engine.arg_utils import AsyncEngineArgs - from vllm.engine.async_llm_engine import AsyncLLMEngine - from vllm.entrypoints.logger import RequestLogger - # yapf conflicts with isort for this block - # yapf: disable - from vllm.entrypoints.openai.protocol import ( - ChatCompletionRequest, - ChatCompletionResponse, - CompletionRequest, - ErrorResponse - ) - - # yapf: enable - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding - from vllm.entrypoints.openai.serving_tokenization import OpenAIServingTokenization - from vllm.logger import init_logger - from vllm.usage.usage_lib import UsageContext - from vllm.entrypoints.openai.serving_engine import LoRAModulePath, PromptAdapterPath - - logger = init_logger(__name__) - - REMOVE_WEB_ADDITIONAL_PROMPTS = True - - if VllmPreprocessRequest.asyncio_to_thread is None: - from asyncio import to_thread as asyncio_to_thread - VllmPreprocessRequest.asyncio_to_thread = asyncio_to_thread - - def remove_extra_system_prompts(messages: list) -> list: - """ - Removes all 'system' prompts except the last one. - - :param messages: List of message dicts with 'role' and 'content'. - :return: Modified list of messages with only the last 'system' prompt preserved. - """ - # Фильтруем только системные сообщения - system_messages_indices = [] - for i, msg in enumerate(messages): - if msg["role"] == "system": - system_messages_indices.append(i) - else: - break - - # Если есть больше одного системного сообщения, удалим все, кроме последнего - if len(system_messages_indices) > 1: - last_system_index = system_messages_indices[-1] - # Удаляем все системные сообщения, кроме последнего - messages = [msg for i, msg in enumerate(messages) if msg["role"] != "system" or i == last_system_index] - - return messages + asyncio_to_thread = None + _vllm = None + _fastapi = None class CustomRequest: def __init__(self, headers: Optional[dict] = None): @@ -669,12 +628,39 @@ def __init__(self, model_endpoint: ModelEndpoint, task: Task = None): super(VllmPreprocessRequest, self).__init__( model_endpoint=model_endpoint, task=task) - def is_port_in_use(port: int) -> bool: - import socket - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - return s.connect_ex(('localhost', port)) == 0 - if not is_port_in_use(8000): - prometheus_client.start_http_server(8000) + # load vLLM Modules + if self._vllm is None: + from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + ChatCompletionResponse, + CompletionRequest, + ErrorResponse + ) + self._vllm = {} + self._vllm["chat_completion_request"] = ChatCompletionRequest + self._vllm["chat_completion_response"] = ChatCompletionResponse + self._vllm["completion_request"] = CompletionRequest + self._vllm["error_response"] = ErrorResponse + + if self._fastapi is None: + from fastapi.responses import JSONResponse, StreamingResponse + self._fastapi = {} + self._fastapi["json_response"] = JSONResponse + self._fastapi["streaming_response"] = StreamingResponse + + from vllm.logger import init_logger + self.logger = init_logger(__name__) + + if VllmPreprocessRequest.asyncio_to_thread is None: + from asyncio import to_thread as asyncio_to_thread + VllmPreprocessRequest.asyncio_to_thread = asyncio_to_thread + + import socket + import prometheus_client + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + if not s.connect_ex(('localhost', 8000)) == 0: + prometheus_client.start_http_server(8000) + # override `send_request` method with the async version self._preprocess.__class__.send_request = VllmPreprocessRequest._preprocess_send_request @@ -738,15 +724,11 @@ async def postprocess( return await self._preprocess.postprocess(data, state, collect_custom_statistics_fn) return data - async def completions(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: """ The actual processing function. We run the process in this context """ - if REMOVE_WEB_ADDITIONAL_PROMPTS: - if "messages" in body: - body["messages"] = remove_extra_system_prompts(body["messages"]) raw_request = CustomRequest( headers = { @@ -754,18 +736,18 @@ async def completions(self, data: Any, state: dict, collect_custom_statistics_fn "tracestate": None } ) - request = CompletionRequest(**body) - logger.info(f"Received chat completion request: {request}") + request = self._vllm["completion_request"](**data) + self.logger.info(f"Received chat completion request: {request}") generator = await self._model["openai_serving_completion"].create_completion( request=request, raw_request=raw_request ) - if isinstance(generator, ErrorResponse): - return JSONResponse(content=generator.model_dump(), status_code=generator.code) + if isinstance(generator, self._vllm["error_response"]): + return self._fastapi["json_response"](content=generator.model_dump(), status_code=generator.code) if request.stream: - return StreamingResponse(content=generator, media_type="text/event-stream") + return self._fastapi["streaming_response"](content=generator, media_type="text/event-stream") else: - return JSONResponse(content=generator.model_dump()) + return self._fastapi["json_response"](content=generator.model_dump()) async def chat_completions(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: @@ -773,30 +755,27 @@ async def chat_completions(self, data: Any, state: dict, collect_custom_statisti The actual processing function. We run the process in this context """ - if REMOVE_WEB_ADDITIONAL_PROMPTS: - if "messages" in body: - body["messages"] = remove_extra_system_prompts(body["messages"]) - request = ChatCompletionRequest(**body) - logger.info(f"Received chat completion request: {request}") - generator = await self._model["self.openai_serving_chat"].create_chat_completion( + request = self._vllm["chat_completion_request"](**data) + self.logger.info(f"Received chat completion request: {request}") + generator = await self._model["openai_serving_chat"].create_chat_completion( request=request, raw_request=None ) - if isinstance(generator, ErrorResponse): - return JSONResponse(content=generator.model_dump(), status_code=generator.code) + if isinstance(generator, self._vllm["error_response"]): + return self._fastapi["json_response"](content=generator.model_dump(), status_code=generator.code) if request.stream: - return StreamingResponse(content=generator, media_type="text/event-stream") + return self._fastapi["streaming_response"](content=generator, media_type="text/event-stream") else: - assert isinstance(generator, ChatCompletionResponse) - return JSONResponse(content=generator.model_dump()) + assert isinstance(generator, self._vllm["chat_completion_response"]) + return self._fastapi["json_response"](content=generator.model_dump()) @staticmethod async def _preprocess_send_request(_, endpoint: str, version: str = None, data: dict = None) -> Optional[dict]: - endpoint = "{}/{}".format(endpoint.strip("/"), version.strip("/")) if version else endpoint.strip("/") + endpoint = "/openai/v1/{}".format(endpoint.strip("/")) base_url = BasePreprocessRequest.get_server_config().get("base_serving_url") base_url = (base_url or BasePreprocessRequest._default_serving_base_url).strip("/") url = "{}/{}".format(base_url, endpoint.strip("/")) - return_value = await CustomAsyncPreprocessRequest.asyncio_to_thread( + return_value = await VllmPreprocessRequest.asyncio_to_thread( request_post, url, json=data, timeout=BasePreprocessRequest._timeout) if not return_value.ok: return None diff --git a/clearml_serving/serving/requirements.txt b/clearml_serving/serving/requirements.txt index da12834..366b19c 100644 --- a/clearml_serving/serving/requirements.txt +++ b/clearml_serving/serving/requirements.txt @@ -18,3 +18,5 @@ lightgbm>=3.3.2,<3.4 requests>=2.31.0 kafka-python>=2.0.2,<2.1 lz4>=4.0.0,<5 +prometheus_client==0.21.1 +vllm==0.5.4 diff --git a/clearml_serving/statistics/Dockerfile b/clearml_serving/statistics/Dockerfile index a55ccc2..fa01199 100644 --- a/clearml_serving/statistics/Dockerfile +++ b/clearml_serving/statistics/Dockerfile @@ -4,7 +4,7 @@ FROM python:3.11-bullseye ENV LC_ALL=C.UTF-8 # install base package -RUN pip3 install --no-cache-dir clearml-serving +# RUN pip3 install --no-cache-dir clearml-serving # get latest execution code from the git repository # RUN cd $HOME && git clone https://github.com/allegroai/clearml-serving.git diff --git a/clearml_serving/version.py b/clearml_serving/version.py index e398332..5b8f37a 100644 --- a/clearml_serving/version.py +++ b/clearml_serving/version.py @@ -1 +1 @@ -__version__ = '1.3.2' +__version__ = '1.3.5' diff --git a/docker/docker-compose-gpu.yml b/docker/docker-compose-gpu.yml new file mode 100644 index 0000000..0009fa8 --- /dev/null +++ b/docker/docker-compose-gpu.yml @@ -0,0 +1,146 @@ +version: "3" + +services: + zookeeper: + image: bitnami/zookeeper:3.7.0 + container_name: clearml-serving-zookeeper + # ports: + # - "2181:2181" + environment: + - ALLOW_ANONYMOUS_LOGIN=yes + networks: + - clearml-serving-backend + + kafka: + image: bitnami/kafka:3.1.1 + container_name: clearml-serving-kafka + # ports: + # - "9092:9092" + environment: + - KAFKA_BROKER_ID=1 + - KAFKA_CFG_LISTENERS=PLAINTEXT://clearml-serving-kafka:9092 + - KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://clearml-serving-kafka:9092 + - KAFKA_CFG_ZOOKEEPER_CONNECT=clearml-serving-zookeeper:2181 + - ALLOW_PLAINTEXT_LISTENER=yes + - KAFKA_CREATE_TOPICS="topic_test:1:1" + depends_on: + - zookeeper + networks: + - clearml-serving-backend + + prometheus: + image: prom/prometheus:v2.34.0 + container_name: clearml-serving-prometheus + volumes: + - ./prometheus.yml:/prometheus.yml + command: + - '--config.file=/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/etc/prometheus/console_libraries' + - '--web.console.templates=/etc/prometheus/consoles' + - '--storage.tsdb.retention.time=200h' + - '--web.enable-lifecycle' + restart: unless-stopped + # ports: + # - "9090:9090" + depends_on: + - clearml-serving-statistics + networks: + - clearml-serving-backend + + alertmanager: + image: prom/alertmanager:v0.23.0 + container_name: clearml-serving-alertmanager + restart: unless-stopped + # ports: + # - "9093:9093" + depends_on: + - prometheus + - grafana + networks: + - clearml-serving-backend + + grafana: + image: grafana/grafana:8.4.4-ubuntu + container_name: clearml-serving-grafana + volumes: + - './datasource.yml:/etc/grafana/provisioning/datasources/datasource.yaml' + restart: unless-stopped + ports: + - "3000:3000" + depends_on: + - prometheus + networks: + - clearml-serving-backend + + + clearml-serving-inference: + image: clearml-serving-inference:latest + container_name: clearml-serving-inference + restart: unless-stopped + # optimize perforamnce + security_opt: + - seccomp:unconfined + ports: + - "8080:8080" + environment: + CLEARML_WEB_HOST: ${CLEARML_WEB_HOST:-https://app.clear.ml} + CLEARML_API_HOST: ${CLEARML_API_HOST:-https://api.clear.ml} + CLEARML_FILES_HOST: ${CLEARML_FILES_HOST:-https://files.clear.ml} + CLEARML_API_ACCESS_KEY: ${CLEARML_API_ACCESS_KEY} + CLEARML_API_SECRET_KEY: ${CLEARML_API_SECRET_KEY} + CLEARML_SERVING_TASK_ID: ${CLEARML_SERVING_TASK_ID:-} + CLEARML_SERVING_PORT: ${CLEARML_SERVING_PORT:-8080} + CLEARML_SERVING_POLL_FREQ: ${CLEARML_SERVING_POLL_FREQ:-1.0} + CLEARML_DEFAULT_SERVE_SUFFIX: ${CLEARML_DEFAULT_SERVE_SUFFIX:-serve} + CLEARML_DEFAULT_BASE_SERVE_URL: ${CLEARML_DEFAULT_BASE_SERVE_URL:-http://127.0.0.1:8080/serve} + CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092} + CLEARML_DEFAULT_TRITON_GRPC_ADDR: ${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-} + CLEARML_USE_GUNICORN: ${CLEARML_USE_GUNICORN:-} + CLEARML_SERVING_NUM_PROCESS: ${CLEARML_SERVING_NUM_PROCESS:-} + CLEARML_EXTRA_PYTHON_PACKAGES: ${CLEARML_EXTRA_PYTHON_PACKAGES:-} + AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-} + AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY:-} + AWS_DEFAULT_REGION: ${AWS_DEFAULT_REGION:-} + GOOGLE_APPLICATION_CREDENTIALS: ${GOOGLE_APPLICATION_CREDENTIALS:-} + AZURE_STORAGE_ACCOUNT: ${AZURE_STORAGE_ACCOUNT:-} + AZURE_STORAGE_KEY: ${AZURE_STORAGE_KEY:-} + depends_on: + - kafka + networks: + - clearml-serving-backend + deploy: + resources: + reservations: + devices: + - driver: nvidia + device_ids: ['0'] + capabilities: [gpu] + + clearml-serving-statistics: + image: allegroai/clearml-serving-statistics:latest + container_name: clearml-serving-statistics + restart: unless-stopped + # optimize perforamnce + security_opt: + - seccomp:unconfined + # ports: + # - "9999:9999" + environment: + CLEARML_WEB_HOST: ${CLEARML_WEB_HOST:-https://app.clear.ml} + CLEARML_API_HOST: ${CLEARML_API_HOST:-https://api.clear.ml} + CLEARML_FILES_HOST: ${CLEARML_FILES_HOST:-https://files.clear.ml} + CLEARML_API_ACCESS_KEY: ${CLEARML_API_ACCESS_KEY} + CLEARML_API_SECRET_KEY: ${CLEARML_API_SECRET_KEY} + CLEARML_SERVING_TASK_ID: ${CLEARML_SERVING_TASK_ID:-} + CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092} + CLEARML_SERVING_POLL_FREQ: ${CLEARML_SERVING_POLL_FREQ:-1.0} + depends_on: + - kafka + networks: + - clearml-serving-backend + + +networks: + clearml-serving-backend: + driver: bridge diff --git a/docker/prometheus.yml b/docker/prometheus.yml index 469e220..b7aa51e 100644 --- a/docker/prometheus.yml +++ b/docker/prometheus.yml @@ -20,3 +20,10 @@ scrape_configs: static_configs: - targets: ['clearml-serving-statistics:9999'] + + - job_name: 'vllm' + + scrape_interval: 5s + + static_configs: + - targets: ['clearml-serving-inference'] \ No newline at end of file diff --git a/examples/vllm/preprocess.py b/examples/vllm/preprocess.py index f54b390..a4191d6 100644 --- a/examples/vllm/preprocess.py +++ b/examples/vllm/preprocess.py @@ -1,8 +1,16 @@ """Hugginface preprocessing module for ClearML Serving.""" -from typing import Any +from typing import Any, Optional +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.engine.async_llm_engine import AsyncLLMEngine +from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.serving_chat import OpenAIServingChat +from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion +from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding +from vllm.entrypoints.openai.serving_tokenization import OpenAIServingTokenization +from vllm.usage.usage_lib import UsageContext +from vllm.entrypoints.openai.serving_engine import LoRAModulePath, PromptAdapterPath -# Notice Preprocess class Must be named "Preprocess" class Preprocess: """Processing class will be run by the ClearML inference services before and after each request.""" @@ -12,14 +20,15 @@ def __init__(self): def load(self, local_file_name: str) -> Optional[Any]: # noqa vllm_engine_config = { - "model":f"{local_file_name}/model", - "tokenizer":f"{local_file_name}/tokenizer", + "model": local_file_name, + "tokenizer": local_file_name, "disable_log_requests": True, "disable_log_stats": False, "gpu_memory_utilization": 0.9, "quantization": None, "enforce_eager": True, - "served_model_name": "ai_operator_hyp22v4" + "served_model_name": "test_vllm", + "dtype": "float16" } vllm_model_config = { "lora_modules": None, # [LoRAModulePath(name=a, path=b)] @@ -30,16 +39,12 @@ def load(self, local_file_name: str) -> Optional[Any]: # noqa "max_log_len": None } self._model = {} - self._model["engine_args"] = AsyncEngineArgs(**vllm_engine_config) - self._model["async_engine_client"] = AsyncLLMEngine.from_engine_args(self.engine_args, usage_context=UsageContext.OPENAI_API_SERVER) - - - self._model["model_config"] = self.async_engine_client.engine.get_model_config() - - self._model["request_logger"] = RequestLogger(max_log_len=vllm_model_config["max_log_len"]) - - self._model["self.openai_serving_chat"] = OpenAIServingChat( - self.async_engine_client, + engine_args = AsyncEngineArgs(**vllm_engine_config) + async_engine_client = AsyncLLMEngine.from_engine_args(engine_args, usage_context=UsageContext.OPENAI_API_SERVER) + model_config = async_engine_client.engine.get_model_config() + request_logger = RequestLogger(max_log_len=vllm_model_config["max_log_len"]) + self._model["openai_serving_chat"] = OpenAIServingChat( + async_engine_client, model_config, served_model_names=[vllm_engine_config["served_model_name"]], response_role=vllm_model_config["response_role"], @@ -50,7 +55,7 @@ def load(self, local_file_name: str) -> Optional[Any]: # noqa return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"] ) self._model["openai_serving_completion"] = OpenAIServingCompletion( - self.async_engine_client, + async_engine_client, model_config, served_model_names=[vllm_engine_config["served_model_name"]], lora_modules=vllm_model_config["lora_modules"], @@ -58,17 +63,40 @@ def load(self, local_file_name: str) -> Optional[Any]: # noqa request_logger=request_logger, return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"] ) - self._model["self.openai_serving_embedding"] = OpenAIServingEmbedding( - self.async_engine_client, + self._model["openai_serving_embedding"] = OpenAIServingEmbedding( + async_engine_client, model_config, served_model_names=[vllm_engine_config["served_model_name"]], request_logger=request_logger ) - self._model["self.openai_serving_tokenization"] = OpenAIServingTokenization( - self.async_engine_client, + self._model["openai_serving_tokenization"] = OpenAIServingTokenization( + async_engine_client, model_config, served_model_names=[vllm_engine_config["served_model_name"]], lora_modules=vllm_model_config["lora_modules"], request_logger=request_logger, chat_template=vllm_model_config["chat_template"] ) + return self._model + + def remove_extra_system_prompts(self, messages: List) -> List: + system_messages_indices = [] + for i, msg in enumerate(messages): + if msg["role"] == "system": + system_messages_indices.append(i) + else: + break + if len(system_messages_indices) > 1: + last_system_index = system_messages_indices[-1] + messages = [msg for i, msg in enumerate(messages) if msg["role"] != "system" or i == last_system_index] + return messages + + def preprocess( + self, + body: Union[bytes, dict], + state: dict, + collect_custom_statistics_fn: Optional[Callable[[dict], None]], + ) -> Any: # noqa + if "messages" in body: + body["messages"] = self.remove_extra_system_prompts(body["messages"]) + return body diff --git a/examples/vllm/readme.md b/examples/vllm/readme.md index fd03eb9..8abb6d3 100644 --- a/examples/vllm/readme.md +++ b/examples/vllm/readme.md @@ -3,21 +3,22 @@ ## setting up the serving service 1. Create serving Service: `clearml-serving create --name "serving example"` (write down the service ID) + 2. Make sure to add any required additional packages (for your custom model) to the [docker-compose.yml](https://github.com/allegroai/clearml-serving/blob/826f503cf4a9b069b89eb053696d218d1ce26f47/docker/docker-compose.yml#L97) (or as environment variable to the `clearml-serving-inference` container), by defining for example: `CLEARML_EXTRA_PYTHON_PACKAGES="vllm==0.5.4"` 3. Create model endpoint: -`clearml-serving --id model add --engine vllm --endpoint "test_vllm" --preprocess "examples/vllm/preprocess.py" --name "test vllm" --project "serving examples"` +`clearml-serving --id model add --model-id --engine vllm --endpoint "test_vllm" --preprocess "examples/vllm/preprocess.py" --name "test vllm" --project "serving examples"` -Or auto update + Or auto update -`clearml-serving --id model auto-update --engine vllm --endpoint "test_vllm" --preprocess "examples/vllm/preprocess.py" --name "test vllm" --project "serving examples" --max-versions 2` + `clearml-serving --id model auto-update --engine vllm --endpoint "test_vllm" --preprocess "examples/vllm/preprocess.py" --name "test vllm" --project "serving examples" --max-versions 2` -Or add Canary endpoint + Or add Canary endpoint -`clearml-serving --id model canary --endpoint "test_vllm" --weights 0.1 0.9 --input-endpoint-prefix test_vllm` + `clearml-serving --id model canary --endpoint "test_vllm" --weights 0.1 0.9 --input-endpoint-prefix test_vllm` 4. If you already have the `clearml-serving` docker-compose running, it might take it a minute or two to sync with the new endpoint. -Or you can run the clearml-serving container independently `docker run -v ~/clearml.conf:/root/clearml.conf -p 8080:8080 -e CLEARML_SERVING_TASK_ID= clearml-serving:latest` + Or you can run the clearml-serving container independently `docker run -v ~/clearml.conf:/root/clearml.conf -p 8080:8080 -e CLEARML_SERVING_TASK_ID= clearml-serving:latest` 5. Test new endpoint (do notice the first call will trigger the model pulling, so it might take longer, from here on, it's all in memory): @@ -49,3 +50,14 @@ r1 = await openai.Completion.acreate( print(f"Completion: \n {r1['choices'][0]['text']}") ``` +NOTE! + +If you want to use send_request method, keep in mind that you have to pass "completions" or "chat/completions" in entrypoint (and pass model as a part of "data" parameter) and use it for non-streaming models: + +```python +prompt = "Hi there, goodman!" +result = self.send_request(endpoint="chat/completions", version=None, data={"model": "test_vllm", "messages": [{"role": "system", "content": "You are a helpful assistant"}, {"role": "user", "content": prompt}]}) +answer = result.choises[0].message.content +``` +OR +If you want to use send_request method, use openai client instead \ No newline at end of file From cadd48f6723e89ab92fbe02458f4341f13130502 Mon Sep 17 00:00:00 2001 From: IlyaMescheryakov1402 Date: Sun, 9 Mar 2025 15:12:05 +0300 Subject: [PATCH 09/22] add openai_serving and openai_serving_models --- clearml_serving/serving/main.py | 23 +++-- .../serving/model_request_processor.py | 4 +- clearml_serving/serving/preprocess_service.py | 48 +++++----- clearml_serving/serving/requirements.txt | 2 +- docker/prometheus.yml | 2 +- examples/vllm/preprocess.py | 87 ++++++++++++------- 6 files changed, 99 insertions(+), 67 deletions(-) diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py index f158181..e7a94d8 100644 --- a/clearml_serving/serving/main.py +++ b/clearml_serving/serving/main.py @@ -4,11 +4,13 @@ import gzip import asyncio -from fastapi import FastAPI, Request, Response, APIRouter, HTTPException +from fastapi import FastAPI, Request, Response, APIRouter, HTTPException, Depends from fastapi.routing import APIRoute from fastapi.responses import PlainTextResponse from grpc.aio import AioRpcError +from vllm.entrypoints.openai.protocol import ChatCompletionRequest, CompletionRequest + from starlette.background import BackgroundTask from typing import Optional, Dict, Any, Callable, Union @@ -194,16 +196,27 @@ async def base_serve_model( return return_value -@router.post("/openai/v1/{endpoint_type:path}") -@router.post("/openai/v1/{endpoint_type:path}/") +async def validate_json_request(raw_request: Request): + content_type = raw_request.headers.get("content-type", "").lower() + media_type = content_type.split(";", maxsplit=1)[0] + if media_type != "application/json": + raise HTTPException( + status_code=HTTPStatus.UNSUPPORTED_MEDIA_TYPE, + detail="Unsupported Media Type: Only 'application/json' is allowed" + ) + +@router.post("/openai/v1/{endpoint_type:path}", dependencies=[Depends(validate_json_request)]) +@router.post("/openai/v1/{endpoint_type:path}/", dependencies=[Depends(validate_json_request)]) async def openai_serve_model( endpoint_type: str, - request: Union[bytes, Dict[Any, Any]] = None + request: Union[CompletionRequest, ChatCompletionRequest], + raw_request: Request ): + combined_request = {"request": request, "raw_request": raw_request} return_value = await process_with_exceptions( base_url=request.get("model", None), version=None, - request_body=request, + request_body=combined_request, serve_type=endpoint_type ) return return_value diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py index c952ddc..eaa4b49 100644 --- a/clearml_serving/serving/model_request_processor.py +++ b/clearml_serving/serving/model_request_processor.py @@ -1234,12 +1234,12 @@ async def _process_request(self, processor: BasePreprocessRequest, url: str, bod # noinspection PyUnresolvedReferences processed = await processor.completions(preprocessed, state, stats_collect_fn) \ if processor.is_process_async \ - else processor.completion(preprocessed, state, stats_collect_fn) + else processor.completions(preprocessed, state, stats_collect_fn) elif serve_type == "chat/completions": # noinspection PyUnresolvedReferences processed = await processor.chat_completions(preprocessed, state, stats_collect_fn) \ if processor.is_process_async \ - else processor.chat_completion(preprocessed, state, stats_collect_fn) + else processor.chat_completions(preprocessed, state, stats_collect_fn) else: raise ValueError(f"wrong url_type: expected 'process', 'completions' or 'chat/completions', got {serve_type}") # noinspection PyUnresolvedReferences diff --git a/clearml_serving/serving/preprocess_service.py b/clearml_serving/serving/preprocess_service.py index 45946c7..bbf8a1f 100644 --- a/clearml_serving/serving/preprocess_service.py +++ b/clearml_serving/serving/preprocess_service.py @@ -613,6 +613,9 @@ async def _preprocess_send_request(_, endpoint: str, version: str = None, data: @BasePreprocessRequest.register_engine("vllm", modules=["vllm", "fastapi"]) class VllmPreprocessRequest(BasePreprocessRequest): + is_preprocess_async = True + is_process_async = True + is_postprocess_async = True asyncio_to_thread = None _vllm = None _fastapi = None @@ -729,25 +732,18 @@ async def completions(self, data: Any, state: dict, collect_custom_statistics_fn The actual processing function. We run the process in this context """ - - raw_request = CustomRequest( - headers = { - "traceparent": None, - "tracestate": None - } - ) - request = self._vllm["completion_request"](**data) - self.logger.info(f"Received chat completion request: {request}") - generator = await self._model["openai_serving_completion"].create_completion( - request=request, - raw_request=raw_request - ) + request, raw_request = data["request"], data["raw_request"] + handler = self._model["openai_serving_completion"] + if handler is None: + return self._model["openai_serving"].create_error_response(message="The model does not support Completions API") + # request = self._vllm["completion_request"](**data) + # self.logger.info(f"Received chat completion request: {request}") + generator = await handler.create_completion(request=request, raw_request=raw_request) if isinstance(generator, self._vllm["error_response"]): return self._fastapi["json_response"](content=generator.model_dump(), status_code=generator.code) - if request.stream: - return self._fastapi["streaming_response"](content=generator, media_type="text/event-stream") - else: + elif isinstance(generator, self._vllm["chat_completion_response"]): return self._fastapi["json_response"](content=generator.model_dump()) + return self._fastapi["streaming_response"](content=generator, media_type="text/event-stream") async def chat_completions(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: @@ -755,19 +751,19 @@ async def chat_completions(self, data: Any, state: dict, collect_custom_statisti The actual processing function. We run the process in this context """ - - request = self._vllm["chat_completion_request"](**data) - self.logger.info(f"Received chat completion request: {request}") - generator = await self._model["openai_serving_chat"].create_chat_completion( - request=request, raw_request=None - ) + request, raw_request = data["request"], data["raw_request"] + handler = self._model["openai_serving_chat"] # analog of chat(raw_request) in https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/entrypoints/openai/api_server.py#L405 + if handler is None: + return self._model["openai_serving"].create_error_response(message="The model does not support Chat Completions API") + # request = self._vllm["chat_completion_request"](**data) + # self.logger.info(f"Received chat completion request: {request}") + generator = await handler.create_chat_completion(request=request, raw_request=raw_request) if isinstance(generator, self._vllm["error_response"]): return self._fastapi["json_response"](content=generator.model_dump(), status_code=generator.code) - if request.stream: - return self._fastapi["streaming_response"](content=generator, media_type="text/event-stream") - else: - assert isinstance(generator, self._vllm["chat_completion_response"]) + elif isinstance(generator, self._vllm["chat_completion_response"]): return self._fastapi["json_response"](content=generator.model_dump()) + return self._fastapi["streaming_response"](content=generator, media_type="text/event-stream") + @staticmethod async def _preprocess_send_request(_, endpoint: str, version: str = None, data: dict = None) -> Optional[dict]: diff --git a/clearml_serving/serving/requirements.txt b/clearml_serving/serving/requirements.txt index 366b19c..922f8e3 100644 --- a/clearml_serving/serving/requirements.txt +++ b/clearml_serving/serving/requirements.txt @@ -19,4 +19,4 @@ requests>=2.31.0 kafka-python>=2.0.2,<2.1 lz4>=4.0.0,<5 prometheus_client==0.21.1 -vllm==0.5.4 +vllm==0.7.3 diff --git a/docker/prometheus.yml b/docker/prometheus.yml index b7aa51e..da47e83 100644 --- a/docker/prometheus.yml +++ b/docker/prometheus.yml @@ -26,4 +26,4 @@ scrape_configs: scrape_interval: 5s static_configs: - - targets: ['clearml-serving-inference'] \ No newline at end of file + - targets: ['clearml-serving-inference:8000'] \ No newline at end of file diff --git a/examples/vllm/preprocess.py b/examples/vllm/preprocess.py index a4191d6..fc8b3aa 100644 --- a/examples/vllm/preprocess.py +++ b/examples/vllm/preprocess.py @@ -1,8 +1,10 @@ """Hugginface preprocessing module for ClearML Serving.""" -from typing import Any, Optional +from typing import Any, Optional, List, Callable, Union from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding @@ -19,7 +21,13 @@ def __init__(self): self.model_endpoint = None def load(self, local_file_name: str) -> Optional[Any]: # noqa - vllm_engine_config = { + + @dataclass + class BaseModelPath: + name: str + model_path: str + + self.vllm_engine_config = { "model": local_file_name, "tokenizer": local_file_name, "disable_log_requests": True, @@ -28,9 +36,10 @@ def load(self, local_file_name: str) -> Optional[Any]: # noqa "quantization": None, "enforce_eager": True, "served_model_name": "test_vllm", - "dtype": "float16" + "dtype": "float16", + "max_model_len": 8192 } - vllm_model_config = { + self.vllm_model_config = { "lora_modules": None, # [LoRAModulePath(name=a, path=b)] "prompt_adapters": None, # [PromptAdapterPath(name=a, path=b)] "response_role": "assistant", @@ -39,43 +48,57 @@ def load(self, local_file_name: str) -> Optional[Any]: # noqa "max_log_len": None } self._model = {} - engine_args = AsyncEngineArgs(**vllm_engine_config) - async_engine_client = AsyncLLMEngine.from_engine_args(engine_args, usage_context=UsageContext.OPENAI_API_SERVER) - model_config = async_engine_client.engine.get_model_config() - request_logger = RequestLogger(max_log_len=vllm_model_config["max_log_len"]) - self._model["openai_serving_chat"] = OpenAIServingChat( + self.engine_args = AsyncEngineArgs(**self.vllm_engine_config) + async_engine_client = AsyncLLMEngine.from_engine_args(self.engine_args, usage_context=UsageContext.OPENAI_API_SERVER) + self.model_config = async_engine_client.engine.get_model_config() + request_logger = RequestLogger(max_log_len=self.vllm_model_config["max_log_len"]) + self._model["openai_serving_models"] = OpenAIServingModels( + async_engine_client, + self.model_config, + [BaseModelPath(name=self.vllm_engine_config["served_model_name"], model_path=self.vllm_engine_config["model"])], + lora_modules=self.vllm_model_config["lora_modules"], + prompt_adapters=self.vllm_model_config["prompt_adapters"], + ) + self._model["openai_serving"] = OpenAIServing( async_engine_client, - model_config, - served_model_names=[vllm_engine_config["served_model_name"]], - response_role=vllm_model_config["response_role"], - lora_modules=vllm_model_config["lora_modules"], - prompt_adapters=vllm_model_config["prompt_adapters"], + self.model_config, + self._model["openai_serving_models"], request_logger=request_logger, - chat_template=vllm_model_config["chat_template"], - return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"] + return_tokens_as_token_ids=self.vllm_model_config["return_tokens_as_token_ids"] ) + self._model["openai_serving_chat"] = OpenAIServingChat( + async_engine_client, + self.model_config, + served_model_names=[self.vllm_engine_config["served_model_name"]], + response_role=self.vllm_model_config["response_role"], + lora_modules=self.vllm_model_config["lora_modules"], + prompt_adapters=self.vllm_model_config["prompt_adapters"], + request_logger=request_logger, + chat_template=self.vllm_model_config["chat_template"], + return_tokens_as_token_ids=self.vllm_model_config["return_tokens_as_token_ids"] + ) if self.model_config.runner_type == "generate" else None self._model["openai_serving_completion"] = OpenAIServingCompletion( async_engine_client, - model_config, - served_model_names=[vllm_engine_config["served_model_name"]], - lora_modules=vllm_model_config["lora_modules"], - prompt_adapters=vllm_model_config["prompt_adapters"], + self.model_config, + served_model_names=[self.vllm_engine_config["served_model_name"]], + lora_modules=self.vllm_model_config["lora_modules"], + prompt_adapters=self.vllm_model_config["prompt_adapters"], request_logger=request_logger, - return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"] - ) + return_tokens_as_token_ids=self.vllm_model_config["return_tokens_as_token_ids"] + ) if self.model_config.runner_type == "generate" else None self._model["openai_serving_embedding"] = OpenAIServingEmbedding( async_engine_client, - model_config, - served_model_names=[vllm_engine_config["served_model_name"]], + self.model_config, + served_model_names=[self.vllm_engine_config["served_model_name"]], request_logger=request_logger - ) + ) if self.model_config.task == "embed" else None self._model["openai_serving_tokenization"] = OpenAIServingTokenization( async_engine_client, - model_config, - served_model_names=[vllm_engine_config["served_model_name"]], - lora_modules=vllm_model_config["lora_modules"], + self.model_config, + served_model_names=[self.vllm_engine_config["served_model_name"]], + lora_modules=self.vllm_model_config["lora_modules"], request_logger=request_logger, - chat_template=vllm_model_config["chat_template"] + chat_template=self.vllm_model_config["chat_template"] ) return self._model @@ -91,12 +114,12 @@ def remove_extra_system_prompts(self, messages: List) -> List: messages = [msg for i, msg in enumerate(messages) if msg["role"] != "system" or i == last_system_index] return messages - def preprocess( + async def preprocess( self, body: Union[bytes, dict], state: dict, collect_custom_statistics_fn: Optional[Callable[[dict], None]], ) -> Any: # noqa - if "messages" in body: - body["messages"] = self.remove_extra_system_prompts(body["messages"]) + if "messages" in body["request"]: + body["request"]["messages"] = self.remove_extra_system_prompts(body["request"]["messages"]) return body From 77e1f95dbd2919491ffffe949934cb148a69cca8 Mon Sep 17 00:00:00 2001 From: IlyaMescheryakov1402 Date: Sun, 9 Mar 2025 22:53:44 +0300 Subject: [PATCH 10/22] fix response import --- clearml_serving/serving/main.py | 2 +- clearml_serving/serving/preprocess_service.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py index e7a94d8..4683838 100644 --- a/clearml_serving/serving/main.py +++ b/clearml_serving/serving/main.py @@ -214,7 +214,7 @@ async def openai_serve_model( ): combined_request = {"request": request, "raw_request": raw_request} return_value = await process_with_exceptions( - base_url=request.get("model", None), + base_url=request.model, version=None, request_body=combined_request, serve_type=endpoint_type diff --git a/clearml_serving/serving/preprocess_service.py b/clearml_serving/serving/preprocess_service.py index bbf8a1f..594bb07 100644 --- a/clearml_serving/serving/preprocess_service.py +++ b/clearml_serving/serving/preprocess_service.py @@ -637,12 +637,14 @@ def __init__(self, model_endpoint: ModelEndpoint, task: Task = None): ChatCompletionRequest, ChatCompletionResponse, CompletionRequest, + CompletionResponse, ErrorResponse ) self._vllm = {} self._vllm["chat_completion_request"] = ChatCompletionRequest self._vllm["chat_completion_response"] = ChatCompletionResponse self._vllm["completion_request"] = CompletionRequest + self._vllm["completion_response"] = CompletionResponse self._vllm["error_response"] = ErrorResponse if self._fastapi is None: @@ -741,7 +743,7 @@ async def completions(self, data: Any, state: dict, collect_custom_statistics_fn generator = await handler.create_completion(request=request, raw_request=raw_request) if isinstance(generator, self._vllm["error_response"]): return self._fastapi["json_response"](content=generator.model_dump(), status_code=generator.code) - elif isinstance(generator, self._vllm["chat_completion_response"]): + elif isinstance(generator, self._vllm["completion_response"]): return self._fastapi["json_response"](content=generator.model_dump()) return self._fastapi["streaming_response"](content=generator, media_type="text/event-stream") From 1c591f2d153f5b56284d13ba7ae39087e0c64fd3 Mon Sep 17 00:00:00 2001 From: IlyaMescheryakov1402 Date: Mon, 10 Mar 2025 00:21:24 +0300 Subject: [PATCH 11/22] fix openai testing --- clearml_serving/serving/preprocess_service.py | 8 ++- examples/vllm/preprocess.py | 45 ++++++++++------- examples/vllm/readme.md | 49 +++++++------------ examples/vllm/test_openai_api.py | 32 ++++++++++++ 4 files changed, 78 insertions(+), 56 deletions(-) create mode 100644 examples/vllm/test_openai_api.py diff --git a/clearml_serving/serving/preprocess_service.py b/clearml_serving/serving/preprocess_service.py index 594bb07..0e7d15a 100644 --- a/clearml_serving/serving/preprocess_service.py +++ b/clearml_serving/serving/preprocess_service.py @@ -735,11 +735,10 @@ async def completions(self, data: Any, state: dict, collect_custom_statistics_fn We run the process in this context """ request, raw_request = data["request"], data["raw_request"] + # analog of completion(raw_request) in https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/entrypoints/openai/api_server.py#L405 handler = self._model["openai_serving_completion"] if handler is None: return self._model["openai_serving"].create_error_response(message="The model does not support Completions API") - # request = self._vllm["completion_request"](**data) - # self.logger.info(f"Received chat completion request: {request}") generator = await handler.create_completion(request=request, raw_request=raw_request) if isinstance(generator, self._vllm["error_response"]): return self._fastapi["json_response"](content=generator.model_dump(), status_code=generator.code) @@ -754,11 +753,10 @@ async def chat_completions(self, data: Any, state: dict, collect_custom_statisti We run the process in this context """ request, raw_request = data["request"], data["raw_request"] - handler = self._model["openai_serving_chat"] # analog of chat(raw_request) in https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/entrypoints/openai/api_server.py#L405 + # analog of chat(raw_request) in https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/entrypoints/openai/api_server.py#L405 + handler = self._model["openai_serving_chat"] if handler is None: return self._model["openai_serving"].create_error_response(message="The model does not support Chat Completions API") - # request = self._vllm["chat_completion_request"](**data) - # self.logger.info(f"Received chat completion request: {request}") generator = await handler.create_chat_completion(request=request, raw_request=raw_request) if isinstance(generator, self._vllm["error_response"]): return self._fastapi["json_response"](content=generator.model_dump(), status_code=generator.code) diff --git a/examples/vllm/preprocess.py b/examples/vllm/preprocess.py index fc8b3aa..b2c6ce0 100644 --- a/examples/vllm/preprocess.py +++ b/examples/vllm/preprocess.py @@ -4,13 +4,13 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.serving_engine import OpenAIServing -from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.entrypoints.openai.serving_models import OpenAIServingModels, LoRAModulePath, PromptAdapterPath, BaseModelPath from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding from vllm.entrypoints.openai.serving_tokenization import OpenAIServingTokenization +from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption from vllm.usage.usage_lib import UsageContext -from vllm.entrypoints.openai.serving_engine import LoRAModulePath, PromptAdapterPath class Preprocess: @@ -22,11 +22,6 @@ def __init__(self): def load(self, local_file_name: str) -> Optional[Any]: # noqa - @dataclass - class BaseModelPath: - name: str - model_path: str - self.vllm_engine_config = { "model": local_file_name, "tokenizer": local_file_name, @@ -47,6 +42,14 @@ class BaseModelPath: "return_tokens_as_token_ids": False, "max_log_len": None } + self.chat_settings = { + "enable_reasoning": False, + "reasoning_parser": None, + "enable_auto_tools": False, + "tool_parser": None, + "enable_prompt_tokens_details": False, + "chat_template_content_format": "auto" + } self._model = {} self.engine_args = AsyncEngineArgs(**self.vllm_engine_config) async_engine_client = AsyncLLMEngine.from_engine_args(self.engine_args, usage_context=UsageContext.OPENAI_API_SERVER) @@ -69,36 +72,40 @@ class BaseModelPath: self._model["openai_serving_chat"] = OpenAIServingChat( async_engine_client, self.model_config, - served_model_names=[self.vllm_engine_config["served_model_name"]], + self._model["openai_serving_models"], response_role=self.vllm_model_config["response_role"], - lora_modules=self.vllm_model_config["lora_modules"], - prompt_adapters=self.vllm_model_config["prompt_adapters"], request_logger=request_logger, chat_template=self.vllm_model_config["chat_template"], - return_tokens_as_token_ids=self.vllm_model_config["return_tokens_as_token_ids"] + chat_template_content_format=self.chat_settings["chat_template_content_format"], + return_tokens_as_token_ids=self.vllm_model_config["return_tokens_as_token_ids"], + enable_reasoning=self.chat_settings["enable_reasoning"], + reasoning_parser=self.chat_settings["reasoning_parser"], + enable_auto_tools=self.chat_settings["enable_auto_tools"], + tool_parser=self.chat_settings["tool_parser"], + enable_prompt_tokens_details=self.chat_settings["enable_prompt_tokens_details"] ) if self.model_config.runner_type == "generate" else None self._model["openai_serving_completion"] = OpenAIServingCompletion( async_engine_client, self.model_config, - served_model_names=[self.vllm_engine_config["served_model_name"]], - lora_modules=self.vllm_model_config["lora_modules"], - prompt_adapters=self.vllm_model_config["prompt_adapters"], + self._model["openai_serving_models"], request_logger=request_logger, return_tokens_as_token_ids=self.vllm_model_config["return_tokens_as_token_ids"] ) if self.model_config.runner_type == "generate" else None self._model["openai_serving_embedding"] = OpenAIServingEmbedding( async_engine_client, self.model_config, - served_model_names=[self.vllm_engine_config["served_model_name"]], - request_logger=request_logger + self._model["openai_serving_models"], + request_logger=request_logger, + chat_template=self.vllm_model_config["chat_template"], + chat_template_content_format=self.chat_settings["chat_template_content_format"] ) if self.model_config.task == "embed" else None self._model["openai_serving_tokenization"] = OpenAIServingTokenization( async_engine_client, self.model_config, - served_model_names=[self.vllm_engine_config["served_model_name"]], - lora_modules=self.vllm_model_config["lora_modules"], + self._model["openai_serving_models"], request_logger=request_logger, - chat_template=self.vllm_model_config["chat_template"] + chat_template=self.vllm_model_config["chat_template"], + chat_template_content_format=self.chat_settings["chat_template_content_format"] ) return self._model diff --git a/examples/vllm/readme.md b/examples/vllm/readme.md index 8abb6d3..33668cf 100644 --- a/examples/vllm/readme.md +++ b/examples/vllm/readme.md @@ -4,52 +4,37 @@ 1. Create serving Service: `clearml-serving create --name "serving example"` (write down the service ID) -2. Make sure to add any required additional packages (for your custom model) to the [docker-compose.yml](https://github.com/allegroai/clearml-serving/blob/826f503cf4a9b069b89eb053696d218d1ce26f47/docker/docker-compose.yml#L97) (or as environment variable to the `clearml-serving-inference` container), by defining for example: `CLEARML_EXTRA_PYTHON_PACKAGES="vllm==0.5.4"` +2. Make sure to add any required additional packages (for your custom model) to the [docker-compose.yml](https://github.com/allegroai/clearml-serving/blob/826f503cf4a9b069b89eb053696d218d1ce26f47/docker/docker-compose.yml#L97) (or as environment variable to the `clearml-serving-inference` container), by defining for example: `CLEARML_EXTRA_PYTHON_PACKAGES="vllm==0.7.3,prometheus_client==0.21.1"` 3. Create model endpoint: -`clearml-serving --id model add --model-id --engine vllm --endpoint "test_vllm" --preprocess "examples/vllm/preprocess.py" --name "test vllm" --project "serving examples"` + ``` + clearml-serving --id model add --model-id --engine vllm --endpoint "test_vllm" --preprocess "examples/vllm/preprocess.py" --name "test vllm" --project "serving examples" + ``` Or auto update - `clearml-serving --id model auto-update --engine vllm --endpoint "test_vllm" --preprocess "examples/vllm/preprocess.py" --name "test vllm" --project "serving examples" --max-versions 2` + ``` + clearml-serving --id model auto-update --engine vllm --endpoint "test_vllm" --preprocess "examples/vllm/preprocess.py" --name "test vllm" --project "serving examples" --max-versions 2 + ``` Or add Canary endpoint - `clearml-serving --id model canary --endpoint "test_vllm" --weights 0.1 0.9 --input-endpoint-prefix test_vllm` + ``` + clearml-serving --id model canary --endpoint "test_vllm" --weights 0.1 0.9 --input-endpoint-prefix test_vllm + ``` 4. If you already have the `clearml-serving` docker-compose running, it might take it a minute or two to sync with the new endpoint. - Or you can run the clearml-serving container independently `docker run -v ~/clearml.conf:/root/clearml.conf -p 8080:8080 -e CLEARML_SERVING_TASK_ID= clearml-serving:latest` + Or you can run the clearml-serving container independently: + ``` + docker run -v ~/clearml.conf:/root/clearml.conf -p 8080:8080 -e CLEARML_SERVING_TASK_ID= clearml-serving:latest + ``` 5. Test new endpoint (do notice the first call will trigger the model pulling, so it might take longer, from here on, it's all in memory): -```python - -import openai -openai.api_key = "dummy" -openai.api_base = f"http://serving.apps.okd.mts.ai/clearml/v1" - - -r0 = await openai.ChatCompletion.acreate( - model=vllm_endpoint, - messages=[{"role": "system", "content": ""}, {"role": "user", "content": "Hi there, goodman!"}], - temperature=1.0, - max_tokens=1024, - top_p=1.0, - request_timeout=10000, -) + ```bash + python examples/vllm/test_openai_app.py + ``` -print(f"ChatCompletion: {r0['choices'][0]['message']}") - -r1 = await openai.Completion.acreate( - model=vllm_endpoint, - prompt="Hi there, goodman!", - temperature=1.0, - max_tokens=256, -) - -print(f"Completion: \n {r1['choices'][0]['text']}") - -``` NOTE! If you want to use send_request method, keep in mind that you have to pass "completions" or "chat/completions" in entrypoint (and pass model as a part of "data" parameter) and use it for non-streaming models: diff --git a/examples/vllm/test_openai_api.py b/examples/vllm/test_openai_api.py new file mode 100644 index 0000000..4290427 --- /dev/null +++ b/examples/vllm/test_openai_api.py @@ -0,0 +1,32 @@ +from openai import OpenAI + +def main(model_name: str = "test_vllm"): + client = OpenAI(api_key="-") + client.base_url = "http://127.0.0.1:8080/serve/openai/v1" + + chat_response = client.chat.completions.create( + model=model_name, + messages=[ + {"role": "system", "content": ""}, + {"role": "user", "content": "Hi there, goodman!"} + ], + temperature=1.0, + max_tokens=1024, + top_p=1.0 + ) + + print(f"ChatCompletion: {chat_response.choices[0].message}") + + comp_response = client.completions.create( + model=model_name, + prompt="Hi there, goodman!", + temperature=1.0, + max_tokens=256 + ) + + print(f"Completion: \n {comp_response.choices[0].text}") + + return None + +if __name__ == '__main__': + main() \ No newline at end of file From 9441ae8473a078627e8b8acb073416565d4b1911 Mon Sep 17 00:00:00 2001 From: IlyaMescheryakov1402 Date: Mon, 10 Mar 2025 23:52:14 +0300 Subject: [PATCH 12/22] move engine init in separate class --- clearml_serving/serving/preprocess_service.py | 261 +++++++++++++----- docker/docker-compose-gpu.yml | 1 + examples/vllm/preprocess.py | 160 ++++++----- examples/vllm/test_openai_api.py | 2 +- 4 files changed, 273 insertions(+), 151 deletions(-) diff --git a/clearml_serving/serving/preprocess_service.py b/clearml_serving/serving/preprocess_service.py index 0e7d15a..09e5ded 100644 --- a/clearml_serving/serving/preprocess_service.py +++ b/clearml_serving/serving/preprocess_service.py @@ -1,4 +1,5 @@ import os +import json import sys import threading import traceback @@ -14,19 +15,12 @@ from .endpoints import ModelEndpoint -# try: -# import prometheus_client -# from fastapi.responses import JSONResponse, StreamingResponse -# from vllm.entrypoints.openai.protocol import ( -# ChatCompletionRequest, -# ChatCompletionResponse, -# CompletionRequest, -# ErrorResponse -# ) -# from vllm.logger import init_logger -# except ImportError: -# print("WARNING: install vllm==0.5.4 and prometheus_client==0.21.1 to serve vllm engine") - +class Singleton(object): + _instance = None + def __new__(class_, *args, **kwargs): + if not isinstance(class_._instance, class_): + class_._instance = object.__new__(class_, *args, **kwargs) + return class_._instance class BasePreprocessRequest(object): __preprocessing_lookup = {} @@ -611,41 +605,21 @@ async def _preprocess_send_request(_, endpoint: str, version: str = None, data: return return_value.json() -@BasePreprocessRequest.register_engine("vllm", modules=["vllm", "fastapi"]) -class VllmPreprocessRequest(BasePreprocessRequest): - is_preprocess_async = True - is_process_async = True - is_postprocess_async = True - asyncio_to_thread = None +class VllmEngine(Singleton): + _model = None _vllm = None _fastapi = None + is_already_loaded = False - class CustomRequest: - def __init__(self, headers: Optional[dict] = None): - self.headers = headers - - async def is_disconnected(self): - return False - - def __init__(self, model_endpoint: ModelEndpoint, task: Task = None): - super(VllmPreprocessRequest, self).__init__( - model_endpoint=model_endpoint, task=task) + def __init__(self): # load vLLM Modules if self._vllm is None: - from vllm.entrypoints.openai.protocol import ( - ChatCompletionRequest, - ChatCompletionResponse, - CompletionRequest, - CompletionResponse, - ErrorResponse - ) + from vllm import entrypoints, engine, usage self._vllm = {} - self._vllm["chat_completion_request"] = ChatCompletionRequest - self._vllm["chat_completion_response"] = ChatCompletionResponse - self._vllm["completion_request"] = CompletionRequest - self._vllm["completion_response"] = CompletionResponse - self._vllm["error_response"] = ErrorResponse + self._vllm["entrypoints"] = entrypoints + self._vllm["engine"] = engine + self._vllm["usage"] = usage if self._fastapi is None: from fastapi.responses import JSONResponse, StreamingResponse @@ -656,16 +630,189 @@ def __init__(self, model_endpoint: ModelEndpoint, task: Task = None): from vllm.logger import init_logger self.logger = init_logger(__name__) - if VllmPreprocessRequest.asyncio_to_thread is None: - from asyncio import to_thread as asyncio_to_thread - VllmPreprocessRequest.asyncio_to_thread = asyncio_to_thread - import socket import prometheus_client with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: if not s.connect_ex(('localhost', 8000)) == 0: prometheus_client.start_http_server(8000) + def load_engine( + self, + name: str, + model_path: str, + vllm_model_config: dict, + chat_settings: dict + ): + if self.is_already_loaded: + self.add_models(name=name, model_path=model_path) + return None + + vllm_engine_config = json.loads(os.environ.get("VLLM_ENGINE_ARGS")) + engine_args = self._vllm["engine"].arg_utils.AsyncEngineArgs(**vllm_engine_config) + async_engine_client = self._vllm["engine"].async_llm_engine.AsyncLLMEngine.from_engine_args( + engine_args, + usage_context=self._vllm["usage"].usage_lib.UsageContext.OPENAI_API_SERVER + ) + model_config = async_engine_client.engine.get_model_config() + request_logger = self._vllm["entrypoints"].logger.RequestLogger( + max_log_len=vllm_model_config["max_log_len"] + ) + self._model["openai_serving_models"] = self._vllm[ + "entrypoints" + ].openai.serving_models.OpenAIServingModels( + async_engine_client, + model_config, + [ + self._vllm["entrypoints"].openai.serving_models.BaseModelPath( + name=name, + model_path=model_path + ) + ], + lora_modules=svllm_model_config["lora_modules"], + prompt_adapters=vllm_model_config["prompt_adapters"], + ) + await self._model["openai_serving_models"].init_static_loras() + self._model["openai_serving"] = self._vllm[ + "entrypoints" + ].openai.serving_engine.OpenAIServing( + async_engine_client, + model_config, + self._model["openai_serving_models"], + request_logger=request_logger, + return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"] + ) + self._model["openai_serving_chat"] = self._vllm[ + "entrypoints" + ].openai.serving_chat.OpenAIServingChat( + async_engine_client, + model_config, + self._model["openai_serving_models"], + response_role=vllm_model_config["response_role"], + request_logger=request_logger, + chat_template=vllm_model_config["chat_template"], + chat_template_content_format=chat_settings["chat_template_content_format"], + return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"], + enable_reasoning=chat_settings["enable_reasoning"], + reasoning_parser=chat_settings["reasoning_parser"], + enable_auto_tools=chat_settings["enable_auto_tools"], + tool_parser=chat_settings["tool_parser"], + enable_prompt_tokens_details=chat_settings["enable_prompt_tokens_details"] + ) if model_config.runner_type == "generate" else None + self._model["openai_serving_completion"] = self._vllm[ + "entrypoints" + ].openai.serving_completion.OpenAIServingCompletion( + async_engine_client, + model_config, + self._model["openai_serving_models"], + request_logger=request_logger, + return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"] + ) if model_config.runner_type == "generate" else None + self._model["openai_serving_embedding"] = self._vllm[ + "entrypoints" + ].openai.serving_embedding.OpenAIServingEmbedding( + async_engine_client, + model_config, + self._model["openai_serving_models"], + request_logger=request_logger, + chat_template=vllm_model_config["chat_template"], + chat_template_content_format=chat_settings["chat_template_content_format"] + ) if model_config.task == "embed" else None + self._model["openai_serving_tokenization"] = self._vllm[ + "entrypoints" + ].openai.serving_tokenization.OpenAIServingTokenization( + async_engine_client, + model_config, + self._model["openai_serving_models"], + request_logger=request_logger, + chat_template=vllm_model_config["chat_template"], + chat_template_content_format=chat_settings["chat_template_content_format"] + ) + self.logger.info("vLLM Engine was successfully initialized") + self.is_already_loaded = True + return None + + def add_models(self, name: str, model_path: str): + self._model["openai_serving_models"].base_model_paths.append( + self._vllm["entrypoints"].openai.serving_models.BaseModelPath( + name=name, + model_path=model_path + ) + ) + self.logger.info("Model {} was added to vllm engine".format(name)) + return None + + async def completions( + self, + data: Any, + state: dict, + collect_custom_statistics_fn: Callable[[dict], None] = None + ) -> Any: + """ + The actual processing function. + We run the process in this context + """ + request, raw_request = data["request"], data["raw_request"] + # analog of completion(raw_request) in https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/entrypoints/openai/api_server.py#L405 + handler = self._model["openai_serving_completion"] + if handler is None: + return self._model["openai_serving"].create_error_response( + message="The model does not support Completions API" + ) + generator = await handler.create_completion(request=request, raw_request=raw_request) + if isinstance(generator, self._vllm["entrypoints"].openai.protocol.ErrorResponse): + return self._fastapi["json_response"](content=generator.model_dump(), status_code=generator.code) + elif isinstance(generator, self._vllm["entrypoints"].openai.protocol.CompletionResponse): + return self._fastapi["json_response"](content=generator.model_dump()) + return self._fastapi["streaming_response"](content=generator, media_type="text/event-stream") + + + async def chat_completions( + self, + data: Any, + state: dict, + collect_custom_statistics_fn: Callable[[dict], None] = None + ) -> Any: + """ + The actual processing function. + We run the process in this context + """ + request, raw_request = data["request"], data["raw_request"] + # analog of chat(raw_request) in https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/entrypoints/openai/api_server.py#L405 + handler = self._model["openai_serving_chat"] + if handler is None: + return self._model["openai_serving"].create_error_response( + message="The model does not support Chat Completions API" + ) + generator = await handler.create_chat_completion(request=request, raw_request=raw_request) + if isinstance(generator, self._vllm["entrypoints"].openai.protocol.ErrorResponse): + return self._fastapi["json_response"](content=generator.model_dump(), status_code=generator.code) + elif isinstance(generator, self._vllm["entrypoints"].openai.protocol.ChatCompletionResponse): + return self._fastapi["json_response"](content=generator.model_dump()) + return self._fastapi["streaming_response"](content=generator, media_type="text/event-stream") + + +@BasePreprocessRequest.register_engine("vllm", modules=["vllm", "fastapi"]) +class VllmPreprocessRequest(BasePreprocessRequest): + is_preprocess_async = True + is_process_async = True + is_postprocess_async = True + asyncio_to_thread = None + _vllm_engine = None + + def __init__(self, model_endpoint: ModelEndpoint, task: Task = None): + super(VllmPreprocessRequest, self).__init__( + model_endpoint=model_endpoint, task=task) + self._vllm_engine = VllmEngine() + self._vllm_engine.load_engine( + name=model_endpoint.serving_url, + model_path=self._get_local_model_file(), + **self._model + ) + + if VllmPreprocessRequest.asyncio_to_thread is None: + from asyncio import to_thread as asyncio_to_thread + VllmPreprocessRequest.asyncio_to_thread = asyncio_to_thread + # override `send_request` method with the async version self._preprocess.__class__.send_request = VllmPreprocessRequest._preprocess_send_request @@ -734,17 +881,7 @@ async def completions(self, data: Any, state: dict, collect_custom_statistics_fn The actual processing function. We run the process in this context """ - request, raw_request = data["request"], data["raw_request"] - # analog of completion(raw_request) in https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/entrypoints/openai/api_server.py#L405 - handler = self._model["openai_serving_completion"] - if handler is None: - return self._model["openai_serving"].create_error_response(message="The model does not support Completions API") - generator = await handler.create_completion(request=request, raw_request=raw_request) - if isinstance(generator, self._vllm["error_response"]): - return self._fastapi["json_response"](content=generator.model_dump(), status_code=generator.code) - elif isinstance(generator, self._vllm["completion_response"]): - return self._fastapi["json_response"](content=generator.model_dump()) - return self._fastapi["streaming_response"](content=generator, media_type="text/event-stream") + return self._vllm_engine.completions(data=data, state=state, collect_custom_statistics_fn=collect_custom_statistics_fn) async def chat_completions(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: @@ -752,17 +889,7 @@ async def chat_completions(self, data: Any, state: dict, collect_custom_statisti The actual processing function. We run the process in this context """ - request, raw_request = data["request"], data["raw_request"] - # analog of chat(raw_request) in https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/entrypoints/openai/api_server.py#L405 - handler = self._model["openai_serving_chat"] - if handler is None: - return self._model["openai_serving"].create_error_response(message="The model does not support Chat Completions API") - generator = await handler.create_chat_completion(request=request, raw_request=raw_request) - if isinstance(generator, self._vllm["error_response"]): - return self._fastapi["json_response"](content=generator.model_dump(), status_code=generator.code) - elif isinstance(generator, self._vllm["chat_completion_response"]): - return self._fastapi["json_response"](content=generator.model_dump()) - return self._fastapi["streaming_response"](content=generator, media_type="text/event-stream") + return self._vllm_engine.chat_completions(data=data, state=state, collect_custom_statistics_fn=collect_custom_statistics_fn) @staticmethod diff --git a/docker/docker-compose-gpu.yml b/docker/docker-compose-gpu.yml index 0009fa8..221f6d0 100644 --- a/docker/docker-compose-gpu.yml +++ b/docker/docker-compose-gpu.yml @@ -105,6 +105,7 @@ services: GOOGLE_APPLICATION_CREDENTIALS: ${GOOGLE_APPLICATION_CREDENTIALS:-} AZURE_STORAGE_ACCOUNT: ${AZURE_STORAGE_ACCOUNT:-} AZURE_STORAGE_KEY: ${AZURE_STORAGE_KEY:-} + VLLM_ENGINE_ARGS: ${VLLM_ENGINE_ARGS:-'{"disable_log_requests":true,"disable_log_stats":false,"gpu_memory_utilization":0.95,"quantization":null,"enforce_eager":true}'} depends_on: - kafka networks: diff --git a/examples/vllm/preprocess.py b/examples/vllm/preprocess.py index b2c6ce0..aa0f13a 100644 --- a/examples/vllm/preprocess.py +++ b/examples/vllm/preprocess.py @@ -1,16 +1,5 @@ """Hugginface preprocessing module for ClearML Serving.""" from typing import Any, Optional, List, Callable, Union -from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.entrypoints.logger import RequestLogger -from vllm.entrypoints.openai.serving_engine import OpenAIServing -from vllm.entrypoints.openai.serving_models import OpenAIServingModels, LoRAModulePath, PromptAdapterPath, BaseModelPath -from vllm.entrypoints.openai.serving_chat import OpenAIServingChat -from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion -from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding -from vllm.entrypoints.openai.serving_tokenization import OpenAIServingTokenization -from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption -from vllm.usage.usage_lib import UsageContext class Preprocess: @@ -22,19 +11,19 @@ def __init__(self): def load(self, local_file_name: str) -> Optional[Any]: # noqa - self.vllm_engine_config = { - "model": local_file_name, - "tokenizer": local_file_name, - "disable_log_requests": True, - "disable_log_stats": False, - "gpu_memory_utilization": 0.9, - "quantization": None, - "enforce_eager": True, - "served_model_name": "test_vllm", - "dtype": "float16", - "max_model_len": 8192 - } - self.vllm_model_config = { + # vllm_engine_config = { + # "model": local_file_name, + # "tokenizer": local_file_name, + # "disable_log_requests": True, + # "disable_log_stats": False, + # "gpu_memory_utilization": 0.9, + # "quantization": None, + # "enforce_eager": True, + # "served_model_name": "test_vllm", + # "dtype": "float16", + # "max_model_len": 8192 + # } + vllm_model_config = { "lora_modules": None, # [LoRAModulePath(name=a, path=b)] "prompt_adapters": None, # [PromptAdapterPath(name=a, path=b)] "response_role": "assistant", @@ -42,7 +31,7 @@ def load(self, local_file_name: str) -> Optional[Any]: # noqa "return_tokens_as_token_ids": False, "max_log_len": None } - self.chat_settings = { + chat_settings = { "enable_reasoning": False, "reasoning_parser": None, "enable_auto_tools": False, @@ -50,64 +39,69 @@ def load(self, local_file_name: str) -> Optional[Any]: # noqa "enable_prompt_tokens_details": False, "chat_template_content_format": "auto" } - self._model = {} - self.engine_args = AsyncEngineArgs(**self.vllm_engine_config) - async_engine_client = AsyncLLMEngine.from_engine_args(self.engine_args, usage_context=UsageContext.OPENAI_API_SERVER) - self.model_config = async_engine_client.engine.get_model_config() - request_logger = RequestLogger(max_log_len=self.vllm_model_config["max_log_len"]) - self._model["openai_serving_models"] = OpenAIServingModels( - async_engine_client, - self.model_config, - [BaseModelPath(name=self.vllm_engine_config["served_model_name"], model_path=self.vllm_engine_config["model"])], - lora_modules=self.vllm_model_config["lora_modules"], - prompt_adapters=self.vllm_model_config["prompt_adapters"], - ) - self._model["openai_serving"] = OpenAIServing( - async_engine_client, - self.model_config, - self._model["openai_serving_models"], - request_logger=request_logger, - return_tokens_as_token_ids=self.vllm_model_config["return_tokens_as_token_ids"] - ) - self._model["openai_serving_chat"] = OpenAIServingChat( - async_engine_client, - self.model_config, - self._model["openai_serving_models"], - response_role=self.vllm_model_config["response_role"], - request_logger=request_logger, - chat_template=self.vllm_model_config["chat_template"], - chat_template_content_format=self.chat_settings["chat_template_content_format"], - return_tokens_as_token_ids=self.vllm_model_config["return_tokens_as_token_ids"], - enable_reasoning=self.chat_settings["enable_reasoning"], - reasoning_parser=self.chat_settings["reasoning_parser"], - enable_auto_tools=self.chat_settings["enable_auto_tools"], - tool_parser=self.chat_settings["tool_parser"], - enable_prompt_tokens_details=self.chat_settings["enable_prompt_tokens_details"] - ) if self.model_config.runner_type == "generate" else None - self._model["openai_serving_completion"] = OpenAIServingCompletion( - async_engine_client, - self.model_config, - self._model["openai_serving_models"], - request_logger=request_logger, - return_tokens_as_token_ids=self.vllm_model_config["return_tokens_as_token_ids"] - ) if self.model_config.runner_type == "generate" else None - self._model["openai_serving_embedding"] = OpenAIServingEmbedding( - async_engine_client, - self.model_config, - self._model["openai_serving_models"], - request_logger=request_logger, - chat_template=self.vllm_model_config["chat_template"], - chat_template_content_format=self.chat_settings["chat_template_content_format"] - ) if self.model_config.task == "embed" else None - self._model["openai_serving_tokenization"] = OpenAIServingTokenization( - async_engine_client, - self.model_config, - self._model["openai_serving_models"], - request_logger=request_logger, - chat_template=self.vllm_model_config["chat_template"], - chat_template_content_format=self.chat_settings["chat_template_content_format"] - ) - return self._model + # self._model = {} + # engine_args = AsyncEngineArgs(**self.vllm_engine_config) + # async_engine_client = AsyncLLMEngine.from_engine_args(self.engine_args, usage_context=UsageContext.OPENAI_API_SERVER) + # model_config = async_engine_client.engine.get_model_config() + # request_logger = RequestLogger(max_log_len=self.vllm_model_config["max_log_len"]) + # self._model["openai_serving_models"] = OpenAIServingModels( + # async_engine_client, + # self.model_config, + # [BaseModelPath(name=self.vllm_engine_config["served_model_name"], model_path=self.vllm_engine_config["model"])], + # lora_modules=self.vllm_model_config["lora_modules"], + # prompt_adapters=self.vllm_model_config["prompt_adapters"], + # ) + # self._model["openai_serving"] = OpenAIServing( + # async_engine_client, + # self.model_config, + # self._model["openai_serving_models"], + # request_logger=request_logger, + # return_tokens_as_token_ids=self.vllm_model_config["return_tokens_as_token_ids"] + # ) + # self._model["openai_serving_chat"] = OpenAIServingChat( + # async_engine_client, + # self.model_config, + # self._model["openai_serving_models"], + # response_role=self.vllm_model_config["response_role"], + # request_logger=request_logger, + # chat_template=self.vllm_model_config["chat_template"], + # chat_template_content_format=self.chat_settings["chat_template_content_format"], + # return_tokens_as_token_ids=self.vllm_model_config["return_tokens_as_token_ids"], + # enable_reasoning=self.chat_settings["enable_reasoning"], + # reasoning_parser=self.chat_settings["reasoning_parser"], + # enable_auto_tools=self.chat_settings["enable_auto_tools"], + # tool_parser=self.chat_settings["tool_parser"], + # enable_prompt_tokens_details=self.chat_settings["enable_prompt_tokens_details"] + # ) if self.model_config.runner_type == "generate" else None + # self._model["openai_serving_completion"] = OpenAIServingCompletion( + # async_engine_client, + # self.model_config, + # self._model["openai_serving_models"], + # request_logger=request_logger, + # return_tokens_as_token_ids=self.vllm_model_config["return_tokens_as_token_ids"] + # ) if self.model_config.runner_type == "generate" else None + # self._model["openai_serving_embedding"] = OpenAIServingEmbedding( + # async_engine_client, + # self.model_config, + # self._model["openai_serving_models"], + # request_logger=request_logger, + # chat_template=self.vllm_model_config["chat_template"], + # chat_template_content_format=self.chat_settings["chat_template_content_format"] + # ) if self.model_config.task == "embed" else None + # self._model["openai_serving_tokenization"] = OpenAIServingTokenization( + # async_engine_client, + # self.model_config, + # self._model["openai_serving_models"], + # request_logger=request_logger, + # chat_template=self.vllm_model_config["chat_template"], + # chat_template_content_format=self.chat_settings["chat_template_content_format"] + # ) + # return self._model + return { + # "vllm_engine_config": vllm_engine_config, + "vllm_model_config": vllm_model_config, + "chat_settings": chat_settings + } def remove_extra_system_prompts(self, messages: List) -> List: system_messages_indices = [] diff --git a/examples/vllm/test_openai_api.py b/examples/vllm/test_openai_api.py index 4290427..55d7b27 100644 --- a/examples/vllm/test_openai_api.py +++ b/examples/vllm/test_openai_api.py @@ -15,7 +15,7 @@ def main(model_name: str = "test_vllm"): top_p=1.0 ) - print(f"ChatCompletion: {chat_response.choices[0].message}") + print(f"ChatCompletion: {chat_response.choices[0].message.content}") comp_response = client.completions.create( model=model_name, From 9bb0dbb1821d369dc9e5d47bc186946d12c19a74 Mon Sep 17 00:00:00 2001 From: IlyaMescheryakov1402 Date: Tue, 11 Mar 2025 11:45:52 +0300 Subject: [PATCH 13/22] fix imports --- clearml_serving/serving/preprocess_service.py | 195 ++++++++++-------- docker/docker-compose-gpu.yml | 2 +- examples/vllm/preprocess.py | 71 ------- 3 files changed, 114 insertions(+), 154 deletions(-) diff --git a/clearml_serving/serving/preprocess_service.py b/clearml_serving/serving/preprocess_service.py index 09e5ded..2d584c9 100644 --- a/clearml_serving/serving/preprocess_service.py +++ b/clearml_serving/serving/preprocess_service.py @@ -615,11 +615,38 @@ def __init__(self): # load vLLM Modules if self._vllm is None: - from vllm import entrypoints, engine, usage - self._vllm = {} - self._vllm["entrypoints"] = entrypoints - self._vllm["engine"] = engine - self._vllm["usage"] = usage + # from vllm import entrypoints, engine, usage + from vllm.engine.arg_utils import AsyncEngineArgs + from vllm.engine.async_llm_engine import AsyncLLMEngine + from vllm.entrypoints.logger import RequestLogger + from vllm.entrypoints.openai.serving_engine import OpenAIServing + from vllm.entrypoints.openai.serving_models import OpenAIServingModels, LoRAModulePath, PromptAdapterPath, BaseModelPath + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding + from vllm.entrypoints.openai.serving_tokenization import OpenAIServingTokenization + from vllm.entrypoints.openai.protocol import ChatCompletionResponse, CompletionResponse, ErrorResponse + from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption + from vllm.usage.usage_lib import UsageContext + self._vllm = { + "AsyncEngineArgs": AsyncEngineArgs, + "AsyncLLMEngine": AsyncLLMEngine, + "RequestLogger": RequestLogger, + "OpenAIServing": OpenAIServing, + "OpenAIServingModels": OpenAIServingModels, + "LoRAModulePath": LoRAModulePath, + "PromptAdapterPath": PromptAdapterPath, + "BaseModelPath": BaseModelPath, + "OpenAIServingChat": OpenAIServingChat, + "OpenAIServingCompletion": OpenAIServingCompletion, + "OpenAIServingEmbedding": OpenAIServingEmbedding, + "OpenAIServingTokenization": OpenAIServingTokenization, + "ChatCompletionResponse": ChatCompletionResponse, + "CompletionResponse": CompletionResponse, + "ErrorResponse": ErrorResponse, + "ChatTemplateContentFormatOption": ChatTemplateContentFormatOption, + "UsageContext": UsageContext + } if self._fastapi is None: from fastapi.responses import JSONResponse, StreamingResponse @@ -647,85 +674,75 @@ def load_engine( self.add_models(name=name, model_path=model_path) return None - vllm_engine_config = json.loads(os.environ.get("VLLM_ENGINE_ARGS")) - engine_args = self._vllm["engine"].arg_utils.AsyncEngineArgs(**vllm_engine_config) - async_engine_client = self._vllm["engine"].async_llm_engine.AsyncLLMEngine.from_engine_args( + vllm_engine_config = json.loads(os.environ.get("VLLM_ENGINE_ARGS").replace("'", "")) + vllm_engine_config["model"] = model_path + vllm_engine_config["served_model_name"] = name + engine_args = self._vllm["AsyncEngineArgs"](**vllm_engine_config) + async_engine_client = self._vllm["AsyncLLMEngine"].from_engine_args( engine_args, - usage_context=self._vllm["usage"].usage_lib.UsageContext.OPENAI_API_SERVER + usage_context=self._vllm["UsageContext"].OPENAI_API_SERVER ) model_config = async_engine_client.engine.get_model_config() - request_logger = self._vllm["entrypoints"].logger.RequestLogger( + request_logger = self._vllm["RequestLogger"]( max_log_len=vllm_model_config["max_log_len"] ) - self._model["openai_serving_models"] = self._vllm[ - "entrypoints" - ].openai.serving_models.OpenAIServingModels( - async_engine_client, - model_config, - [ - self._vllm["entrypoints"].openai.serving_models.BaseModelPath( - name=name, - model_path=model_path - ) - ], - lora_modules=svllm_model_config["lora_modules"], - prompt_adapters=vllm_model_config["prompt_adapters"], + self._model["openai_serving_models"] = self._vllm["OpenAIServingModels"]( + async_engine_client, + model_config, + [ + self._vllm["BaseModelPath"]( + name=name, + model_path=model_path + ) + ], + lora_modules=vllm_model_config["lora_modules"], + prompt_adapters=vllm_model_config["prompt_adapters"], ) - await self._model["openai_serving_models"].init_static_loras() - self._model["openai_serving"] = self._vllm[ - "entrypoints" - ].openai.serving_engine.OpenAIServing( - async_engine_client, - model_config, - self._model["openai_serving_models"], - request_logger=request_logger, - return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"] + # await self._model["openai_serving_models"].init_static_loras() + self._model["openai_serving"] = self._vllm["OpenAIServing"]( + async_engine_client, + model_config, + self._model["openai_serving_models"], + request_logger=request_logger, + return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"] ) - self._model["openai_serving_chat"] = self._vllm[ - "entrypoints" - ].openai.serving_chat.OpenAIServingChat( - async_engine_client, - model_config, - self._model["openai_serving_models"], - response_role=vllm_model_config["response_role"], - request_logger=request_logger, - chat_template=vllm_model_config["chat_template"], - chat_template_content_format=chat_settings["chat_template_content_format"], - return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"], - enable_reasoning=chat_settings["enable_reasoning"], - reasoning_parser=chat_settings["reasoning_parser"], - enable_auto_tools=chat_settings["enable_auto_tools"], - tool_parser=chat_settings["tool_parser"], - enable_prompt_tokens_details=chat_settings["enable_prompt_tokens_details"] + self._model["openai_serving_chat"] = self._vllm["OpenAIServingChat"]( + async_engine_client, + model_config, + self._model["openai_serving_models"], + response_role=vllm_model_config["response_role"], + request_logger=request_logger, + chat_template=vllm_model_config["chat_template"], + chat_template_content_format=chat_settings["chat_template_content_format"], + return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"], + enable_reasoning=chat_settings["enable_reasoning"], + reasoning_parser=chat_settings["reasoning_parser"], + enable_auto_tools=chat_settings["enable_auto_tools"], + tool_parser=chat_settings["tool_parser"], + enable_prompt_tokens_details=chat_settings["enable_prompt_tokens_details"] ) if model_config.runner_type == "generate" else None - self._model["openai_serving_completion"] = self._vllm[ - "entrypoints" - ].openai.serving_completion.OpenAIServingCompletion( - async_engine_client, - model_config, - self._model["openai_serving_models"], - request_logger=request_logger, - return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"] + self._model["openai_serving_completion"] = self._vllm["OpenAIServingCompletion"]( + async_engine_client, + model_config, + self._model["openai_serving_models"], + request_logger=request_logger, + return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"] ) if model_config.runner_type == "generate" else None - self._model["openai_serving_embedding"] = self._vllm[ - "entrypoints" - ].openai.serving_embedding.OpenAIServingEmbedding( - async_engine_client, - model_config, - self._model["openai_serving_models"], - request_logger=request_logger, - chat_template=vllm_model_config["chat_template"], - chat_template_content_format=chat_settings["chat_template_content_format"] + self._model["openai_serving_embedding"] = self._vllm["OpenAIServingEmbedding"]( + async_engine_client, + model_config, + self._model["openai_serving_models"], + request_logger=request_logger, + chat_template=vllm_model_config["chat_template"], + chat_template_content_format=chat_settings["chat_template_content_format"] ) if model_config.task == "embed" else None - self._model["openai_serving_tokenization"] = self._vllm[ - "entrypoints" - ].openai.serving_tokenization.OpenAIServingTokenization( - async_engine_client, - model_config, - self._model["openai_serving_models"], - request_logger=request_logger, - chat_template=vllm_model_config["chat_template"], - chat_template_content_format=chat_settings["chat_template_content_format"] + self._model["openai_serving_tokenization"] = self._vllm["OpenAIServingTokenization"]( + async_engine_client, + model_config, + self._model["openai_serving_models"], + request_logger=request_logger, + chat_template=vllm_model_config["chat_template"], + chat_template_content_format=chat_settings["chat_template_content_format"] ) self.logger.info("vLLM Engine was successfully initialized") self.is_already_loaded = True @@ -733,7 +750,7 @@ def load_engine( def add_models(self, name: str, model_path: str): self._model["openai_serving_models"].base_model_paths.append( - self._vllm["entrypoints"].openai.serving_models.BaseModelPath( + self._vllm["BaseModelPath"]( name=name, model_path=model_path ) @@ -759,13 +776,12 @@ async def completions( message="The model does not support Completions API" ) generator = await handler.create_completion(request=request, raw_request=raw_request) - if isinstance(generator, self._vllm["entrypoints"].openai.protocol.ErrorResponse): + if isinstance(generator, self._vllm["ErrorResponse"]): return self._fastapi["json_response"](content=generator.model_dump(), status_code=generator.code) - elif isinstance(generator, self._vllm["entrypoints"].openai.protocol.CompletionResponse): + elif isinstance(generator, self._vllm["CompletionResponse"]): return self._fastapi["json_response"](content=generator.model_dump()) return self._fastapi["streaming_response"](content=generator, media_type="text/event-stream") - async def chat_completions( self, data: Any, @@ -784,12 +800,20 @@ async def chat_completions( message="The model does not support Chat Completions API" ) generator = await handler.create_chat_completion(request=request, raw_request=raw_request) - if isinstance(generator, self._vllm["entrypoints"].openai.protocol.ErrorResponse): + if isinstance(generator, self._vllm["ErrorResponse"]): return self._fastapi["json_response"](content=generator.model_dump(), status_code=generator.code) - elif isinstance(generator, self._vllm["entrypoints"].openai.protocol.ChatCompletionResponse): + elif isinstance(generator, self._vllm["ChatCompletionResponse"]): return self._fastapi["json_response"](content=generator.model_dump()) return self._fastapi["streaming_response"](content=generator, media_type="text/event-stream") + async def models( + self, + data: Any, + state: dict, + collect_custom_statistics_fn: Callable[[dict], None] = None + ) -> Any: + pass + @BasePreprocessRequest.register_engine("vllm", modules=["vllm", "fastapi"]) class VllmPreprocessRequest(BasePreprocessRequest): @@ -881,7 +905,7 @@ async def completions(self, data: Any, state: dict, collect_custom_statistics_fn The actual processing function. We run the process in this context """ - return self._vllm_engine.completions(data=data, state=state, collect_custom_statistics_fn=collect_custom_statistics_fn) + return await self._vllm_engine.completions(data=data, state=state, collect_custom_statistics_fn=collect_custom_statistics_fn) async def chat_completions(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: @@ -889,9 +913,16 @@ async def chat_completions(self, data: Any, state: dict, collect_custom_statisti The actual processing function. We run the process in this context """ - return self._vllm_engine.chat_completions(data=data, state=state, collect_custom_statistics_fn=collect_custom_statistics_fn) + return await self._vllm_engine.chat_completions(data=data, state=state, collect_custom_statistics_fn=collect_custom_statistics_fn) + async def models(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: + """ + The actual processing function. + We run the process in this context + """ + return self._vllm_engine.models(data=data, state=state, collect_custom_statistics_fn=collect_custom_statistics_fn) + @staticmethod async def _preprocess_send_request(_, endpoint: str, version: str = None, data: dict = None) -> Optional[dict]: endpoint = "/openai/v1/{}".format(endpoint.strip("/")) diff --git a/docker/docker-compose-gpu.yml b/docker/docker-compose-gpu.yml index 221f6d0..dbb063b 100644 --- a/docker/docker-compose-gpu.yml +++ b/docker/docker-compose-gpu.yml @@ -105,7 +105,7 @@ services: GOOGLE_APPLICATION_CREDENTIALS: ${GOOGLE_APPLICATION_CREDENTIALS:-} AZURE_STORAGE_ACCOUNT: ${AZURE_STORAGE_ACCOUNT:-} AZURE_STORAGE_KEY: ${AZURE_STORAGE_KEY:-} - VLLM_ENGINE_ARGS: ${VLLM_ENGINE_ARGS:-'{"disable_log_requests":true,"disable_log_stats":false,"gpu_memory_utilization":0.95,"quantization":null,"enforce_eager":true}'} + VLLM_ENGINE_ARGS: ${VLLM_ENGINE_ARGS:-'{"disable_log_requests":true,"disable_log_stats":false,"gpu_memory_utilization":0.95,"enforce_eager":true}'} depends_on: - kafka networks: diff --git a/examples/vllm/preprocess.py b/examples/vllm/preprocess.py index aa0f13a..001dd58 100644 --- a/examples/vllm/preprocess.py +++ b/examples/vllm/preprocess.py @@ -11,18 +11,6 @@ def __init__(self): def load(self, local_file_name: str) -> Optional[Any]: # noqa - # vllm_engine_config = { - # "model": local_file_name, - # "tokenizer": local_file_name, - # "disable_log_requests": True, - # "disable_log_stats": False, - # "gpu_memory_utilization": 0.9, - # "quantization": None, - # "enforce_eager": True, - # "served_model_name": "test_vllm", - # "dtype": "float16", - # "max_model_len": 8192 - # } vllm_model_config = { "lora_modules": None, # [LoRAModulePath(name=a, path=b)] "prompt_adapters": None, # [PromptAdapterPath(name=a, path=b)] @@ -39,66 +27,7 @@ def load(self, local_file_name: str) -> Optional[Any]: # noqa "enable_prompt_tokens_details": False, "chat_template_content_format": "auto" } - # self._model = {} - # engine_args = AsyncEngineArgs(**self.vllm_engine_config) - # async_engine_client = AsyncLLMEngine.from_engine_args(self.engine_args, usage_context=UsageContext.OPENAI_API_SERVER) - # model_config = async_engine_client.engine.get_model_config() - # request_logger = RequestLogger(max_log_len=self.vllm_model_config["max_log_len"]) - # self._model["openai_serving_models"] = OpenAIServingModels( - # async_engine_client, - # self.model_config, - # [BaseModelPath(name=self.vllm_engine_config["served_model_name"], model_path=self.vllm_engine_config["model"])], - # lora_modules=self.vllm_model_config["lora_modules"], - # prompt_adapters=self.vllm_model_config["prompt_adapters"], - # ) - # self._model["openai_serving"] = OpenAIServing( - # async_engine_client, - # self.model_config, - # self._model["openai_serving_models"], - # request_logger=request_logger, - # return_tokens_as_token_ids=self.vllm_model_config["return_tokens_as_token_ids"] - # ) - # self._model["openai_serving_chat"] = OpenAIServingChat( - # async_engine_client, - # self.model_config, - # self._model["openai_serving_models"], - # response_role=self.vllm_model_config["response_role"], - # request_logger=request_logger, - # chat_template=self.vllm_model_config["chat_template"], - # chat_template_content_format=self.chat_settings["chat_template_content_format"], - # return_tokens_as_token_ids=self.vllm_model_config["return_tokens_as_token_ids"], - # enable_reasoning=self.chat_settings["enable_reasoning"], - # reasoning_parser=self.chat_settings["reasoning_parser"], - # enable_auto_tools=self.chat_settings["enable_auto_tools"], - # tool_parser=self.chat_settings["tool_parser"], - # enable_prompt_tokens_details=self.chat_settings["enable_prompt_tokens_details"] - # ) if self.model_config.runner_type == "generate" else None - # self._model["openai_serving_completion"] = OpenAIServingCompletion( - # async_engine_client, - # self.model_config, - # self._model["openai_serving_models"], - # request_logger=request_logger, - # return_tokens_as_token_ids=self.vllm_model_config["return_tokens_as_token_ids"] - # ) if self.model_config.runner_type == "generate" else None - # self._model["openai_serving_embedding"] = OpenAIServingEmbedding( - # async_engine_client, - # self.model_config, - # self._model["openai_serving_models"], - # request_logger=request_logger, - # chat_template=self.vllm_model_config["chat_template"], - # chat_template_content_format=self.chat_settings["chat_template_content_format"] - # ) if self.model_config.task == "embed" else None - # self._model["openai_serving_tokenization"] = OpenAIServingTokenization( - # async_engine_client, - # self.model_config, - # self._model["openai_serving_models"], - # request_logger=request_logger, - # chat_template=self.vllm_model_config["chat_template"], - # chat_template_content_format=self.chat_settings["chat_template_content_format"] - # ) - # return self._model return { - # "vllm_engine_config": vllm_engine_config, "vllm_model_config": vllm_model_config, "chat_settings": chat_settings } From fedfcdadeb7da3e4b76d691a150b02f624993618 Mon Sep 17 00:00:00 2001 From: IlyaMescheryakov1402 Date: Tue, 11 Mar 2025 22:42:59 +0300 Subject: [PATCH 14/22] add getattr for process methods --- .../serving/model_request_processor.py | 39 ++++--- clearml_serving/serving/preprocess_service.py | 103 ++++++++++++------ examples/vllm/preprocess.py | 1 - 3 files changed, 89 insertions(+), 54 deletions(-) diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py index eaa4b49..0cf9084 100644 --- a/clearml_serving/serving/model_request_processor.py +++ b/clearml_serving/serving/model_request_processor.py @@ -1225,23 +1225,28 @@ async def _process_request(self, processor: BasePreprocessRequest, url: str, bod preprocessed = await processor.preprocess(body, state, stats_collect_fn) \ if processor.is_preprocess_async \ else processor.preprocess(body, state, stats_collect_fn) - if serve_type == "process": - # noinspection PyUnresolvedReferences - processed = await processor.process(preprocessed, state, stats_collect_fn) \ - if processor.is_process_async \ - else processor.process(preprocessed, state, stats_collect_fn) - elif serve_type == "completions": - # noinspection PyUnresolvedReferences - processed = await processor.completions(preprocessed, state, stats_collect_fn) \ - if processor.is_process_async \ - else processor.completions(preprocessed, state, stats_collect_fn) - elif serve_type == "chat/completions": - # noinspection PyUnresolvedReferences - processed = await processor.chat_completions(preprocessed, state, stats_collect_fn) \ - if processor.is_process_async \ - else processor.chat_completions(preprocessed, state, stats_collect_fn) - else: - raise ValueError(f"wrong url_type: expected 'process', 'completions' or 'chat/completions', got {serve_type}") + processed_func = getattr(processor, serve_type.replace("/", "_")) + # noinspection PyUnresolvedReferences + processed = await processed_func(preprocessed, state, stats_collect_fn) \ + if processor.is_process_async \ + else processed_func(preprocessed, state, stats_collect_fn) + # if serve_type == "process": + # # noinspection PyUnresolvedReferences + # processed = await processor.process(preprocessed, state, stats_collect_fn) \ + # if processor.is_process_async \ + # else processor.process(preprocessed, state, stats_collect_fn) + # elif serve_type == "completions": + # # noinspection PyUnresolvedReferences + # processed = await processor.completions(preprocessed, state, stats_collect_fn) \ + # if processor.is_process_async \ + # else processor.completions(preprocessed, state, stats_collect_fn) + # elif serve_type == "chat/completions": + # # noinspection PyUnresolvedReferences + # processed = await processor.chat_completions(preprocessed, state, stats_collect_fn) \ + # if processor.is_process_async \ + # else processor.chat_completions(preprocessed, state, stats_collect_fn) + # else: + # raise ValueError(f"wrong url_type: expected 'process', 'completions' or 'chat/completions', got {serve_type}") # noinspection PyUnresolvedReferences return_value = await processor.postprocess(processed, state, stats_collect_fn) \ if processor.is_postprocess_async \ diff --git a/clearml_serving/serving/preprocess_service.py b/clearml_serving/serving/preprocess_service.py index 2d584c9..ed9f270 100644 --- a/clearml_serving/serving/preprocess_service.py +++ b/clearml_serving/serving/preprocess_service.py @@ -606,26 +606,32 @@ async def _preprocess_send_request(_, endpoint: str, version: str = None, data: class VllmEngine(Singleton): - _model = None _vllm = None _fastapi = None is_already_loaded = False - def __init__(self): - + def __init__(self) -> None: # load vLLM Modules if self._vllm is None: - # from vllm import entrypoints, engine, usage from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.serving_engine import OpenAIServing - from vllm.entrypoints.openai.serving_models import OpenAIServingModels, LoRAModulePath, PromptAdapterPath, BaseModelPath + from vllm.entrypoints.openai.serving_models import ( + OpenAIServingModels, + LoRAModulePath, + PromptAdapterPath, + BaseModelPath + ) from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding from vllm.entrypoints.openai.serving_tokenization import OpenAIServingTokenization - from vllm.entrypoints.openai.protocol import ChatCompletionResponse, CompletionResponse, ErrorResponse + from vllm.entrypoints.openai.protocol import ( + ChatCompletionResponse, + CompletionResponse, + ErrorResponse + ) from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption from vllm.usage.usage_lib import UsageContext self._vllm = { @@ -669,7 +675,7 @@ def load_engine( model_path: str, vllm_model_config: dict, chat_settings: dict - ): + ) -> None: if self.is_already_loaded: self.add_models(name=name, model_path=model_path) return None @@ -686,7 +692,7 @@ def load_engine( request_logger = self._vllm["RequestLogger"]( max_log_len=vllm_model_config["max_log_len"] ) - self._model["openai_serving_models"] = self._vllm["OpenAIServingModels"]( + self.openai_serving_models = self._vllm["OpenAIServingModels"]( async_engine_client, model_config, [ @@ -698,18 +704,18 @@ def load_engine( lora_modules=vllm_model_config["lora_modules"], prompt_adapters=vllm_model_config["prompt_adapters"], ) - # await self._model["openai_serving_models"].init_static_loras() - self._model["openai_serving"] = self._vllm["OpenAIServing"]( + # await self.openai_serving_models.init_static_loras() + self.openai_serving = self._vllm["OpenAIServing"]( async_engine_client, model_config, - self._model["openai_serving_models"], + self.openai_serving_models, request_logger=request_logger, return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"] ) - self._model["openai_serving_chat"] = self._vllm["OpenAIServingChat"]( + self.openai_serving_chat = self._vllm["OpenAIServingChat"]( async_engine_client, model_config, - self._model["openai_serving_models"], + self.openai_serving_models, response_role=vllm_model_config["response_role"], request_logger=request_logger, chat_template=vllm_model_config["chat_template"], @@ -721,25 +727,25 @@ def load_engine( tool_parser=chat_settings["tool_parser"], enable_prompt_tokens_details=chat_settings["enable_prompt_tokens_details"] ) if model_config.runner_type == "generate" else None - self._model["openai_serving_completion"] = self._vllm["OpenAIServingCompletion"]( + self.openai_serving_completion = self._vllm["OpenAIServingCompletion"]( async_engine_client, model_config, - self._model["openai_serving_models"], + self.openai_serving_models, request_logger=request_logger, return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"] ) if model_config.runner_type == "generate" else None - self._model["openai_serving_embedding"] = self._vllm["OpenAIServingEmbedding"]( + self.openai_serving_embedding = self._vllm["OpenAIServingEmbedding"]( async_engine_client, model_config, - self._model["openai_serving_models"], + self.openai_serving_models, request_logger=request_logger, chat_template=vllm_model_config["chat_template"], chat_template_content_format=chat_settings["chat_template_content_format"] ) if model_config.task == "embed" else None - self._model["openai_serving_tokenization"] = self._vllm["OpenAIServingTokenization"]( + self.openai_serving_tokenization = self._vllm["OpenAIServingTokenization"]( async_engine_client, model_config, - self._model["openai_serving_models"], + self.openai_serving_models, request_logger=request_logger, chat_template=vllm_model_config["chat_template"], chat_template_content_format=chat_settings["chat_template_content_format"] @@ -748,8 +754,8 @@ def load_engine( self.is_already_loaded = True return None - def add_models(self, name: str, model_path: str): - self._model["openai_serving_models"].base_model_paths.append( + def add_models(self, name: str, model_path: str) -> None: + self.openai_serving_models.base_model_paths.append( self._vllm["BaseModelPath"]( name=name, model_path=model_path @@ -769,10 +775,9 @@ async def completions( We run the process in this context """ request, raw_request = data["request"], data["raw_request"] - # analog of completion(raw_request) in https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/entrypoints/openai/api_server.py#L405 - handler = self._model["openai_serving_completion"] + handler = self.openai_serving_completion if handler is None: - return self._model["openai_serving"].create_error_response( + return self.openai_serving.create_error_response( message="The model does not support Completions API" ) generator = await handler.create_completion(request=request, raw_request=raw_request) @@ -793,10 +798,9 @@ async def chat_completions( We run the process in this context """ request, raw_request = data["request"], data["raw_request"] - # analog of chat(raw_request) in https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/entrypoints/openai/api_server.py#L405 - handler = self._model["openai_serving_chat"] + handler = self.openai_serving_chat if handler is None: - return self._model["openai_serving"].create_error_response( + return self.openai_serving.create_error_response( message="The model does not support Chat Completions API" ) generator = await handler.create_chat_completion(request=request, raw_request=raw_request) @@ -812,7 +816,9 @@ async def models( state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None ) -> Any: - pass + request, raw_request = data["request"], data["raw_request"] + models_ = await self.openai_serving_models.show_available_models() + return JSONResponse(content=models_.model_dump()) @BasePreprocessRequest.register_engine("vllm", modules=["vllm", "fastapi"]) @@ -900,28 +906,53 @@ async def postprocess( return await self._preprocess.postprocess(data, state, collect_custom_statistics_fn) return data - async def completions(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: + async def completions( + self, + data: Any, + state: dict, + collect_custom_statistics_fn: Callable[[dict], None] = None + ) -> Any: """ The actual processing function. We run the process in this context """ - return await self._vllm_engine.completions(data=data, state=state, collect_custom_statistics_fn=collect_custom_statistics_fn) - + return await self._vllm_engine.completions( + data=data, + state=state, + collect_custom_statistics_fn=collect_custom_statistics_fn + ) - async def chat_completions(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: + async def chat_completions( + self, + data: Any, + state: dict, + collect_custom_statistics_fn: Callable[[dict], None] = None + ) -> Any: """ The actual processing function. We run the process in this context """ - return await self._vllm_engine.chat_completions(data=data, state=state, collect_custom_statistics_fn=collect_custom_statistics_fn) - + return await self._vllm_engine.chat_completions( + data=data, + state=state, + collect_custom_statistics_fn=collect_custom_statistics_fn + ) - async def models(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: + async def models( + self, + data: Any, + state: dict, + collect_custom_statistics_fn: Callable[[dict], None] = None + ) -> Any: """ The actual processing function. We run the process in this context """ - return self._vllm_engine.models(data=data, state=state, collect_custom_statistics_fn=collect_custom_statistics_fn) + return await self._vllm_engine.models( + data=data, + state=state, + collect_custom_statistics_fn=collect_custom_statistics_fn + ) @staticmethod async def _preprocess_send_request(_, endpoint: str, version: str = None, data: dict = None) -> Optional[dict]: diff --git a/examples/vllm/preprocess.py b/examples/vllm/preprocess.py index 001dd58..87ca1be 100644 --- a/examples/vllm/preprocess.py +++ b/examples/vllm/preprocess.py @@ -10,7 +10,6 @@ def __init__(self): self.model_endpoint = None def load(self, local_file_name: str) -> Optional[Any]: # noqa - vllm_model_config = { "lora_modules": None, # [LoRAModulePath(name=a, path=b)] "prompt_adapters": None, # [PromptAdapterPath(name=a, path=b)] From 25e294059650c01e8c0aa56829407b6f812397fb Mon Sep 17 00:00:00 2001 From: IlyaMescheryakov1402 Date: Tue, 11 Mar 2025 22:44:32 +0300 Subject: [PATCH 15/22] fix jsonresponse --- clearml_serving/serving/preprocess_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clearml_serving/serving/preprocess_service.py b/clearml_serving/serving/preprocess_service.py index ed9f270..b236833 100644 --- a/clearml_serving/serving/preprocess_service.py +++ b/clearml_serving/serving/preprocess_service.py @@ -818,7 +818,7 @@ async def models( ) -> Any: request, raw_request = data["request"], data["raw_request"] models_ = await self.openai_serving_models.show_available_models() - return JSONResponse(content=models_.model_dump()) + return self._fastapi["json_response"](content=models_.model_dump()) @BasePreprocessRequest.register_engine("vllm", modules=["vllm", "fastapi"]) From 8ecb51f1db10175fd03aea0daf64d012b2d13ac9 Mon Sep 17 00:00:00 2001 From: IlyaMescheryakov1402 Date: Wed, 12 Mar 2025 01:09:50 +0300 Subject: [PATCH 16/22] add models endpoint --- clearml_serving/serving/main.py | 2 +- .../serving/model_request_processor.py | 17 ----------------- clearml_serving/serving/preprocess_service.py | 8 ++++++++ examples/vllm/readme.md | 10 +++++++++- examples/vllm/test_openai_api.py | 12 +++++++----- 5 files changed, 25 insertions(+), 24 deletions(-) diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py index 4683838..b2540fe 100644 --- a/clearml_serving/serving/main.py +++ b/clearml_serving/serving/main.py @@ -206,7 +206,7 @@ async def validate_json_request(raw_request: Request): ) @router.post("/openai/v1/{endpoint_type:path}", dependencies=[Depends(validate_json_request)]) -@router.post("/openai/v1/{endpoint_type:path}/", dependencies=[Depends(validate_json_request)]) +@router.get("/openai/v1/{endpoint_type:path}", dependencies=[Depends(validate_json_request)]) async def openai_serve_model( endpoint_type: str, request: Union[CompletionRequest, ChatCompletionRequest], diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py index 0cf9084..11b6fd4 100644 --- a/clearml_serving/serving/model_request_processor.py +++ b/clearml_serving/serving/model_request_processor.py @@ -1230,23 +1230,6 @@ async def _process_request(self, processor: BasePreprocessRequest, url: str, bod processed = await processed_func(preprocessed, state, stats_collect_fn) \ if processor.is_process_async \ else processed_func(preprocessed, state, stats_collect_fn) - # if serve_type == "process": - # # noinspection PyUnresolvedReferences - # processed = await processor.process(preprocessed, state, stats_collect_fn) \ - # if processor.is_process_async \ - # else processor.process(preprocessed, state, stats_collect_fn) - # elif serve_type == "completions": - # # noinspection PyUnresolvedReferences - # processed = await processor.completions(preprocessed, state, stats_collect_fn) \ - # if processor.is_process_async \ - # else processor.completions(preprocessed, state, stats_collect_fn) - # elif serve_type == "chat/completions": - # # noinspection PyUnresolvedReferences - # processed = await processor.chat_completions(preprocessed, state, stats_collect_fn) \ - # if processor.is_process_async \ - # else processor.chat_completions(preprocessed, state, stats_collect_fn) - # else: - # raise ValueError(f"wrong url_type: expected 'process', 'completions' or 'chat/completions', got {serve_type}") # noinspection PyUnresolvedReferences return_value = await processor.postprocess(processed, state, stats_collect_fn) \ if processor.is_postprocess_async \ diff --git a/clearml_serving/serving/preprocess_service.py b/clearml_serving/serving/preprocess_service.py index b236833..c0271d7 100644 --- a/clearml_serving/serving/preprocess_service.py +++ b/clearml_serving/serving/preprocess_service.py @@ -763,6 +763,14 @@ def add_models(self, name: str, model_path: str) -> None: ) self.logger.info("Model {} was added to vllm engine".format(name)) return None + + def remove_model(self, name: str) -> None: + self.openai_serving_models.base_model_paths = [ + model for model in self.openai_serving_models.base_model_paths + if model.name != name + ] + self.logger.info("Model {} was removed from vllm engine".format(name)) + return None async def completions( self, diff --git a/examples/vllm/readme.md b/examples/vllm/readme.md index 33668cf..f85c645 100644 --- a/examples/vllm/readme.md +++ b/examples/vllm/readme.md @@ -32,9 +32,17 @@ 5. Test new endpoint (do notice the first call will trigger the model pulling, so it might take longer, from here on, it's all in memory): ```bash - python examples/vllm/test_openai_app.py + python examples/vllm/test_openai_api.py ``` + **Available routes**: + + + /v1/completions + + /v1/chat/completions + + /v1/models + + see [test_openai_api.py](test_openai_api.py) for more information. + NOTE! If you want to use send_request method, keep in mind that you have to pass "completions" or "chat/completions" in entrypoint (and pass model as a part of "data" parameter) and use it for non-streaming models: diff --git a/examples/vllm/test_openai_api.py b/examples/vllm/test_openai_api.py index 55d7b27..a6b908b 100644 --- a/examples/vllm/test_openai_api.py +++ b/examples/vllm/test_openai_api.py @@ -1,6 +1,6 @@ from openai import OpenAI -def main(model_name: str = "test_vllm"): +def main(model_name: str): client = OpenAI(api_key="-") client.base_url = "http://127.0.0.1:8080/serve/openai/v1" @@ -14,8 +14,7 @@ def main(model_name: str = "test_vllm"): max_tokens=1024, top_p=1.0 ) - - print(f"ChatCompletion: {chat_response.choices[0].message.content}") + print(f"ChatCompletion: \n\n {chat_response.choices[0].message.content}") comp_response = client.completions.create( model=model_name, @@ -23,10 +22,13 @@ def main(model_name: str = "test_vllm"): temperature=1.0, max_tokens=256 ) + print(f"\n\n Completion: \n\n {comp_response.choices[0].text}") - print(f"Completion: \n {comp_response.choices[0].text}") + fake_body = {"stream": False, "model": model_name, "prompt": "test"} + print(f"Models:\n") + print('\n\n'.join(map(str, client.models.list(extra_body=fake_body).data))) return None if __name__ == '__main__': - main() \ No newline at end of file + main(model_name="test_vllm") \ No newline at end of file From 10f887d449369a6a6b8b6dc5f9643997a96c914e Mon Sep 17 00:00:00 2001 From: IlyaMescheryakov1402 Date: Wed, 12 Mar 2025 02:20:19 +0300 Subject: [PATCH 17/22] add some sugar --- examples/vllm/test_openai_api.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/vllm/test_openai_api.py b/examples/vllm/test_openai_api.py index a6b908b..c3b7d76 100644 --- a/examples/vllm/test_openai_api.py +++ b/examples/vllm/test_openai_api.py @@ -14,7 +14,7 @@ def main(model_name: str): max_tokens=1024, top_p=1.0 ) - print(f"ChatCompletion: \n\n {chat_response.choices[0].message.content}") + print(f"\n\nChatCompletion:\n\n {chat_response.choices[0].message.content}") comp_response = client.completions.create( model=model_name, @@ -22,10 +22,10 @@ def main(model_name: str): temperature=1.0, max_tokens=256 ) - print(f"\n\n Completion: \n\n {comp_response.choices[0].text}") + print(f"\n\nCompletion:\n\n {comp_response.choices[0].text}") fake_body = {"stream": False, "model": model_name, "prompt": "test"} - print(f"Models:\n") + print(f"\n\nModels:\n") print('\n\n'.join(map(str, client.models.list(extra_body=fake_body).data))) return None From a2817e38dae833382618d06d2abebce0b22a6791 Mon Sep 17 00:00:00 2001 From: IlyaMescheryakov1402 Date: Wed, 12 Mar 2025 14:24:00 +0300 Subject: [PATCH 18/22] small changes for pr --- clearml_serving/serving/Dockerfile | 2 +- clearml_serving/serving/main.py | 16 +++++++++------- clearml_serving/statistics/Dockerfile | 2 +- docker/prometheus.yml | 2 +- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/clearml_serving/serving/Dockerfile b/clearml_serving/serving/Dockerfile index a2d6a47..bd817ea 100644 --- a/clearml_serving/serving/Dockerfile +++ b/clearml_serving/serving/Dockerfile @@ -4,7 +4,7 @@ FROM python:3.11-bullseye ENV LC_ALL=C.UTF-8 # install base package -# RUN pip3 install --no-cache-dir clearml-serving +RUN pip3 install --no-cache-dir clearml-serving # get latest execution code from the git repository # RUN cd $HOME && git clone https://github.com/allegroai/clearml-serving.git diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py index b2540fe..e092093 100644 --- a/clearml_serving/serving/main.py +++ b/clearml_serving/serving/main.py @@ -9,6 +9,8 @@ from fastapi.responses import PlainTextResponse from grpc.aio import AioRpcError +from http import HTTPStatus + from vllm.entrypoints.openai.protocol import ChatCompletionRequest, CompletionRequest from starlette.background import BackgroundTask @@ -115,14 +117,14 @@ async def cuda_exception_handler(request, exc): async def process_with_exceptions( base_url: str, version: Optional[str], - request_body: Union[bytes, Dict[Any, Any]], + request: Union[bytes, Dict[Any, Any]], serve_type: str ): try: return_value = await processor.process_request( base_url=base_url, version=version, - request_body=request_body, + request_body=request, serve_type=serve_type ) except EndpointNotFoundException as ex: @@ -130,21 +132,21 @@ async def process_with_exceptions( except (EndpointModelLoadException, EndpointBackendEngineException) as ex: session_logger.report_text( "[{}] Exception [{}] {} while processing request: {}\n{}".format( - instance_id, type(ex), ex, request_body, "".join(traceback.format_exc()) + instance_id, type(ex), ex, request, "".join(traceback.format_exc()) ) ) raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex)) except ServingInitializationException as ex: session_logger.report_text( "[{}] Exception [{}] {} while loading serving inference: {}\n{}".format( - instance_id, type(ex), ex, request_body, "".join(traceback.format_exc()) + instance_id, type(ex), ex, request, "".join(traceback.format_exc()) ) ) raise HTTPException(status_code=500, detail="Error [{}] processing request: {}".format(type(ex), ex)) except ValueError as ex: session_logger.report_text( "[{}] Exception [{}] {} while processing request: {}\n{}".format( - instance_id, type(ex), ex, request_body, "".join(traceback.format_exc()) + instance_id, type(ex), ex, request, "".join(traceback.format_exc()) ) ) if "CUDA out of memory. " in str(ex) or "NVML_SUCCESS == r INTERNAL ASSERT FAILED" in str(ex): @@ -154,7 +156,7 @@ async def process_with_exceptions( except AioRpcError as ex: if grpc_aio_verbose_errors and ex.code() in grpc_aio_verbose_errors: session_logger.report_text( - "[{}] Exception [AioRpcError] {} while processing request: {}".format(instance_id, ex, request_body) + "[{}] Exception [AioRpcError] {} while processing request: {}".format(instance_id, ex, request) ) elif not grpc_aio_ignore_errors or ex.code() not in grpc_aio_ignore_errors: session_logger.report_text("[{}] Exception [AioRpcError] status={} ".format(instance_id, ex.code())) @@ -164,7 +166,7 @@ async def process_with_exceptions( except Exception as ex: session_logger.report_text( "[{}] Exception [{}] {} while processing request: {}\n{}".format( - instance_id, type(ex), ex, request_body, "".join(traceback.format_exc()) + instance_id, type(ex), ex, request, "".join(traceback.format_exc()) ) ) raise HTTPException(status_code=500, detail="Error [{}] processing request: {}".format(type(ex), ex)) diff --git a/clearml_serving/statistics/Dockerfile b/clearml_serving/statistics/Dockerfile index fa01199..a55ccc2 100644 --- a/clearml_serving/statistics/Dockerfile +++ b/clearml_serving/statistics/Dockerfile @@ -4,7 +4,7 @@ FROM python:3.11-bullseye ENV LC_ALL=C.UTF-8 # install base package -# RUN pip3 install --no-cache-dir clearml-serving +RUN pip3 install --no-cache-dir clearml-serving # get latest execution code from the git repository # RUN cd $HOME && git clone https://github.com/allegroai/clearml-serving.git diff --git a/docker/prometheus.yml b/docker/prometheus.yml index da47e83..d93387b 100644 --- a/docker/prometheus.yml +++ b/docker/prometheus.yml @@ -26,4 +26,4 @@ scrape_configs: scrape_interval: 5s static_configs: - - targets: ['clearml-serving-inference:8000'] \ No newline at end of file + - targets: ['clearml-serving-inference:8000'] From 42d87380741d545e887e7b4486b3104253e3fb51 Mon Sep 17 00:00:00 2001 From: IlyaMescheryakov1402 Date: Wed, 12 Mar 2025 14:28:11 +0300 Subject: [PATCH 19/22] add empty string --- examples/vllm/test_openai_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/vllm/test_openai_api.py b/examples/vllm/test_openai_api.py index c3b7d76..527f353 100644 --- a/examples/vllm/test_openai_api.py +++ b/examples/vllm/test_openai_api.py @@ -31,4 +31,4 @@ def main(model_name: str): return None if __name__ == '__main__': - main(model_name="test_vllm") \ No newline at end of file + main(model_name="test_vllm") From db3d4539eef03098baec7edf5653d644fa30cb42 Mon Sep 17 00:00:00 2001 From: IlyaMescheryakov1402 Date: Wed, 12 Mar 2025 15:46:43 +0300 Subject: [PATCH 20/22] update readme --- examples/vllm/readme.md | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/examples/vllm/readme.md b/examples/vllm/readme.md index f85c645..3670a1c 100644 --- a/examples/vllm/readme.md +++ b/examples/vllm/readme.md @@ -4,29 +4,18 @@ 1. Create serving Service: `clearml-serving create --name "serving example"` (write down the service ID) -2. Make sure to add any required additional packages (for your custom model) to the [docker-compose.yml](https://github.com/allegroai/clearml-serving/blob/826f503cf4a9b069b89eb053696d218d1ce26f47/docker/docker-compose.yml#L97) (or as environment variable to the `clearml-serving-inference` container), by defining for example: `CLEARML_EXTRA_PYTHON_PACKAGES="vllm==0.7.3,prometheus_client==0.21.1"` -3. Create model endpoint: - ``` - clearml-serving --id model add --model-id --engine vllm --endpoint "test_vllm" --preprocess "examples/vllm/preprocess.py" --name "test vllm" --project "serving examples" - ``` - - Or auto update - - ``` - clearml-serving --id model auto-update --engine vllm --endpoint "test_vllm" --preprocess "examples/vllm/preprocess.py" --name "test vllm" --project "serving examples" --max-versions 2 - ``` - - Or add Canary endpoint +2. Add vLLM engine parameters in `VLLM_ENGINE_ARGS` variable as it was done in [this file](/docker/docker-compose-gpu.yml#L108). Make sure to add any required additional packages (for your custom model) to the [requirements.txt](/clearml_serving/serving/requirements.txt) or [docker-compose.yml](https://github.com/allegroai/clearml-serving/blob/826f503cf4a9b069b89eb053696d218d1ce26f47/docker/docker-compose.yml#L97) (or as environment variable to the `clearml-serving-inference` container), by defining for example: `CLEARML_EXTRA_PYTHON_PACKAGES="vllm==0.7.3 prometheus_client==0.21.1"` +3. Create model endpoint: ``` - clearml-serving --id model canary --endpoint "test_vllm" --weights 0.1 0.9 --input-endpoint-prefix test_vllm + clearml-serving --id model add --model-id --engine vllm --endpoint "test_vllm" --preprocess "examples/vllm/preprocess.py" ``` 4. If you already have the `clearml-serving` docker-compose running, it might take it a minute or two to sync with the new endpoint. Or you can run the clearml-serving container independently: ``` - docker run -v ~/clearml.conf:/root/clearml.conf -p 8080:8080 -e CLEARML_SERVING_TASK_ID= clearml-serving:latest + docker run -v ~/clearml.conf:/root/clearml.conf -p 8080:8080 -e CLEARML_SERVING_TASK_ID= clearml-serving-inference:latest ``` 5. Test new endpoint (do notice the first call will trigger the model pulling, so it might take longer, from here on, it's all in memory): From 34c4a9df9138fddbae2983abb2d393c2c6c4f9ea Mon Sep 17 00:00:00 2001 From: IlyaMescheryakov1402 Date: Thu, 20 Mar 2025 02:26:54 +0300 Subject: [PATCH 21/22] update readme and fix docker-compose-gpu.yml --- docker/docker-compose-gpu.yml | 2 +- examples/vllm/readme.md | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/docker/docker-compose-gpu.yml b/docker/docker-compose-gpu.yml index dbb063b..bfeead7 100644 --- a/docker/docker-compose-gpu.yml +++ b/docker/docker-compose-gpu.yml @@ -75,7 +75,7 @@ services: clearml-serving-inference: - image: clearml-serving-inference:latest + image: allegroai/clearml-serving-inference:latest container_name: clearml-serving-inference restart: unless-stopped # optimize perforamnce diff --git a/examples/vllm/readme.md b/examples/vllm/readme.md index 3670a1c..48b93be 100644 --- a/examples/vllm/readme.md +++ b/examples/vllm/readme.md @@ -11,12 +11,7 @@ clearml-serving --id model add --model-id --engine vllm --endpoint "test_vllm" --preprocess "examples/vllm/preprocess.py" ``` -4. If you already have the `clearml-serving` docker-compose running, it might take it a minute or two to sync with the new endpoint. - - Or you can run the clearml-serving container independently: - ``` - docker run -v ~/clearml.conf:/root/clearml.conf -p 8080:8080 -e CLEARML_SERVING_TASK_ID= clearml-serving-inference:latest - ``` +4. If you already have the `clearml-serving` docker-compose running, it might take it a minute or two to sync with the new endpoint. To run docker-compose, see [docker-compose instructions](/README.md#nail_care-initial-setup), p. 8 (and use [docker-compose-gpu.yml](/docker/docker-compose-gpu.yml) file for vllm on gpu and [docker-compose.yml](/docker/docker-compose.yml) otherwise) 5. Test new endpoint (do notice the first call will trigger the model pulling, so it might take longer, from here on, it's all in memory): @@ -32,6 +27,8 @@ see [test_openai_api.py](test_openai_api.py) for more information. +6. Check metrics using grafana (You have to select Prometheus as data source, all of vLLM metrics have "vllm:" prefix). For more information, see [Model monitoring and performance metrics](/README.md#bar_chart-model-monitoring-and-performance-metrics-bell) + NOTE! If you want to use send_request method, keep in mind that you have to pass "completions" or "chat/completions" in entrypoint (and pass model as a part of "data" parameter) and use it for non-streaming models: From 1c3bcc75324fc5b8de9db381bb2a1a2316227afc Mon Sep 17 00:00:00 2001 From: IlyaMescheryakov1402 Date: Thu, 20 Mar 2025 15:21:40 +0300 Subject: [PATCH 22/22] fix request_body --- clearml_serving/serving/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py index e092093..8d44306 100644 --- a/clearml_serving/serving/main.py +++ b/clearml_serving/serving/main.py @@ -192,7 +192,7 @@ async def base_serve_model( return_value = await process_with_exceptions( base_url=model_id, version=version, - request_body=request, + request=request, serve_type="process" ) return return_value @@ -218,7 +218,7 @@ async def openai_serve_model( return_value = await process_with_exceptions( base_url=request.model, version=None, - request_body=combined_request, + request=combined_request, serve_type=endpoint_type ) return return_value