Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/unit-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ jobs:
python3 -m pip install /root/packages/cu118/flash_attn-*.whl
python3 -m pip install -r requirements_cuda.txt -r requirements/test.txt
python3 -m pip install -e .
python3 -m pip install -U 'numpy<2.0'
- name: Check env
run: |
python3 -m pip list
Expand Down
39 changes: 35 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ if (BUILD_TEST)
GIT_REPOSITORY https://github.com/catchorg/Catch2.git
GIT_TAG v3.8.0
GIT_SHALLOW ON
GIT_PROGRESS TRUE
USES_TERMINAL_DOWNLOAD TRUE
EXCLUDE_FROM_ALL
)
FetchContent_MakeAvailable(Catch2)
Expand All @@ -53,8 +55,10 @@ endif()
FetchContent_Declare(
repo-cutlass
GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git
GIT_TAG v3.9.2
GIT_SHALLOW ON
GIT_TAG v3.9.2
GIT_SHALLOW ON
GIT_PROGRESS TRUE
USES_TERMINAL_DOWNLOAD TRUE
EXCLUDE_FROM_ALL
)

Expand All @@ -66,13 +70,38 @@ FetchContent_MakeAvailable(repo-cutlass)
FetchContent_Declare(
yaml-cpp
GIT_REPOSITORY https://github.com/jbeder/yaml-cpp.git
GIT_TAG 0.8.0
GIT_TAG 0.8.0
GIT_PROGRESS TRUE
USES_TERMINAL_DOWNLOAD TRUE
PATCH_COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/cmake/yaml-cpp_cmake_policy.patch
UPDATE_DISCONNECTED 1
UPDATE_DISCONNECTED 1
)
set(YAML_BUILD_SHARED_LIBS OFF CACHE BOOL "Build static library of yaml-cpp")
FetchContent_MakeAvailable(yaml-cpp)

FetchContent_Declare(
xgrammar
GIT_REPOSITORY https://github.com/mlc-ai/xgrammar.git
GIT_TAG v0.1.25
GIT_SUBMODULES "3rdparty/dlpack"
GIT_PROGRESS TRUE
USES_TERMINAL_DOWNLOAD TRUE
UPDATE_DISCONNECTED 1
)

FetchContent_GetProperties(xgrammar)
if(NOT xgrammar_POPULATED)
# Fetch the content using previously declared details
FetchContent_Populate(xgrammar)

file(WRITE ${xgrammar_SOURCE_DIR}/config.cmake "set(XGRAMMAR_BUILD_PYTHON_BINDINGS OFF)\n")
if(NOT MSVC)
file(APPEND ${xgrammar_SOURCE_DIR}/config.cmake "set(CMAKE_CXX_FLAGS \"-Wno-error\")\n")
endif()

# Bring the populated content into the build
add_subdirectory(${xgrammar_SOURCE_DIR} ${xgrammar_BINARY_DIR})
endif()

# the environment variable
# ASAN_OPTIONS=protect_shadow_gap=0,intercept_tls_get_addr=0
Expand Down Expand Up @@ -266,7 +295,9 @@ add_subdirectory(src)
if (BUILD_PY_FFI)
if (CALL_FROM_SETUP_PY)
install(TARGETS _turbomind DESTINATION ${CMAKE_INSTALL_PREFIX})
install(TARGETS _xgrammar DESTINATION ${CMAKE_INSTALL_PREFIX})
else()
install(TARGETS _turbomind DESTINATION ${CMAKE_SOURCE_DIR}/lmdeploy/lib)
install(TARGETS _xgrammar DESTINATION ${CMAKE_SOURCE_DIR}/lmdeploy/lib)
endif()
endif ()
3 changes: 2 additions & 1 deletion debug.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/sh
#!/bin/bash -e

builder="-G Ninja"

Expand All @@ -15,4 +15,5 @@ cmake ${builder} .. \
-DCMAKE_CUDA_FLAGS="-lineinfo" \
-DUSE_NVTX=ON \
-DPYTHON_EXECUTABLE=$(which python3) \
-DFETCHCONTENT_QUIET=OFF \
-DBUILD_TEST=ON
4 changes: 3 additions & 1 deletion docs/en/advance/structed_output.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Structured output

Currently, only the Pytorch backend has this capability. Therefore, whether you are using the pipeline or the api_server, please specify the use of the Pytorch backend.
Structured output, also known as guided decoding, forces the model to generate text that exactly matches a user-supplied JSON schema, grammar, or regex.
Both the PyTorch and Turbomind backends now support structured (schema-constrained) generation.
Below are examples for the pipeline API and the API server.

## pipeline

Expand Down
4 changes: 3 additions & 1 deletion docs/zh_cn/advance/structed_output.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# 结构化输出

目前只有 Pytorch 后端具有该能力。所以无论是使用 pipline 还是使用 api_server,请指定使用 pytorch 后端。
结构化输出(也称为引导解码)会强制模型生成与用户提供的 JSON 模式、语法或正则表达式完全匹配的文本。
当前,PyTorch 与 Turbomind 两个后端均已支持这种(受模式约束的)结构化生成。
以下分别为 pipeline API 和 API 服务的使用示例。

## pipeline

Expand Down
3 changes: 2 additions & 1 deletion generate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,5 @@ cmake ${builder} .. \
-DBUILD_PY_FFI=ON \
-DBUILD_MULTI_GPU=ON \
-DCMAKE_CUDA_FLAGS="-lineinfo" \
-DUSE_NVTX=ON
-DUSE_NVTX=ON \
-DFETCHCONTENT_QUIET=OFF
2 changes: 1 addition & 1 deletion lmdeploy/messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ class GenerationConfig:
around special tokens. The behavior of Fast tokenizers is to have
this to False. This is setup to True in slow tokenizers.
logprobs (int): Number of log probabilities to return per output token.
response_format (Dict): Only pytorch backend support formatting
response_format (Dict): Generate responses according to given formatting.
response. Examples:
{
"type": "json_schema",
Expand Down
20 changes: 9 additions & 11 deletions lmdeploy/serve/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,17 +129,17 @@ def create_error_response(status: HTTPStatus, message: str, error_type='invalid_
async def check_request(request) -> Optional[JSONResponse]:
"""Check if a request is valid."""
if hasattr(request, 'model') and request.model not in get_model_list():
return create_error_response(HTTPStatus.NOT_FOUND, f'The model `{request.model}` does not exist.')
return create_error_response(HTTPStatus.NOT_FOUND, f'The model {request.model!r} does not exist.')
if hasattr(request, 'n') and request.n <= 0:
return create_error_response(HTTPStatus.BAD_REQUEST, f'The n `{request.n}` must be a positive int.')
return create_error_response(HTTPStatus.BAD_REQUEST, f'The n {request.n!r} must be a positive int.')
if hasattr(request, 'top_p') and not (request.top_p > 0 and request.top_p <= 1):
return create_error_response(HTTPStatus.BAD_REQUEST, f'The top_p `{request.top_p}` must be in (0, 1].')
return create_error_response(HTTPStatus.BAD_REQUEST, f'The top_p {request.top_p!r} must be in (0, 1].')
if hasattr(request, 'top_k') and request.top_k < 0:
return create_error_response(HTTPStatus.BAD_REQUEST,
f'The top_k `{request.top_k}` cannot be a negative integer.')
f'The top_k {request.top_k!r} cannot be a negative integer.')
if hasattr(request, 'temperature') and not (request.temperature <= 2 and request.temperature >= 0):
return create_error_response(HTTPStatus.BAD_REQUEST,
f'The temperature `{request.temperature}` must be in [0, 2]')
f'The temperature {request.temperature!r} must be in [0, 2]')
return


Expand Down Expand Up @@ -315,8 +315,8 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque
1.0 means no penalty
- stop (str | List[str] | None): To stop generating further
tokens. Only accept stop words that's encoded to one token idex.
- response_format (Dict | None): Only pytorch backend support formatting
response. Examples: `{"type": "json_schema", "json_schema": {"name":
- response_format (Dict | None): To generate response according to given
schema. Examples: `{"type": "json_schema", "json_schema": {"name":
"test","schema": {"properties": {"name": {"type": "string"}},
"required": ["name"], "type": "object"}}}`
or `{"type": "regex_schema", "regex_schema": "call me [A-Za-z]{1,10}"}`
Expand Down Expand Up @@ -365,7 +365,7 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque
if error_check_ret is not None:
return error_check_ret
if VariableInterface.async_engine.id2step.get(request.session_id, 0) != 0:
return create_error_response(HTTPStatus.BAD_REQUEST, f'The session_id `{request.session_id}` is occupied.')
return create_error_response(HTTPStatus.BAD_REQUEST, f'The session_id {request.session_id!r} is occupied.')

model_name = request.model
adapter_name = None
Expand All @@ -385,8 +385,6 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque
gen_logprobs = request.top_logprobs
response_format = None
if request.response_format and request.response_format.type != 'text':
if VariableInterface.async_engine.backend != 'pytorch':
return create_error_response(HTTPStatus.BAD_REQUEST, 'only pytorch backend can use response_format now')
response_format = request.response_format.model_dump()

if request.logit_bias is not None:
Expand Down Expand Up @@ -717,7 +715,7 @@ async def completions_v1(request: CompletionRequest, raw_request: Request = None
if error_check_ret is not None:
return error_check_ret
if VariableInterface.async_engine.id2step.get(request.session_id, 0) != 0:
return create_error_response(HTTPStatus.BAD_REQUEST, f'The session_id `{request.session_id}` is occupied.')
return create_error_response(HTTPStatus.BAD_REQUEST, f'The session_id {request.session_id!r} is occupied.')

model_name = request.model
adapter_name = None
Expand Down
4 changes: 3 additions & 1 deletion lmdeploy/serve/openai/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import Any, Dict, List, Literal, Optional, Union

import shortuuid
from pydantic import BaseModel, Field
from pydantic import BaseModel, ConfigDict, Field


class ErrorResponse(BaseModel):
Expand Down Expand Up @@ -90,10 +90,12 @@ class JsonSchema(BaseModel):
name: str
# description is not used since it depends on model
description: Optional[str] = None
# `schema` is a reserved field in Pydantic BaseModel
# use alias since pydantic does not support the OpenAI key `schema`
json_schema: Optional[Dict[str, Any]] = Field(default=None, alias='schema', examples=[None])
# strict is not used
strict: Optional[bool] = False
model_config = ConfigDict(serialize_by_alias=True)


class ResponseFormat(BaseModel):
Expand Down
6 changes: 3 additions & 3 deletions lmdeploy/serve/proxy/proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ async def check_request_model(self, model_name) -> Optional[JSONResponse]:
"""Check if a request is valid."""
if model_name in self.model_list:
return
ret = create_error_response(HTTPStatus.NOT_FOUND, f'The model `{model_name}` does not exist.')
ret = create_error_response(HTTPStatus.NOT_FOUND, f'The model {model_name!r} does not exist.')
return ret

def handle_unavailable_model(self, model_name):
Expand Down Expand Up @@ -538,8 +538,8 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque
1.0 means no penalty
- stop (str | List[str] | None): To stop generating further
tokens. Only accept stop words that's encoded to one token idex.
- response_format (Dict | None): Only pytorch backend support formatting
response. Examples: `{"type": "json_schema", "json_schema": {"name":
- response_format (Dict | None): To generate response according to given
schema. Examples: `{"type": "json_schema", "json_schema": {"name":
"test","schema": {"properties": {"name": {"type": "string"}},
"required": ["name"], "type": "object"}}}`
or `{"type": "regex_schema", "regex_schema": "call me [A-Za-z]{1,10}"}`
Expand Down
Loading