[Frontend] Added chat-style multimodal support to /classify. (vllm-project#27516)

WorldExplored · vnadathur · gemini-code-assist[bot] · geodavic · commit c92c761d355f · 2025-11-15T20:54:23.000-06:00
Signed-off-by: WorldExplored &lt;srreyansh.sethi@gmail.com&gt;
Signed-off-by: Srreyansh Sethi &lt;107075589+WorldExplored@users.noreply.github.com&gt;
Signed-off-by: vnadathur &lt;glvikramn@gmail.com&gt;
Signed-off-by: wang.yuqi &lt;yuqi.wang@daocloud.io&gt;
Co-authored-by: vnadathur &lt;236933696+vnadathur@users.noreply.github.com&gt;
Co-authored-by: gemini-code-assist[bot] &lt;176961590+gemini-code-assist[bot]@users.noreply.github.com&gt;
Co-authored-by: vnadathur &lt;glvikramn@gmail.com&gt;
Co-authored-by: wang.yuqi &lt;noooop@126.com&gt;
Co-authored-by: wang.yuqi &lt;yuqi.wang@daocloud.io&gt;
Signed-off-by: George D. Torres &lt;gdavtor@gmail.com&gt;
diff --git a/tests/entrypoints/pooling/openai/test_classification.py b/tests/entrypoints/pooling/openai/test_classification.py
@@ -46,6 +46,16 @@ def test_single_input_classification(server: RemoteOpenAIServer, model_name: str
     assert hasattr(output.data[0], "probs")
 
 
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_add_special_tokens_false(server: RemoteOpenAIServer, model_name: str):
+    response = requests.post(
+        server.url_for("classify"),
+        json={"model": model_name, "input": "hello", "add_special_tokens": False},
+    )
+    response.raise_for_status()
+    ClassificationResponse.model_validate(response.json())
+
+
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 def test_multiple_inputs_classification(server: RemoteOpenAIServer, model_name: str):
     input_texts = [
diff --git a/tests/entrypoints/pooling/openai/test_vision_classification.py b/tests/entrypoints/pooling/openai/test_vision_classification.py
@@ -0,0 +1,95 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+import requests
+
+from tests.utils import RemoteOpenAIServer
+from vllm.entrypoints.openai.protocol import ClassificationResponse
+
+VLM_MODEL_NAME = "muziyongshixin/Qwen2.5-VL-7B-for-VideoCls"
+MAXIMUM_VIDEOS = 1
+TEST_VIDEO_URL = "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4"
+
+HF_OVERRIDES = {
+    "text_config": {
+        "architectures": ["Qwen2_5_VLForSequenceClassification"],
+    },
+}
+
+
+@pytest.fixture(scope="module")
+def server_vlm_classify():
+    args = [
+        "--runner",
+        "pooling",
+        "--max-model-len",
+        "5000",
+        "--enforce-eager",
+        "--limit-mm-per-prompt",
+        json.dumps({"video": MAXIMUM_VIDEOS}),
+    ]
+
+    with RemoteOpenAIServer(
+        VLM_MODEL_NAME, args, override_hf_configs=HF_OVERRIDES
+    ) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.parametrize("model_name", [VLM_MODEL_NAME])
+def test_classify_accepts_chat_text_only(
+    server_vlm_classify: RemoteOpenAIServer, model_name: str
+) -> None:
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Please classify this text request."},
+            ],
+        }
+    ]
+
+    response = requests.post(
+        server_vlm_classify.url_for("classify"),
+        json={"model": model_name, "messages": messages},
+    )
+    response.raise_for_status()
+
+    output = ClassificationResponse.model_validate(response.json())
+
+    assert output.object == "list"
+    assert output.model == model_name
+    assert len(output.data) == 1
+    assert len(output.data[0].probs) == 2
+    assert output.usage.prompt_tokens == 22
+
+
+@pytest.mark.parametrize("model_name", [VLM_MODEL_NAME])
+def test_classify_accepts_chat_video_url(
+    server_vlm_classify: RemoteOpenAIServer, model_name: str
+) -> None:
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Please classify this video."},
+                {"type": "video_url", "video_url": {"url": TEST_VIDEO_URL}},
+            ],
+        }
+    ]
+
+    response = requests.post(
+        server_vlm_classify.url_for("classify"),
+        json={"model": model_name, "messages": messages},
+    )
+    response.raise_for_status()
+
+    output = ClassificationResponse.model_validate(response.json())
+
+    assert output.object == "list"
+    assert output.model == model_name
+    assert len(output.data) == 1
+    assert len(output.data[0].probs) == 2
+    assert output.usage.prompt_tokens == 4807
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
@@ -1784,6 +1784,9 @@ async def init_app_state(
             engine_client,
             state.openai_serving_models,
             request_logger=request_logger,
+            chat_template=resolved_chat_template,
+            chat_template_content_format=args.chat_template_content_format,
+            trust_request_chat_template=args.trust_request_chat_template,
             log_error_stack=args.log_error_stack,
         )
         if "classify" in supported_tasks
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
@@ -2000,10 +2000,10 @@ class ScoreResponse(OpenAIBaseModel):
     usage: UsageInfo
 
 
-class ClassificationRequest(OpenAIBaseModel):
+class ClassificationCompletionRequest(OpenAIBaseModel):
     model: str | None = None
     input: list[str] | str
-    truncate_prompt_tokens: int | None = None
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
     user: str | None = None
 
     # --8<-- [start:classification-extra-params]
@@ -2015,7 +2015,21 @@ class ClassificationRequest(OpenAIBaseModel):
             "if the served model does not use priority scheduling."
         ),
     )
-
+    add_special_tokens: bool = Field(
+        default=True,
+        description=(
+            "If true (the default), special tokens (e.g. BOS) will be added to "
+            "the prompt."
+        ),
+    )
+    request_id: str = Field(
+        default_factory=lambda: f"{random_uuid()}",
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
     softmax: bool | None = Field(
         default=None,
         description="softmax will be deprecated, please use use_activation instead.",
@@ -2040,6 +2054,102 @@ def to_pooling_params(self):
         )
 
 
+class ClassificationChatRequest(OpenAIBaseModel):
+    model: str | None = None
+    messages: list[ChatCompletionMessageParam]
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+    user: str | None = None
+
+    # --8<-- [start:chat-classification-extra-params]
+    add_generation_prompt: bool = Field(
+        default=False,
+        description=(
+            "If true, the generation prompt will be added to the chat template. "
+            "This is a parameter used by chat template in tokenizer config of the "
+            "model."
+        ),
+    )
+
+    add_special_tokens: bool = Field(
+        default=False,
+        description=(
+            "If true, special tokens (e.g. BOS) will be added to the prompt "
+            "on top of what is added by the chat template. "
+            "For most models, the chat template takes care of adding the "
+            "special tokens so this should be set to false (as is the "
+            "default)."
+        ),
+    )
+
+    chat_template: str | None = Field(
+        default=None,
+        description=(
+            "A Jinja template to use for this conversion. "
+            "As of transformers v4.44, default chat template is no longer "
+            "allowed, so you must provide a chat template if the tokenizer "
+            "does not define one."
+        ),
+    )
+
+    chat_template_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=(
+            "Additional keyword args to pass to the template renderer. "
+            "Will be accessible by the chat template."
+        ),
+    )
+
+    mm_processor_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
+
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+
+    request_id: str = Field(
+        default_factory=lambda: f"{random_uuid()}",
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "Default is True.",
+    )
+    # --8<-- [end:chat-classification-extra-params]
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            use_activation=get_use_activation(self),
+        )
+
+
+ClassificationRequest: TypeAlias = (
+    ClassificationCompletionRequest | ClassificationChatRequest
+)
+
+
 class ClassificationData(OpenAIBaseModel):
     index: int
     label: str | None
diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py