Adds VLLMModel (huggingface#337)

NathanHB · aymeric-roucher · albertvillanova · web-flow · commit e432d4190d36 · 2025-03-13T12:00:00.000-07:00
Co-authored-by: Aymeric &lt;aymeric.roucher@gmail.com&gt;
Co-authored-by: Albert Villanova del Moral &lt;8515462+albertvillanova@users.noreply.github.com&gt;
diff --git a/docs/source/en/reference/models.mdx b/docs/source/en/reference/models.mdx
@@ -167,3 +167,20 @@ print(model([{"role": "user", "content": "Ok!"}], stop_sequences=["great"]))
 > You must have `mlx-lm` installed on your machine. Please run `pip install smolagents[mlx-lm]` if it's not the case.
 
 [[autodoc]] MLXModel
+
+### VLLMModel
+
+Model to use [vLLM](https://docs.vllm.ai/) for fast LLM inference and serving.
+
+```python
+from smolagents import MLXModel
+
+model = VLLMModel(model_id="HuggingFaceTB/SmolLM-135M-Instruct")
+
+print(model([{"role": "user", "content": "Ok!"}], stop_sequences=["great"]))
+```
+
+> [!TIP]
+> You must have `vllm` installed on your machine. Please run `pip install smolagents[vllm]` if it's not the case.
+
+[[autodoc]] VLLMModel
diff --git a/pyproject.toml b/pyproject.toml
@@ -71,6 +71,10 @@ vision = [
   "helium",
   "selenium",
 ]
+vllm = [
+  "vllm",
+  "torch"
+]
 all = [
   "smolagents[audio,docker,e2b,gradio,litellm,mcp,mlx-lm,openai,telemetry,transformers,vision]",
 ]
diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py
@@ -61,7 +61,6 @@
     AgentParsingError,
     make_init_file,
     parse_code_blobs,
-    parse_json_tool_call,
     truncate_content,
 )
 
@@ -190,7 +189,6 @@ def __init__(
         model: Callable[[List[Dict[str, str]]], ChatMessage],
         prompt_templates: Optional[PromptTemplates] = None,
         max_steps: int = 20,
-        tool_parser: Optional[Callable] = None,
         add_base_tools: bool = False,
         verbosity_level: LogLevel = LogLevel.INFO,
         grammar: Optional[Dict[str, str]] = None,
@@ -207,7 +205,6 @@ def __init__(
         self.prompt_templates = prompt_templates or EMPTY_PROMPT_TEMPLATES
         self.max_steps = max_steps
         self.step_number = 0
-        self.tool_parser = tool_parser or parse_json_tool_call
         self.grammar = grammar
         self.planning_interval = planning_interval
         self.state = {}
diff --git a/src/smolagents/models.py b/src/smolagents/models.py
@@ -15,7 +15,6 @@
 import json
 import logging
 import os
-import random
 import uuid
 import warnings
 from copy import deepcopy
@@ -26,7 +25,7 @@
 from huggingface_hub.utils import is_torch_available
 
 from .tools import Tool
-from .utils import _is_package_available, encode_image_base64, make_image_url
+from .utils import _is_package_available, encode_image_base64, make_image_url, parse_json_blob
 
 
 if TYPE_CHECKING:
@@ -236,10 +235,34 @@ def get_clean_message_list(
     return output_message_list
 
 
+def get_tool_call_chat_message_from_text(text: str, tool_name_key: str, tool_arguments_key: str) -> ChatMessage:
+    tool_call_dictionary, text = parse_json_blob(text)
+    try:
+        tool_name = tool_call_dictionary[tool_name_key]
+    except Exception as e:
+        raise ValueError(
+            f"Key {tool_name_key=} not found in the generated tool call. Got keys: {list(tool_call_dictionary.keys())} instead"
+        ) from e
+    tool_arguments = tool_call_dictionary.get(tool_arguments_key, None)
+    return ChatMessage(
+        role="assistant",
+        content=text,
+        tool_calls=[
+            ChatMessageToolCall(
+                id=uuid.uuid4(),
+                type="function",
+                function=ChatMessageToolCallDefinition(name=tool_name, arguments=tool_arguments),
+            )
+        ],
+    )
+
+
 class Model:
-    def __init__(self, **kwargs):
+    def __init__(self, tool_name_key: str = "name", tool_arguments_key: str = "arguments", **kwargs):
         self.last_input_token_count = None
         self.last_output_token_count = None
+        self.tool_name_key = tool_name_key
+        self.tool_arguments_key = tool_arguments_key
         self.kwargs = kwargs
 
     def _prepare_completion_kwargs(
@@ -465,6 +488,104 @@ def __call__(
         return message
 
 
+class VLLMModel(Model):
+    """Model to use [vLLM](https://docs.vllm.ai/) for fast LLM inference and serving.
+
+    Parameters:
+        model_id (`str`):
+            The Hugging Face model ID to be used for inference.
+            This can be a path or model identifier from the Hugging Face model hub.
+    """
+
+    def __init__(self, model_id, **kwargs):
+        if not _is_package_available("vllm"):
+            raise ModuleNotFoundError("Please install 'vllm' extra to use VLLMModel: `pip install 'smolagents[vllm]'`")
+
+        from vllm import LLM
+        from vllm.transformers_utils.tokenizer import get_tokenizer
+
+        super().__init__(**kwargs)
+
+        self.model_id = model_id
+        self.model = LLM(model=model_id)
+        self.tokenizer = get_tokenizer(model_id)
+        self._is_vlm = False  # VLLMModel does not support vision models yet.
+
+    def cleanup(self):
+        import gc
+
+        import torch
+        from vllm.distributed.parallel_state import destroy_distributed_environment, destroy_model_parallel
+
+        destroy_model_parallel()
+        if self.model is not None:
+            # taken from https://github.com/vllm-project/vllm/issues/1908#issuecomment-2076870351
+            del self.model.llm_engine.model_executor.driver_worker
+        self.model = None
+        gc.collect()
+        destroy_distributed_environment()
+        torch.cuda.empty_cache()
+
+    def __call__(
+        self,
+        messages: List[Dict[str, str]],
+        stop_sequences: Optional[List[str]] = None,
+        grammar: Optional[str] = None,
+        tools_to_call_from: Optional[List[Tool]] = None,
+        **kwargs,
+    ) -> ChatMessage:
+        from vllm import SamplingParams
+
+        completion_kwargs = self._prepare_completion_kwargs(
+            messages=messages,
+            flatten_messages_as_text=(not self._is_vlm),
+            stop_sequences=stop_sequences,
+            grammar=grammar,
+            tools_to_call_from=tools_to_call_from,
+            **kwargs,
+        )
+        messages = completion_kwargs.pop("messages")
+        prepared_stop_sequences = completion_kwargs.pop("stop", [])
+        tools = completion_kwargs.pop("tools", None)
+        completion_kwargs.pop("tool_choice", None)
+
+        if tools_to_call_from is not None:
+            prompt = self.tokenizer.apply_chat_template(
+                messages,
+                tools=tools,
+                add_generation_prompt=True,
+                tokenize=False,
+            )
+        else:
+            prompt = self.tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+            )
+
+        sampling_params = SamplingParams(
+            n=kwargs.get("n", 1),
+            temperature=kwargs.get("temperature", 0.0),
+            max_tokens=kwargs.get("max_tokens", 2048),
+            stop=prepared_stop_sequences,
+        )
+
+        out = self.model.generate(
+            prompt,
+            sampling_params=sampling_params,
+        )
+        output = out[0].outputs[0].text
+        self.last_input_token_count = len(out[0].prompt_token_ids)
+        self.last_output_token_count = len(out[0].outputs[0].token_ids)
+        if tools_to_call_from:
+            chat_message = get_tool_call_chat_message_from_text(output, self.tool_name_key, self.tool_arguments_key)
+            chat_message.raw = {"out": out, "completion_kwargs": completion_kwargs}
+            return chat_message
+        else:
+            return ChatMessage(
+                role="assistant", content=output, raw={"out": out, "completion_kwargs": completion_kwargs}
+            )
+
+
 class MLXModel(Model):
     """A class to interact with models loaded using MLX on Apple silicon.
 
@@ -523,27 +644,7 @@ def __init__(
         self.stream_generate = mlx_lm.stream_generate
         self.tool_name_key = tool_name_key
         self.tool_arguments_key = tool_arguments_key
-
-    def _to_message(self, text, tools_to_call_from):
-        if tools_to_call_from:
-            # solution for extracting tool JSON without assuming a specific model output format
-            maybe_json = "{" + text.split("{", 1)[-1][::-1].split("}", 1)[-1][::-1] + "}"
-            parsed_text = json.loads(maybe_json)
-            tool_name = parsed_text.get(self.tool_name_key, None)
-            tool_arguments = parsed_text.get(self.tool_arguments_key, None)
-            if tool_name:
-                return ChatMessage(
-                    role="assistant",
-                    content="",
-                    tool_calls=[
-                        ChatMessageToolCall(
-                            id=uuid.uuid4(),
-                            type="function",
-                            function=ChatMessageToolCallDefinition(name=tool_name, arguments=tool_arguments),
-                        )
-                    ],
-                )
-        return ChatMessage(role="assistant", content=text)
+        self.is_vlm = False  # mlx-lm doesn't support vision models
 
     def __call__(
         self,
@@ -554,7 +655,7 @@ def __call__(
         **kwargs,
     ) -> ChatMessage:
         completion_kwargs = self._prepare_completion_kwargs(
-            flatten_messages_as_text=True,  # mlx-lm doesn't support vision models
+            flatten_messages_as_text=(not self._is_vlm),
             messages=messages,
             stop_sequences=stop_sequences,
             grammar=grammar,
@@ -583,9 +684,19 @@ def __call__(
                 stop_sequence_start = text.rfind(stop_sequence)
                 if stop_sequence_start != -1:
                     text = text[:stop_sequence_start]
-                    return self._to_message(text, tools_to_call_from)
+                    found_stop_sequence = True
+                    break
+            if found_stop_sequence:
+                break
 
-        return self._to_message(text, tools_to_call_from)
+        if tools_to_call_from:
+            chat_message = get_tool_call_chat_message_from_text(text, self.tool_name_key, self.tool_arguments_key)
+            chat_message.raw = {"out": text, "completion_kwargs": completion_kwargs}
+            return chat_message
+        else:
+            return ChatMessage(
+                role="assistant", content=text, raw={"out": text, "completion_kwargs": completion_kwargs}
+            )
 
 
 class TransformersModel(Model):
@@ -779,38 +890,14 @@ def __call__(
         if stop_sequences is not None:
             output = remove_stop_sequences(output, stop_sequences)
 
-        if tools_to_call_from is None:
-            return ChatMessage(
-                role="assistant",
-                content=output,
-                raw={"out": out, "completion_kwargs": completion_kwargs},
-            )
+        if tools_to_call_from:
+            chat_message = get_tool_call_chat_message_from_text(output, self.tool_name_key, self.tool_arguments_key)
+            chat_message.raw = {"out": out, "completion_kwargs": completion_kwargs}
+            return chat_message
         else:
-            if "Action:" in output:
-                output = output.split("Action:", 1)[1].strip()
-            try:
-                start_index = output.index("{")
-                end_index = output.rindex("}")
-                output = output[start_index : end_index + 1]
-            except Exception as e:
-                raise Exception("No json blob found in output!") from e
-
-            try:
-                parsed_output = json.loads(output)
-            except json.JSONDecodeError as e:
-                raise ValueError(f"Tool call '{output}' has an invalid JSON structure: {e}")
-            tool_name = parsed_output.get("name")
-            tool_arguments = parsed_output.get("arguments")
             return ChatMessage(
                 role="assistant",
-                content="",
-                tool_calls=[
-                    ChatMessageToolCall(
-                        id="".join(random.choices("0123456789", k=5)),
-                        type="function",
-                        function=ChatMessageToolCallDefinition(name=tool_name, arguments=tool_arguments),
-                    )
-                ],
+                content=output,
                 raw={"out": out, "completion_kwargs": completion_kwargs},
             )
 
@@ -1051,6 +1138,7 @@ def create_client(self):
     "HfApiModel",
     "LiteLLMModel",
     "OpenAIServerModel",
+    "VLLMModel",
     "AzureOpenAIServerModel",
     "ChatMessage",
 ]
diff --git a/src/smolagents/utils.py b/src/smolagents/utils.py
@@ -26,7 +26,7 @@
 from functools import lru_cache
 from io import BytesIO
 from textwrap import dedent
-from typing import TYPE_CHECKING, Any, Dict, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, Tuple
 
 
 if TYPE_CHECKING:
@@ -140,13 +140,14 @@ def make_json_serializable(obj: Any) -> Any:
         return str(obj)
 
 
-def parse_json_blob(json_blob: str) -> Dict[str, str]:
+def parse_json_blob(json_blob: str) -> Tuple[Dict[str, str], str]:
+    "Extracts the JSON blob from the input and returns the JSON data and the rest of the input."
     try:
         first_accolade_index = json_blob.find("{")
         last_accolade_index = [a.start() for a in list(re.finditer("}", json_blob))][-1]
         json_blob = json_blob[first_accolade_index : last_accolade_index + 1].replace('\\"', "'")
         json_data = json.loads(json_blob, strict=False)
-        return json_data
+        return json_data, json_blob[:first_accolade_index]
     except json.JSONDecodeError as e:
         place = e.pos
         if json_blob[place - 1 : place + 2] == "},\n":
@@ -158,8 +159,6 @@ def parse_json_blob(json_blob: str) -> Dict[str, str]:
             f"JSON blob was: {json_blob}, decoding failed on that specific part of the blob:\n"
             f"'{json_blob[place - 4 : place + 5]}'."
         )
-    except Exception as e:
-        raise ValueError(f"Error in parsing the JSON blob: {e}")
 
 
 def parse_code_blobs(text: str) -> str:
@@ -219,30 +218,6 @@ def parse_code_blobs(text: str) -> str:
     )
 
 
-def parse_json_tool_call(json_blob: str) -> Tuple[str, Union[str, None]]:
-    json_blob = json_blob.replace("```json", "").replace("```", "")
-    tool_call = parse_json_blob(json_blob)
-    tool_name_key, tool_arguments_key = None, None
-    for possible_tool_name_key in ["action", "tool_name", "tool", "name", "function"]:
-        if possible_tool_name_key in tool_call:
-            tool_name_key = possible_tool_name_key
-    for possible_tool_arguments_key in [
-        "action_input",
-        "tool_arguments",
-        "tool_args",
-        "parameters",
-    ]:
-        if possible_tool_arguments_key in tool_call:
-            tool_arguments_key = possible_tool_arguments_key
-    if tool_name_key is not None:
-        if tool_arguments_key is not None:
-            return tool_call[tool_name_key], tool_call[tool_arguments_key]
-        else:
-            return tool_call[tool_name_key], None
-    error_msg = "No tool name key found in tool call!" + f" Tool call: {json_blob}"
-    raise AgentParsingError(error_msg)
-
-
 MAX_LENGTH_TRUNCATE_CONTENT = 20000
 
 
diff --git a/tests/test_agents.py b/tests/test_agents.py

Original file line number	Diff line number	Diff line change
`@@ -71,6 +71,10 @@ vision = [`
`71`	`71`	`"helium",`
`72`	`72`	`"selenium",`
`73`	`73`	`]`
	`74`	`+vllm = [`
	`75`	`+ "vllm",`
	`76`	`+ "torch"`
	`77`	`+]`
`74`	`78`	`all = [`
`75`	`79`	`"smolagents[audio,docker,e2b,gradio,litellm,mcp,mlx-lm,openai,telemetry,transformers,vision]",`
`76`	`80`	`]`