vllm-project · Isotr0py · Oct 27, 2025 · Oct 27, 2025 · Oct 27, 2025 · Oct 27, 2025
@@ -65,7 +65,9 @@ def auto_mock(module, attr, max_mocks=50):
 CompleteCommand = auto_mock("vllm.entrypoints.cli.openai", "CompleteCommand")
 cli_args = auto_mock("vllm.entrypoints.openai", "cli_args")
 run_batch = auto_mock("vllm.entrypoints.openai", "run_batch")
-FlexibleArgumentParser = auto_mock("vllm.utils", "FlexibleArgumentParser")
+FlexibleArgumentParser = auto_mock(
+    "vllm.utils.argparse_utils", "FlexibleArgumentParser"
+)
 
 
 class MarkdownFormatter(HelpFormatter):

diff --git a/tests/utils.py b/tests/utils.py
@@ -45,9 +45,7 @@
 from vllm.model_executor.model_loader import get_model_loader
 from vllm.platforms import current_platform
 from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.utils import (
-    FlexibleArgumentParser,
-)
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.mem_constants import GB_bytes
 from vllm.utils.network_utils import get_open_port
 from vllm.utils.torch_utils import cuda_device_count_stateless

diff --git a/tests/utils_/test_utils.py → tests/utils_/test_argparse_utils.py b/tests/utils_/test_utils.py → tests/utils_/test_argparse_utils.py
@@ -4,23 +4,15 @@
 
 import json
 import os
-import tempfile
-from pathlib import Path
-from unittest.mock import patch
 
 import pytest
-import torch
 import yaml
 from transformers import AutoTokenizer
 
-from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.transformers_utils.detokenizer_utils import convert_ids_list_to_tokens
 
-from vllm.utils import (
-    FlexibleArgumentParser,
-    bind_kv_cache,
-)
-from ..utils import create_new_process_for_each_test, flat_product
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from ..utils import flat_product
 
 
 # Tests for FlexibleArgumentParser
@@ -256,87 +248,6 @@ def test_duplicate_dict_args(caplog_vllm, parser):
     assert "-O.mode" in caplog_vllm.text
 
 
-def test_bind_kv_cache():
-    from vllm.attention import Attention
-
-    ctx = {
-        "layers.0.self_attn": Attention(32, 128, 0.1),
-        "layers.1.self_attn": Attention(32, 128, 0.1),
-        "layers.2.self_attn": Attention(32, 128, 0.1),
-        "layers.3.self_attn": Attention(32, 128, 0.1),
-    }
-    kv_cache = [
-        torch.zeros((1,)),
-        torch.zeros((1,)),
-        torch.zeros((1,)),
-        torch.zeros((1,)),
-    ]
-    bind_kv_cache(ctx, [kv_cache])
-    assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0]
-    assert ctx["layers.1.self_attn"].kv_cache[0] is kv_cache[1]
-    assert ctx["layers.2.self_attn"].kv_cache[0] is kv_cache[2]
-    assert ctx["layers.3.self_attn"].kv_cache[0] is kv_cache[3]
-
-
-def test_bind_kv_cache_kv_sharing():
-    from vllm.attention import Attention
-
-    ctx = {
-        "layers.0.self_attn": Attention(32, 128, 0.1),
-        "layers.1.self_attn": Attention(32, 128, 0.1),
-        "layers.2.self_attn": Attention(32, 128, 0.1),
-        "layers.3.self_attn": Attention(32, 128, 0.1),
-    }
-    kv_cache = [
-        torch.zeros((1,)),
-        torch.zeros((1,)),
-        torch.zeros((1,)),
-        torch.zeros((1,)),
-    ]
-    shared_kv_cache_layers = {
-        "layers.2.self_attn": "layers.1.self_attn",
-        "layers.3.self_attn": "layers.0.self_attn",
-    }
-    bind_kv_cache(ctx, [kv_cache], shared_kv_cache_layers)
-    assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0]
-    assert ctx["layers.1.self_attn"].kv_cache[0] is kv_cache[1]
-    assert ctx["layers.2.self_attn"].kv_cache[0] is kv_cache[1]
-    assert ctx["layers.3.self_attn"].kv_cache[0] is kv_cache[0]
-
-
-def test_bind_kv_cache_non_attention():
-    from vllm.attention import Attention
-
-    # example from Jamba PP=2
-    ctx = {
-        "model.layers.20.attn": Attention(32, 128, 0.1),
-        "model.layers.28.attn": Attention(32, 128, 0.1),
-    }
-    kv_cache = [
-        torch.zeros((1,)),
-        torch.zeros((1,)),
-    ]
-    bind_kv_cache(ctx, [kv_cache])
-    assert ctx["model.layers.20.attn"].kv_cache[0] is kv_cache[0]
-    assert ctx["model.layers.28.attn"].kv_cache[0] is kv_cache[1]
-
-
-def test_bind_kv_cache_pp():
-    with patch("vllm.utils.torch_utils.cuda_device_count_stateless", lambda: 2):
-        # this test runs with 1 GPU, but we simulate 2 GPUs
-        cfg = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=2))
-    with set_current_vllm_config(cfg):
-        from vllm.attention import Attention
-
-        ctx = {
-            "layers.0.self_attn": Attention(32, 128, 0.1),
-        }
-        kv_cache = [[torch.zeros((1,))], [torch.zeros((1,))]]
-        bind_kv_cache(ctx, kv_cache)
-        assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0][0]
-        assert ctx["layers.0.self_attn"].kv_cache[1] is kv_cache[1][0]
-
-
 def test_model_specification(
     parser_with_config, cli_config_file, cli_config_file_with_model
 ):

diff --git a/tests/utils_/test_serial_utils.py b/tests/utils_/test_serial_utils.py
@@ -14,7 +14,7 @@
 
 @pytest.mark.parametrize("endianness", ENDIANNESS)
 @pytest.mark.parametrize("embed_dtype", EMBED_DTYPE_TO_TORCH_DTYPE.keys())
-@torch.inference_mode
+@torch.inference_mode()
 def test_encode_and_decode(embed_dtype: str, endianness: str):
     for i in range(10):
         tensor = torch.rand(2, 3, 5, 7, 11, 13, device="cpu", dtype=torch.float32)

@@ -42,7 +42,7 @@
 )
 from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
 from vllm.logger import init_logger
-from vllm.utils import cdiv
+from vllm.utils.math_utils import cdiv
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionMetadata

@@ -44,7 +44,8 @@
 )
 from vllm.distributed.parallel_state import get_tensor_model_parallel_rank, get_tp_group
 from vllm.sampling_params import SamplingParams
-from vllm.utils import cdiv, get_kv_cache_torch_dtype
+from vllm.utils import get_kv_cache_torch_dtype
+from vllm.utils.math_utils import cdiv
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.version import __version__ as VLLM_VERSION
 

@@ -51,7 +51,8 @@
     with_cancellation,
 )
 from vllm.logger import init_logger
-from vllm.utils import FlexibleArgumentParser, set_ulimit
+from vllm.utils import set_ulimit
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.network_utils import is_valid_ipv6_address
 from vllm.version import __version__ as VLLM_VERSION
 

@@ -18,7 +18,7 @@
 from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.network_utils import get_tcp_uri
 from vllm.utils.system_utils import decorate_logs, set_process_title
 from vllm.v1.engine.core import EngineCoreProc

@@ -108,7 +108,8 @@
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParserManager
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import Device, FlexibleArgumentParser, set_ulimit
+from vllm.utils import Device, set_ulimit
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.network_utils import is_valid_ipv6_address
 from vllm.utils.system_utils import decorate_logs
 from vllm.v1.engine.exceptions import EngineDeadError

@@ -13,7 +13,7 @@
 
 from vllm.lora.layers import LoRAMapping
 from vllm.triton_utils import HAS_TRITON, triton
-from vllm.utils import round_up
+from vllm.utils.math_utils import round_up
 
 if HAS_TRITON:
     from vllm.lora.ops.triton_ops import (

@@ -48,9 +48,9 @@
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
-from vllm.utils import round_up
 from vllm.utils.flashinfer import has_flashinfer
 from vllm.utils.import_utils import has_triton_kernels
+from vllm.utils.math_utils import round_up
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 logger = init_logger(__name__)