Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion docs/mkdocs/hooks/generate_argparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,9 @@ def auto_mock(module, attr, max_mocks=50):
CompleteCommand = auto_mock("vllm.entrypoints.cli.openai", "CompleteCommand")
cli_args = auto_mock("vllm.entrypoints.openai", "cli_args")
run_batch = auto_mock("vllm.entrypoints.openai", "run_batch")
FlexibleArgumentParser = auto_mock("vllm.utils", "FlexibleArgumentParser")
FlexibleArgumentParser = auto_mock(
"vllm.utils.argparse_utils", "FlexibleArgumentParser"
)


class MarkdownFormatter(HelpFormatter):
Expand Down
4 changes: 1 addition & 3 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,7 @@
from vllm.model_executor.model_loader import get_model_loader
from vllm.platforms import current_platform
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.utils import (
FlexibleArgumentParser,
)
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.mem_constants import GB_bytes
from vllm.utils.network_utils import get_open_port
from vllm.utils.torch_utils import cuda_device_count_stateless
Expand Down
93 changes: 2 additions & 91 deletions tests/utils_/test_utils.py → tests/utils_/test_argparse_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,15 @@

import json
import os
import tempfile
from pathlib import Path
from unittest.mock import patch

import pytest
import torch
import yaml
from transformers import AutoTokenizer

from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.transformers_utils.detokenizer_utils import convert_ids_list_to_tokens

from vllm.utils import (
FlexibleArgumentParser,
bind_kv_cache,
)
from ..utils import create_new_process_for_each_test, flat_product
from vllm.utils.argparse_utils import FlexibleArgumentParser
from ..utils import flat_product


# Tests for FlexibleArgumentParser
Expand Down Expand Up @@ -256,87 +248,6 @@ def test_duplicate_dict_args(caplog_vllm, parser):
assert "-O.mode" in caplog_vllm.text


def test_bind_kv_cache():
from vllm.attention import Attention

ctx = {
"layers.0.self_attn": Attention(32, 128, 0.1),
"layers.1.self_attn": Attention(32, 128, 0.1),
"layers.2.self_attn": Attention(32, 128, 0.1),
"layers.3.self_attn": Attention(32, 128, 0.1),
}
kv_cache = [
torch.zeros((1,)),
torch.zeros((1,)),
torch.zeros((1,)),
torch.zeros((1,)),
]
bind_kv_cache(ctx, [kv_cache])
assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0]
assert ctx["layers.1.self_attn"].kv_cache[0] is kv_cache[1]
assert ctx["layers.2.self_attn"].kv_cache[0] is kv_cache[2]
assert ctx["layers.3.self_attn"].kv_cache[0] is kv_cache[3]


def test_bind_kv_cache_kv_sharing():
from vllm.attention import Attention

ctx = {
"layers.0.self_attn": Attention(32, 128, 0.1),
"layers.1.self_attn": Attention(32, 128, 0.1),
"layers.2.self_attn": Attention(32, 128, 0.1),
"layers.3.self_attn": Attention(32, 128, 0.1),
}
kv_cache = [
torch.zeros((1,)),
torch.zeros((1,)),
torch.zeros((1,)),
torch.zeros((1,)),
]
shared_kv_cache_layers = {
"layers.2.self_attn": "layers.1.self_attn",
"layers.3.self_attn": "layers.0.self_attn",
}
bind_kv_cache(ctx, [kv_cache], shared_kv_cache_layers)
assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0]
assert ctx["layers.1.self_attn"].kv_cache[0] is kv_cache[1]
assert ctx["layers.2.self_attn"].kv_cache[0] is kv_cache[1]
assert ctx["layers.3.self_attn"].kv_cache[0] is kv_cache[0]


def test_bind_kv_cache_non_attention():
from vllm.attention import Attention

# example from Jamba PP=2
ctx = {
"model.layers.20.attn": Attention(32, 128, 0.1),
"model.layers.28.attn": Attention(32, 128, 0.1),
}
kv_cache = [
torch.zeros((1,)),
torch.zeros((1,)),
]
bind_kv_cache(ctx, [kv_cache])
assert ctx["model.layers.20.attn"].kv_cache[0] is kv_cache[0]
assert ctx["model.layers.28.attn"].kv_cache[0] is kv_cache[1]


def test_bind_kv_cache_pp():
with patch("vllm.utils.torch_utils.cuda_device_count_stateless", lambda: 2):
# this test runs with 1 GPU, but we simulate 2 GPUs
cfg = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=2))
with set_current_vllm_config(cfg):
from vllm.attention import Attention

ctx = {
"layers.0.self_attn": Attention(32, 128, 0.1),
}
kv_cache = [[torch.zeros((1,))], [torch.zeros((1,))]]
bind_kv_cache(ctx, kv_cache)
assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0][0]
assert ctx["layers.0.self_attn"].kv_cache[1] is kv_cache[1][0]


def test_model_specification(
parser_with_config, cli_config_file, cli_config_file_with_model
):
Comment on lines 251 to 253
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

The FlexibleArgumentParser.parse_args method has a bug that causes an IndexError if vllm serve is called with --model as the last argument without a value. This should be handled gracefully by argparse (which would raise a SystemExit), but the current pre-processing logic crashes instead.

The crash occurs because the code at vllm/utils/argparse_utils.py:227 accesses args[model_idx + 1] without checking if the index is valid.

To ensure robust error handling and prevent this crash, I recommend adding a test case to expose this bug. This will help in fixing the underlying issue in FlexibleArgumentParser.parse_args and prevent future regressions.

Here is a suggested test case to add to this file:

def test_model_option_no_value(parser_with_config):
    with pytest.raises(SystemExit):
        parser_with_config.parse_args(["serve", "--model"])

Expand Down
2 changes: 1 addition & 1 deletion tests/utils_/test_serial_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

@pytest.mark.parametrize("endianness", ENDIANNESS)
@pytest.mark.parametrize("embed_dtype", EMBED_DTYPE_TO_TORCH_DTYPE.keys())
@torch.inference_mode
@torch.inference_mode()
def test_encode_and_decode(embed_dtype: str, endianness: str):
for i in range(10):
tensor = torch.rand(2, 3, 5, 7, 11, 13, device="cpu", dtype=torch.float32)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
)
from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
from vllm.logger import init_logger
from vllm.utils import cdiv
from vllm.utils.math_utils import cdiv

if TYPE_CHECKING:
from vllm.attention.backends.abstract import AttentionMetadata
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@
)
from vllm.distributed.parallel_state import get_tensor_model_parallel_rank, get_tp_group
from vllm.sampling_params import SamplingParams
from vllm.utils import cdiv, get_kv_cache_torch_dtype
from vllm.utils import get_kv_cache_torch_dtype
from vllm.utils.math_utils import cdiv
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.version import __version__ as VLLM_VERSION

Expand Down
3 changes: 2 additions & 1 deletion vllm/entrypoints/anthropic/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@
with_cancellation,
)
from vllm.logger import init_logger
from vllm.utils import FlexibleArgumentParser, set_ulimit
from vllm.utils import set_ulimit
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.network_utils import is_valid_ipv6_address
from vllm.version import __version__ as VLLM_VERSION

Expand Down
2 changes: 1 addition & 1 deletion vllm/entrypoints/cli/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
from vllm.logger import init_logger
from vllm.usage.usage_lib import UsageContext
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.network_utils import get_tcp_uri
from vllm.utils.system_utils import decorate_logs, set_process_title
from vllm.v1.engine.core import EngineCoreProc
Expand Down
3 changes: 2 additions & 1 deletion vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParserManager
from vllm.usage.usage_lib import UsageContext
from vllm.utils import Device, FlexibleArgumentParser, set_ulimit
from vllm.utils import Device, set_ulimit
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.network_utils import is_valid_ipv6_address
from vllm.utils.system_utils import decorate_logs
from vllm.v1.engine.exceptions import EngineDeadError
Expand Down
2 changes: 1 addition & 1 deletion vllm/lora/punica_wrapper/punica_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from vllm.lora.layers import LoRAMapping
from vllm.triton_utils import HAS_TRITON, triton
from vllm.utils import round_up
from vllm.utils.math_utils import round_up

if HAS_TRITON:
from vllm.lora.ops.triton_ops import (
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/layers/quantization/mxfp4.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@
from vllm.model_executor.utils import set_weight_attrs
from vllm.platforms import current_platform
from vllm.scalar_type import scalar_types
from vllm.utils import round_up
from vllm.utils.flashinfer import has_flashinfer
from vllm.utils.import_utils import has_triton_kernels
from vllm.utils.math_utils import round_up
from vllm.utils.torch_utils import is_torch_equal_or_newer

logger = init_logger(__name__)
Expand Down
Loading
Loading