add omnivici

0xrushi · 0xrushi · commit efedbfcc6088 · 2025-10-25T23:04:09.000-04:00
Signed-off-by: 0xrushi &lt;6279035+0xrushi@users.noreply.github.com&gt;
diff --git a/tests/model_executor/model_loader/test_bitsandbytes_loader.py b/tests/model_executor/model_loader/test_bitsandbytes_loader.py
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+from vllm.config.load import LoadConfig
+from vllm.model_executor.model_loader.bitsandbytes_loader import (
+    BitsAndBytesModelLoader,
+)
+
+
+class _DummyBitsAndBytesLoader(BitsAndBytesModelLoader):
+    """Test helper that bypasses any real HF interactions."""
+
+    def __init__(
+        self, load_config: LoadConfig, mock_result: tuple[str, list[str], str]
+    ):
+        super().__init__(load_config)
+        self._mock_result = mock_result
+
+    def _get_weight_files(  # type: ignore[override]
+        self,
+        model_name_or_path: str,
+        allowed_patterns: list[str],
+        revision: Optional[str] = None,
+    ) -> tuple[str, list[str], str]:
+        return self._mock_result
+
+
+def test_bitsandbytes_loader_detects_safetensors_from_files(tmp_path):
+    """Even if the allow-pattern looks like *.bin, safetensors files are detected."""
+
+    llm_dir = tmp_path / "llm"
+    llm_dir.mkdir()
+    safetensor = llm_dir / "model-00001-of-00002.safetensors"
+    safetensor.write_bytes(b"test")
+
+    load_config = LoadConfig()
+    loader = _DummyBitsAndBytesLoader(
+        load_config,
+        mock_result=(str(tmp_path), [str(safetensor)], "*.bin"),
+    )
+
+    files, use_safetensors = loader._prepare_weights(str(tmp_path), revision=None)
+
+    assert use_safetensors is True
+    assert files == [str(safetensor)]
diff --git a/tests/model_executor/test_weight_utils.py b/tests/model_executor/test_weight_utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import json
 import os
 import tempfile
 
@@ -11,6 +12,7 @@
 from vllm.model_executor.model_loader.weight_utils import (
     download_weights_from_hf,
     enable_hf_transfer,
+    filter_duplicate_safetensors_files,
 )
 
 
@@ -61,6 +63,28 @@ def test_download_weights_from_hf():
         )
 
 
+def test_filter_duplicate_safetensors_files_with_subfolder(tmp_path):
+    llm_dir = tmp_path / "llm"
+    llm_dir.mkdir()
+    kept_file = llm_dir / "model-00001-of-00002.safetensors"
+    kept_file.write_bytes(b"0")
+    dropped_file = tmp_path / "other.safetensors"
+    dropped_file.write_bytes(b"0")
+
+    index_path = llm_dir / "model.safetensors.index.json"
+    index_path.write_text(
+        json.dumps({"weight_map": {"w": "model-00001-of-00002.safetensors"}})
+    )
+
+    filtered = filter_duplicate_safetensors_files(
+        [str(kept_file), str(dropped_file)],
+        str(tmp_path),
+        "llm/model.safetensors.index.json",
+    )
+
+    assert filtered == [str(kept_file)]
+
+
 if __name__ == "__main__":
     test_hf_transfer_auto_activation()
     test_download_weights_from_hf()
diff --git a/tests/tokenization/test_tokenizer_llm_subfolder.py b/tests/tokenization/test_tokenizer_llm_subfolder.py
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any
+
+from vllm.transformers_utils import tokenizer as tokenizer_module
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+
+class _DummyTokenizer:
+    def __init__(self):
+        self.all_special_ids: list[int] = []
+        self.all_special_tokens: list[str] = []
+        self.all_special_tokens_extended: list[str] = []
+        self.special_tokens_map: dict[str, str] = {}
+        self.vocab_size = 1
+
+    def get_vocab(self) -> dict[str, int]:
+        return {"a": 0}
+
+    def __len__(self) -> int:  # pragma: no cover - trivial
+        return 1
+
+    def decode(self, *args: Any, **kwargs: Any) -> str:
+        return ""
+
+    def encode(self, *args: Any, **kwargs: Any) -> list[int]:
+        return []
+
+
+def test_tokenizer_prefers_llm_subfolder(monkeypatch):
+    captured = {}
+
+    def fake_file_exists(repo_id: str, file_name: str, **kwargs: Any) -> bool:
+        return file_name == "llm/tokenizer.json"
+
+    def fake_auto_from_pretrained(*args: Any, **kwargs: Any):
+        captured["subfolder"] = kwargs.get("subfolder")
+        return _DummyTokenizer()
+
+    monkeypatch.setattr(tokenizer_module, "file_exists", fake_file_exists)
+    monkeypatch.setattr(
+        tokenizer_module.AutoTokenizer,
+        "from_pretrained",
+        classmethod(
+            lambda cls, *args, **kwargs: fake_auto_from_pretrained(*args, **kwargs)
+        ),
+    )
+
+    tokenizer = get_tokenizer("fake/model")
+
+    assert tokenizer is not None
+    assert captured["subfolder"] == "llm"
diff --git a/tests/transformers_utils/test_hf_config_parser_subfolder.py b/tests/transformers_utils/test_hf_config_parser_subfolder.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional, Union
+
+from transformers import GenerationConfig, PretrainedConfig
+
+from vllm.transformers_utils import config as config_module
+from vllm.transformers_utils.config import HFConfigParser, try_get_generation_config
+
+
+def test_hf_config_parser_uses_llm_subfolder(monkeypatch):
+    parser = HFConfigParser()
+    base_config = PretrainedConfig()
+    subfolder_config = PretrainedConfig()
+
+    def fake_get_config_dict(
+        cls,
+        model: Union[str, bytes],
+        revision: Optional[str] = None,
+        code_revision: Optional[str] = None,
+        **kwargs,
+    ):
+        return {"llm_cfg": {}}, base_config
+
+    def fake_file_exists(
+        model: Union[str, bytes], config_name: str, revision: Optional[str]
+    ):
+        return config_name == "llm/config.json"
+
+    auto_called = {}
+
+    def fake_auto_from_pretrained(cls, *args, **kwargs):
+        auto_called["subfolder"] = kwargs.get("subfolder")
+        return subfolder_config
+
+    monkeypatch.setattr(
+        PretrainedConfig,
+        "get_config_dict",
+        classmethod(fake_get_config_dict),
+    )
+    monkeypatch.setattr(config_module, "file_or_path_exists", fake_file_exists)
+    monkeypatch.setattr(
+        config_module.AutoConfig,
+        "from_pretrained",
+        classmethod(fake_auto_from_pretrained),
+    )
+
+    returned_dict, returned_config = parser.parse("fake/model", trust_remote_code=False)
+
+    assert returned_dict == {"llm_cfg": {}}
+    assert returned_config is subfolder_config
+    assert auto_called["subfolder"] == "llm"
+
+
+def test_try_get_generation_config_llm_subfolder(monkeypatch):
+    calls = []
+
+    def fake_from_pretrained(cls, model: str, **kwargs):
+        calls.append(kwargs.get("subfolder"))
+        if len(calls) == 1:
+            raise OSError("missing")
+        return GenerationConfig()
+
+    monkeypatch.setattr(
+        config_module.GenerationConfig,
+        "from_pretrained",
+        classmethod(fake_from_pretrained),
+    )
+
+    result = try_get_generation_config("fake/model", trust_remote_code=False)
+
+    assert isinstance(result, GenerationConfig)
+    assert calls == [None, "llm"]
diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -96,14 +96,27 @@ def _get_weight_files(
         is_local = os.path.isdir(model_name_or_path)
 
         if is_local:
-            for pattern in allowed_patterns:
+            patterns = list(allowed_patterns)
+            # Prefer subfolder patterns if common subfolder exists locally.
+            if os.path.isdir(os.path.join(model_name_or_path, "llm")):
+                patterns = [f"llm/{p}" for p in allowed_patterns] + patterns
+            for pattern in patterns:
                 weight_files = glob.glob(os.path.join(model_name_or_path, pattern))
                 if weight_files:
                     return model_name_or_path, weight_files, pattern
         else:
             hf_api = HfApi()
             repo_files = hf_api.list_repo_files(repo_id=model_name_or_path)
-            for pattern in allowed_patterns:
+            search_patterns = list(allowed_patterns)
+            # Prefer 'llm/' weights when present in the repo.
+            if any(
+                f.startswith("llm/") and f.endswith((".safetensors", ".bin", ".pt"))
+                for f in repo_files
+            ):
+                search_patterns = [
+                    f"llm/{p}" for p in allowed_patterns
+                ] + search_patterns
+            for pattern in search_patterns:
                 matching_files = fnmatch.filter(repo_files, pattern)
                 if matching_files:
                     hf_folder = download_weights_from_hf(
@@ -128,26 +141,35 @@ def _prepare_weights(
 
         allowed_patterns = ["*.safetensors", "*.bin", "*.pt"]
 
+        if getattr(self, "allow_patterns_overrides", None):
+            allowed_patterns = list(self.allow_patterns_overrides)
+
         hf_folder, hf_weights_files, matched_pattern = self._get_weight_files(
             model_name_or_path, allowed_patterns, revision
         )
 
-        use_safetensors = matched_pattern == "*.safetensors"
+        # Detect safetensors robustly (pattern may include subfolder)
+        use_safetensors = matched_pattern.endswith(".safetensors")
+        # Additionally guard by checking actual files
+        if not use_safetensors:
+            use_safetensors = any(f.endswith(".safetensors") for f in hf_weights_files)
         is_local = os.path.isdir(model_name_or_path)
-        index_file = SAFE_WEIGHTS_INDEX_NAME
+        # If weights live under a subfolder (e.g., 'llm/*.safetensors'),
+        # the index file will also live there.
+        if "/" in matched_pattern:
+            folder_prefix = matched_pattern.rsplit("/", 1)[0] + "/"
+        else:
+            folder_prefix = ""
+        index_file = folder_prefix + SAFE_WEIGHTS_INDEX_NAME
+        if use_safetensors and not is_local:
+            # Download index for safetensors to select correct shards.
+            download_safetensors_index_file_from_hf(
+                model_name_or_path,
+                index_file,
+                self.load_config.download_dir,
+                revision,
+            )
         if use_safetensors:
-            # For models like Mistral-7B-Instruct-v0.3
-            # there are both sharded safetensors files and a consolidated
-            # safetensors file. Using both breaks.
-            # Here, we download the `model.safetensors.index.json` and filter
-            # any files not found in the index.
-            if not is_local:
-                download_safetensors_index_file_from_hf(
-                    model_name_or_path,
-                    index_file,
-                    self.load_config.download_dir,
-                    revision,
-                )
             hf_weights_files = filter_duplicate_safetensors_files(
                 hf_weights_files, hf_folder, index_file
             )
@@ -587,6 +609,8 @@ def _initialize_loader_state(
         self._get_bnb_target_modules(model)
         self._classify_module_sharding(model)
 
+        self.allow_patterns_overrides = getattr(model, "allow_patterns_overrides", None)
+
     def _dequantize_dq(self, quant_states: Any):
         """
         When BNB employs Double Quantization, we perform the dequantization of
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
@@ -499,8 +499,12 @@ def filter_duplicate_safetensors_files(
     with open(index_file_name) as f:
         weight_map = json.load(f)["weight_map"]
     weight_files_in_index = set()
+    # If the index file is inside a subfolder (e.g., 'llm/model.safetensors.index.json'),
+    # the shard paths in `weight_map` are relative to that subfolder. Use the
+    # index file's directory as the base for joining shard filenames.
+    base_dir = os.path.dirname(index_file_name)
     for weight_name in weight_map:
-        weight_files_in_index.add(os.path.join(hf_folder, weight_map[weight_name]))
+        weight_files_in_index.add(os.path.join(base_dir, weight_map[weight_name]))
     # Filter out any fields that are not found in the index file.
     hf_weights_files = [f for f in hf_weights_files if f in weight_files_in_index]
     return hf_weights_files
diff --git a/vllm/model_executor/models/omnivinci.py b/vllm/model_executor/models/omnivinci.py
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Thin wrapper to support nvidia/omnivinci LLM weights stored under llm/.
+
+This model maps the root architecture (VILAForCausalLM) to the text-only
+Qwen2 architecture by reusing vLLM's Qwen2ForCausalLM and ensures the weight
+loader searches in the `llm/` subfolder of the repository.
+"""
+
+from vllm.config import VllmConfig
+from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
+
+
+class OmniVinciForCausalLM(Qwen2ForCausalLM):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        # direct the default loader to read weights from the llm/ subfolder
+        self.allow_patterns_overrides = [
+            "llm/*.safetensors",
+            "llm/consolidated*.safetensors",
+            "llm/*.pt",
+        ]
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
@@ -166,6 +166,10 @@
     "TeleFLMForCausalLM": ("teleflm", "TeleFLMForCausalLM"),
     "XverseForCausalLM": ("llama", "LlamaForCausalLM"),
     "Zamba2ForCausalLM": ("zamba2", "Zamba2ForCausalLM"),
+    # nvidia/omnivinci root config advertises VILAForCausalLM but the LLM
+    # component is Qwen2 with weights/config under the llm/ subfolder.
+    # Map it to a thin wrapper that reuses Qwen2 implementation.
+    "VILAForCausalLM": ("omnivinci", "OmniVinciForCausalLM"),
 }
 
 _EMBEDDING_MODELS = {
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py