diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 89522dee8b8ad..6d34541a3cecc 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -42,11 +42,19 @@ class SentencePieceTokenTypes(IntEnum): BYTE = 6 -AnyModel = TypeVar("AnyModel", bound="type[Model]") +class ModelType(IntEnum): + TEXT = 1 + VISION = 2 -class Model: - _model_classes: dict[str, type[Model]] = {} +AnyModel = TypeVar("AnyModel", bound="type[ModelBase]") + + +class ModelBase: + _model_classes: dict[ModelType, dict[str, type[ModelBase]]] = { + ModelType.TEXT: {}, + ModelType.VISION: {}, + } dir_model: Path ftype: gguf.LlamaFileType @@ -75,7 +83,9 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, metadata_override: Path | None = None, model_name: str | None = None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None): - if type(self) is Model: + if type(self) is ModelBase or \ + type(self) is TextModel or \ + type(self) is VisionModel: raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") self.dir_model = dir_model @@ -98,11 +108,11 @@ def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: self.get_tensors = get_remote_tensors else: - self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors") + self.part_names = ModelBase.get_model_part_names(self.dir_model, "model", ".safetensors") self.is_safetensors = len(self.part_names) > 0 if not self.is_safetensors: - self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin") - self.hparams = Model.load_hparams(self.dir_model) if hparams is None else hparams + self.part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin") + self.hparams = ModelBase.load_hparams(self.dir_model) if hparams is None else hparams self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"]) self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) self.tensor_names = None @@ -126,11 +136,10 @@ def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard) @classmethod - def __init_subclass__(cls): - # can't use an abstract property, because overriding it without type errors - # would require using decorated functions instead of simply defining the property - if "model_arch" not in cls.__dict__: - raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}") + def add_prefix_to_filename(cls, path: Path, prefix: str) -> Path: + stem, suffix = path.stem, path.suffix + new_name = f"{prefix}{stem}{suffix}" + return path.with_name(new_name) def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: key = next((k for k in keys if k in self.hparams), None) @@ -140,9 +149,6 @@ def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: return None raise KeyError(f"could not find any of: {keys}") - def set_vocab(self): - self._set_vocab_gpt2() - def get_tensors(self) -> Iterator[tuple[str, Tensor]]: tensor_names_from_parts: set[str] = set() @@ -230,50 +236,7 @@ def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", " return new_name def set_gguf_parameters(self): - self.gguf_writer.add_block_count(self.block_count) - - if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None: - self.gguf_writer.add_context_length(n_ctx) - logger.info(f"gguf: context length = {n_ctx}") - - if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None: - self.gguf_writer.add_embedding_length(n_embd) - logger.info(f"gguf: embedding length = {n_embd}") - - if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None: - self.gguf_writer.add_feed_forward_length(n_ff) - logger.info(f"gguf: feed forward length = {n_ff}") - - if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None: - self.gguf_writer.add_head_count(n_head) - logger.info(f"gguf: head count = {n_head}") - - if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: - self.gguf_writer.add_head_count_kv(n_head_kv) - logger.info(f"gguf: key-value head count = {n_head_kv}") - - if (rope_theta := self.hparams.get("rope_theta")) is not None: - self.gguf_writer.add_rope_freq_base(rope_theta) - logger.info(f"gguf: rope theta = {rope_theta}") - if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None: - self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) - logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") - if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: - self.gguf_writer.add_layer_norm_eps(f_norm_eps) - logger.info(f"gguf: layer norm epsilon = {f_norm_eps}") - if (n_experts := self.hparams.get("num_local_experts")) is not None: - self.gguf_writer.add_expert_count(n_experts) - logger.info(f"gguf: expert count = {n_experts}") - if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: - self.gguf_writer.add_expert_used_count(n_experts_used) - logger.info(f"gguf: experts used count = {n_experts_used}") - - if (head_dim := self.hparams.get("head_dim")) is not None: - self.gguf_writer.add_key_length(head_dim) - self.gguf_writer.add_value_length(head_dim) - - self.gguf_writer.add_file_type(self.ftype) - logger.info(f"gguf: file type = {self.ftype}") + raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses") def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -419,27 +382,6 @@ def prepare_metadata(self, vocab_only: bool): if self.metadata.size_label is None and total_params > 0: self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count) - # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0' - output_type: str = self.ftype.name.partition("_")[2] - - # Filename Output - if self.fname_out.is_dir(): - # Generate default filename based on model specification and available metadata - if not vocab_only: - fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None) - else: - fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab") - - # Use the default filename - self.fname_out = self.fname_out / f"{fname_default}.gguf" - else: - # Output path is a custom defined templated filename - # Note: `not is_dir()` is used because `.is_file()` will not detect - # file template strings as it doesn't actually exist as a file - - # Process templated file name with the output ftype, useful with the "auto" ftype - self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type) - self.set_type() logger.info("Set meta model") @@ -448,12 +390,12 @@ def prepare_metadata(self, vocab_only: bool): logger.info("Set model parameters") self.set_gguf_parameters() - logger.info("Set model tokenizer") - self.set_vocab() - logger.info("Set model quantization version") self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) + def write_vocab(self): + raise NotImplementedError("write_vocab() must be implemented in subclasses") + def write(self): self.prepare_tensors() self.prepare_metadata(vocab_only=False) @@ -462,15 +404,6 @@ def write(self): self.gguf_writer.write_tensors_to_file(progress=True) self.gguf_writer.close() - def write_vocab(self): - if len(self.gguf_writer.tensors) != 1: - raise ValueError('Splitting the vocabulary is not supported') - - self.prepare_metadata(vocab_only=True) - self.gguf_writer.write_header_to_file(path=self.fname_out) - self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.close() - @staticmethod def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]: part_names: list[str] = [] @@ -485,30 +418,131 @@ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str] @staticmethod def load_hparams(dir_model: Path): with open(dir_model / "config.json", "r", encoding="utf-8") as f: - return json.load(f) + hparams = json.load(f) + if "text_config" in hparams: + hparams = {**hparams, **hparams["text_config"]} + return hparams @classmethod def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]: assert names def func(modelcls: AnyModel) -> AnyModel: + model_type = ModelType.VISION if modelcls.model_arch == gguf.MODEL_ARCH.CLIP_VISION else ModelType.TEXT for name in names: - cls._model_classes[name] = modelcls + cls._model_classes[model_type][name] = modelcls return modelcls return func @classmethod def print_registered_models(cls): - for name in sorted(cls._model_classes.keys()): - logger.error(f"- {name}") + for model_type, model_classes in cls._model_classes.items(): + logger.error(f"{model_type.name} models:") + for name in sorted(model_classes.keys()): + logger.error(f" - {name}") @classmethod - def from_model_architecture(cls, arch: str) -> type[Model]: + def from_model_architecture(cls, arch: str, model_type = ModelType.TEXT) -> type[ModelBase]: try: - return cls._model_classes[arch] + return cls._model_classes[model_type][arch] except KeyError: raise NotImplementedError(f'Architecture {arch!r} not supported!') from None + +class TextModel(ModelBase): + @classmethod + def __init_subclass__(cls): + # can't use an abstract property, because overriding it without type errors + # would require using decorated functions instead of simply defining the property + if "model_arch" not in cls.__dict__: + raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}") + + def set_vocab(self): + self._set_vocab_gpt2() + + def prepare_metadata(self, vocab_only: bool): + super().prepare_metadata(vocab_only=vocab_only) + + total_params = self.gguf_writer.get_total_parameter_count()[0] + # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0' + output_type: str = self.ftype.name.partition("_")[2] + + # Filename Output + if self.fname_out.is_dir(): + # Generate default filename based on model specification and available metadata + if not vocab_only: + fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None) + else: + fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab") + + # Use the default filename + self.fname_out = self.fname_out / f"{fname_default}.gguf" + else: + # Output path is a custom defined templated filename + # Note: `not is_dir()` is used because `.is_file()` will not detect + # file template strings as it doesn't actually exist as a file + + # Process templated file name with the output ftype, useful with the "auto" ftype + self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type) + + logger.info("Set model tokenizer") + self.set_vocab() + + def set_gguf_parameters(self): + self.gguf_writer.add_block_count(self.block_count) + + if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None: + self.gguf_writer.add_context_length(n_ctx) + logger.info(f"gguf: context length = {n_ctx}") + + if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None: + self.gguf_writer.add_embedding_length(n_embd) + logger.info(f"gguf: embedding length = {n_embd}") + + if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None: + self.gguf_writer.add_feed_forward_length(n_ff) + logger.info(f"gguf: feed forward length = {n_ff}") + + if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None: + self.gguf_writer.add_head_count(n_head) + logger.info(f"gguf: head count = {n_head}") + + if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: + self.gguf_writer.add_head_count_kv(n_head_kv) + logger.info(f"gguf: key-value head count = {n_head_kv}") + + if (rope_theta := self.hparams.get("rope_theta")) is not None: + self.gguf_writer.add_rope_freq_base(rope_theta) + logger.info(f"gguf: rope theta = {rope_theta}") + if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None: + self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) + logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") + if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: + self.gguf_writer.add_layer_norm_eps(f_norm_eps) + logger.info(f"gguf: layer norm epsilon = {f_norm_eps}") + if (n_experts := self.hparams.get("num_local_experts")) is not None: + self.gguf_writer.add_expert_count(n_experts) + logger.info(f"gguf: expert count = {n_experts}") + if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: + self.gguf_writer.add_expert_used_count(n_experts_used) + logger.info(f"gguf: experts used count = {n_experts_used}") + + if (head_dim := self.hparams.get("head_dim")) is not None: + self.gguf_writer.add_key_length(head_dim) + self.gguf_writer.add_value_length(head_dim) + + self.gguf_writer.add_file_type(self.ftype) + logger.info(f"gguf: file type = {self.ftype}") + + def write_vocab(self): + if len(self.gguf_writer.tensors) != 1: + raise ValueError('Splitting the vocabulary is not supported') + + self.prepare_metadata(vocab_only=True) + self.gguf_writer.write_header_to_file(path=self.fname_out) + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.close() + def does_token_look_special(self, token: str | bytes) -> bool: if isinstance(token, (bytes, bytearray)): token_text = token.decode(encoding="utf-8") @@ -1024,8 +1058,48 @@ def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0]) -@Model.register("GPTNeoXForCausalLM") -class GPTNeoXModel(Model): +class VisionModel(ModelBase): + model_arch = gguf.MODEL_ARCH.CLIP_VISION + n_text_embd = 0 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + if self.model_arch != gguf.MODEL_ARCH.CLIP_VISION: + raise TypeError("VisionModel must be subclassed with model_arch = gguf.MODEL_ARCH.CLIP_VISION") + + # small hack to correct the number of layers + self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.CLIP_VISION, 128) + self.n_embd_text = self.find_hparam(["hidden_size", "n_embd"]) + assert self.n_embd_text > 0, "n_embd not found in hparams" + + if "vision_config" not in self.hparams: + raise ValueError("vision_config not found in hparams") + # move vision config to the top level + self.hparams = self.hparams["vision_config"] + + def set_type(self): + self.gguf_writer.add_type(gguf.GGUFType.CLIP_VISION) + + def set_gguf_parameters(self): + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_uint32(gguf.Keys.ClipVision.PROJECTION_DIM, self.n_embd_text) + self.gguf_writer.add_bool(gguf.Keys.ClipVision.HAS_VISION_ENCODER, True) + + # vision config + self.gguf_writer.add_uint32(gguf.Keys.ClipVision.IMAGE_SIZE, self.find_hparam(["image_size"])) + self.gguf_writer.add_uint32(gguf.Keys.ClipVision.PATCH_SIZE, self.find_hparam(["patch_size"])) + self.gguf_writer.add_uint32(gguf.Keys.ClipVision.EMBEDDING_LENGTH, self.find_hparam(["hidden_size"])) + self.gguf_writer.add_uint32(gguf.Keys.ClipVision.FEED_FORWARD_LENGTH, self.find_hparam(["intermediate_size"])) + self.gguf_writer.add_uint32(gguf.Keys.ClipVision.BLOCK_COUNT, self.find_hparam(["num_hidden_layers"])) + self.gguf_writer.add_uint32(gguf.Keys.ClipVision.Attention.HEAD_COUNT, self.find_hparam(["num_attention_heads"])) + + def write_vocab(self): + raise ValueError("VisionModel does not support vocab writing") + + +@ModelBase.register("GPTNeoXForCausalLM") +class GPTNeoXModel(TextModel): model_arch = gguf.MODEL_ARCH.GPTNEOX def set_gguf_parameters(self): @@ -1081,8 +1155,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return tensors -@Model.register("BloomForCausalLM", "BloomModel") -class BloomModel(Model): +@ModelBase.register("BloomForCausalLM", "BloomModel") +class BloomModel(TextModel): model_arch = gguf.MODEL_ARCH.BLOOM def set_gguf_parameters(self): @@ -1138,8 +1212,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return tensors -@Model.register("MPTForCausalLM") -class MPTModel(Model): +@ModelBase.register("MPTForCausalLM") +class MPTModel(TextModel): model_arch = gguf.MODEL_ARCH.MPT def set_vocab(self): @@ -1182,8 +1256,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(new_name, data_torch)] -@Model.register("OrionForCausalLM") -class OrionModel(Model): +@ModelBase.register("OrionForCausalLM") +class OrionModel(TextModel): model_arch = gguf.MODEL_ARCH.ORION def set_vocab(self): @@ -1217,8 +1291,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"]) -@Model.register("BaichuanForCausalLM", "BaiChuanForCausalLM") -class BaichuanModel(Model): +@ModelBase.register("BaichuanForCausalLM", "BaiChuanForCausalLM") +class BaichuanModel(TextModel): model_arch = gguf.MODEL_ARCH.BAICHUAN def set_vocab(self): @@ -1297,8 +1371,8 @@ def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor: return weights[r * n_part:r * n_part + r, ...] -@Model.register("XverseForCausalLM") -class XverseModel(Model): +@ModelBase.register("XverseForCausalLM") +class XverseModel(TextModel): model_arch = gguf.MODEL_ARCH.XVERSE def set_vocab(self): @@ -1404,8 +1478,8 @@ def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | Non ) -@Model.register("FalconForCausalLM", "RWForCausalLM") -class FalconModel(Model): +@ModelBase.register("FalconForCausalLM", "RWForCausalLM") +class FalconModel(TextModel): model_arch = gguf.MODEL_ARCH.FALCON def set_gguf_parameters(self): @@ -1458,8 +1532,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("GPTBigCodeForCausalLM") -class StarCoderModel(Model): +@ModelBase.register("GPTBigCodeForCausalLM") +class StarCoderModel(TextModel): model_arch = gguf.MODEL_ARCH.STARCODER def set_gguf_parameters(self): @@ -1475,8 +1549,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) -@Model.register("GPTRefactForCausalLM") -class RefactModel(Model): +@ModelBase.register("GPTRefactForCausalLM") +class RefactModel(TextModel): model_arch = gguf.MODEL_ARCH.REFACT def set_vocab(self): @@ -1539,8 +1613,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return tensors -@Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM") -class StableLMModel(Model): +@ModelBase.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM") +class StableLMModel(TextModel): model_arch = gguf.MODEL_ARCH.STABLELM def set_vocab(self): @@ -1629,8 +1703,8 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed norms: {norms}") -@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") -class LlamaModel(Model): +@ModelBase.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") +class LlamaModel(TextModel): model_arch = gguf.MODEL_ARCH.LLAMA undo_permute = True @@ -1778,23 +1852,13 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("Llama4ForConditionalGeneration") +@ModelBase.register("Llama4ForConditionalGeneration") class Llama4Model(LlamaModel): model_arch = gguf.MODEL_ARCH.LLAMA4 - has_vision: bool = False undo_permute = False - # TODO @ngxson : avoid duplicate this code everywhere by at least support "text_config" - # same with llama, but we need to merge the text_config into the root level of hparams def __init__(self, *args, **kwargs): - hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0]) - if "text_config" in hparams: - hparams = {**hparams, **hparams["text_config"]} - kwargs["hparams"] = hparams super().__init__(*args, **kwargs) - if "vision_config" in hparams: - logger.info("Has vision encoder, but it will be ignored") - self.has_vision = True # IMPORTANT: the normal "intermediate_size" is renamed to "intermediate_size_mlp", we need to undo this self.hparams["intermediate_size_moe"] = self.hparams["intermediate_size"] self.hparams["intermediate_size"] = self.hparams["intermediate_size_mlp"] @@ -1829,18 +1893,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): return super().modify_tensors(data_torch, name, bid) -@Model.register("Mistral3ForConditionalGeneration") +@ModelBase.register("Mistral3ForConditionalGeneration") class Mistral3Model(LlamaModel): model_arch = gguf.MODEL_ARCH.LLAMA - # we need to merge the text_config into the root level of hparams - def __init__(self, *args, **kwargs): - hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0]) - if "text_config" in hparams: - hparams = {**hparams, **hparams["text_config"]} - kwargs["hparams"] = hparams - super().__init__(*args, **kwargs) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): name = name.replace("language_model.", "") if "multi_modal_projector" in name or "vision_tower" in name: @@ -1848,8 +1904,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): return super().modify_tensors(data_torch, name, bid) -@Model.register("DeciLMForCausalLM") -class DeciModel(Model): +@ModelBase.register("DeciLMForCausalLM") +class DeciModel(TextModel): model_arch = gguf.MODEL_ARCH.DECI @staticmethod @@ -2020,8 +2076,8 @@ def prepare_tensors(self): super().prepare_tensors() -@Model.register("BitnetForCausalLM") -class BitnetModel(Model): +@ModelBase.register("BitnetForCausalLM") +class BitnetModel(TextModel): model_arch = gguf.MODEL_ARCH.BITNET def set_vocab(self): @@ -2061,8 +2117,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield (new_name, data_torch) -@Model.register("GrokForCausalLM") -class GrokModel(Model): +@ModelBase.register("GrokForCausalLM") +class GrokModel(TextModel): model_arch = gguf.MODEL_ARCH.GROK def set_vocab(self): @@ -2114,8 +2170,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("DbrxForCausalLM") -class DbrxModel(Model): +@ModelBase.register("DbrxForCausalLM") +class DbrxModel(TextModel): model_arch = gguf.MODEL_ARCH.DBRX def set_gguf_parameters(self): @@ -2183,8 +2239,8 @@ def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: return n_dims > 1 -@Model.register("MiniCPMForCausalLM") -class MiniCPMModel(Model): +@ModelBase.register("MiniCPMForCausalLM") +class MiniCPMModel(TextModel): model_arch = gguf.MODEL_ARCH.MINICPM def set_gguf_parameters(self): @@ -2238,8 +2294,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("MiniCPM3ForCausalLM") -class MiniCPM3Model(Model): +@ModelBase.register("MiniCPM3ForCausalLM") +class MiniCPM3Model(TextModel): model_arch = gguf.MODEL_ARCH.MINICPM3 def set_gguf_parameters(self): @@ -2291,8 +2347,8 @@ def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | Non ) -@Model.register("QWenLMHeadModel") -class QwenModel(Model): +@ModelBase.register("QWenLMHeadModel") +class QwenModel(TextModel): model_arch = gguf.MODEL_ARCH.QWEN @staticmethod @@ -2333,8 +2389,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) -@Model.register("Qwen2ForCausalLM") -class Qwen2Model(Model): +@ModelBase.register("Qwen2ForCausalLM") +class Qwen2Model(TextModel): model_arch = gguf.MODEL_ARCH.QWEN2 def set_vocab(self): @@ -2352,8 +2408,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) -@Model.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration") -class Qwen2VLModel(Model): +@ModelBase.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration") +class Qwen2VLModel(TextModel): model_arch = gguf.MODEL_ARCH.QWEN2VL def set_gguf_parameters(self): @@ -2375,8 +2431,8 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: yield name, data -@Model.register("WavTokenizerDec") -class WavTokenizerDecModel(Model): +@ModelBase.register("WavTokenizerDec") +class WavTokenizerDecModel(TextModel): model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: @@ -2413,8 +2469,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_causal_attention(False) -@Model.register("Qwen2MoeForCausalLM") -class Qwen2MoeModel(Model): +@ModelBase.register("Qwen2MoeForCausalLM") +class Qwen2MoeModel(TextModel): model_arch = gguf.MODEL_ARCH.QWEN2MOE def set_gguf_parameters(self): @@ -2476,18 +2532,18 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("Qwen3ForCausalLM") +@ModelBase.register("Qwen3ForCausalLM") class Qwen3Model(Qwen2Model): model_arch = gguf.MODEL_ARCH.QWEN3 -@Model.register("Qwen3MoeForCausalLM") +@ModelBase.register("Qwen3MoeForCausalLM") class Qwen3MoeModel(Qwen2MoeModel): model_arch = gguf.MODEL_ARCH.QWEN3MOE -@Model.register("GPT2LMHeadModel") -class GPT2Model(Model): +@ModelBase.register("GPT2LMHeadModel") +class GPT2Model(TextModel): model_arch = gguf.MODEL_ARCH.GPT2 def set_gguf_parameters(self): @@ -2518,8 +2574,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return tensors -@Model.register("PhiForCausalLM") -class Phi2Model(Model): +@ModelBase.register("PhiForCausalLM") +class Phi2Model(TextModel): model_arch = gguf.MODEL_ARCH.PHI2 def set_gguf_parameters(self): @@ -2542,8 +2598,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_add_bos_token(False) -@Model.register("Phi3ForCausalLM") -class Phi3MiniModel(Model): +@ModelBase.register("Phi3ForCausalLM") +class Phi3MiniModel(TextModel): model_arch = gguf.MODEL_ARCH.PHI3 def set_vocab(self): @@ -2720,7 +2776,7 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) -@Model.register("PhiMoEForCausalLM") +@ModelBase.register("PhiMoEForCausalLM") class PhiMoeModel(Phi3MiniModel): model_arch = gguf.MODEL_ARCH.PHIMOE @@ -2777,8 +2833,8 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("PlamoForCausalLM") -class PlamoModel(Model): +@ModelBase.register("PlamoForCausalLM") +class PlamoModel(TextModel): model_arch = gguf.MODEL_ARCH.PLAMO def set_vocab(self): @@ -2825,8 +2881,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(new_name, data_torch)] -@Model.register("CodeShellForCausalLM") -class CodeShellModel(Model): +@ModelBase.register("CodeShellForCausalLM") +class CodeShellModel(TextModel): model_arch = gguf.MODEL_ARCH.CODESHELL def set_gguf_parameters(self): @@ -2866,8 +2922,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(new_name, data_torch)] -@Model.register("InternLM2ForCausalLM") -class InternLM2Model(Model): +@ModelBase.register("InternLM2ForCausalLM") +class InternLM2Model(TextModel): model_arch = gguf.MODEL_ARCH.INTERNLM2 def set_vocab(self): @@ -3039,8 +3095,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("InternLM3ForCausalLM") -class InternLM3Model(Model): +@ModelBase.register("InternLM3ForCausalLM") +class InternLM3Model(TextModel): model_arch = gguf.MODEL_ARCH.LLAMA def set_vocab(self): @@ -3099,8 +3155,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("BertModel", "BertForMaskedLM", "CamembertModel") -class BertModel(Model): +@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel") +class BertModel(TextModel): model_arch = gguf.MODEL_ARCH.BERT def __init__(self, *args, **kwargs): @@ -3187,7 +3243,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("RobertaModel") +@ModelBase.register("RobertaModel") class RobertaModel(BertModel): model_arch = gguf.MODEL_ARCH.BERT @@ -3232,7 +3288,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return super().modify_tensors(data_torch, name, bid) -@Model.register("NomicBertModel") +@ModelBase.register("NomicBertModel") class NomicBertModel(BertModel): model_arch = gguf.MODEL_ARCH.NOMIC_BERT @@ -3262,7 +3318,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) -@Model.register("XLMRobertaModel", "XLMRobertaForSequenceClassification") +@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification") class XLMRobertaModel(BertModel): model_arch = gguf.MODEL_ARCH.BERT @@ -3373,8 +3429,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return super().modify_tensors(data_torch, name, bid) -@Model.register("GemmaForCausalLM") -class GemmaModel(Model): +@ModelBase.register("GemmaForCausalLM") +class GemmaModel(TextModel): model_arch = gguf.MODEL_ARCH.GEMMA def set_vocab(self): @@ -3424,8 +3480,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("Gemma2ForCausalLM") -class Gemma2Model(Model): +@ModelBase.register("Gemma2ForCausalLM") +class Gemma2Model(TextModel): model_arch = gguf.MODEL_ARCH.GEMMA2 def set_vocab(self): @@ -3471,27 +3527,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration") -class Gemma3Model(Model): +@ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration") +class Gemma3Model(TextModel): model_arch = gguf.MODEL_ARCH.GEMMA3 - has_vision: bool = False - - # we need to merge the text_config into the root level of hparams - def __init__(self, *args, **kwargs): - hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0]) - if "text_config" in hparams: - hparams = {**hparams, **hparams["text_config"]} - kwargs["hparams"] = hparams - super().__init__(*args, **kwargs) - if "vision_config" in hparams: - logger.info("Has vision encoder, but it will be ignored") - self.has_vision = True - - def write(self): - super().write() - if self.has_vision: - logger.info("NOTE: this script only convert the language model to GGUF") - logger.info(" for the vision model, please use gemma3_convert_encoder_to_gguf.py") def set_vocab(self): self._set_vocab_sentencepiece() @@ -3529,10 +3567,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.startswith("language_model."): name = name.replace("language_model.", "") + elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \ - or name.startswith("multimodal_projector.") or name.startswith("vision_model."): # this is for old HF model, should be removed later - # ignore vision tensors - return [] + or name.startswith("multimodal_projector.") or name.startswith("vision_model."): + return [] # skip vision tensors # remove OOV (out-of-vocabulary) rows in token_embd if "embed_tokens.weight" in name: @@ -3548,13 +3586,58 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("Starcoder2ForCausalLM") -class StarCoder2Model(Model): +@ModelBase.register("Gemma3ForConditionalGeneration") +class Gemma3VisionModel(VisionModel): + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_string(gguf.Keys.ClipVision.PROJECTOR_TYPE, "gemma3") + # default values below are taken from HF tranformers code + self.gguf_writer.add_float32(gguf.Keys.ClipVision.Attention.LAYERNORM_EPS, hparams.get("layer_norm_eps", 1e-6)) + self.gguf_writer.add_array(gguf.Keys.ClipVision.IMAGE_MEAN, [0.5, 0.5, 0.5]) + self.gguf_writer.add_array(gguf.Keys.ClipVision.IMAGE_STD, [0.5, 0.5, 0.5]) + self.gguf_writer.add_bool (gguf.Keys.ClipVision.USE_GELU, True) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + del bid, new_name, n_dims # unused + # related to https://github.com/ggml-org/llama.cpp/issues/13025 + if "input_projection" in name: + return gguf.GGMLQuantizationType.F16 + if ".embeddings." in name: + return gguf.GGMLQuantizationType.F32 + return False + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + if name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \ + or name.startswith("multimodal_projector.") or name.startswith("vision_model."): + # process vision tensors + name = name.replace("_weight", ".weight") + if "fc1" in name: + name = name.replace("fc1", "fc2") + else: + name = name.replace("fc2", "fc1") + + # correct norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector + # the other norm values are part of SigLIP model, and they are already correct + # ref code: Gemma3RMSNorm + if "soft_emb_norm.weight" in name: + logger.info(f"Correcting norm value for '{name}'") + data_torch = data_torch + 1 + + return [(self.map_tensor_name(name), data_torch)] + + return [] # skip other tensors + + +@ModelBase.register("Starcoder2ForCausalLM") +class StarCoder2Model(TextModel): model_arch = gguf.MODEL_ARCH.STARCODER2 -@Model.register("Rwkv6ForCausalLM") -class Rwkv6Model(Model): +@ModelBase.register("Rwkv6ForCausalLM") +class Rwkv6Model(TextModel): model_arch = gguf.MODEL_ARCH.RWKV6 def set_vocab(self): @@ -3626,7 +3709,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield (new_name, data_torch) -@Model.register("RWKV6Qwen2ForCausalLM") +@ModelBase.register("RWKV6Qwen2ForCausalLM") class RWKV6Qwen2Model(Rwkv6Model): model_arch = gguf.MODEL_ARCH.RWKV6QWEN2 @@ -3680,8 +3763,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield (new_name, data) -@Model.register("Rwkv7ForCausalLM", "RWKV7ForCausalLM") -class Rwkv7Model(Model): +@ModelBase.register("Rwkv7ForCausalLM", "RWKV7ForCausalLM") +class Rwkv7Model(TextModel): model_arch = gguf.MODEL_ARCH.RWKV7 def set_vocab(self): @@ -3799,7 +3882,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield (new_name, data_torch) -@Model.register("RwkvHybridForCausalLM") +@ModelBase.register("RwkvHybridForCausalLM") class ARwkv7Model(Rwkv7Model): model_arch = gguf.MODEL_ARCH.ARWKV7 @@ -3842,8 +3925,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_head_count(0) -@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") -class MambaModel(Model): +@ModelBase.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") +class MambaModel(TextModel): model_arch = gguf.MODEL_ARCH.MAMBA def set_vocab(self): @@ -3920,8 +4003,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(new_name, data_torch)] -@Model.register("CohereForCausalLM") -class CommandR2Model(Model): +@ModelBase.register("CohereForCausalLM") +class CommandR2Model(TextModel): model_arch = gguf.MODEL_ARCH.COMMAND_R def __init__(self, *args, **kwargs): @@ -3938,8 +4021,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) -@Model.register("Cohere2ForCausalLM") -class Cohere2Model(Model): +@ModelBase.register("Cohere2ForCausalLM") +class Cohere2Model(TextModel): model_arch = gguf.MODEL_ARCH.COHERE2 def set_gguf_parameters(self): @@ -3956,9 +4039,9 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) -@Model.register("OlmoForCausalLM") -@Model.register("OLMoForCausalLM") -class OlmoModel(Model): +@ModelBase.register("OlmoForCausalLM") +@ModelBase.register("OLMoForCausalLM") +class OlmoModel(TextModel): model_arch = gguf.MODEL_ARCH.OLMO def set_gguf_parameters(self): @@ -3984,13 +4067,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("Olmo2ForCausalLM") -class Olmo2Model(Model): +@ModelBase.register("Olmo2ForCausalLM") +class Olmo2Model(TextModel): model_arch = gguf.MODEL_ARCH.OLMO2 -@Model.register("OlmoeForCausalLM") -class OlmoeModel(Model): +@ModelBase.register("OlmoeForCausalLM") +class OlmoeModel(TextModel): model_arch = gguf.MODEL_ARCH.OLMOE def set_gguf_parameters(self): @@ -4049,7 +4132,7 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("JinaBertModel", "JinaBertForMaskedLM") +@ModelBase.register("JinaBertModel", "JinaBertForMaskedLM") class JinaBertV2Model(BertModel): model_arch = gguf.MODEL_ARCH.JINA_BERT_V2 @@ -4096,8 +4179,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return super().modify_tensors(data_torch, name, bid) -@Model.register("OpenELMForCausalLM") -class OpenELMModel(Model): +@ModelBase.register("OpenELMForCausalLM") +class OpenELMModel(TextModel): model_arch = gguf.MODEL_ARCH.OPENELM @staticmethod @@ -4171,8 +4254,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield (self.map_tensor_name(name), data_torch) -@Model.register("ArcticForCausalLM") -class ArcticModel(Model): +@ModelBase.register("ArcticForCausalLM") +class ArcticModel(TextModel): model_arch = gguf.MODEL_ARCH.ARCTIC def set_vocab(self): @@ -4322,8 +4405,8 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("DeepseekForCausalLM") -class DeepseekModel(Model): +@ModelBase.register("DeepseekForCausalLM") +class DeepseekModel(TextModel): model_arch = gguf.MODEL_ARCH.DEEPSEEK def set_vocab(self): @@ -4413,9 +4496,9 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("DeepseekV2ForCausalLM") -@Model.register("DeepseekV3ForCausalLM") -class DeepseekV2Model(Model): +@ModelBase.register("DeepseekV2ForCausalLM") +@ModelBase.register("DeepseekV3ForCausalLM") +class DeepseekV2Model(TextModel): model_arch = gguf.MODEL_ARCH.DEEPSEEK2 def set_vocab(self): @@ -4541,8 +4624,8 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("PLMForCausalLM") -class PLMModel(Model): +@ModelBase.register("PLMForCausalLM") +class PLMModel(TextModel): model_arch = gguf.MODEL_ARCH.PLM def set_vocab(self): @@ -4564,11 +4647,11 @@ def prepare_tensors(self): super().prepare_tensors() -@Model.register("T5WithLMHeadModel") -@Model.register("T5ForConditionalGeneration") -@Model.register("MT5ForConditionalGeneration") -@Model.register("UMT5ForConditionalGeneration") -class T5Model(Model): +@ModelBase.register("T5WithLMHeadModel") +@ModelBase.register("T5ForConditionalGeneration") +@ModelBase.register("MT5ForConditionalGeneration") +@ModelBase.register("UMT5ForConditionalGeneration") +class T5Model(TextModel): model_arch = gguf.MODEL_ARCH.T5 def __init__(self, *args, **kwargs): @@ -4707,8 +4790,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("T5EncoderModel") -class T5EncoderModel(Model): +@ModelBase.register("T5EncoderModel") +class T5EncoderModel(TextModel): model_arch = gguf.MODEL_ARCH.T5ENCODER def __init__(self, *args, **kwargs): @@ -4846,8 +4929,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("JAISLMHeadModel") -class JaisModel(Model): +@ModelBase.register("JAISLMHeadModel") +class JaisModel(TextModel): model_arch = gguf.MODEL_ARCH.JAIS def __init__(self, *args, **kwargs): @@ -4929,8 +5012,8 @@ def prepare_tensors(self): self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias) -@Model.register("Glm4ForCausalLM") -class Glm4Model(Model): +@ModelBase.register("Glm4ForCausalLM") +class Glm4Model(TextModel): model_arch = gguf.MODEL_ARCH.GLM4 def set_vocab(self): @@ -4945,8 +5028,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) -@Model.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration") -class ChatGLMModel(Model): +@ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration") +class ChatGLMModel(TextModel): model_arch = gguf.MODEL_ARCH.CHATGLM def set_vocab_chatglm3(self): @@ -5100,8 +5183,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("NemotronForCausalLM") -class NemotronModel(Model): +@ModelBase.register("NemotronForCausalLM") +class NemotronModel(TextModel): model_arch = gguf.MODEL_ARCH.NEMOTRON def set_vocab(self): @@ -5141,8 +5224,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("ExaoneForCausalLM") -class ExaoneModel(Model): +@ModelBase.register("ExaoneForCausalLM") +class ExaoneModel(TextModel): model_arch = gguf.MODEL_ARCH.EXAONE def set_gguf_parameters(self): @@ -5210,7 +5293,7 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) -@Model.register("GraniteForCausalLM") +@ModelBase.register("GraniteForCausalLM") class GraniteModel(LlamaModel): """Conversion for IBM's GraniteForCausalLM""" model_arch = gguf.MODEL_ARCH.GRANITE @@ -5244,7 +5327,7 @@ def set_gguf_parameters(self): logger.info("gguf: (granite) logits_scale = %s", logits_scale) -@Model.register("GraniteMoeForCausalLM") +@ModelBase.register("GraniteMoeForCausalLM") class GraniteMoeModel(GraniteModel): """Conversion for IBM's GraniteMoeForCausalLM""" model_arch = gguf.MODEL_ARCH.GRANITE_MOE @@ -5268,8 +5351,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return super().modify_tensors(data_torch, name, bid) -@Model.register("BailingMoeForCausalLM") -class BailingMoeModel(Model): +@ModelBase.register("BailingMoeForCausalLM") +class BailingMoeModel(TextModel): model_arch = gguf.MODEL_ARCH.BAILINGMOE def set_vocab(self): @@ -5367,9 +5450,9 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("ChameleonForConditionalGeneration") -@Model.register("ChameleonForCausalLM") # obsolete -class ChameleonModel(Model): +@ModelBase.register("ChameleonForConditionalGeneration") +@ModelBase.register("ChameleonForCausalLM") # obsolete +class ChameleonModel(TextModel): model_arch = gguf.MODEL_ARCH.CHAMELEON def set_gguf_parameters(self): @@ -5554,6 +5637,10 @@ def parse_args() -> argparse.Namespace: "--remote", action="store_true", help="(Experimental) Read safetensors file remotely without downloading to disk. Config and tokenizer files will still be downloaded. To use this feature, you need to specify Hugging Face model repo name instead of a local directory. For example: 'HuggingFaceTB/SmolLM2-1.7B-Instruct'. Note: To access gated repo, set HF_TOKEN environment variable to your Hugging Face token.", ) + parser.add_argument( + "--mmproj", action="store_true", + help="(Experimental) Export multimodal projector (mmproj) for vision models. This will only work on some vision models. A prefix 'mmproj-' will be added to the output file name.", + ) args = parser.parse_args() if not args.print_supported_models and args.model is None: @@ -5584,7 +5671,7 @@ def main() -> None: if args.print_supported_models: logger.error("Supported models:") - Model.print_registered_models() + ModelBase.print_registered_models() sys.exit(0) if args.verbose: @@ -5631,13 +5718,18 @@ def main() -> None: logger.info(f"Loading model: {dir_model.name}") - hparams = Model.load_hparams(dir_model) + hparams = ModelBase.load_hparams(dir_model) + + if args.mmproj: + if "mmproj" not in fname_out.name: + fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-") with torch.inference_mode(): output_type = ftype_map[args.outtype] model_architecture = hparams["architectures"][0] + model_type = ModelType.VISION if args.mmproj else ModelType.TEXT try: - model_class = Model.from_model_architecture(model_architecture) + model_class = ModelBase.from_model_architecture(model_architecture, model_type=model_type) except NotImplementedError: logger.error(f"Model {model_architecture} is not supported") sys.exit(1) diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index bdc991533b4e0..00a6733cbd360 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -24,7 +24,7 @@ import gguf # reuse model definitions from convert_hf_to_gguf.py -from convert_hf_to_gguf import LazyTorchTensor, Model +from convert_hf_to_gguf import LazyTorchTensor, ModelBase logger = logging.getLogger("lora-to-gguf") @@ -340,11 +340,11 @@ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]: sys.exit(1) else: logger.info(f"Loading base model: {dir_base_model.name}") - hparams = Model.load_hparams(dir_base_model) + hparams = ModelBase.load_hparams(dir_base_model) with torch.inference_mode(): try: - model_class = Model.from_model_architecture(hparams["architectures"][0]) + model_class = ModelBase.from_model_architecture(hparams["architectures"][0]) except NotImplementedError: logger.error(f"Model {hparams['architectures'][0]} is not supported") sys.exit(1) diff --git a/examples/llava/clip-impl.h b/examples/llava/clip-impl.h index 4d7340a56bd0c..180ae9880b124 100644 --- a/examples/llava/clip-impl.h +++ b/examples/llava/clip-impl.h @@ -50,7 +50,6 @@ // tensor name constants // -#define TN_TOKEN_EMBD "%s.token_embd.weight" #define TN_POS_EMBD "%s.position_embd.weight" #define TN_CLASS_EMBD "v.class_embd" #define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat @@ -66,8 +65,6 @@ #define TN_LN_2 "%s.blk.%d.ln2.%s" #define TN_LN_PRE "%s.pre_ln.%s" #define TN_LN_POST "%s.post_ln.%s" -#define TN_TEXT_PROJ "text_projection.weight" -#define TN_VIS_PROJ "visual_projection.weight" #define TN_LLAVA_PROJ "mm.%d.%s" #define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s" #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 8fcde2626aa7c..3f24705201d93 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -218,17 +218,37 @@ class Adapter: TYPE = "adapter.type" LORA_ALPHA = "adapter.lora.alpha" + class ClipVision: + PROJECTOR_TYPE = "clip.projector_type" + HAS_VISION_ENCODER = "clip.has_vision_encoder" + HAS_LLAVA_PROJECTOR = "clip.has_llava_projector" + IMAGE_SIZE = "clip.vision.image_size" + PATCH_SIZE = "clip.vision.patch_size" + EMBEDDING_LENGTH = "clip.vision.embedding_length" + FEED_FORWARD_LENGTH = "clip.vision.feed_forward_length" + PROJECTION_DIM = "clip.vision.projection_dim" + BLOCK_COUNT = "clip.vision.block_count" + IMAGE_MEAN = "clip.vision.image_mean" + IMAGE_STD = "clip.vision.image_std" + USE_GELU = "clip.use_gelu" + + class Attention: + HEAD_COUNT = "clip.vision.attention.head_count" + LAYERNORM_EPS = "clip.vision.attention.layer_norm_epsilon" + # # recommended mapping of model tensor names for storage in gguf # class GGUFType: - MODEL = "model" - ADAPTER = "adapter" + MODEL = "model" + ADAPTER = "adapter" + CLIP_VISION = "clip-vision" class MODEL_ARCH(IntEnum): + CLIP_VISION = auto() # dummy arch for clip.cpp LLAMA = auto() LLAMA4 = auto() DECI = auto() @@ -297,6 +317,16 @@ class MODEL_ARCH(IntEnum): BAILINGMOE = auto() +class VISION_PROJECTOR_TYPE(IntEnum): + MLP = auto() + LDP = auto() + LDPV2 = auto() + RESAMPLER = auto() + GLM_EDGE = auto() + MERGER = auto() + GEMMA3 = auto() + + class MODEL_TENSOR(IntEnum): TOKEN_EMBD = auto() TOKEN_EMBD_NORM = auto() @@ -436,9 +466,41 @@ class MODEL_TENSOR(IntEnum): POSNET_ATTN_K = auto() POSNET_ATTN_V = auto() POSNET_ATTN_OUT = auto() + # vision + V_MMPROJ = auto() + V_MMPROJ_FC = auto() + V_MMPROJ_MLP = auto() + V_MMPROJ_PEG = auto() + V_ENC_EMBD_CLS = auto() + V_ENC_EMBD_PATCH = auto() + V_ENC_EMBD_POS = auto() + V_ENC_ATTN_Q = auto() + V_ENC_ATTN_K = auto() + V_ENC_ATTN_V = auto() + V_ENC_INPUT_NORM = auto() + V_ENC_OUTPUT = auto() + V_ENC_OUTPUT_NORM = auto() + V_ENC_FFN_UP = auto() + V_ENC_FFN_DOWN = auto() + V_PRE_NORM = auto() + V_POST_NORM = auto() + V_MM_INP_PROJ = auto() # gemma3 + V_MM_SOFT_EMB_NORM = auto() # gemma3 + V_RESMPL_POS_EMBD_K = auto() # minicpmv + V_RESMPL_ATTN_Q = auto() # minicpmv + V_RESMPL_ATTN_K = auto() # minicpmv + V_RESMPL_ATTN_V = auto() # minicpmv + V_RESMPL_ATTN_OUT = auto() # minicpmv + V_RESMPL_KV = auto() # minicpmv + V_RESMPL_KV_NORM = auto() # minicpmv + V_RESMPL_POST_NORM = auto() # minicpmv + V_RESMPL_Q_NORM = auto() # minicpmv + V_RESMPL_PROJ = auto() # minicpmv + V_RESMPL_QUERY = auto() # minicpmv MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { + MODEL_ARCH.CLIP_VISION: "clip", # dummy arch for clip.cpp MODEL_ARCH.LLAMA: "llama", MODEL_ARCH.LLAMA4: "llama4", MODEL_ARCH.DECI: "deci", @@ -507,6 +569,16 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.BAILINGMOE: "bailingmoe", } +VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { + VISION_PROJECTOR_TYPE.MLP: "mlp", + VISION_PROJECTOR_TYPE.LDP: "ldp", + VISION_PROJECTOR_TYPE.LDPV2: "ldpv2", + VISION_PROJECTOR_TYPE.RESAMPLER: "resampler", + VISION_PROJECTOR_TYPE.GLM_EDGE: "adapter", + VISION_PROJECTOR_TYPE.MERGER: "qwen2vl_merger", + VISION_PROJECTOR_TYPE.GEMMA3: "gemma3", +} + TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.TOKEN_EMBD: "token_embd", MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm", @@ -646,9 +718,72 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k", MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v", MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output", + # vision + MODEL_TENSOR.V_MMPROJ: "mm.{bid}", + MODEL_TENSOR.V_MMPROJ_FC: "mm.model.fc", + MODEL_TENSOR.V_MMPROJ_MLP: "mm.model.mlp.{bid}", + MODEL_TENSOR.V_MMPROJ_PEG: "mm.model.peg.{bid}", + MODEL_TENSOR.V_ENC_EMBD_CLS: "v.class_embd", + MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd", + MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd", + MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q", + MODEL_TENSOR.V_ENC_ATTN_K: "v.blk.{bid}.attn_k", + MODEL_TENSOR.V_ENC_ATTN_V: "v.blk.{bid}.attn_v", + MODEL_TENSOR.V_ENC_INPUT_NORM: "v.blk.{bid}.ln1", + MODEL_TENSOR.V_ENC_OUTPUT: "v.blk.{bid}.attn_out", + MODEL_TENSOR.V_ENC_OUTPUT_NORM: "v.blk.{bid}.ln2", + MODEL_TENSOR.V_ENC_FFN_UP: "v.blk.{bid}.ffn_up", + MODEL_TENSOR.V_ENC_FFN_DOWN: "v.blk.{bid}.ffn_down", + MODEL_TENSOR.V_PRE_NORM: "v.pre_ln", + MODEL_TENSOR.V_POST_NORM: "v.post_ln", + MODEL_TENSOR.V_MM_INP_PROJ: "mm.input_projection", + MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm", + MODEL_TENSOR.V_RESMPL_POS_EMBD_K: "resampler.pos_embd_k", + MODEL_TENSOR.V_RESMPL_ATTN_Q: "resampler.attn.q", + MODEL_TENSOR.V_RESMPL_ATTN_K: "resampler.attn.k", + MODEL_TENSOR.V_RESMPL_ATTN_V: "resampler.attn.v", + MODEL_TENSOR.V_RESMPL_ATTN_OUT: "resampler.attn.out", + MODEL_TENSOR.V_RESMPL_KV: "resampler.kv", + MODEL_TENSOR.V_RESMPL_KV_NORM: "resampler.ln_kv", + MODEL_TENSOR.V_RESMPL_POST_NORM: "resampler.ln_post", + MODEL_TENSOR.V_RESMPL_Q_NORM: "resampler.ln_q", + MODEL_TENSOR.V_RESMPL_PROJ: "resampler.proj", + MODEL_TENSOR.V_RESMPL_QUERY: "resampler.query", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { + MODEL_ARCH.CLIP_VISION: [ + MODEL_TENSOR.V_MMPROJ, + MODEL_TENSOR.V_MMPROJ_FC, + MODEL_TENSOR.V_MMPROJ_MLP, + MODEL_TENSOR.V_MMPROJ_PEG, + MODEL_TENSOR.V_ENC_EMBD_CLS, + MODEL_TENSOR.V_ENC_EMBD_PATCH, + MODEL_TENSOR.V_ENC_EMBD_POS, + MODEL_TENSOR.V_ENC_ATTN_Q, + MODEL_TENSOR.V_ENC_ATTN_K, + MODEL_TENSOR.V_ENC_ATTN_V, + MODEL_TENSOR.V_ENC_INPUT_NORM, + MODEL_TENSOR.V_ENC_OUTPUT, + MODEL_TENSOR.V_ENC_OUTPUT_NORM, + MODEL_TENSOR.V_ENC_FFN_UP, + MODEL_TENSOR.V_ENC_FFN_DOWN, + MODEL_TENSOR.V_PRE_NORM, + MODEL_TENSOR.V_POST_NORM, + MODEL_TENSOR.V_MM_INP_PROJ, + MODEL_TENSOR.V_MM_SOFT_EMB_NORM, + MODEL_TENSOR.V_RESMPL_POS_EMBD_K, + MODEL_TENSOR.V_RESMPL_ATTN_Q, + MODEL_TENSOR.V_RESMPL_ATTN_K, + MODEL_TENSOR.V_RESMPL_ATTN_V, + MODEL_TENSOR.V_RESMPL_ATTN_OUT, + MODEL_TENSOR.V_RESMPL_KV, + MODEL_TENSOR.V_RESMPL_KV_NORM, + MODEL_TENSOR.V_RESMPL_POST_NORM, + MODEL_TENSOR.V_RESMPL_Q_NORM, + MODEL_TENSOR.V_RESMPL_PROJ, + MODEL_TENSOR.V_RESMPL_QUERY, + ], MODEL_ARCH.LLAMA: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 0bc75cf513a9f..22066b2868019 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -886,6 +886,150 @@ class TensorNameMap: MODEL_TENSOR.POSNET_ATTN_OUT: ( "backbone.posnet.{bid}.proj_out", # wavtokenizer ), + + ############################################################################# + ## Vision encoder + + MODEL_TENSOR.V_MMPROJ: ( + "multi_modal_projector.linear_{bid}", + ), + + MODEL_TENSOR.V_MMPROJ_FC: ( + "model.connector.modality_projection.proj", # SmolVLM + ), + + MODEL_TENSOR.V_MMPROJ_MLP: ( + "model.mm_projector.mlp.mlp.{bid}", + ), + + MODEL_TENSOR.V_MMPROJ_PEG: ( + "model.mm_projector.peg.peg.{bid}", + ), + + MODEL_TENSOR.V_ENC_EMBD_CLS: ( + "vision_tower.vision_model.embeddings.class_embedding", + ), + + MODEL_TENSOR.V_ENC_EMBD_PATCH: ( + "vision_tower.vision_model.embeddings.patch_embedding", + "vpm.embeddings.patch_embedding", + "model.vision_model.embeddings.patch_embedding", # SmolVLM + ), + + MODEL_TENSOR.V_ENC_EMBD_POS: ( + "vision_tower.vision_model.embeddings.position_embedding", + "vpm.embeddings.position_embedding", + "model.vision_model.embeddings.position_embedding", # SmolVLM + ), + + MODEL_TENSOR.V_ENC_ATTN_Q: ( + "vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj", + "vpm.encoder.layers.{bid}.self_attn.q_proj", + "model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM + ), + + MODEL_TENSOR.V_ENC_ATTN_K: ( + "vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj", + "vpm.encoder.layers.{bid}.self_attn.k_proj", + "model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM + ), + + MODEL_TENSOR.V_ENC_ATTN_V: ( + "vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj", + "vpm.encoder.layers.{bid}.self_attn.v_proj", + "model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM + ), + + MODEL_TENSOR.V_ENC_INPUT_NORM: ( + "vision_tower.vision_model.encoder.layers.{bid}.layer_norm1", + "vpm.encoder.layers.{bid}.layer_norm1", + "model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM + ), + + MODEL_TENSOR.V_ENC_OUTPUT: ( + "vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj", + "vpm.encoder.layers.{bid}.self_attn.out_proj", + "model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM + ), + + MODEL_TENSOR.V_ENC_OUTPUT_NORM: ( + "vision_tower.vision_model.encoder.layers.{bid}.layer_norm2", + "vpm.encoder.layers.{bid}.layer_norm2", + "model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM + ), + + MODEL_TENSOR.V_ENC_FFN_UP: ( + "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1", + "vpm.encoder.layers.{bid}.mlp.fc1", + "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM + ), + + MODEL_TENSOR.V_ENC_FFN_DOWN: ( + "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2", + "vpm.encoder.layers.{bid}.mlp.fc2", + "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM + ), + + MODEL_TENSOR.V_PRE_NORM: ( + "vision_tower.vision_model.pre_layrnorm", + ), + + MODEL_TENSOR.V_POST_NORM: ( + "vision_tower.vision_model.post_layernorm", + "model.vision_model.post_layernorm", # SmolVLM + ), + + MODEL_TENSOR.V_MM_INP_PROJ: ( + "multi_modal_projector.mm_input_projection", + ), + + MODEL_TENSOR.V_MM_SOFT_EMB_NORM: ( + "multi_modal_projector.mm_soft_emb_norm", + ), + + MODEL_TENSOR.V_RESMPL_POS_EMBD_K: ( + "resampler.pos_embed_k", + ), + + MODEL_TENSOR.V_RESMPL_ATTN_Q: ( + "resampler.attn.in_proj_q", # tensor generated from resampler.attn.in_proj + ), + + MODEL_TENSOR.V_RESMPL_ATTN_K: ( + "resampler.attn.in_proj_k", # tensor generated from resampler.attn.in_proj + ), + + MODEL_TENSOR.V_RESMPL_ATTN_V: ( + "resampler.attn.in_proj_v", # tensor generated from resampler.attn.in_proj + ), + + MODEL_TENSOR.V_RESMPL_ATTN_OUT: ( + "resampler.attn.out_proj", + ), + + MODEL_TENSOR.V_RESMPL_KV: ( + "resampler.kv_proj", + ), + + MODEL_TENSOR.V_RESMPL_POST_NORM: ( + "resampler.ln_post", + ), + + MODEL_TENSOR.V_RESMPL_KV_NORM: ( + "resampler.ln_kv", + ), + + MODEL_TENSOR.V_RESMPL_Q_NORM: ( + "resampler.ln_q", + ), + + MODEL_TENSOR.V_RESMPL_PROJ: ( + "resampler.proj", + ), + + MODEL_TENSOR.V_RESMPL_QUERY: ( + "resampler.query", + ), } # architecture-specific block mappings