Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 162 additions & 0 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -10611,6 +10611,18 @@ def set_gguf_parameters(self):
logger.info("gguf: (granite) logits_scale = %s", logits_scale)


@ModelBase.register("GraniteSpeechForConditionalGeneration", ModelType.TEXT)
class GraniteSpeechTextModel(GraniteModel):
model_arch = gguf.MODEL_ARCH.GRANITE

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith(("encoder.", "projector.")):
return
if name.startswith("language_model."):
name = name[len("language_model."):]
yield from super().modify_tensors(data_torch, name, bid)


@ModelBase.register("GraniteMoeForCausalLM", "GraniteMoeSharedForCausalLM")
class GraniteMoeModel(GraniteModel):
"""Conversion for IBM's GraniteMoeForCausalLM"""
Expand Down Expand Up @@ -12347,6 +12359,154 @@ def modify_tensors(self, data_torch, name, bid):
yield from super().modify_tensors(data_torch, name, bid)


@ModelBase.register("GraniteSpeechForConditionalGeneration", ModelType.MMPROJ)
class GraniteSpeechMmprojModel(MmprojModel):
has_vision_encoder = False
has_audio_encoder = True

_batch_norm_tensors: list[dict[str, Tensor]] | None = None

def get_audio_config(self) -> dict[str, Any] | None:
return self.global_config.get("encoder_config")

def set_gguf_parameters(self):
assert self.hparams_audio is not None
a = self.hparams_audio
a["hidden_size"] = a["hidden_dim"]
a["intermediate_size"] = a["hidden_dim"] * a["feedforward_mult"]
a["num_attention_heads"] = a["num_heads"]
a["num_hidden_layers"] = a["num_layers"]

super().set_gguf_parameters()

self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GRANITE_SPEECH)
self.gguf_writer.add_audio_num_mel_bins(a["input_dim"])
self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)

def tensor_force_quant(self, name, new_name, bid, n_dims):
if "encoder" in name or "projector" in name:
if ".conv" in name and ".weight" in name:
return gguf.GGMLQuantizationType.F32
return super().tensor_force_quant(name, new_name, bid, n_dims)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith("language_model."):
return
if "attention_dists" in name:
return
if "num_batches_tracked" in name:
return

# fold running_mean, running_var and eps into weight and bias for batch_norm
if "batch_norm" in name and "encoder.layers." in name:
if self._batch_norm_tensors is None:
self._batch_norm_tensors = [{} for _ in range(self.block_count)]
assert bid is not None
self._batch_norm_tensors[bid][name] = data_torch
if len(self._batch_norm_tensors[bid]) < 4:
return
prefix = f"encoder.layers.{bid}.conv.batch_norm"
weight = self._batch_norm_tensors[bid][f"{prefix}.weight"]
bias = self._batch_norm_tensors[bid][f"{prefix}.bias"]
running_mean = self._batch_norm_tensors[bid][f"{prefix}.running_mean"]
running_var = self._batch_norm_tensors[bid][f"{prefix}.running_var"]
eps = 1e-5
a = weight / torch.sqrt(running_var + eps)
b = bias - running_mean * a
yield from super().modify_tensors(a, f"encoder.layers.{bid}.conv.batch_norm.weight", bid)
yield from super().modify_tensors(b, f"encoder.layers.{bid}.conv.batch_norm.bias", bid)
return

if ".attn.to_kv.weight" in name:
k_weight, v_weight = data_torch.chunk(2, dim=0)
yield from super().modify_tensors(k_weight, name.replace("to_kv", "to_k"), bid)
yield from super().modify_tensors(v_weight, name.replace("to_kv", "to_v"), bid)
return

if ("up_conv" in name or "down_conv" in name) and name.endswith(".weight"):
if data_torch.ndim == 3 and data_torch.shape[2] == 1:
data_torch = data_torch.squeeze(2)

if "depth_conv" in name and name.endswith(".weight"):
if data_torch.ndim == 3 and data_torch.shape[1] == 1:
data_torch = data_torch.squeeze(1)

if name.startswith("projector."):
gguf_name = self._map_projector_tensor(name)
if gguf_name is None:
return
yield (gguf_name, data_torch)
return

global_map = {
"encoder.input_linear.weight": "a.enc_inp_linear.weight",
"encoder.input_linear.bias": "a.enc_inp_linear.bias",
"encoder.out.weight": "a.enc_ctc_out.weight",
"encoder.out.bias": "a.enc_ctc_out.bias",
"encoder.out_mid.weight": "a.enc_ctc_out_mid.weight",
"encoder.out_mid.bias": "a.enc_ctc_out_mid.bias",
}
if name in global_map:
yield (global_map[name], data_torch)
return

if ".attn.rel_pos_emb.weight" in name:
assert bid is not None
yield (f"a.blk.{bid}.attn_rel_pos_emb", data_torch)
return

yield from super().modify_tensors(data_torch, name, bid)

@staticmethod
def _map_projector_tensor(name: str) -> str | None:
static_map = {
"projector.query": "a.proj_query",
"projector.qformer.layernorm.weight": "a.proj_norm.weight",
"projector.qformer.layernorm.bias": "a.proj_norm.bias",
"projector.linear.weight": "a.proj_linear.weight",
"projector.linear.bias": "a.proj_linear.bias",
}
if name in static_map:
return static_map[name]
m = re.match(r"projector\.qformer\.encoder\.layer\.(\d+)\.(.*)", name)
if not m:
return None
lid = m.group(1)
rest = m.group(2)
layer_map = {
"attention.attention.query.weight": "self_attn_q.weight",
"attention.attention.query.bias": "self_attn_q.bias",
"attention.attention.key.weight": "self_attn_k.weight",
"attention.attention.key.bias": "self_attn_k.bias",
"attention.attention.value.weight": "self_attn_v.weight",
"attention.attention.value.bias": "self_attn_v.bias",
"attention.output.dense.weight": "self_attn_out.weight",
"attention.output.dense.bias": "self_attn_out.bias",
"attention.output.LayerNorm.weight": "self_attn_norm.weight",
"attention.output.LayerNorm.bias": "self_attn_norm.bias",
"crossattention.attention.query.weight": "cross_attn_q.weight",
"crossattention.attention.query.bias": "cross_attn_q.bias",
"crossattention.attention.key.weight": "cross_attn_k.weight",
"crossattention.attention.key.bias": "cross_attn_k.bias",
"crossattention.attention.value.weight": "cross_attn_v.weight",
"crossattention.attention.value.bias": "cross_attn_v.bias",
"crossattention.output.dense.weight": "cross_attn_out.weight",
"crossattention.output.dense.bias": "cross_attn_out.bias",
"crossattention.output.LayerNorm.weight": "cross_attn_norm.weight",
"crossattention.output.LayerNorm.bias": "cross_attn_norm.bias",
"intermediate_query.dense.weight": "ffn_up.weight",
"intermediate_query.dense.bias": "ffn_up.bias",
"output_query.dense.weight": "ffn_down.weight",
"output_query.dense.bias": "ffn_down.bias",
"output_query.LayerNorm.weight": "ffn_norm.weight",
"output_query.LayerNorm.bias": "ffn_norm.bias",
}
suffix = layer_map.get(rest)
if suffix is None:
return None
return f"a.proj_blk.{lid}.{suffix}"


@ModelBase.register("Lfm25AudioTokenizer")
class LFM25AudioTokenizer(LFM2Model):
model_arch = gguf.MODEL_ARCH.LFM2
Expand Down Expand Up @@ -13356,6 +13516,8 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
# TODO: refactor this later to avoid adding exception here
if model_type == ModelType.TEXT and arch == "StepVLForConditionalGeneration":
return arch
if model_type == ModelType.TEXT and arch == "GraniteSpeechForConditionalGeneration":
return arch

# if "architectures" is found in the sub-config, use that instead
if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
Expand Down
1 change: 1 addition & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -4138,6 +4138,7 @@ class VisionProjectorType:
YOUTUVL = "youtuvl"
NEMOTRON_V2_VL = "nemotron_v2_vl"
HUNYUANOCR = "hunyuanocr"
GRANITE_SPEECH = "granite_speech" # audio


# Items here are (block size, type size)
Expand Down
17 changes: 17 additions & 0 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -1912,20 +1912,23 @@ class TensorNameMap:
"conformer.layers.{bid}.self_attn.linear_q", # lfm2
"conformer.layers.{bid}.attention.attn.q_proj", # gemma3n
"conformer.layers.{bid}.self_attn.q_proj", # gemma4
"encoder.layers.{bid}.attn.to_q", # granite_speech
),

MODEL_TENSOR.A_ENC_ATTN_K: (
"audio_tower.layers.{bid}.self_attn.k_proj", # ultravox
"conformer.layers.{bid}.self_attn.linear_k", # lfm2
"conformer.layers.{bid}.attention.attn.k_proj", # gemma3n
"conformer.layers.{bid}.self_attn.k_proj", # gemma4
"encoder.layers.{bid}.attn.to_k", # granite_speech (split from to_kv)
),

MODEL_TENSOR.A_ENC_ATTN_V: (
"audio_tower.layers.{bid}.self_attn.v_proj", # ultravox
"conformer.layers.{bid}.self_attn.linear_v", # lfm2
"conformer.layers.{bid}.attention.attn.v_proj", # gemma3n
"conformer.layers.{bid}.self_attn.v_proj", # gemma4
"encoder.layers.{bid}.attn.to_v", # granite_speech (split from to_kv)
),

MODEL_TENSOR.A_ENC_ATTN_K_REL: (
Expand Down Expand Up @@ -1953,25 +1956,29 @@ class TensorNameMap:
"audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox
"conformer.layers.{bid}.norm_self_att", # lfm2
"conformer.layers.{bid}.attention.pre_attn_norm", # gemma3n
"encoder.layers.{bid}.attn.pre_norm", # granite_speech
),

MODEL_TENSOR.A_ENC_OUTPUT: (
"audio_tower.layers.{bid}.self_attn.out_proj", # ultravox
"conformer.layers.{bid}.self_attn.linear_out", # lfm2
"conformer.layers.{bid}.attention.post", # gemma3n
"conformer.layers.{bid}.self_attn.post", # gemma4
"encoder.layers.{bid}.attn.to_out", # granite_speech
),

MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
"audio_tower.layers.{bid}.final_layer_norm", # ultravox
"conformer.layers.{bid}.norm_out", # lfm2
"conformer.layers.{bid}.attention.post_norm", # gemma3n
"encoder.layers.{bid}.post_norm", # granite_speech
),

MODEL_TENSOR.A_ENC_FFN_NORM: (
"conformer.layers.{bid}.norm_feed_forward1", # lfm2
"conformer.layers.{bid}.ffw_layer_start.pre_layer_norm", # gemma3n
"conformer.layers.{bid}.feed_forward1.pre_layer_norm", # gemma4
"encoder.layers.{bid}.ff1.pre_norm", # granite_speech
),

MODEL_TENSOR.A_ENC_FFN_POST_NORM: (
Expand All @@ -1988,6 +1995,7 @@ class TensorNameMap:
"conformer.layers.{bid}.feed_forward1.linear1", # lfm2
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_1", # gemma3n
"conformer.layers.{bid}.feed_forward1.ffw_layer_1", # gemma4
"encoder.layers.{bid}.ff1.up_proj", # granite_speech
),

MODEL_TENSOR.A_ENC_FFN_GATE: (),
Expand All @@ -1997,24 +2005,28 @@ class TensorNameMap:
"conformer.layers.{bid}.feed_forward1.linear2", # lfm2
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_2", # gemma3n
"conformer.layers.{bid}.feed_forward1.ffw_layer_2", # gemma4
"encoder.layers.{bid}.ff1.down_proj", # granite_speech
),

MODEL_TENSOR.A_ENC_FFN_UP_1: (
"conformer.layers.{bid}.feed_forward2.linear1", # lfm2
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_1", # gemma3n
"conformer.layers.{bid}.feed_forward2.ffw_layer_1", # gemma4
"encoder.layers.{bid}.ff2.up_proj", # granite_speech
),

MODEL_TENSOR.A_ENC_FFN_DOWN_1: (
"conformer.layers.{bid}.feed_forward2.linear2", # lfm2
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_2", # gemma3n
"conformer.layers.{bid}.feed_forward2.ffw_layer_2", # gemma4
"encoder.layers.{bid}.ff2.down_proj", # granite_speech
),

MODEL_TENSOR.A_ENC_FFN_NORM_1: (
"conformer.layers.{bid}.norm_feed_forward2", # lfm2
"conformer.layers.{bid}.ffw_layer_end.pre_layer_norm", # gemma3n
"conformer.layers.{bid}.feed_forward2.pre_layer_norm", # gemma4
"encoder.layers.{bid}.ff2.pre_norm", # granite_speech
),

MODEL_TENSOR.A_ENC_FFN_POST_NORM_1: (
Expand Down Expand Up @@ -2071,26 +2083,31 @@ class TensorNameMap:
MODEL_TENSOR.A_ENC_CONV_DW: (
"conformer.layers.{bid}.conv.depthwise_conv", # lfm2
"conformer.layers.{bid}.lconv1d.depthwise_conv1d", # gemma3n
"encoder.layers.{bid}.conv.depth_conv.conv", # granite_speech
),

MODEL_TENSOR.A_ENC_CONV_NORM: (
"conformer.layers.{bid}.conv.batch_norm", # lfm2
"conformer.layers.{bid}.lconv1d.pre_layer_norm", # gemma3n
"encoder.layers.{bid}.conv.batch_norm", # granite_speech
),

MODEL_TENSOR.A_ENC_CONV_PW1: (
"conformer.layers.{bid}.conv.pointwise_conv1", # lfm2
"conformer.layers.{bid}.lconv1d.linear_start", # gemma3n
"encoder.layers.{bid}.conv.up_conv", # granite_speech
),

MODEL_TENSOR.A_ENC_CONV_PW2: (
"conformer.layers.{bid}.conv.pointwise_conv2", # lfm2
"conformer.layers.{bid}.lconv1d.linear_end", # gemma3n
"encoder.layers.{bid}.conv.down_conv", # granite_speech
),

MODEL_TENSOR.A_ENC_NORM_CONV: (
"conformer.layers.{bid}.norm_conv", # lfm2
"conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n
"encoder.layers.{bid}.conv.norm", # granite_speech
),

MODEL_TENSOR.A_PER_DIM_K_SCALE: (
Expand Down
1 change: 1 addition & 0 deletions tools/mtmd/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ add_library(mtmd
models/dotsocr.cpp
models/gemma4a.cpp
models/gemma4v.cpp
models/granite-speech.cpp
models/glm4v.cpp
models/hunyuanocr.cpp
models/internvl.cpp
Expand Down
23 changes: 23 additions & 0 deletions tools/mtmd/clip-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,27 @@
#define TN_CONV_NORM "%s.blk.%d.conv_norm.%s"
#define TN_CONV_PW1 "%s.blk.%d.conv_pw1.%s"
#define TN_CONV_PW2 "%s.blk.%d.conv_pw2.%s"
// granite_speech
#define TN_GS_INP_LINEAR "a.enc_inp_linear.%s"
#define TN_GS_CTC_OUT "a.enc_ctc_out.%s"
#define TN_GS_CTC_OUT_MID "a.enc_ctc_out_mid.%s"
#define TN_GS_ATTN_REL_POS "%s.blk.%d.attn_rel_pos_emb"
#define TN_GS_PROJ_QUERY "a.proj_query"
#define TN_GS_PROJ_NORM "a.proj_norm.%s"
#define TN_GS_PROJ_LINEAR "a.proj_linear.%s"
#define TN_GS_PROJ_SELF_ATTN_Q "a.proj_blk.%d.self_attn_q.%s"
#define TN_GS_PROJ_SELF_ATTN_K "a.proj_blk.%d.self_attn_k.%s"
#define TN_GS_PROJ_SELF_ATTN_V "a.proj_blk.%d.self_attn_v.%s"
#define TN_GS_PROJ_SELF_ATTN_O "a.proj_blk.%d.self_attn_out.%s"
#define TN_GS_PROJ_SELF_ATTN_N "a.proj_blk.%d.self_attn_norm.%s"
#define TN_GS_PROJ_CROSS_ATTN_Q "a.proj_blk.%d.cross_attn_q.%s"
#define TN_GS_PROJ_CROSS_ATTN_K "a.proj_blk.%d.cross_attn_k.%s"
#define TN_GS_PROJ_CROSS_ATTN_V "a.proj_blk.%d.cross_attn_v.%s"
#define TN_GS_PROJ_CROSS_ATTN_O "a.proj_blk.%d.cross_attn_out.%s"
#define TN_GS_PROJ_CROSS_ATTN_N "a.proj_blk.%d.cross_attn_norm.%s"
#define TN_GS_PROJ_FFN_UP "a.proj_blk.%d.ffn_up.%s"
#define TN_GS_PROJ_FFN_DOWN "a.proj_blk.%d.ffn_down.%s"
#define TN_GS_PROJ_FFN_NORM "a.proj_blk.%d.ffn_norm.%s"

// gemma4 audio conformer
#define TN_A_MM_INP_PROJ "mm.a.input_projection.%s"
Expand Down Expand Up @@ -293,6 +314,7 @@ enum projector_type {
PROJECTOR_TYPE_KIMIK25,
PROJECTOR_TYPE_NEMOTRON_V2_VL,
PROJECTOR_TYPE_HUNYUANOCR,
PROJECTOR_TYPE_GRANITE_SPEECH,
PROJECTOR_TYPE_UNKNOWN,
};

Expand Down Expand Up @@ -338,6 +360,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_KIMIK25, "kimik25"},
{ PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"},
{ PROJECTOR_TYPE_HUNYUANOCR, "hunyuanocr"},
{ PROJECTOR_TYPE_GRANITE_SPEECH, "granite_speech"},
};

static projector_type clip_projector_type_from_string(const std::string & str) {
Expand Down
Loading
Loading