Skip to content

Commit 2e259ca

Browse files
committed
sarashina/00, fixed some bugs
1 parent 4fa02e0 commit 2e259ca

2 files changed

Lines changed: 17 additions & 39 deletions

File tree

convert_hf_to_gguf.py

Lines changed: 9 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2947,26 +2947,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
29472947
else:
29482948
return
29492949

2950-
if self.model_type == ModelType.MMPROJ:
2951-
# Block for VLM
2952-
#if name.startswith("llm.") or name in ("norm.weight", "norm.bias"):
2953-
if name.startswith("llm."):
2954-
print(f"skip: {name}")
2955-
return # LLMのみ変換する場合はスキップ
2956-
2957-
# llm. プレフィックスを除去して標準名に変換
2958-
if name.startswith("visual."):
2959-
name = name[len("visual."):]
2960-
2961-
else:
2962-
# Block for LLM
2963-
if name.startswith("visual.") or name in ("norm.weight", "norm.bias"):
2964-
print(f"skip: {name}")
2965-
return # LLMのみ変換する場合はスキップ
2966-
2967-
# llm. プレフィックスを除去して標準名に変換
2950+
if self.origin_hf_arch.startswith('Sarashina2VisionForCausalLM'):
2951+
# Remove llm. from name
29682952
if name.startswith("llm."):
29692953
name = name[len("llm."):]
2954+
elif name.startswith("visual.") or name in ("norm.weight", "norm.bias"):
2955+
return #Skip processing "modify_tensors"
29702956

29712957
yield from super().modify_tensors(data_torch, name, bid)
29722958

@@ -3138,10 +3124,10 @@ def set_gguf_parameters(self):
31383124
# spatial_merge_size
31393125
if "spatial_merge_size" in self.global_config:
31403126
self.gguf_writer.add_vision_spatial_merge_size(self.global_config["spatial_merge_size"])
3141-
elif "vision_config" in self.global_config and "spatial_merge_size" in self.global_config["vision_config"]:
3142-
self.gguf_writer.add_vision_spatial_merge_size(self.global_config["vision_config"]["spatial_merge_size"])
3143-
elif self.global_config.get("model_type") == "sarashina2_vision":
3144-
self.gguf_writer.add_vision_spatial_merge_size(2)
3127+
#elif "vision_config" in self.global_config and "spatial_merge_size" in self.global_config["vision_config"]:
3128+
# self.gguf_writer.add_vision_spatial_merge_size(self.global_config["vision_config"]["spatial_merge_size"])
3129+
#elif self.global_config.get("model_type") == "sarashina2_vision":
3130+
# self.gguf_writer.add_vision_spatial_merge_size(2)
31453131

31463132
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
31473133
n_head = (
@@ -12948,7 +12934,7 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
1294812934
elif model_type == ModelType.MMPROJ and vision_config.get("architectures") is not None:
1294912935
arch = vision_config["architectures"][0]
1295012936

12951-
if "Sarashina" in arch:
12937+
if "Sarashina2VisionForCausalLM" in arch:
1295212938
arch = "Qwen2VLForConditionalGeneration"
1295312939

1295412940
if arch is None:

tools/mtmd/clip.cpp

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1336,25 +1336,15 @@ struct clip_model_loader {
13361336
} break;
13371337
case PROJECTOR_TYPE_SARASHINA2VL_MERGER:
13381338
{
1339-
// 1. デフォルト値のセット
1340-
hparams.n_merge = 2;
1339+
hparams.n_merge = 2; // default value for Sarashina2_VL
13411340
hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
1342-
1343-
// 2. GGUFから値を読み込む (第1引数は定数、第2引数は格納先)
1344-
// KEY_SPATIAL_MERGE_SIZE は内部で "clip.vision.spatial_merge_size" 等に紐付いています
13451341
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
1346-
1347-
// Qwen 2.5 ではないのでここは false か model.proj_type == ... で判定
13481342
get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, false);
1349-
1350-
// 3. 各種制限の設定
13511343
hparams.set_limit_image_tokens(8, 4096);
13521344
hparams.set_warmup_n_tokens(46*46);
1353-
1354-
// 4. 警告チェック
13551345
const int warn_min_pixels = 1024 * hparams.n_merge * hparams.n_merge * hparams.patch_size * hparams.patch_size;
13561346
if (hparams.image_min_pixels < warn_min_pixels) {
1357-
LOG_WRN("%s: Sarashina2VL models (Qwen2-VL based) require sufficient image tokens\n", __func__);
1347+
LOG_WRN("%s: Sarashina2VL models require sufficient image tokens\n", __func__);
13581348
}
13591349
} break;
13601350
case PROJECTOR_TYPE_YOUTUVL:
@@ -1652,6 +1642,7 @@ struct clip_model_loader {
16521642
|| model.proj_type == PROJECTOR_TYPE_GEMMA3
16531643
|| model.proj_type == PROJECTOR_TYPE_IDEFICS3
16541644
|| model.proj_type == PROJECTOR_TYPE_MINICPMV
1645+
|| model.proj_type == PROJECTOR_TYPE_SARASHINA2VL_MERGER
16551646
) && layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd;
16561647
if (is_ffn_swapped) {
16571648
// swap up and down weights
@@ -2606,11 +2597,11 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
26062597
switch (proj) {
26072598
case PROJECTOR_TYPE_QWEN2VL:
26082599
case PROJECTOR_TYPE_QWEN25VL:
2609-
case PROJECTOR_TYPE_SARASHINA2VL_MERGER:
26102600
case PROJECTOR_TYPE_QWEN3VL:
26112601
case PROJECTOR_TYPE_GLM4V:
26122602
case PROJECTOR_TYPE_PADDLEOCR:
26132603
case PROJECTOR_TYPE_YOUTUVL:
2604+
case PROJECTOR_TYPE_SARASHINA2VL_MERGER:
26142605
return (img->nx / params.patch_size) / 2;
26152606
default:
26162607
break;
@@ -2628,6 +2619,7 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
26282619
case PROJECTOR_TYPE_GLM4V:
26292620
case PROJECTOR_TYPE_PADDLEOCR:
26302621
case PROJECTOR_TYPE_YOUTUVL:
2622+
case PROJECTOR_TYPE_SARASHINA2VL_MERGER:
26312623
return (img->ny / params.patch_size) / 2;
26322624
default:
26332625
break;
@@ -2690,10 +2682,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
26902682
} break;
26912683
case PROJECTOR_TYPE_QWEN2VL:
26922684
case PROJECTOR_TYPE_QWEN25VL:
2693-
case PROJECTOR_TYPE_SARASHINA2VL_MERGER:
26942685
case PROJECTOR_TYPE_QWEN3VL:
26952686
case PROJECTOR_TYPE_GLM4V:
26962687
case PROJECTOR_TYPE_YOUTUVL:
2688+
case PROJECTOR_TYPE_SARASHINA2VL_MERGER:
26972689
{
26982690
// dynamic size (2 conv, so double patch size)
26992691
int x_patch = img->nx / (params.patch_size * 2);
@@ -3020,8 +3012,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
30203012
set_input_i32("positions", positions);
30213013
} break;
30223014
case PROJECTOR_TYPE_QWEN25VL:
3023-
case PROJECTOR_TYPE_SARASHINA2VL_MERGER:
30243015
case PROJECTOR_TYPE_YOUTUVL:
3016+
case PROJECTOR_TYPE_SARASHINA2VL_MERGER:
30253017
{
30263018
// pw * ph = number of tokens output by ViT after apply patch merger
30273019
// ipw * ipw = number of vision token been processed inside ViT
@@ -3343,9 +3335,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
33433335
return ctx->model.mm_model_mlp_3_w->ne[1];
33443336
case PROJECTOR_TYPE_QWEN2VL:
33453337
case PROJECTOR_TYPE_QWEN25VL:
3346-
case PROJECTOR_TYPE_SARASHINA2VL_MERGER:
33473338
case PROJECTOR_TYPE_JANUS_PRO:
33483339
case PROJECTOR_TYPE_YOUTUVL:
3340+
case PROJECTOR_TYPE_SARASHINA2VL_MERGER:
33493341
return ctx->model.mm_1_b->ne[0];
33503342
case PROJECTOR_TYPE_QWEN3VL:
33513343
// main path + deepstack paths

0 commit comments

Comments
 (0)