@@ -1336,25 +1336,15 @@ struct clip_model_loader {
13361336 } break ;
13371337 case PROJECTOR_TYPE_SARASHINA2VL_MERGER :
13381338 {
1339- // 1. デフォルト値のセット
1340- hparams.n_merge = 2 ;
1339+ hparams.n_merge = 2 ; // default value for Sarashina2_VL
13411340 hparams.image_resize_algo = RESIZE_ALGO_BILINEAR ;
1342-
1343- // 2. GGUFから値を読み込む (第1引数は定数、第2引数は格納先)
1344- // KEY_SPATIAL_MERGE_SIZE は内部で "clip.vision.spatial_merge_size" 等に紐付いています
13451341 get_u32 (KEY_SPATIAL_MERGE_SIZE , hparams.n_merge , false );
1346-
1347- // Qwen 2.5 ではないのでここは false か model.proj_type == ... で判定
13481342 get_u32 (KEY_WIN_ATTN_PATTERN , hparams.n_wa_pattern , false );
1349-
1350- // 3. 各種制限の設定
13511343 hparams.set_limit_image_tokens (8 , 4096 );
13521344 hparams.set_warmup_n_tokens (46 *46 );
1353-
1354- // 4. 警告チェック
13551345 const int warn_min_pixels = 1024 * hparams.n_merge * hparams.n_merge * hparams.patch_size * hparams.patch_size ;
13561346 if (hparams.image_min_pixels < warn_min_pixels) {
1357- LOG_WRN (" %s: Sarashina2VL models (Qwen2-VL based) require sufficient image tokens\n " , __func__);
1347+ LOG_WRN (" %s: Sarashina2VL models require sufficient image tokens\n " , __func__);
13581348 }
13591349 } break ;
13601350 case PROJECTOR_TYPE_YOUTUVL :
@@ -1652,6 +1642,7 @@ struct clip_model_loader {
16521642 || model.proj_type == PROJECTOR_TYPE_GEMMA3
16531643 || model.proj_type == PROJECTOR_TYPE_IDEFICS3
16541644 || model.proj_type == PROJECTOR_TYPE_MINICPMV
1645+ || model.proj_type == PROJECTOR_TYPE_SARASHINA2VL_MERGER
16551646 ) && layer.ff_up_w && layer.ff_down_w && layer.ff_down_w ->ne [0 ] == hparams.n_embd ;
16561647 if (is_ffn_swapped) {
16571648 // swap up and down weights
@@ -2606,11 +2597,11 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
26062597 switch (proj) {
26072598 case PROJECTOR_TYPE_QWEN2VL :
26082599 case PROJECTOR_TYPE_QWEN25VL :
2609- case PROJECTOR_TYPE_SARASHINA2VL_MERGER :
26102600 case PROJECTOR_TYPE_QWEN3VL :
26112601 case PROJECTOR_TYPE_GLM4V :
26122602 case PROJECTOR_TYPE_PADDLEOCR :
26132603 case PROJECTOR_TYPE_YOUTUVL :
2604+ case PROJECTOR_TYPE_SARASHINA2VL_MERGER :
26142605 return (img->nx / params.patch_size ) / 2 ;
26152606 default :
26162607 break ;
@@ -2628,6 +2619,7 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
26282619 case PROJECTOR_TYPE_GLM4V :
26292620 case PROJECTOR_TYPE_PADDLEOCR :
26302621 case PROJECTOR_TYPE_YOUTUVL :
2622+ case PROJECTOR_TYPE_SARASHINA2VL_MERGER :
26312623 return (img->ny / params.patch_size ) / 2 ;
26322624 default :
26332625 break ;
@@ -2690,10 +2682,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
26902682 } break ;
26912683 case PROJECTOR_TYPE_QWEN2VL :
26922684 case PROJECTOR_TYPE_QWEN25VL :
2693- case PROJECTOR_TYPE_SARASHINA2VL_MERGER :
26942685 case PROJECTOR_TYPE_QWEN3VL :
26952686 case PROJECTOR_TYPE_GLM4V :
26962687 case PROJECTOR_TYPE_YOUTUVL :
2688+ case PROJECTOR_TYPE_SARASHINA2VL_MERGER :
26972689 {
26982690 // dynamic size (2 conv, so double patch size)
26992691 int x_patch = img->nx / (params.patch_size * 2 );
@@ -3020,8 +3012,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
30203012 set_input_i32 (" positions" , positions);
30213013 } break ;
30223014 case PROJECTOR_TYPE_QWEN25VL :
3023- case PROJECTOR_TYPE_SARASHINA2VL_MERGER :
30243015 case PROJECTOR_TYPE_YOUTUVL :
3016+ case PROJECTOR_TYPE_SARASHINA2VL_MERGER :
30253017 {
30263018 // pw * ph = number of tokens output by ViT after apply patch merger
30273019 // ipw * ipw = number of vision token been processed inside ViT
@@ -3343,9 +3335,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
33433335 return ctx->model .mm_model_mlp_3_w ->ne [1 ];
33443336 case PROJECTOR_TYPE_QWEN2VL :
33453337 case PROJECTOR_TYPE_QWEN25VL :
3346- case PROJECTOR_TYPE_SARASHINA2VL_MERGER :
33473338 case PROJECTOR_TYPE_JANUS_PRO :
33483339 case PROJECTOR_TYPE_YOUTUVL :
3340+ case PROJECTOR_TYPE_SARASHINA2VL_MERGER :
33493341 return ctx->model .mm_1_b ->ne [0 ];
33503342 case PROJECTOR_TYPE_QWEN3VL :
33513343 // main path + deepstack paths
0 commit comments