@@ -843,6 +843,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
843843        if  chkhsh  ==  "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51" :
844844            # ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer 
845845            res  =  "lfm2" 
846+         if  chkhsh  ==  "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb" :
847+             # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B 
848+             res  =  "exaone4" 
846849
847850        if  res  is  None :
848851            logger .warning ("\n " )
@@ -2861,7 +2864,8 @@ def set_gguf_parameters(self):
28612864    def  modify_tensors (self , data_torch : Tensor , name : str , bid : int  |  None ) ->  Iterable [tuple [str , Tensor ]]:
28622865        num_heads  =  self .hparams ["num_attention_heads" ]
28632866        num_kv_heads  =  self .hparams ["num_key_value_heads" ]
2864-         head_dim  =  self .hparams ["head_dim" ]
2867+         if  (head_dim  :=  self .hparams .get ("head_dim" )) is  None :
2868+             head_dim  =  self .hparams ["hidden_size" ] //  num_heads 
28652869
28662870        if  "ernie."  in  name :
28672871            name  =  name .replace ("ernie." , "model." )
@@ -2894,6 +2898,93 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
28942898        return  [(self .map_tensor_name (name ), data_torch )]
28952899
28962900
2901+ @ModelBase .register ("Ernie4_5_MoeForCausalLM" ) 
2902+ class  Ernie4_5MoeModel (Ernie4_5Model ):
2903+     model_arch  =  gguf .MODEL_ARCH .ERNIE4_5_MOE 
2904+     _experts : list [dict [str , Tensor ]] |  None  =  None 
2905+ 
2906+     def  __init__ (self , * args , ** kwargs ):
2907+         super ().__init__ (* args , ** kwargs )
2908+         self ._experts  =  [{} for  _  in  range (self .block_count )]
2909+ 
2910+     def  set_gguf_parameters (self ):
2911+         super ().set_gguf_parameters ()
2912+         self .gguf_writer .add_expert_count (self .hparams ["moe_num_experts" ])
2913+         self .gguf_writer .add_expert_used_count (self .hparams ["moe_k" ])
2914+         self .gguf_writer .add_interleave_moe_layer_step (self .hparams ["moe_layer_interval" ])
2915+         self .gguf_writer .add_leading_dense_block_count (self .hparams ["moe_layer_start_index" ])
2916+         if  (moe_intermediate_size  :=  self .hparams .get ("moe_intermediate_size" )) is  not None :
2917+             self .gguf_writer .add_expert_feed_forward_length (moe_intermediate_size )
2918+         if  (shared_expert_count  :=  self .hparams .get ('moe_num_shared_experts' )) is  not None :
2919+             self .gguf_writer .add_expert_shared_count (shared_expert_count )
2920+             if  shared_expert_count  >  0  and  (shared_expert_intermediate_size  :=  self .hparams .get ('intermediate_size' )) is  not None  and  (num_key_value_heads  :=  self .hparams .get ('num_key_value_heads' )) is  not None :
2921+                 self .gguf_writer .add_expert_shared_feed_forward_length (shared_expert_intermediate_size  //  num_key_value_heads )
2922+ 
2923+     def  modify_tensors (self , data_torch : Tensor , name : str , bid : int  |  None ) ->  Iterable [tuple [str , Tensor ]]:
2924+         # Modify correction bias name as in DeepseekV2 
2925+         if  name .endswith ("e_score_correction_bias" ):
2926+             name  =  name .replace ("e_score_correction_bias" , "e_score_correction.bias" )
2927+ 
2928+         # skip Multi-Token Prediction (MTP) layers (again, same as DeepseekV2) 
2929+         match  =  re .match (r"model.mtp_block.(\d+)" , name )
2930+         if  match :
2931+             return  []
2932+ 
2933+         # skip all other MTP tensors for now 
2934+         match  =  re .match (r"model.mtp_emb_norm.(\d+)" , name )
2935+         if  match :
2936+             return  []
2937+ 
2938+         match  =  re .match (r"model.mtp_hidden_norm.(\d+)" , name )
2939+         if  match :
2940+             return  []
2941+ 
2942+         match  =  re .match (r"model.mtp_linear_proj.(\d+)" , name )
2943+         if  match :
2944+             return  []
2945+ 
2946+         # process the experts separately 
2947+         if  name .find ("mlp.experts" ) !=  - 1 :
2948+             n_experts  =  self .hparams ["moe_num_experts" ]
2949+             assert  bid  is  not None 
2950+ 
2951+             if  self ._experts  is  None :
2952+                 self ._experts  =  [{} for  _  in  range (self .block_count )]
2953+ 
2954+             self ._experts [bid ][name ] =  data_torch 
2955+ 
2956+             if  len (self ._experts [bid ]) >=  n_experts  *  3 :
2957+                 tensors : list [tuple [str , Tensor ]] =  []
2958+ 
2959+                 # merge the experts into a single 3d tensor 
2960+                 for  w_name  in  ["gate_proj" , "up_proj" , "down_proj" ]:
2961+                     datas : list [Tensor ] =  []
2962+ 
2963+                     for  xid  in  range (n_experts ):
2964+                         ename_to_retrieve  =  f"model.layers.{ bid } { xid } { w_name }  
2965+                         datas .append (self ._experts [bid ][ename_to_retrieve ])
2966+                         del  self ._experts [bid ][ename_to_retrieve ]
2967+ 
2968+                     data_torch  =  torch .stack (datas , dim = 0 )
2969+                     merged_name  =  f"model.layers.{ bid } { w_name }  
2970+                     new_name  =  self .map_tensor_name (merged_name )
2971+                     tensors .append ((new_name , data_torch ))
2972+ 
2973+                 return  tensors 
2974+             else :
2975+                 return  []
2976+         return  [(self .map_tensor_name (name ), data_torch )]
2977+ 
2978+     def  prepare_tensors (self ):
2979+         super ().prepare_tensors ()
2980+ 
2981+         if  self ._experts  is  not None :
2982+             # flatten `list[dict[str, Tensor]]` into `list[str]` 
2983+             experts  =  [k  for  d  in  self ._experts  for  k  in  d .keys ()]
2984+             if  len (experts ) >  0 :
2985+                 raise  ValueError (f"Unprocessed experts: { experts }  )
2986+ 
2987+ 
28972988@ModelBase .register ( 
28982989    "Qwen2VLModel" , 
28992990    "Qwen2VLForConditionalGeneration" , 
@@ -6692,6 +6783,75 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
66926783                yield  (self .format_tensor_name (gguf .MODEL_TENSOR .ROPE_FREQS ), torch .tensor (rope_factors , dtype = torch .float32 ))
66936784
66946785
6786+ @ModelBase .register ("Exaone4ForCausalLM" ) 
6787+ class  Exaone4Model (TextModel ):
6788+     model_arch  =  gguf .MODEL_ARCH .EXAONE4 
6789+ 
6790+     def  set_vocab (self ):
6791+         tokens , toktypes , tokpre  =  self .get_vocab_base ()
6792+         self .gguf_writer .add_tokenizer_model ("gpt2" )
6793+         self .gguf_writer .add_tokenizer_pre (tokpre )
6794+         self .gguf_writer .add_token_list (tokens )
6795+         self .gguf_writer .add_token_types (toktypes )
6796+ 
6797+         special_vocab  =  gguf .SpecialVocab (self .dir_model , load_merges = True )
6798+         special_vocab .add_to_gguf (self .gguf_writer )
6799+ 
6800+     def  set_gguf_parameters (self ):
6801+         super ().set_gguf_parameters ()
6802+         hparams  =  self .hparams 
6803+         self .gguf_writer .add_vocab_size (hparams ["vocab_size" ])
6804+ 
6805+         if  hparams .get ("sliding_window" ) is  not None :
6806+             self .gguf_writer .add_sliding_window (hparams ["sliding_window" ])
6807+             if  "layer_types"  in  hparams :
6808+                 self .gguf_writer .add_sliding_window_pattern ([t  ==  "sliding_attention"  for  t  in  hparams ["layer_types" ]])
6809+             elif  "sliding_window_pattern"  in  hparams :
6810+                 sliding_window_pattern  =  []
6811+                 if  isinstance (hparams ["sliding_window_pattern" ], str ):  # e.g. LLLG 
6812+                     for  i  in  range (hparams ["num_hidden_layers" ]):
6813+                         sliding_window_pattern .append (hparams ["sliding_window_pattern" ][i  %  len (hparams ["sliding_window_pattern" ])] ==  "L" )
6814+                 if  isinstance (hparams ["sliding_window_pattern" ], int ):  # e.g. 4 
6815+                     for  i  in  range (hparams ["num_hidden_layers" ]):
6816+                         sliding_window_pattern .append ((i  +  1 ) %  hparams ["sliding_window_pattern" ] !=  0 )
6817+                 if  len (sliding_window_pattern ) ==  hparams ["num_hidden_layers" ]:
6818+                     self .gguf_writer .add_sliding_window_pattern (sliding_window_pattern )
6819+ 
6820+         rope_scaling  =  self .hparams .get ("rope_scaling" ) or  {}
6821+         if  rope_scaling .get ("rope_type" , rope_scaling .get ("type" )) ==  "linear"  and  "factor"  in  rope_scaling :
6822+             self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .LINEAR )
6823+             self .gguf_writer .add_rope_scaling_factor (rope_scaling ["factor" ])
6824+ 
6825+     def  generate_extra_tensors (self ) ->  Iterable [tuple [str , Tensor ]]:
6826+         if  rope_scaling  :=  self .find_hparam (["rope_scaling" ], optional = True ):
6827+             if  rope_scaling .get ("rope_type" , '' ).lower () ==  "llama3" :
6828+                 base  =  self .hparams .get ("rope_theta" , 10_000.0 )
6829+                 if  (dim  :=  self .hparams .get ("head_dim" )) is  None :
6830+                     dim  =  self .hparams ["hidden_size" ] //  self .hparams ["num_attention_heads" ]
6831+                 freqs  =  1.0  /  (base  **  (torch .arange (0 , dim , 2 , dtype = torch .float32 ) /  dim ))
6832+ 
6833+                 factor  =  rope_scaling .get ("factor" , 16.0 )
6834+                 low_freq_factor  =  rope_scaling .get ("low_freq_factor" , 1.0 )
6835+                 high_freq_factor  =  rope_scaling .get ("high_freq_factor" , 4.0 )
6836+                 old_context_len  =  self .hparams .get ("original_max_position_embeddings" , 8192 )
6837+ 
6838+                 low_freq_wavelen  =  old_context_len  /  low_freq_factor 
6839+                 high_freq_wavelen  =  old_context_len  /  high_freq_factor 
6840+ 
6841+                 rope_factors  =  []
6842+                 for  freq  in  freqs :
6843+                     wavelen  =  2  *  math .pi  /  freq 
6844+                     if  wavelen  <  high_freq_wavelen :
6845+                         rope_factors .append (1 )
6846+                     elif  wavelen  >  low_freq_wavelen :
6847+                         rope_factors .append (factor )
6848+                     else :
6849+                         smooth  =  (old_context_len  /  wavelen  -  low_freq_factor ) /  (high_freq_factor  -  low_freq_factor )
6850+                         rope_factors .append (1  /  ((1  -  smooth ) /  factor  +  smooth ))
6851+ 
6852+                 yield  (self .format_tensor_name (gguf .MODEL_TENSOR .ROPE_FREQS ), torch .tensor (rope_factors , dtype = torch .float32 ))
6853+ 
6854+ 
66956855@ModelBase .register ("GraniteForCausalLM" ) 
66966856class  GraniteModel (LlamaModel ):
66976857    """Conversion for IBM's GraniteForCausalLM""" 
0 commit comments