From 23c7da7d14760825ec7f4d352d88c42d007ff234 Mon Sep 17 00:00:00 2001 From: RunningLeon Date: Tue, 19 Aug 2025 10:24:58 +0800 Subject: [PATCH 1/3] support interns1-mini --- convert_hf_to_gguf.py | 84 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 79 insertions(+), 5 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index b45c8f1d7feb1..b803eebe0d9e0 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2917,7 +2917,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if "language_model." in name: name = name.replace("language_model.", "") # for InternVL if name.startswith("mlp") or name.startswith("multi_modal_projector") \ - or name.startswith("vision_model") or name.startswith("audio_tower"): + or name.startswith("vision_model") or name.startswith("audio_tower") \ + or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"): # skip vision and audio tensors return [] yield from super().modify_tensors(data_torch, name, bid) @@ -3589,6 +3590,82 @@ def prepare_tensors(self): class Qwen3Model(Qwen2Model): model_arch = gguf.MODEL_ARCH.QWEN3 + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False) + self.origin_hf_arch = hparams.get('architectures', [None])[0] + + def set_vocab(self): + # deal with intern-s1-mini + if self.origin_hf_arch == 'InternS1ForConditionalGeneration': + self._set_vocab_interns1() + return + + super().set_vocab() + + def _set_vocab_interns1(self): + tokens: list[str] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab()) + vocab_size = self.hparams.get("vocab_size", len(vocab)) + assert max(vocab.values()) < vocab_size + + tokpre = self.get_vocab_base_pre(tokenizer) + + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()} + added_vocab = tokenizer.get_added_vocab() + + added_tokens_decoder = tokenizer.added_tokens_decoder + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + else: + token: str = reverse_vocab[i] + if token in added_vocab: + # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized. + # To avoid unexpected issues - we make sure to normalize non-normalized tokens + if not added_tokens_decoder[i].normalized: + previous_token = token + token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) + if previous_token != token: + logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") + + if added_tokens_decoder[i].special or self.does_token_look_special(token): + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + toktypes.append(gguf.TokenType.NORMAL) + tokens.append(token) + + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_tokens_map_file = self.dir_model / 'special_tokens_map.json' + additional_special_tokens = [] + if special_tokens_map_file.is_file(): + with open(special_tokens_map_file, encoding = 'utf-8') as f: + additional_special_tokens = json.load(f).get('additional_special_tokens', []) + tokenizer_cfg_file = self.dir_model / 'special_tokens_map.json' + if tokenizer_cfg_file.is_file(): + with open(tokenizer_cfg_file, encoding = 'utf-8') as f: + added_tokens_decoder = json.load(f).get('added_tokens_decoder', {}) + token2ids_map = {data['content'] : int(token) for token, data in added_tokens_decoder.items() if data['special']} + for token in additional_special_tokens: + if token in token2ids_map: + special_vocab._set_special_token(token, token2ids_map[token]) + special_vocab._set_special_token('eos', 151645) + special_vocab._set_special_token("bos", 151643) + special_vocab.add_to_gguf(self.gguf_writer) + @ModelBase.register("Qwen3MoeForCausalLM") class Qwen3MoeModel(Qwen2MoeModel): @@ -3605,10 +3682,7 @@ def set_vocab(self): self._set_vocab_interns1() return - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - self._set_vocab_gpt2() + super().set_vocab() def _set_vocab_interns1(self): tokens: list[str] = [] From 37888ba4dc7bd34e56808b4598126610b16bf3bf Mon Sep 17 00:00:00 2001 From: RunningLeon Date: Fri, 22 Aug 2025 18:32:29 +0800 Subject: [PATCH 2/3] fix comment --- convert_hf_to_gguf.py | 174 ++++++++++++------------------------------ 1 file changed, 48 insertions(+), 126 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index b803eebe0d9e0..ec5fe6b6e005c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1210,6 +1210,54 @@ def _try_set_pooling_type(self) -> None: raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported") self.gguf_writer.add_pooling_type(pooling_type) + def _set_vocab_interns1(self): + tokens: list[str] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab()) + vocab_size = self.hparams.get("vocab_size", len(vocab)) + assert max(vocab.values()) < vocab_size + + tokpre = self.get_vocab_base_pre(tokenizer) + + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()} + added_vocab = tokenizer.get_added_vocab() + + added_tokens_decoder = tokenizer.added_tokens_decoder + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + else: + token: str = reverse_vocab[i] + if token in added_vocab: + # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized. + # To avoid unexpected issues - we make sure to normalize non-normalized tokens + if not added_tokens_decoder[i].normalized: + previous_token = token + token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) + if previous_token != token: + logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") + + if added_tokens_decoder[i].special or self.does_token_look_special(token): + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + toktypes.append(gguf.TokenType.NORMAL) + tokens.append(token) + + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + class MmprojModel(ModelBase): model_type = ModelType.MMPROJ @@ -3603,69 +3651,6 @@ def set_vocab(self): super().set_vocab() - def _set_vocab_interns1(self): - tokens: list[str] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab()) - vocab_size = self.hparams.get("vocab_size", len(vocab)) - assert max(vocab.values()) < vocab_size - - tokpre = self.get_vocab_base_pre(tokenizer) - - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()} - added_vocab = tokenizer.get_added_vocab() - - added_tokens_decoder = tokenizer.added_tokens_decoder - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.UNUSED) - else: - token: str = reverse_vocab[i] - if token in added_vocab: - # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized. - # To avoid unexpected issues - we make sure to normalize non-normalized tokens - if not added_tokens_decoder[i].normalized: - previous_token = token - token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) - if previous_token != token: - logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") - - if added_tokens_decoder[i].special or self.does_token_look_special(token): - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - toktypes.append(gguf.TokenType.NORMAL) - tokens.append(token) - - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - special_tokens_map_file = self.dir_model / 'special_tokens_map.json' - additional_special_tokens = [] - if special_tokens_map_file.is_file(): - with open(special_tokens_map_file, encoding = 'utf-8') as f: - additional_special_tokens = json.load(f).get('additional_special_tokens', []) - tokenizer_cfg_file = self.dir_model / 'special_tokens_map.json' - if tokenizer_cfg_file.is_file(): - with open(tokenizer_cfg_file, encoding = 'utf-8') as f: - added_tokens_decoder = json.load(f).get('added_tokens_decoder', {}) - token2ids_map = {data['content'] : int(token) for token, data in added_tokens_decoder.items() if data['special']} - for token in additional_special_tokens: - if token in token2ids_map: - special_vocab._set_special_token(token, token2ids_map[token]) - special_vocab._set_special_token('eos', 151645) - special_vocab._set_special_token("bos", 151643) - special_vocab.add_to_gguf(self.gguf_writer) - @ModelBase.register("Qwen3MoeForCausalLM") class Qwen3MoeModel(Qwen2MoeModel): @@ -3684,69 +3669,6 @@ def set_vocab(self): super().set_vocab() - def _set_vocab_interns1(self): - tokens: list[str] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab()) - vocab_size = self.hparams.get("vocab_size", len(vocab)) - assert max(vocab.values()) < vocab_size - - tokpre = self.get_vocab_base_pre(tokenizer) - - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()} - added_vocab = tokenizer.get_added_vocab() - - added_tokens_decoder = tokenizer.added_tokens_decoder - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.UNUSED) - else: - token: str = reverse_vocab[i] - if token in added_vocab: - # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized. - # To avoid unexpected issues - we make sure to normalize non-normalized tokens - if not added_tokens_decoder[i].normalized: - previous_token = token - token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) - if previous_token != token: - logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") - - if added_tokens_decoder[i].special or self.does_token_look_special(token): - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - toktypes.append(gguf.TokenType.NORMAL) - tokens.append(token) - - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - special_tokens_map_file = self.dir_model / 'special_tokens_map.json' - additional_special_tokens = [] - if special_tokens_map_file.is_file(): - with open(special_tokens_map_file, encoding = 'utf-8') as f: - additional_special_tokens = json.load(f).get('additional_special_tokens', []) - tokenizer_cfg_file = self.dir_model / 'special_tokens_map.json' - if tokenizer_cfg_file.is_file(): - with open(tokenizer_cfg_file, encoding = 'utf-8') as f: - added_tokens_decoder = json.load(f).get('added_tokens_decoder', {}) - token2ids_map = {data['content'] : int(token) for token, data in added_tokens_decoder.items() if data['special']} - for token in additional_special_tokens: - if token in token2ids_map: - special_vocab._set_special_token(token, token2ids_map[token]) - special_vocab._set_special_token('eos', 151645) - special_vocab._set_special_token("bos", 151643) - special_vocab.add_to_gguf(self.gguf_writer) - @ModelBase.register("GPT2LMHeadModel") class GPT2Model(TextModel): From dbbe1f7a07128e9b4bef4a8175b89724a69b3f14 Mon Sep 17 00:00:00 2001 From: RunningLeon Date: Mon, 25 Aug 2025 10:51:44 +0800 Subject: [PATCH 3/3] update --- convert_hf_to_gguf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ec5fe6b6e005c..80b5380aba0c1 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1256,6 +1256,7 @@ def _set_vocab_interns1(self): self.gguf_writer.add_token_types(toktypes) special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab._set_special_token("bos", 151643) special_vocab.add_to_gguf(self.gguf_writer)