From f57d168c58db8844e5d44e5762aa0ee2d7dae39e Mon Sep 17 00:00:00 2001 From: lucylq Date: Fri, 17 Oct 2025 13:27:50 -0700 Subject: [PATCH] [tokenizers][PR] Parse special_tokens_map.json Add functionality to hf_tokenizer to parse special_tokens_map.json, which contains the source of truth for which bos/eos to use. Differential Revision: [D84878533](https://our.internmc.facebook.com/intern/diff/D84878533/) [ghstack-poisoned] --- .../pytorch/tokenizers/bpe_tokenizer_base.h | 3 +- src/hf_tokenizer.cpp | 132 ++++++++++----- .../hf_tokenizer_dir/special_tokens_map.json | 16 ++ .../resources/hf_tokenizer_dir/tokenizer.json | 152 ++++++++++++++++++ .../hf_tokenizer_dir/tokenizer_config.json | 42 +++++ test/test_hf_tokenizer.cpp | 15 ++ test/test_hf_tokenizer.py | 20 +++ 7 files changed, 338 insertions(+), 42 deletions(-) create mode 100644 test/resources/hf_tokenizer_dir/special_tokens_map.json create mode 100644 test/resources/hf_tokenizer_dir/tokenizer.json create mode 100644 test/resources/hf_tokenizer_dir/tokenizer_config.json diff --git a/include/pytorch/tokenizers/bpe_tokenizer_base.h b/include/pytorch/tokenizers/bpe_tokenizer_base.h index 5e5c05d..5eb01f7 100644 --- a/include/pytorch/tokenizers/bpe_tokenizer_base.h +++ b/include/pytorch/tokenizers/bpe_tokenizer_base.h @@ -122,7 +122,8 @@ inline Result> build_special_token_regex( if (special_pattern.empty()) { return static_cast>(nullptr); } - return create_regex(special_pattern); + // Wrap pattern in parentheses for proper grouping + return create_regex("(" + special_pattern + ")"); } class BPETokenizerBase : public Tokenizer { diff --git a/src/hf_tokenizer.cpp b/src/hf_tokenizer.cpp index d26be5f..3bd8e9a 100644 --- a/src/hf_tokenizer.cpp +++ b/src/hf_tokenizer.cpp @@ -25,6 +25,17 @@ using json = nlohmann::json; namespace tokenizers { +namespace { +// Helper to extract token string from either string or object format +std::string extract_token_string(const json& token_json) { + if (token_json.is_string()) { + return token_json.get(); + } else if (token_json.is_object() && token_json.contains("content")) { + return token_json["content"].get(); + } + return ""; +}; +} // namespace // -------------------------private method end------------------------------- // -------------------------public method start------------------------------- @@ -32,6 +43,12 @@ Error HFTokenizer::load(const std::string& path) { // If this is a directory, look for tokenizer.json and tokenizer_config.json std::string model_json = path; std::string model_config_json = ""; + std::string special_tokens_map_json; + + // Check if bos/eos found. + bool bos_found = false; + bool eos_found = false; + if (fs::is_directory(path)) { const fs::path root(path); model_json = (root / "tokenizer.json").string(); @@ -43,6 +60,11 @@ Error HFTokenizer::load(const std::string& path) { if (fs::exists(model_config_json_path)) { model_config_json = model_config_json_path.string(); } + + const auto special_tokens_map_json_path = root / "special_tokens_map.json"; + if (fs::exists(special_tokens_map_json_path)) { + special_tokens_map_json = special_tokens_map_json_path.string(); + } } // Load the tokenizer.json file @@ -63,7 +85,6 @@ Error HFTokenizer::load(const std::string& path) { // Parse the special tokens try { - std::vector> special_token_pairs; const auto& special_tokens = parsed_json.at("added_tokens"); auto special_token_map_result = detail::build_token_map( special_tokens, @@ -213,8 +234,37 @@ Error HFTokenizer::load(const std::string& path) { return Error::LoadFailure; } - // If a tokenizer config file is found, parse it to look up the eos/bos tokens - if (!model_config_json.empty()) { + // Try special_tokens_map.json first + std::string bos_token; + std::string eos_token; + + if (!special_tokens_map_json.empty()) { + std::ifstream special_file(special_tokens_map_json); + if (special_file) { + try { + json special_tokens_json = json::parse(std::string( + (std::istreambuf_iterator(special_file)), + std::istreambuf_iterator())); + + if (special_tokens_json.contains("bos_token")) { + bos_token = extract_token_string(special_tokens_json["bos_token"]); + } + if (special_tokens_json.contains("eos_token")) { + eos_token = extract_token_string(special_tokens_json["eos_token"]); + } + + TK_LOG( + Info, + "Loaded tokens from special_tokens_map.json: bos='%s', eos='%s'", + bos_token.c_str(), + eos_token.c_str()); + } catch (const std::exception& e) { + TK_LOG(Info, "Could not parse special_tokens_map.json: %s", e.what()); + } + } + } + // Try tokenizer_config.json next + if ((bos_token.empty() || eos_token.empty()) && !model_config_json.empty()) { // Load it and parse it as json std::ifstream config_file(model_config_json); if (!config_file) { @@ -224,40 +274,41 @@ Error HFTokenizer::load(const std::string& path) { std::string config_contents( (std::istreambuf_iterator(config_file)), std::istreambuf_iterator()); - json parsed_config_json; try { - parsed_config_json = json::parse(config_contents); + json parsed_config_json = json::parse(config_contents); + if (bos_token.empty() && parsed_config_json.contains("bos_token")) { + bos_token = extract_token_string(parsed_config_json["bos_token"]); + } + if (eos_token.empty() && parsed_config_json.contains("eos_token")) { + eos_token = extract_token_string(parsed_config_json["eos_token"]); + } + TK_LOG( + Info, + "Loaded tokens from tokenizer_config.json: bos='%s', eos='%s'", + bos_token.c_str(), + eos_token.c_str()); } catch (const std::exception& e) { TK_LOG(Error, "Error parsing model config json json file: %s", e.what()); return Error::LoadFailure; } + } - // Pull out the token strings - try { - const std::string bos_token = parsed_config_json.contains("bos_token") && - !parsed_config_json["bos_token"].is_null() - ? parsed_config_json["bos_token"].get() - : ""; - - const std::string eos_token = parsed_config_json.contains("eos_token") && - !parsed_config_json["eos_token"].is_null() - ? parsed_config_json["eos_token"].get() - : ""; - const auto bos_res = special_token_map_->tryGetInteger(bos_token); - const auto eos_res = special_token_map_->tryGetInteger(eos_token); - if (!bos_res) { - TK_LOG(Error, "BOS token %s not in special tokens", bos_token.c_str()); - return Error::LoadFailure; - } - if (!eos_res) { - TK_LOG(Error, "EOS token %s not in special tokens", eos_token.c_str()); - return Error::LoadFailure; - } - bos_tok_ = *bos_res; - eos_tok_ = *eos_res; - } catch (const std::exception& e) { - TK_LOG(Error, "Could not eos/bos from tokenizer config: %s", e.what()); - return Error::LoadFailure; + // Try to extract the bos/eos tokens. + if (!bos_token.empty() && !eos_token.empty()) { + auto bos_candidate = special_token_map_->tryGetInteger(bos_token); + if (!bos_candidate) { + TK_LOG(Info, "BOS token %s not in special tokens", bos_token.c_str()); + } else { + bos_tok_ = *bos_candidate; + bos_found = true; + } + + auto eos_candidate = special_token_map_->tryGetInteger(eos_token); + if (!eos_candidate) { + TK_LOG(Info, "EOS token %s not in special tokens", eos_token.c_str()); + } else { + eos_tok_ = *eos_candidate; + eos_found = true; } } @@ -265,18 +316,20 @@ Error HFTokenizer::load(const std::string& path) { // 1. Look for special tokens with "bos"/"begin" or "eos"/"end" in them // 2. Sub-qualify with the word "text" if needed // 3. If EOS found, but BOS is not (or vice versa), assume they are the same - else { + if (!eos_found || !bos_found) { std::vector bos_candidates; std::vector eos_candidates; for (std::size_t token_idx = 0; token_idx < special_token_map_->size(); ++token_idx) { const auto [token, _] = special_token_map_->getElement(token_idx); - if (token.find("bos") != std::string::npos || - token.find("begin") != std::string::npos) { + if (!bos_found && + (token.find("bos") != std::string::npos || + token.find("begin") != std::string::npos)) { bos_candidates.push_back(token); } - if (token.find("eos") != std::string::npos || - token.find("end") != std::string::npos) { + if (!eos_found && + (token.find("eos") != std::string::npos || + token.find("end") != std::string::npos)) { eos_candidates.push_back(token); } } @@ -300,14 +353,11 @@ Error HFTokenizer::load(const std::string& path) { } } - // Use if a single candidate - bool bos_found = false; - bool eos_found = false; - if (bos_candidates.size() == 1) { + if (!bos_found && bos_candidates.size() == 1) { bos_found = true; bos_tok_ = *(special_token_map_->tryGetInteger(bos_candidates[0])); } - if (eos_candidates.size() == 1) { + if (!eos_found && eos_candidates.size() == 1) { eos_found = true; eos_tok_ = *(special_token_map_->tryGetInteger(eos_candidates[0])); } diff --git a/test/resources/hf_tokenizer_dir/special_tokens_map.json b/test/resources/hf_tokenizer_dir/special_tokens_map.json new file mode 100644 index 0000000..02ee80b --- /dev/null +++ b/test/resources/hf_tokenizer_dir/special_tokens_map.json @@ -0,0 +1,16 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/test/resources/hf_tokenizer_dir/tokenizer.json b/test/resources/hf_tokenizer_dir/tokenizer.json new file mode 100644 index 0000000..0215484 --- /dev/null +++ b/test/resources/hf_tokenizer_dir/tokenizer.json @@ -0,0 +1,152 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128000, + "content": "<|begin_of_text|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128001, + "content": "<|end_of_text|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128009, + "content": "<|eot_id|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Replace", + "pattern": { + "String": " " + }, + "content": "▁" + } + ] + }, + "pre_tokenizer": { + "type": "Sequence", + "pretokenizers": [ + { + "type": "Split", + "pattern": { + "Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" + }, + "behavior": "MergedWithPrevious", + "invert": false + }, + { + "type": "ByteLevel", + "add_prefix_space": false, + "trim_offsets": false, + "use_regex": false + } + ] + }, + "post_processor": { + "type": "ByteLevel", + "add_prefix_space": false, + "trim_offsets": false, + "use_regex": false + }, + "decoder": { + "type": "ByteLevel", + "add_prefix_space": false, + "trim_offsets": false, + "use_regex": false + }, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": null, + "continuing_subword_prefix": "", + "end_of_word_suffix": "", + "fuse_unk": false, + "byte_fallback": false, + "ignore_merges": false, + "vocab": { + "": 0, + "": 1, + "": 2, + "▁": 3, + "H": 4, + "e": 5, + "l": 6, + "o": 7, + "▁Hello": 8, + "▁world!": 9, + "w": 10, + "r": 11, + "d": 12, + "!": 13 + }, + "merges": [ + "H e", + "e l", + "l l", + "l o", + "▁ H", + "▁H e", + "▁He l", + "▁Hel l", + "▁Hell o", + "w o", + "o r", + "r l", + "l d", + "d !", + "▁ w", + "▁w o", + "▁wo r", + "▁wor l", + "▁worl d", + "▁world !" + ] + } +} diff --git a/test/resources/hf_tokenizer_dir/tokenizer_config.json b/test/resources/hf_tokenizer_dir/tokenizer_config.json new file mode 100644 index 0000000..2a04eaa --- /dev/null +++ b/test/resources/hf_tokenizer_dir/tokenizer_config.json @@ -0,0 +1,42 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizerFast", + "unk_token": null +} diff --git a/test/test_hf_tokenizer.cpp b/test/test_hf_tokenizer.cpp index 3bae037..f64bd8a 100644 --- a/test/test_hf_tokenizer.cpp +++ b/test/test_hf_tokenizer.cpp @@ -44,6 +44,21 @@ TEST(HFTokenizerTest, TestLoadInvalidPath) { EXPECT_EQ(error, Error::LoadFailure); } +TEST(HFTokenizerTest, TestSpecialTokensMap) { + HFTokenizer tokenizer; + auto path = _get_resource_path("hf_tokenizer_dir/"); + auto error = tokenizer.load(path); + EXPECT_EQ(error, Error::Ok); + + // Verify bos_token is loaded from special_tokens_map.json + auto bos_token_id = tokenizer.bos_tok(); + EXPECT_EQ(bos_token_id, 128000); // <|begin_of_text|> + + // Verify eos_token is loaded from special_tokens_map.json + auto eos_token_id = tokenizer.eos_tok(); + EXPECT_EQ(eos_token_id, 128009); // <|eot_id|> +} + TEST(HFTokenizerTest, TestEncode) { HFTokenizer tokenizer; auto path = _get_resource_path("test_hf_tokenizer.json"); diff --git a/test/test_hf_tokenizer.py b/test/test_hf_tokenizer.py index dbed244..cd60883 100644 --- a/test/test_hf_tokenizer.py +++ b/test/test_hf_tokenizer.py @@ -49,6 +49,26 @@ def test_llama3_2_1b(self) -> None: cpp_tokens = cpp_tokenizer.encode(PROMPT, bos=1) self.assertEqual(tokens, cpp_tokens) + def test_llama3_2_1b_special_toks(self) -> None: + tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.2-1B-Instruct") + tokenizer.save_pretrained(self.temp_dir.name) + + cpp_tokenizer = CppHFTokenizer() + cpp_tokenizer.load(self.temp_dir.name) + + tokens = tokenizer.encode(PROMPT) + cpp_tokens = cpp_tokenizer.encode(PROMPT, bos=1) + self.assertEqual(tokens, cpp_tokens) + + bos_id = tokenizer.convert_tokens_to_ids( + tokenizer.special_tokens_map["bos_token"] + ) + eos_id = tokenizer.convert_tokens_to_ids( + tokenizer.special_tokens_map["eos_token"] + ) + self.assertEqual(cpp_tokenizer.bos_tok(), bos_id) + self.assertEqual(cpp_tokenizer.eos_tok(), eos_id) + def test_phi_4_mini(self) -> None: tokenizer = AutoTokenizer.from_pretrained( "software-mansion/react-native-executorch-phi-4-mini"