From f57d168c58db8844e5d44e5762aa0ee2d7dae39e Mon Sep 17 00:00:00 2001
From: lucylq <lfq@meta.com>
Date: Fri, 17 Oct 2025 13:27:50 -0700
Subject: [PATCH] [tokenizers][PR] Parse special_tokens_map.json

Add functionality to hf_tokenizer to parse special_tokens_map.json, which contains the source of truth for which bos/eos to use.

Differential Revision: [D84878533](https://our.internmc.facebook.com/intern/diff/D84878533/)

[ghstack-poisoned]
---
 .../pytorch/tokenizers/bpe_tokenizer_base.h   |   3 +-
 src/hf_tokenizer.cpp                          | 132 ++++++++++-----
 .../hf_tokenizer_dir/special_tokens_map.json  |  16 ++
 .../resources/hf_tokenizer_dir/tokenizer.json | 152 ++++++++++++++++++
 .../hf_tokenizer_dir/tokenizer_config.json    |  42 +++++
 test/test_hf_tokenizer.cpp                    |  15 ++
 test/test_hf_tokenizer.py                     |  20 +++
 7 files changed, 338 insertions(+), 42 deletions(-)
 create mode 100644 test/resources/hf_tokenizer_dir/special_tokens_map.json
 create mode 100644 test/resources/hf_tokenizer_dir/tokenizer.json
 create mode 100644 test/resources/hf_tokenizer_dir/tokenizer_config.json
diff --git a/include/pytorch/tokenizers/bpe_tokenizer_base.h b/include/pytorch/tokenizers/bpe_tokenizer_base.h
index 5e5c05d..5eb01f7 100644
--- a/include/pytorch/tokenizers/bpe_tokenizer_base.h
+++ b/include/pytorch/tokenizers/bpe_tokenizer_base.h
@@ -122,7 +122,8 @@ inline Result<std::unique_ptr<IRegex>> build_special_token_regex(
   if (special_pattern.empty()) {
     return static_cast<std::unique_ptr<IRegex>>(nullptr);
   }
-  return create_regex(special_pattern);
+  // Wrap pattern in parentheses for proper grouping
+  return create_regex("(" + special_pattern + ")");
 }
 
 class BPETokenizerBase : public Tokenizer {
diff --git a/src/hf_tokenizer.cpp b/src/hf_tokenizer.cpp
index d26be5f..3bd8e9a 100644
--- a/src/hf_tokenizer.cpp
+++ b/src/hf_tokenizer.cpp
@@ -25,6 +25,17 @@ using json = nlohmann::json;
 
 namespace tokenizers {
 
+namespace {
+// Helper to extract token string from either string or object format
+std::string extract_token_string(const json& token_json) {
+  if (token_json.is_string()) {
+    return token_json.get<std::string>();
+  } else if (token_json.is_object() && token_json.contains("content")) {
+    return token_json["content"].get<std::string>();
+  }
+  return "";
+};
+} // namespace
 // -------------------------private method end-------------------------------
 // -------------------------public method start-------------------------------
 
@@ -32,6 +43,12 @@ Error HFTokenizer::load(const std::string& path) {
   // If this is a directory, look for tokenizer.json and tokenizer_config.json
   std::string model_json = path;
   std::string model_config_json = "";
+  std::string special_tokens_map_json;
+
+  // Check if bos/eos found.
+  bool bos_found = false;
+  bool eos_found = false;
+
   if (fs::is_directory(path)) {
     const fs::path root(path);
     model_json = (root / "tokenizer.json").string();
@@ -43,6 +60,11 @@ Error HFTokenizer::load(const std::string& path) {
     if (fs::exists(model_config_json_path)) {
       model_config_json = model_config_json_path.string();
     }
+
+    const auto special_tokens_map_json_path = root / "special_tokens_map.json";
+    if (fs::exists(special_tokens_map_json_path)) {
+      special_tokens_map_json = special_tokens_map_json_path.string();
+    }
   }
 
   // Load the tokenizer.json file
@@ -63,7 +85,6 @@ Error HFTokenizer::load(const std::string& path) {
 
   // Parse the special tokens
   try {
-    std::vector<std::pair<std::string, std::uint64_t>> special_token_pairs;
     const auto& special_tokens = parsed_json.at("added_tokens");
     auto special_token_map_result = detail::build_token_map(
         special_tokens,
@@ -213,8 +234,37 @@ Error HFTokenizer::load(const std::string& path) {
     return Error::LoadFailure;
   }
 
-  // If a tokenizer config file is found, parse it to look up the eos/bos tokens
-  if (!model_config_json.empty()) {
+  // Try special_tokens_map.json first
+  std::string bos_token;
+  std::string eos_token;
+
+  if (!special_tokens_map_json.empty()) {
+    std::ifstream special_file(special_tokens_map_json);
+    if (special_file) {
+      try {
+        json special_tokens_json = json::parse(std::string(
+            (std::istreambuf_iterator<char>(special_file)),
+            std::istreambuf_iterator<char>()));
+
+        if (special_tokens_json.contains("bos_token")) {
+          bos_token = extract_token_string(special_tokens_json["bos_token"]);
+        }
+        if (special_tokens_json.contains("eos_token")) {
+          eos_token = extract_token_string(special_tokens_json["eos_token"]);
+        }
+
+        TK_LOG(
+            Info,
+            "Loaded tokens from special_tokens_map.json: bos='%s', eos='%s'",
+            bos_token.c_str(),
+            eos_token.c_str());
+      } catch (const std::exception& e) {
+        TK_LOG(Info, "Could not parse special_tokens_map.json: %s", e.what());
+      }
+    }
+  }
+  // Try tokenizer_config.json next
+  if ((bos_token.empty() || eos_token.empty()) && !model_config_json.empty()) {
     // Load it and parse it as json
     std::ifstream config_file(model_config_json);
     if (!config_file) {
@@ -224,40 +274,41 @@ Error HFTokenizer::load(const std::string& path) {
     std::string config_contents(
         (std::istreambuf_iterator<char>(config_file)),
         std::istreambuf_iterator<char>());
-    json parsed_config_json;
     try {
-      parsed_config_json = json::parse(config_contents);
+      json parsed_config_json = json::parse(config_contents);
+      if (bos_token.empty() && parsed_config_json.contains("bos_token")) {
+        bos_token = extract_token_string(parsed_config_json["bos_token"]);
+      }
+      if (eos_token.empty() && parsed_config_json.contains("eos_token")) {
+        eos_token = extract_token_string(parsed_config_json["eos_token"]);
+      }
+      TK_LOG(
+          Info,
+          "Loaded tokens from tokenizer_config.json: bos='%s', eos='%s'",
+          bos_token.c_str(),
+          eos_token.c_str());
     } catch (const std::exception& e) {
       TK_LOG(Error, "Error parsing model config json json file: %s", e.what());
       return Error::LoadFailure;
     }
+  }
 
-    // Pull out the token strings
-    try {
-      const std::string bos_token = parsed_config_json.contains("bos_token") &&
-              !parsed_config_json["bos_token"].is_null()
-          ? parsed_config_json["bos_token"].get<std::string>()
-          : "";
-
-      const std::string eos_token = parsed_config_json.contains("eos_token") &&
-              !parsed_config_json["eos_token"].is_null()
-          ? parsed_config_json["eos_token"].get<std::string>()
-          : "";
-      const auto bos_res = special_token_map_->tryGetInteger(bos_token);
-      const auto eos_res = special_token_map_->tryGetInteger(eos_token);
-      if (!bos_res) {
-        TK_LOG(Error, "BOS token %s not in special tokens", bos_token.c_str());
-        return Error::LoadFailure;
-      }
-      if (!eos_res) {
-        TK_LOG(Error, "EOS token %s not in special tokens", eos_token.c_str());
-        return Error::LoadFailure;
-      }
-      bos_tok_ = *bos_res;
-      eos_tok_ = *eos_res;
-    } catch (const std::exception& e) {
-      TK_LOG(Error, "Could not eos/bos from tokenizer config: %s", e.what());
-      return Error::LoadFailure;
+  // Try to extract the bos/eos tokens.
+  if (!bos_token.empty() && !eos_token.empty()) {
+    auto bos_candidate = special_token_map_->tryGetInteger(bos_token);
+    if (!bos_candidate) {
+      TK_LOG(Info, "BOS token %s not in special tokens", bos_token.c_str());
+    } else {
+      bos_tok_ = *bos_candidate;
+      bos_found = true;
+    }
+
+    auto eos_candidate = special_token_map_->tryGetInteger(eos_token);
+    if (!eos_candidate) {
+      TK_LOG(Info, "EOS token %s not in special tokens", eos_token.c_str());
+    } else {
+      eos_tok_ = *eos_candidate;
+      eos_found = true;
     }
   }
 
@@ -265,18 +316,20 @@ Error HFTokenizer::load(const std::string& path) {
   // 1. Look for special tokens with "bos"/"begin" or "eos"/"end" in them
   // 2. Sub-qualify with the word "text" if needed
   // 3. If EOS found, but BOS is not (or vice versa), assume they are the same
-  else {
+  if (!eos_found || !bos_found) {
     std::vector<std::string_view> bos_candidates;
     std::vector<std::string_view> eos_candidates;
     for (std::size_t token_idx = 0; token_idx < special_token_map_->size();
          ++token_idx) {
       const auto [token, _] = special_token_map_->getElement(token_idx);
-      if (token.find("bos") != std::string::npos ||
-          token.find("begin") != std::string::npos) {
+      if (!bos_found &&
+          (token.find("bos") != std::string::npos ||
+           token.find("begin") != std::string::npos)) {
         bos_candidates.push_back(token);
       }
-      if (token.find("eos") != std::string::npos ||
-          token.find("end") != std::string::npos) {
+      if (!eos_found &&
+          (token.find("eos") != std::string::npos ||
+           token.find("end") != std::string::npos)) {
         eos_candidates.push_back(token);
       }
     }
@@ -300,14 +353,11 @@ Error HFTokenizer::load(const std::string& path) {
       }
     }
 
-    // Use if a single candidate
-    bool bos_found = false;
-    bool eos_found = false;
-    if (bos_candidates.size() == 1) {
+    if (!bos_found && bos_candidates.size() == 1) {
       bos_found = true;
       bos_tok_ = *(special_token_map_->tryGetInteger(bos_candidates[0]));
     }
-    if (eos_candidates.size() == 1) {
+    if (!eos_found && eos_candidates.size() == 1) {
       eos_found = true;
       eos_tok_ = *(special_token_map_->tryGetInteger(eos_candidates[0]));
     }
diff --git a/test/resources/hf_tokenizer_dir/special_tokens_map.json b/test/resources/hf_tokenizer_dir/special_tokens_map.json
new file mode 100644
index 0000000..02ee80b
--- /dev/null
+++ b/test/resources/hf_tokenizer_dir/special_tokens_map.json
@@ -0,0 +1,16 @@
+{
+  "bos_token": {
+    "content": "<|begin_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|eot_id|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/test/resources/hf_tokenizer_dir/tokenizer.json b/test/resources/hf_tokenizer_dir/tokenizer.json
new file mode 100644
index 0000000..0215484
--- /dev/null
+++ b/test/resources/hf_tokenizer_dir/tokenizer.json
@@ -0,0 +1,152 @@
+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 1,
+      "content": "<s>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 2,
+      "content": "</s>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 128000,
+      "content": "<|begin_of_text|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 128001,
+      "content": "<|end_of_text|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 128009,
+      "content": "<|eot_id|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": {
+    "type": "Sequence",
+    "normalizers": [
+      {
+        "type": "Replace",
+        "pattern": {
+          "String": " "
+        },
+        "content": "▁"
+      }
+    ]
+  },
+  "pre_tokenizer": {
+    "type": "Sequence",
+    "pretokenizers": [
+      {
+        "type": "Split",
+        "pattern": {
+          "Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+        },
+        "behavior": "MergedWithPrevious",
+        "invert": false
+      },
+      {
+        "type": "ByteLevel",
+        "add_prefix_space": false,
+        "trim_offsets": false,
+        "use_regex": false
+      }
+    ]
+  },
+  "post_processor": {
+    "type": "ByteLevel",
+    "add_prefix_space": false,
+    "trim_offsets": false,
+    "use_regex": false
+  },
+  "decoder": {
+    "type": "ByteLevel",
+    "add_prefix_space": false,
+    "trim_offsets": false,
+    "use_regex": false
+  },
+  "model": {
+    "type": "BPE",
+    "dropout": null,
+    "unk_token": null,
+    "continuing_subword_prefix": "",
+    "end_of_word_suffix": "",
+    "fuse_unk": false,
+    "byte_fallback": false,
+    "ignore_merges": false,
+    "vocab": {
+      "<unk>": 0,
+      "<s>": 1,
+      "</s>": 2,
+      "▁": 3,
+      "H": 4,
+      "e": 5,
+      "l": 6,
+      "o": 7,
+      "▁Hello": 8,
+      "▁world!": 9,
+      "w": 10,
+      "r": 11,
+      "d": 12,
+      "!": 13
+    },
+    "merges": [
+      "H e",
+      "e l",
+      "l l",
+      "l o",
+      "▁ H",
+      "▁H e",
+      "▁He l",
+      "▁Hel l",
+      "▁Hell o",
+      "w o",
+      "o r",
+      "r l",
+      "l d",
+      "d !",
+      "▁ w",
+      "▁w o",
+      "▁wo r",
+      "▁wor l",
+      "▁worl d",
+      "▁world !"
+    ]
+  }
+}
diff --git a/test/resources/hf_tokenizer_dir/tokenizer_config.json b/test/resources/hf_tokenizer_dir/tokenizer_config.json
new file mode 100644
index 0000000..2a04eaa
--- /dev/null
+++ b/test/resources/hf_tokenizer_dir/tokenizer_config.json
@@ -0,0 +1,42 @@
+{
+  "add_bos_token": true,
+  "added_tokens_decoder": {
+    "128000": {
+      "content": "<|begin_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128001": {
+      "content": "<|end_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128009": {
+      "content": "<|eot_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|begin_of_text|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|eot_id|>",
+  "extra_special_tokens": {},
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 131072,
+  "pad_token": "<|finetune_right_pad_id|>",
+  "padding_side": "right",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": null
+}
diff --git a/test/test_hf_tokenizer.cpp b/test/test_hf_tokenizer.cpp
index 3bae037..f64bd8a 100644
--- a/test/test_hf_tokenizer.cpp
+++ b/test/test_hf_tokenizer.cpp
@@ -44,6 +44,21 @@ TEST(HFTokenizerTest, TestLoadInvalidPath) {
   EXPECT_EQ(error, Error::LoadFailure);
 }
 
+TEST(HFTokenizerTest, TestSpecialTokensMap) {
+  HFTokenizer tokenizer;
+  auto path = _get_resource_path("hf_tokenizer_dir/");
+  auto error = tokenizer.load(path);
+  EXPECT_EQ(error, Error::Ok);
+
+  // Verify bos_token is loaded from special_tokens_map.json
+  auto bos_token_id = tokenizer.bos_tok();
+  EXPECT_EQ(bos_token_id, 128000); // <|begin_of_text|>
+
+  // Verify eos_token is loaded from special_tokens_map.json
+  auto eos_token_id = tokenizer.eos_tok();
+  EXPECT_EQ(eos_token_id, 128009); // <|eot_id|>
+}
+
 TEST(HFTokenizerTest, TestEncode) {
   HFTokenizer tokenizer;
   auto path = _get_resource_path("test_hf_tokenizer.json");
diff --git a/test/test_hf_tokenizer.py b/test/test_hf_tokenizer.py
index dbed244..cd60883 100644
--- a/test/test_hf_tokenizer.py
+++ b/test/test_hf_tokenizer.py
@@ -49,6 +49,26 @@ def test_llama3_2_1b(self) -> None:
         cpp_tokens = cpp_tokenizer.encode(PROMPT, bos=1)
         self.assertEqual(tokens, cpp_tokens)
 
+    def test_llama3_2_1b_special_toks(self) -> None:
+        tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.2-1B-Instruct")
+        tokenizer.save_pretrained(self.temp_dir.name)
+
+        cpp_tokenizer = CppHFTokenizer()
+        cpp_tokenizer.load(self.temp_dir.name)
+
+        tokens = tokenizer.encode(PROMPT)
+        cpp_tokens = cpp_tokenizer.encode(PROMPT, bos=1)
+        self.assertEqual(tokens, cpp_tokens)
+
+        bos_id = tokenizer.convert_tokens_to_ids(
+            tokenizer.special_tokens_map["bos_token"]
+        )
+        eos_id = tokenizer.convert_tokens_to_ids(
+            tokenizer.special_tokens_map["eos_token"]
+        )
+        self.assertEqual(cpp_tokenizer.bos_tok(), bos_id)
+        self.assertEqual(cpp_tokenizer.eos_tok(), eos_id)
+
     def test_phi_4_mini(self) -> None:
         tokenizer = AutoTokenizer.from_pretrained(
             "software-mansion/react-native-executorch-phi-4-mini"