From 3d16f647d10c2a11080d4a7f16ab615727e11004 Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Sat, 20 Jul 2024 22:57:59 +0200
Subject: [PATCH 01/29] Update bruteforce test:

- Compare tokenizer vocab tokens.
- Bruteforce byte token generator.
- Find minimal mismatched substring.
---
 tests/test-tokenizer-random.py | 162 +++++++++++++++++++++++++++------
 1 file changed, 136 insertions(+), 26 deletions(-)

diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py
index 9ebe6c89185a3..c17a1cfbd85a7 100644
--- a/tests/test-tokenizer-random.py
+++ b/tests/test-tokenizer-random.py
@@ -116,9 +116,25 @@ def detokenize(self, ids: list[int], remove_special: bool = False, unparse_speci
             num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
         return str(cast(Buffer, self.ffi.buffer(self.text_buff, num)), encoding="utf-8", errors="replace")  # replace errors with '\uFFFD'
 
+    def get_vocab(self, detokenize=False) -> list[str]:
+        vocab: list[str] = []
+        num_tokens = self.lib.llama_n_vocab(self.model)
+        for id in range(num_tokens):
+            if detokenize:
+                text = self.detokenize([id], remove_special=False, unparse_special=True)
+            else:
+                text = self.lib.llama_token_get_text(self.model, id)
+                text = self.ffi.string(text)
+                text = str(text, encoding="utf-8", errors="replace")  # replace errors with '\uFFFD'
+            vocab.append(text)
+        return vocab
+
 
 class Tokenizer:
 
+    def get_vocab(self, detokenize=False) -> list[str]:
+        raise NotImplementedError
+
     def encode(self, text: str) -> list[int]:
         raise NotImplementedError
 
@@ -129,7 +145,7 @@ def decode(self, ids: list[int]) -> str:
 class TokenizerGroundtruth (Tokenizer):
 
     def __init__(self, dir_tokenizer: str):
-        self.model: PreTrainedTokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
+        self.model: PreTrainedTokenizer = AutoTokenizer.from_pretrained(dir_tokenizer, trust_remote_code=False)
         # guess BOS and EOS
         ids = self.encode("a")
         assert 1 <= len(ids) <= 3
@@ -138,15 +154,24 @@ def __init__(self, dir_tokenizer: str):
         self.add_bos_token = getattr(self.model, "add_bos_token", add_bos_token)
         self.add_eos_token = getattr(self.model, "add_eos_token", add_eos_token)
         # build vocab
-        tokens = list(self.model.get_vocab().values())
-        self.vocab = self.model.batch_decode(tokens, skip_special_tokens=True)
-        self.vocab = list(sorted(self.vocab))
+        self.vocab = self.get_vocab(detokenize=True)
         # tokens and lists
-        self.special_tokens = list(self.model.all_special_tokens)
-        self.added_tokens   = self.model.batch_decode(self.model.added_tokens_encoder.values(), skip_special_tokens=False)
+        self.special_tokens = [self.vocab[i] for i in sorted(self.model.all_special_ids)]
+        self.added_tokens   = [self.vocab[i] for i in sorted(self.model.added_tokens_encoder.values())]
         self.bos_token = self.model.bos_token
         self.eos_token = self.model.eos_token
 
+    def get_vocab(self, detokenize=False) -> list[str]:
+        max_token_id = max(self.model.get_vocab().values())
+        if detokenize:
+            ids = list(range(max_token_id + 1))
+            vocab = self.model.batch_decode(ids, skip_special_tokens=False)
+        else:
+            vocab = [None] * (max_token_id + 1)
+            for text, id in self.model.get_vocab().items():
+                vocab[id] = text
+        return vocab
+
     def encode(self, text: str) -> list[int]:
         return self.model.encode(text, add_special_tokens=True)
 
@@ -163,6 +188,9 @@ def __init__(self, vocab_file: str):
             self.libllama = LibLlama()
         self.model = LibLlamaModel(self.libllama, vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096))
 
+    def get_vocab(self, detokenize=False) -> list[str]:
+        return self.model.get_vocab(detokenize)
+
     def encode(self, text: str) -> list[int]:
         return self.model.tokenize(text, add_special=True, parse_special=True)
 
@@ -253,6 +281,23 @@ def generator_vocab_words(tokenizer: TokenizerGroundtruth) -> Iterator[str]:
     yield from tokenizer.vocab
 
 
+def generator_byte_tokens() -> Iterator[str]:
+    """Brute force check common byte encoding"""
+    for a, b in ["<>", "[]", "()", ("\\", "")]:
+        yield from [f"{a}{i}{b}" for i in range(256)]
+        yield from [f"{a}{i:x}{b}" for i in range(256)]
+        yield from [f"{a}{i:X}{b}" for i in range(256)]
+        yield from [f"{a}x{i:x}{b}" for i in range(256)]
+        yield from [f"{a}x{i:X}{b}" for i in range(256)]
+        yield from [f"{a}x{i:02x}{b}" for i in range(256)]
+        yield from [f"{a}x{i:02X}{b}" for i in range(256)]
+        yield from [f"{a}0x{i:x}{b}" for i in range(256)]
+        yield from [f"{a}0x{i:X}{b}" for i in range(256)]
+        yield from [f"{a}0x{i:02x}{b}" for i in range(256)]
+        yield from [f"{a}0x{i:02X}{b}" for i in range(256)]
+        yield from [f"{a}{chr(i)}{b}" for i in range(256)]
+
+
 def generator_ascii_lr_strip() -> Iterator[str]:
     WHITESPACES = ["", " ", "  "]
     CHARACTERS = list(chr(i) for i in range(1, 0x80)) + [""]
@@ -275,10 +320,11 @@ def generator_apostrophe() -> Iterator[str]:
                     yield char1 + lstrip + "'" + rstrip + char2
                     yield char1 + char2 + lstrip + "'" + rstrip + "z"
                     yield "a" + lstrip + "'" + rstrip + char1 + char2
+                    yield "a" + lstrip + "'" + char1 + char2 + rstrip + "z"
 
 
 def generator_added_lr_strip(tokenizer: TokenizerGroundtruth) -> Iterator[str]:
-    WHITESPACES = ["", " ", "  ", "\n", "\r\n", "\n\n", "\t", "\t\t"]
+    WHITESPACES = ["", " ", "  ", "\n", "\r\n", "\n\n", "\t", "\t\t", "        "]
     all_tokens = list(sorted(set(tokenizer.special_tokens + tokenizer.added_tokens)))
     for token in all_tokens:
         for lstrip in WHITESPACES:
@@ -436,6 +482,7 @@ def check_detokenizer(text: str, text1: str, text2: str) -> bool:
     t_start = time.perf_counter()
     encode_errors = 0
     decode_errors = 0
+    total_tests = 0
     MAX_ERRORS = 10
 
     logger.info("%s: %s" % (generator.__qualname__, "ini"))
@@ -455,21 +502,44 @@ def check_detokenizer(text: str, text1: str, text2: str) -> bool:
         t_encode2 += t2 - t1
         t_decode1 += t3 - t2
         t_decode2 += t4 - t3
-        if encode_errors < MAX_ERRORS and ids1 != ids2:
-            i = find_first_mismatch(ids1, ids2)
-            ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1]
-            ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1]
+        # compare
+        encode_ok = ids1 == ids2
+        decode_ok = check_detokenizer(text, text1, text2)
+        encode_errors += not encode_ok
+        decode_errors += not decode_ok
+        total_tests += 1
+        if (encode_errors < MAX_ERRORS and not encode_ok) or (decode_errors < MAX_ERRORS and not decode_ok):
+            def _compare(text: str):
+                ids1  = tokenizer1.encode(text)
+                ids2  = tokenizer2.encode(text)
+                text1 = tokenizer1.decode(ids1)
+                text2 = tokenizer2.decode(ids1)
+                encode_ok = ids1 == ids2
+                decode_ok = check_detokenizer(text, text1, text2)
+                ok = encode_ok and decode_ok
+                return ok, ids1, ids2, text1, text2
+            a, b = 0, len(text)
+            for step in [64, 32, 16, 8, 4, 2, 1]:
+                while a < b:
+                    t = max(a, b - step)
+                    if _compare(text[a : t])[0]:
+                        break
+                    b = t
+            for step in [64, 32, 16, 8, 4, 2, 1]:
+                while a < b:
+                    t = min(a + step, b)
+                    if _compare(text[t : b])[0]:
+                        break
+                    a = t
+            ok, ids1, ids2, text1, text2 = _compare(text[a : b])
+            assert a <= b and not ok
+            logger.error(" Text:" + repr(text[a : b]))
+            logger.error("  " + " ".join(repr(x) + ":" + hex(ord(x)) for x in text[a : b]))
             logger.error(" Expected: " + str(ids1))
             logger.error("   Result: " + str(ids2))
-            encode_errors += 1
+            logger.error(" Expected: " + " ".join(repr(x) + ":" + hex(ord(x)) for x in text1))
+            logger.error("   Result: " + " ".join(repr(x) + ":" + hex(ord(x)) for x in text2))
             logger.error(f" {encode_errors=}")
-        if decode_errors < MAX_ERRORS and not check_detokenizer(text, text1, text2):
-            i = find_first_mismatch(text1, text2)
-            text1 = list(text1[max(0, i - 2) : i + 5 + 1])
-            text2 = list(text2[max(0, i - 2) : i + 5 + 1])
-            logger.error(" Expected: " + " ".join(hex(ord(x)) for x in text1))
-            logger.error("   Result: " + " ".join(hex(ord(x)) for x in text2))
-            decode_errors += 1
             logger.error(f" {decode_errors=}")
         if encode_errors >= MAX_ERRORS and decode_errors >= MAX_ERRORS:
             logger.error(f" EXIT: {encode_errors=} {decode_errors=}")
@@ -480,6 +550,43 @@ def check_detokenizer(text: str, text1: str, text2: str) -> bool:
     logger.info(f"{generator.__qualname__}: end,  {t_encode1=:.3f} {t_encode2=:.3f}  {t_decode1=:.3f} {t_decode2=:.3f}  {t_total=:.3f}")
 
 
+def compare_vocabs(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLlamaCpp):
+
+    MAX_PRINT_ERRORS = 10
+
+    logger.info("compare_vocabs: ini")
+
+    t_start = time.perf_counter()
+
+    for detokenize in (False, True):
+        vocab1 = tokenizer1.get_vocab(detokenize)
+        vocab2 = tokenizer2.get_vocab(detokenize)
+        if vocab1 != vocab2:
+            num_errors = 0
+            for i in range(max(len(vocab1), len(vocab2))):
+                text1 = vocab1[i] if i < len(vocab1) else None
+                text2 = vocab2[i] if i < len(vocab2) else None
+                if text1 != text2:
+                    # is "[UNUSED_TOKEN_" and "[PAD" valid for all models ?  #TODO: use toktypes
+                    if text1 is not None:
+                        text1 = text1.replace("[UNUSED_TOKEN_", "[PAD")
+                    if text2 is not None:
+                        text2 = text2.replace("[UNUSED_TOKEN_", "[PAD")
+                    if text1 is None and (text2 or "").startswith('[PAD'):
+                        text2 = None
+                    if text2 is None and (text1 or "").startswith('[PAD'):
+                        text1 = None
+                if text1 != text2:
+                    num_errors += 1
+                    if num_errors < MAX_PRINT_ERRORS:
+                        logger.error(f" {detokenize=} id={i} expected={repr(text1)} result={repr(text2)}")
+            if num_errors:
+                logger.error(f" {num_errors=}")
+
+    t_total = time.perf_counter() - t_start
+    logger.info(f"compare_vocabs: end,  {t_total=:.3f}")
+
+
 def main(argv: list[str] | None = None):
     parser = argparse.ArgumentParser()
     parser.add_argument("vocab_file", type=str, help="path to vocab 'gguf' file")
@@ -493,18 +600,21 @@ def main(argv: list[str] | None = None):
     tokenizer1 = TokenizerGroundtruth(args.dir_tokenizer)
     tokenizer2 = TokenizerLlamaCpp(args.vocab_file)
 
-    # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text())
-    # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text_edge_cases())
+    compare_vocabs(tokenizer1, tokenizer2)
+
+    compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text())
+    compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text_edge_cases())
+    compare_tokenizers(tokenizer1, tokenizer2, generator_byte_tokens())
     compare_tokenizers(tokenizer1, tokenizer2, generator_ascii_lr_strip())
     compare_tokenizers(tokenizer1, tokenizer2, generator_apostrophe())
     compare_tokenizers(tokenizer1, tokenizer2, generator_unicodes())
     compare_tokenizers(tokenizer1, tokenizer2, generator_vocab_words(tokenizer1))
     compare_tokenizers(tokenizer1, tokenizer2, generator_added_lr_strip(tokenizer1))
-    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_added_tokens(tokenizer1, 10_000))
-    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_chars(10_000))
-    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_unicodes(10_000))
-    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_chars(tokenizer1, 10_000))
-    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_words(tokenizer1, 5_000))
+    compare_tokenizers(tokenizer1, tokenizer2, generator_random_added_tokens(tokenizer1, 10_000))
+    compare_tokenizers(tokenizer1, tokenizer2, generator_random_chars(10_000))
+    compare_tokenizers(tokenizer1, tokenizer2, generator_random_unicodes(10_000))
+    compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_chars(tokenizer1, 10_000))
+    compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_words(tokenizer1, 5_000))
 
     tokenizer2.model.free()
 

From 5ceab90b4d69dcbea2e006e77af5c8bd8eb15645 Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Sat, 20 Jul 2024 23:04:23 +0200
Subject: [PATCH 02/29] Store all unicode codepoint categories

---
 scripts/gen-unicode-data.py | 102 +++++++++++++++++-------------------
 1 file changed, 48 insertions(+), 54 deletions(-)

diff --git a/scripts/gen-unicode-data.py b/scripts/gen-unicode-data.py
index 2d9bde01c3ca7..55ac0af12c29f 100644
--- a/scripts/gen-unicode-data.py
+++ b/scripts/gen-unicode-data.py
@@ -49,52 +49,41 @@ def unicode_data_iter():
         yield (cpt, cpt_lower, cpt_upper, categ, bidir)
 
 
-# see definition in unicode.h
-CODEPOINT_FLAG_UNDEFINED   = 0x0001  #
-CODEPOINT_FLAG_NUMBER      = 0x0002  # \p{N}
-CODEPOINT_FLAG_LETTER      = 0x0004  # \p{L}
-CODEPOINT_FLAG_SEPARATOR   = 0x0008  # \p{Z}
-CODEPOINT_FLAG_MARK        = 0x0010  # \p{M}
-CODEPOINT_FLAG_PUNCTUATION = 0x0020  # \p{P}
-CODEPOINT_FLAG_SYMBOL      = 0x0040  # \p{S}
-CODEPOINT_FLAG_CONTROL     = 0x0080  # \p{C}
-
-UNICODE_CATEGORY_TO_FLAG = {
-    "Cn": CODEPOINT_FLAG_UNDEFINED,    # Undefined
-    "Cc": CODEPOINT_FLAG_CONTROL,      # Control
-    "Cf": CODEPOINT_FLAG_CONTROL,      # Format
-    "Co": CODEPOINT_FLAG_CONTROL,      # Private Use
-    "Cs": CODEPOINT_FLAG_CONTROL,      # Surrrogate
-    "Ll": CODEPOINT_FLAG_LETTER,       # Lowercase Letter
-    "Lm": CODEPOINT_FLAG_LETTER,       # Modifier Letter
-    "Lo": CODEPOINT_FLAG_LETTER,       # Other Letter
-    "Lt": CODEPOINT_FLAG_LETTER,       # Titlecase Letter
-    "Lu": CODEPOINT_FLAG_LETTER,       # Uppercase Letter
-    "L&": CODEPOINT_FLAG_LETTER,       # Cased Letter
-    "Mc": CODEPOINT_FLAG_MARK,         # Spacing Mark
-    "Me": CODEPOINT_FLAG_MARK,         # Enclosing Mark
-    "Mn": CODEPOINT_FLAG_MARK,         # Nonspacing Mark
-    "Nd": CODEPOINT_FLAG_NUMBER,       # Decimal Number
-    "Nl": CODEPOINT_FLAG_NUMBER,       # Letter Number
-    "No": CODEPOINT_FLAG_NUMBER,       # Other Number
-    "Pc": CODEPOINT_FLAG_PUNCTUATION,  # Connector Punctuation
-    "Pd": CODEPOINT_FLAG_PUNCTUATION,  # Dash Punctuation
-    "Pe": CODEPOINT_FLAG_PUNCTUATION,  # Close Punctuation
-    "Pf": CODEPOINT_FLAG_PUNCTUATION,  # Final Punctuation
-    "Pi": CODEPOINT_FLAG_PUNCTUATION,  # Initial Punctuation
-    "Po": CODEPOINT_FLAG_PUNCTUATION,  # Other Punctuation
-    "Ps": CODEPOINT_FLAG_PUNCTUATION,  # Open Punctuation
-    "Sc": CODEPOINT_FLAG_SYMBOL,       # Currency Symbol
-    "Sk": CODEPOINT_FLAG_SYMBOL,       # Modifier Symbol
-    "Sm": CODEPOINT_FLAG_SYMBOL,       # Math Symbol
-    "So": CODEPOINT_FLAG_SYMBOL,       # Other Symbol
-    "Zl": CODEPOINT_FLAG_SEPARATOR,    # Line Separator
-    "Zp": CODEPOINT_FLAG_SEPARATOR,    # Paragraph Separator
-    "Zs": CODEPOINT_FLAG_SEPARATOR,    # Space Separator
+UNICODE_CATEGORY_TO_INDEX = {
+    "Cn":  0,  # \p{Cn} Undefined
+    "Cc":  1,  # \p{Cc} Control
+    "Cf":  2,  # \p{Cf} Format
+    "Co":  3,  # \p{Co} Private Use
+    "Cs":  4,  # \p{Cs} Surrrogate
+    "Ll":  5,  # \p{Ll} Lowercase Letter
+    "Lm":  6,  # \p{Lm} Modifier Letter
+    "Lo":  7,  # \p{Lo} Other Letter
+    "Lt":  8,  # \p{Lt} Titlecase Letter
+    "Lu":  9,  # \p{Lu} Uppercase Letter
+    "Mc": 10,  # \p{Mc} Spacing Mark
+    "Me": 11,  # \p{Me} Enclosing Mark
+    "Mn": 12,  # \p{Mn} Nonspacing Mark
+    "Nd": 13,  # \p{Nd} Decimal Number
+    "Nl": 14,  # \p{Nl} Letter Number
+    "No": 15,  # \p{No} Other Number
+    "Pc": 16,  # \p{Pc} Connector Punctuation
+    "Pd": 17,  # \p{Pd} Dash Punctuation
+    "Pe": 18,  # \p{Pe} Close Punctuation
+    "Pf": 19,  # \p{Pf} Final Punctuation
+    "Pi": 20,  # \p{Pi} Initial Punctuation
+    "Po": 21,  # \p{Po} Other Punctuation
+    "Ps": 22,  # \p{Ps} Open Punctuation
+    "Sc": 23,  # \p{Sc} Currency Symbol
+    "Sk": 24,  # \p{Sk} Modifier Symbol
+    "Sm": 25,  # \p{Sm} Math Symbol
+    "So": 26,  # \p{So} Other Symbol
+    "Zl": 27,  # \p{Zl} Line Separator
+    "Zp": 28,  # \p{Zp} Paragraph Separator
+    "Zs": 29,  # \p{Zs} Space Separator
 }
 
 
-codepoint_flags = array.array('H', [CODEPOINT_FLAG_UNDEFINED]) * MAX_CODEPOINTS
+codepoint_categs = array.array('B', [0]) * MAX_CODEPOINTS  # Undefined
 table_whitespace = []
 table_lowercase = []
 table_uppercase = []
@@ -105,7 +94,7 @@ def unicode_data_iter():
     char = chr(cpt)
 
     # codepoint category flags
-    codepoint_flags[cpt] = UNICODE_CATEGORY_TO_FLAG[categ]
+    codepoint_categs[cpt] = UNICODE_CATEGORY_TO_INDEX[categ]
 
     # lowercase conversion
     if cpt_lower:
@@ -134,12 +123,17 @@ def unicode_data_iter():
 table_nfd.sort()
 
 
-# group ranges with same flags
-ranges_flags: list[tuple[int, int]] = [(0, codepoint_flags[0])]  # start, flags
-for codepoint, flags in enumerate(codepoint_flags):
-    if flags != ranges_flags[-1][1]:
-        ranges_flags.append((codepoint, flags))
-ranges_flags.append((MAX_CODEPOINTS, 0x0000))
+# run length encoding
+assert (max(UNICODE_CATEGORY_TO_INDEX.values()) < 32)
+codepoint_categs_runs = [codepoint_categs[0]]  # 5 bits categ + 11 bits length
+for cpt, categ in enumerate(codepoint_categs[1:], 1):
+    prev = codepoint_categs_runs[-1]
+    if prev <= (0xFFFF - 32) and (prev & 31) == categ:
+        codepoint_categs_runs[-1] += 32  # increment run length
+    else:
+        codepoint_categs_runs.append(categ)  # new run value
+    assert (codepoint_categs_runs[-1] < 0xFFFF)
+assert (MAX_CODEPOINTS == sum((rle >> 5) + 1 for rle in codepoint_categs_runs))
 
 
 # group ranges with same nfd
@@ -153,7 +147,7 @@ def unicode_data_iter():
 
 
 # Generate 'unicode-data.cpp':
-#   python ./scripts//gen-unicode-data.py > unicode-data.cpp
+#   python ./scripts//gen-unicode-data.py > ./src/unicode-data.cpp
 
 def out(line=""):
     print(line, end='\n')  # noqa
@@ -170,9 +164,9 @@ def out(line=""):
 #include <unordered_set>
 """)
 
-out("const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = {  // start, flags // last=next_start-1")
-for codepoint, flags in ranges_flags:
-    out("{0x%06X, 0x%04X}," % (codepoint, flags))
+out("const std::vector<uint16_t> unicode_rle_codepoints_categs = {  // run length encoding, 5 bits categ + 11 bits length")
+for rle in codepoint_categs_runs:
+    out("0x%04X," % rle)
 out("};\n")
 
 out("const std::unordered_set<uint32_t> unicode_set_whitespace = {")

From ba4bbbd1ad7692d60f38fb9a65cde7ec6f86158f Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Sat, 20 Jul 2024 23:09:33 +0200
Subject: [PATCH 03/29] Reimplement 'codepoint_flags' as 'codepoint_categ'

---
 src/unicode.h | 144 ++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 104 insertions(+), 40 deletions(-)

diff --git a/src/unicode.h b/src/unicode.h
index 30b07ba7fa493..f9f4fcc8cc7a0 100644
--- a/src/unicode.h
+++ b/src/unicode.h
@@ -4,46 +4,110 @@
 #include <string>
 #include <vector>
 
-struct codepoint_flags {
-    enum {
-        UNDEFINED       = 0x0001,
-        NUMBER          = 0x0002,  // regex: \p{N}
-        LETTER          = 0x0004,  // regex: \p{L}
-        SEPARATOR       = 0x0008,  // regex: \p{Z}
-        ACCENT_MARK     = 0x0010,  // regex: \p{M}
-        PUNCTUATION     = 0x0020,  // regex: \p{P}
-        SYMBOL          = 0x0040,  // regex: \p{S}
-        CONTROL         = 0x0080,  // regex: \p{C}
-        MASK_CATEGORIES = 0x00FF,
+struct codepoint_categ {
+    enum _category : uint16_t {
+        UNDEF = 0,   // \p{Cn} Undefined
+        C = 1 << 0,  // \p{C}  Control
+        L = 1 << 1,  // \p{L}  Letter
+        M = 1 << 2,  // \p{M}  Mark
+        N = 1 << 3,  // \p{N}  Number
+        P = 1 << 4,  // \p{P}  Punctuation
+        S = 1 << 5,  // \p{S}  Symbol
+        Z = 1 << 6,  // \p{Z}  Separator
+        MASK = (1 << 7) - 1  // 7 bits
     };
 
-    // codepoint type
-    uint16_t is_undefined   : 1;
-    uint16_t is_number      : 1;  // regex: \p{N}
-    uint16_t is_letter      : 1;  // regex: \p{L}
-    uint16_t is_separator   : 1;  // regex: \p{Z}
-    uint16_t is_accent_mark : 1;  // regex: \p{M}
-    uint16_t is_punctuation : 1;  // regex: \p{P}
-    uint16_t is_symbol      : 1;  // regex: \p{S}
-    uint16_t is_control     : 1;  // regex: \p{C}
-    // helper flags
-    uint16_t is_whitespace  : 1;  // regex: \s
-    uint16_t is_lowercase   : 1;
-    uint16_t is_uppercase   : 1;
-    uint16_t is_nfd         : 1;
-
-    // decode from uint16
-    inline codepoint_flags(const uint16_t flags=0) {
-        *reinterpret_cast<uint16_t*>(this) = flags;
-    }
-
-    inline uint16_t as_uint() const {
-        return *reinterpret_cast<const uint16_t*>(this);
-    }
-
-    inline uint16_t category_flag() const {
-        return this->as_uint() & MASK_CATEGORIES;
-    }
+    enum _subcategory : uint16_t {
+        Cc = C | (1 << 7),  // \p{Cc} Control
+        Cf = C | (2 << 7),  // \p{Cf} Format
+        Co = C | (3 << 7),  // \p{Co} Private Use
+        Cs = C | (4 << 7),  // \p{Cs} Surrrogate
+        Ll = L | (1 << 7),  // \p{Ll} Lowercase Letter
+        Lm = L | (2 << 7),  // \p{Lm} Modifier Letter
+        Lo = L | (3 << 7),  // \p{Lo} Other Letter
+        Lt = L | (4 << 7),  // \p{Lt} Titlecase Letter
+        Lu = L | (5 << 7),  // \p{Lu} Uppercase Letter
+        Mc = M | (1 << 7),  // \p{Mc} Spacing Mark
+        Me = M | (2 << 7),  // \p{Me} Enclosing Mark
+        Mn = M | (3 << 7),  // \p{Mn} Nonspacing Mark
+        Nd = N | (1 << 7),  // \p{Nd} Decimal Number
+        Nl = N | (2 << 7),  // \p{Nl} Letter Number
+        No = N | (3 << 7),  // \p{No} Other Number
+        Pc = P | (1 << 7),  // \p{Pc} Connector Punctuation
+        Pd = P | (2 << 7),  // \p{Pd} Dash Punctuation
+        Pe = P | (3 << 7),  // \p{Pe} Close Punctuation
+        Pf = P | (4 << 7),  // \p{Pf} Final Punctuation
+        Pi = P | (5 << 7),  // \p{Pi} Initial Punctuation
+        Po = P | (6 << 7),  // \p{Po} Other Punctuation
+        Ps = P | (7 << 7),  // \p{Ps} Open Punctuation
+        Sc = S | (1 << 7),  // \p{Sc} Currency Symbol
+        Sk = S | (2 << 7),  // \p{Sk} Modifier Symbol
+        Sm = S | (3 << 7),  // \p{Sm} Math Symbol
+        So = S | (4 << 7),  // \p{So} Other Symbol
+        Zl = Z | (1 << 7),  // \p{Zl} Line Separator
+        Zp = Z | (2 << 7),  // \p{Zp} Paragraph Separator
+        Zs = Z | (3 << 7),  // \p{Zs} Space Separator
+        SUBMASK = (1 << 10) - 1  // 7+3 bits
+    };
+
+    enum _flags : uint16_t {
+        WHITESPACE = (1 << 10),  // regex: \s
+        LOWERCASE  = (1 << 11),
+        UPPERCASE  = (1 << 12),
+        //Norm NFD/NFC  = ...,
+    };
+
+    inline codepoint_categ(const uint16_t categ=0) : encoded{categ} {}
+
+    inline uint8_t get_category() const { return encoded & MASK; }
+    inline uint8_t get_subcategory() const { return encoded & SUBMASK; }
+
+    inline bool is_undefined() const { return !encoded; }
+    inline bool is_defined() const { return encoded; }
+
+    inline auto is_whitespace() const { return encoded & WHITESPACE; }
+    inline auto is_lowercase()  const { return encoded & LOWERCASE; }
+    inline auto is_uppercase()  const { return encoded & UPPERCASE; }
+
+    inline auto is_C() const { return encoded & C; }
+    inline auto is_L() const { return encoded & L; }
+    inline auto is_M() const { return encoded & M; }
+    inline auto is_N() const { return encoded & N; }
+    inline auto is_P() const { return encoded & P; }
+    inline auto is_S() const { return encoded & S; }
+    inline auto is_Z() const { return encoded & Z; }
+
+    inline auto is_Cc() const { return (encoded & SUBMASK) == Cc; }
+    inline auto is_Cf() const { return (encoded & SUBMASK) == Cf; }
+    inline auto is_Co() const { return (encoded & SUBMASK) == Co; }
+    inline auto is_Cs() const { return (encoded & SUBMASK) == Cs; }
+    inline auto is_Ll() const { return (encoded & SUBMASK) == Ll; }
+    inline auto is_Lm() const { return (encoded & SUBMASK) == Lm; }
+    inline auto is_Lo() const { return (encoded & SUBMASK) == Lo; }
+    inline auto is_Lt() const { return (encoded & SUBMASK) == Lt; }
+    inline auto is_Lu() const { return (encoded & SUBMASK) == Lu; }
+    inline auto is_Mc() const { return (encoded & SUBMASK) == Mc; }
+    inline auto is_Me() const { return (encoded & SUBMASK) == Me; }
+    inline auto is_Mn() const { return (encoded & SUBMASK) == Mn; }
+    inline auto is_Nd() const { return (encoded & SUBMASK) == Nd; }
+    inline auto is_Nl() const { return (encoded & SUBMASK) == Nl; }
+    inline auto is_No() const { return (encoded & SUBMASK) == No; }
+    inline auto is_Pc() const { return (encoded & SUBMASK) == Pc; }
+    inline auto is_Pd() const { return (encoded & SUBMASK) == Pd; }
+    inline auto is_Pe() const { return (encoded & SUBMASK) == Pe; }
+    inline auto is_Pf() const { return (encoded & SUBMASK) == Pf; }
+    inline auto is_Pi() const { return (encoded & SUBMASK) == Pi; }
+    inline auto is_Po() const { return (encoded & SUBMASK) == Po; }
+    inline auto is_Ps() const { return (encoded & SUBMASK) == Ps; }
+    inline auto is_Sc() const { return (encoded & SUBMASK) == Sc; }
+    inline auto is_Sk() const { return (encoded & SUBMASK) == Sk; }
+    inline auto is_Sm() const { return (encoded & SUBMASK) == Sm; }
+    inline auto is_So() const { return (encoded & SUBMASK) == So; }
+    inline auto is_Zl() const { return (encoded & SUBMASK) == Zl; }
+    inline auto is_Zp() const { return (encoded & SUBMASK) == Zp; }
+    inline auto is_Zs() const { return (encoded & SUBMASK) == Zs; }
+
+    uint16_t encoded;
 };
 
 
@@ -53,8 +117,8 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
 
 std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
 
-codepoint_flags unicode_cpt_flags(const uint32_t cp);
-codepoint_flags unicode_cpt_flags(const std::string & utf8);
+codepoint_categ unicode_cpt_category(const uint32_t cp);
+codepoint_categ unicode_cpt_category(const std::string & utf8);
 
 std::string unicode_byte_to_utf8(uint8_t byte);
 uint8_t unicode_utf8_to_byte(const std::string & utf8);

From 8f9f05bf6de7177d90107fde3ea80cbabdecc6ed Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Sat, 20 Jul 2024 23:12:08 +0200
Subject: [PATCH 04/29] Update unicode data

---
 src/unicode-data.cpp | 6793 ++++++++++++++++++++++++++++--------------
 src/unicode-data.h   |    2 +-
 2 files changed, 4520 insertions(+), 2275 deletions(-)

diff --git a/src/unicode-data.cpp b/src/unicode-data.cpp
index 02bdf782380fe..4a0c0547c7d03 100644
--- a/src/unicode-data.cpp
+++ b/src/unicode-data.cpp
@@ -7,2280 +7,4524 @@
 #include <unordered_map>
 #include <unordered_set>
 
-const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = {  // start, flags // last=next_start-1
-{0x000000, 0x0080},
-{0x000020, 0x0008},
-{0x000021, 0x0020},
-{0x000024, 0x0040},
-{0x000025, 0x0020},
-{0x00002B, 0x0040},
-{0x00002C, 0x0020},
-{0x000030, 0x0002},
-{0x00003A, 0x0020},
-{0x00003C, 0x0040},
-{0x00003F, 0x0020},
-{0x000041, 0x0004},
-{0x00005B, 0x0020},
-{0x00005E, 0x0040},
-{0x00005F, 0x0020},
-{0x000060, 0x0040},
-{0x000061, 0x0004},
-{0x00007B, 0x0020},
-{0x00007C, 0x0040},
-{0x00007D, 0x0020},
-{0x00007E, 0x0040},
-{0x00007F, 0x0080},
-{0x0000A0, 0x0008},
-{0x0000A1, 0x0020},
-{0x0000A2, 0x0040},
-{0x0000A7, 0x0020},
-{0x0000A8, 0x0040},
-{0x0000AA, 0x0004},
-{0x0000AB, 0x0020},
-{0x0000AC, 0x0040},
-{0x0000AD, 0x0080},
-{0x0000AE, 0x0040},
-{0x0000B2, 0x0002},
-{0x0000B4, 0x0040},
-{0x0000B5, 0x0004},
-{0x0000B6, 0x0020},
-{0x0000B8, 0x0040},
-{0x0000B9, 0x0002},
-{0x0000BA, 0x0004},
-{0x0000BB, 0x0020},
-{0x0000BC, 0x0002},
-{0x0000BF, 0x0020},
-{0x0000C0, 0x0004},
-{0x0000D7, 0x0040},
-{0x0000D8, 0x0004},
-{0x0000F7, 0x0040},
-{0x0000F8, 0x0004},
-{0x0002C2, 0x0040},
-{0x0002C6, 0x0004},
-{0x0002D2, 0x0040},
-{0x0002E0, 0x0004},
-{0x0002E5, 0x0040},
-{0x0002EC, 0x0004},
-{0x0002ED, 0x0040},
-{0x0002EE, 0x0004},
-{0x0002EF, 0x0040},
-{0x000300, 0x0010},
-{0x000370, 0x0004},
-{0x000375, 0x0040},
-{0x000376, 0x0004},
-{0x000378, 0x0001},
-{0x00037A, 0x0004},
-{0x00037E, 0x0020},
-{0x00037F, 0x0004},
-{0x000380, 0x0001},
-{0x000384, 0x0040},
-{0x000386, 0x0004},
-{0x000387, 0x0020},
-{0x000388, 0x0004},
-{0x00038B, 0x0001},
-{0x00038C, 0x0004},
-{0x00038D, 0x0001},
-{0x00038E, 0x0004},
-{0x0003A2, 0x0001},
-{0x0003A3, 0x0004},
-{0x0003F6, 0x0040},
-{0x0003F7, 0x0004},
-{0x000482, 0x0040},
-{0x000483, 0x0010},
-{0x00048A, 0x0004},
-{0x000530, 0x0001},
-{0x000531, 0x0004},
-{0x000557, 0x0001},
-{0x000559, 0x0004},
-{0x00055A, 0x0020},
-{0x000560, 0x0004},
-{0x000589, 0x0020},
-{0x00058B, 0x0001},
-{0x00058D, 0x0040},
-{0x000590, 0x0001},
-{0x000591, 0x0010},
-{0x0005BE, 0x0020},
-{0x0005BF, 0x0010},
-{0x0005C0, 0x0020},
-{0x0005C1, 0x0010},
-{0x0005C3, 0x0020},
-{0x0005C4, 0x0010},
-{0x0005C6, 0x0020},
-{0x0005C7, 0x0010},
-{0x0005C8, 0x0001},
-{0x0005D0, 0x0004},
-{0x0005EB, 0x0001},
-{0x0005EF, 0x0004},
-{0x0005F3, 0x0020},
-{0x0005F5, 0x0001},
-{0x000600, 0x0080},
-{0x000606, 0x0040},
-{0x000609, 0x0020},
-{0x00060B, 0x0040},
-{0x00060C, 0x0020},
-{0x00060E, 0x0040},
-{0x000610, 0x0010},
-{0x00061B, 0x0020},
-{0x00061C, 0x0080},
-{0x00061D, 0x0020},
-{0x000620, 0x0004},
-{0x00064B, 0x0010},
-{0x000660, 0x0002},
-{0x00066A, 0x0020},
-{0x00066E, 0x0004},
-{0x000670, 0x0010},
-{0x000671, 0x0004},
-{0x0006D4, 0x0020},
-{0x0006D5, 0x0004},
-{0x0006D6, 0x0010},
-{0x0006DD, 0x0080},
-{0x0006DE, 0x0040},
-{0x0006DF, 0x0010},
-{0x0006E5, 0x0004},
-{0x0006E7, 0x0010},
-{0x0006E9, 0x0040},
-{0x0006EA, 0x0010},
-{0x0006EE, 0x0004},
-{0x0006F0, 0x0002},
-{0x0006FA, 0x0004},
-{0x0006FD, 0x0040},
-{0x0006FF, 0x0004},
-{0x000700, 0x0020},
-{0x00070E, 0x0001},
-{0x00070F, 0x0080},
-{0x000710, 0x0004},
-{0x000711, 0x0010},
-{0x000712, 0x0004},
-{0x000730, 0x0010},
-{0x00074B, 0x0001},
-{0x00074D, 0x0004},
-{0x0007A6, 0x0010},
-{0x0007B1, 0x0004},
-{0x0007B2, 0x0001},
-{0x0007C0, 0x0002},
-{0x0007CA, 0x0004},
-{0x0007EB, 0x0010},
-{0x0007F4, 0x0004},
-{0x0007F6, 0x0040},
-{0x0007F7, 0x0020},
-{0x0007FA, 0x0004},
-{0x0007FB, 0x0001},
-{0x0007FD, 0x0010},
-{0x0007FE, 0x0040},
-{0x000800, 0x0004},
-{0x000816, 0x0010},
-{0x00081A, 0x0004},
-{0x00081B, 0x0010},
-{0x000824, 0x0004},
-{0x000825, 0x0010},
-{0x000828, 0x0004},
-{0x000829, 0x0010},
-{0x00082E, 0x0001},
-{0x000830, 0x0020},
-{0x00083F, 0x0001},
-{0x000840, 0x0004},
-{0x000859, 0x0010},
-{0x00085C, 0x0001},
-{0x00085E, 0x0020},
-{0x00085F, 0x0001},
-{0x000860, 0x0004},
-{0x00086B, 0x0001},
-{0x000870, 0x0004},
-{0x000888, 0x0040},
-{0x000889, 0x0004},
-{0x00088F, 0x0001},
-{0x000890, 0x0080},
-{0x000892, 0x0001},
-{0x000898, 0x0010},
-{0x0008A0, 0x0004},
-{0x0008CA, 0x0010},
-{0x0008E2, 0x0080},
-{0x0008E3, 0x0010},
-{0x000904, 0x0004},
-{0x00093A, 0x0010},
-{0x00093D, 0x0004},
-{0x00093E, 0x0010},
-{0x000950, 0x0004},
-{0x000951, 0x0010},
-{0x000958, 0x0004},
-{0x000962, 0x0010},
-{0x000964, 0x0020},
-{0x000966, 0x0002},
-{0x000970, 0x0020},
-{0x000971, 0x0004},
-{0x000981, 0x0010},
-{0x000984, 0x0001},
-{0x000985, 0x0004},
-{0x00098D, 0x0001},
-{0x00098F, 0x0004},
-{0x000991, 0x0001},
-{0x000993, 0x0004},
-{0x0009A9, 0x0001},
-{0x0009AA, 0x0004},
-{0x0009B1, 0x0001},
-{0x0009B2, 0x0004},
-{0x0009B3, 0x0001},
-{0x0009B6, 0x0004},
-{0x0009BA, 0x0001},
-{0x0009BC, 0x0010},
-{0x0009BD, 0x0004},
-{0x0009BE, 0x0010},
-{0x0009C5, 0x0001},
-{0x0009C7, 0x0010},
-{0x0009C9, 0x0001},
-{0x0009CB, 0x0010},
-{0x0009CE, 0x0004},
-{0x0009CF, 0x0001},
-{0x0009D7, 0x0010},
-{0x0009D8, 0x0001},
-{0x0009DC, 0x0004},
-{0x0009DE, 0x0001},
-{0x0009DF, 0x0004},
-{0x0009E2, 0x0010},
-{0x0009E4, 0x0001},
-{0x0009E6, 0x0002},
-{0x0009F0, 0x0004},
-{0x0009F2, 0x0040},
-{0x0009F4, 0x0002},
-{0x0009FA, 0x0040},
-{0x0009FC, 0x0004},
-{0x0009FD, 0x0020},
-{0x0009FE, 0x0010},
-{0x0009FF, 0x0001},
-{0x000A01, 0x0010},
-{0x000A04, 0x0001},
-{0x000A05, 0x0004},
-{0x000A0B, 0x0001},
-{0x000A0F, 0x0004},
-{0x000A11, 0x0001},
-{0x000A13, 0x0004},
-{0x000A29, 0x0001},
-{0x000A2A, 0x0004},
-{0x000A31, 0x0001},
-{0x000A32, 0x0004},
-{0x000A34, 0x0001},
-{0x000A35, 0x0004},
-{0x000A37, 0x0001},
-{0x000A38, 0x0004},
-{0x000A3A, 0x0001},
-{0x000A3C, 0x0010},
-{0x000A3D, 0x0001},
-{0x000A3E, 0x0010},
-{0x000A43, 0x0001},
-{0x000A47, 0x0010},
-{0x000A49, 0x0001},
-{0x000A4B, 0x0010},
-{0x000A4E, 0x0001},
-{0x000A51, 0x0010},
-{0x000A52, 0x0001},
-{0x000A59, 0x0004},
-{0x000A5D, 0x0001},
-{0x000A5E, 0x0004},
-{0x000A5F, 0x0001},
-{0x000A66, 0x0002},
-{0x000A70, 0x0010},
-{0x000A72, 0x0004},
-{0x000A75, 0x0010},
-{0x000A76, 0x0020},
-{0x000A77, 0x0001},
-{0x000A81, 0x0010},
-{0x000A84, 0x0001},
-{0x000A85, 0x0004},
-{0x000A8E, 0x0001},
-{0x000A8F, 0x0004},
-{0x000A92, 0x0001},
-{0x000A93, 0x0004},
-{0x000AA9, 0x0001},
-{0x000AAA, 0x0004},
-{0x000AB1, 0x0001},
-{0x000AB2, 0x0004},
-{0x000AB4, 0x0001},
-{0x000AB5, 0x0004},
-{0x000ABA, 0x0001},
-{0x000ABC, 0x0010},
-{0x000ABD, 0x0004},
-{0x000ABE, 0x0010},
-{0x000AC6, 0x0001},
-{0x000AC7, 0x0010},
-{0x000ACA, 0x0001},
-{0x000ACB, 0x0010},
-{0x000ACE, 0x0001},
-{0x000AD0, 0x0004},
-{0x000AD1, 0x0001},
-{0x000AE0, 0x0004},
-{0x000AE2, 0x0010},
-{0x000AE4, 0x0001},
-{0x000AE6, 0x0002},
-{0x000AF0, 0x0020},
-{0x000AF1, 0x0040},
-{0x000AF2, 0x0001},
-{0x000AF9, 0x0004},
-{0x000AFA, 0x0010},
-{0x000B00, 0x0001},
-{0x000B01, 0x0010},
-{0x000B04, 0x0001},
-{0x000B05, 0x0004},
-{0x000B0D, 0x0001},
-{0x000B0F, 0x0004},
-{0x000B11, 0x0001},
-{0x000B13, 0x0004},
-{0x000B29, 0x0001},
-{0x000B2A, 0x0004},
-{0x000B31, 0x0001},
-{0x000B32, 0x0004},
-{0x000B34, 0x0001},
-{0x000B35, 0x0004},
-{0x000B3A, 0x0001},
-{0x000B3C, 0x0010},
-{0x000B3D, 0x0004},
-{0x000B3E, 0x0010},
-{0x000B45, 0x0001},
-{0x000B47, 0x0010},
-{0x000B49, 0x0001},
-{0x000B4B, 0x0010},
-{0x000B4E, 0x0001},
-{0x000B55, 0x0010},
-{0x000B58, 0x0001},
-{0x000B5C, 0x0004},
-{0x000B5E, 0x0001},
-{0x000B5F, 0x0004},
-{0x000B62, 0x0010},
-{0x000B64, 0x0001},
-{0x000B66, 0x0002},
-{0x000B70, 0x0040},
-{0x000B71, 0x0004},
-{0x000B72, 0x0002},
-{0x000B78, 0x0001},
-{0x000B82, 0x0010},
-{0x000B83, 0x0004},
-{0x000B84, 0x0001},
-{0x000B85, 0x0004},
-{0x000B8B, 0x0001},
-{0x000B8E, 0x0004},
-{0x000B91, 0x0001},
-{0x000B92, 0x0004},
-{0x000B96, 0x0001},
-{0x000B99, 0x0004},
-{0x000B9B, 0x0001},
-{0x000B9C, 0x0004},
-{0x000B9D, 0x0001},
-{0x000B9E, 0x0004},
-{0x000BA0, 0x0001},
-{0x000BA3, 0x0004},
-{0x000BA5, 0x0001},
-{0x000BA8, 0x0004},
-{0x000BAB, 0x0001},
-{0x000BAE, 0x0004},
-{0x000BBA, 0x0001},
-{0x000BBE, 0x0010},
-{0x000BC3, 0x0001},
-{0x000BC6, 0x0010},
-{0x000BC9, 0x0001},
-{0x000BCA, 0x0010},
-{0x000BCE, 0x0001},
-{0x000BD0, 0x0004},
-{0x000BD1, 0x0001},
-{0x000BD7, 0x0010},
-{0x000BD8, 0x0001},
-{0x000BE6, 0x0002},
-{0x000BF3, 0x0040},
-{0x000BFB, 0x0001},
-{0x000C00, 0x0010},
-{0x000C05, 0x0004},
-{0x000C0D, 0x0001},
-{0x000C0E, 0x0004},
-{0x000C11, 0x0001},
-{0x000C12, 0x0004},
-{0x000C29, 0x0001},
-{0x000C2A, 0x0004},
-{0x000C3A, 0x0001},
-{0x000C3C, 0x0010},
-{0x000C3D, 0x0004},
-{0x000C3E, 0x0010},
-{0x000C45, 0x0001},
-{0x000C46, 0x0010},
-{0x000C49, 0x0001},
-{0x000C4A, 0x0010},
-{0x000C4E, 0x0001},
-{0x000C55, 0x0010},
-{0x000C57, 0x0001},
-{0x000C58, 0x0004},
-{0x000C5B, 0x0001},
-{0x000C5D, 0x0004},
-{0x000C5E, 0x0001},
-{0x000C60, 0x0004},
-{0x000C62, 0x0010},
-{0x000C64, 0x0001},
-{0x000C66, 0x0002},
-{0x000C70, 0x0001},
-{0x000C77, 0x0020},
-{0x000C78, 0x0002},
-{0x000C7F, 0x0040},
-{0x000C80, 0x0004},
-{0x000C81, 0x0010},
-{0x000C84, 0x0020},
-{0x000C85, 0x0004},
-{0x000C8D, 0x0001},
-{0x000C8E, 0x0004},
-{0x000C91, 0x0001},
-{0x000C92, 0x0004},
-{0x000CA9, 0x0001},
-{0x000CAA, 0x0004},
-{0x000CB4, 0x0001},
-{0x000CB5, 0x0004},
-{0x000CBA, 0x0001},
-{0x000CBC, 0x0010},
-{0x000CBD, 0x0004},
-{0x000CBE, 0x0010},
-{0x000CC5, 0x0001},
-{0x000CC6, 0x0010},
-{0x000CC9, 0x0001},
-{0x000CCA, 0x0010},
-{0x000CCE, 0x0001},
-{0x000CD5, 0x0010},
-{0x000CD7, 0x0001},
-{0x000CDD, 0x0004},
-{0x000CDF, 0x0001},
-{0x000CE0, 0x0004},
-{0x000CE2, 0x0010},
-{0x000CE4, 0x0001},
-{0x000CE6, 0x0002},
-{0x000CF0, 0x0001},
-{0x000CF1, 0x0004},
-{0x000CF3, 0x0010},
-{0x000CF4, 0x0001},
-{0x000D00, 0x0010},
-{0x000D04, 0x0004},
-{0x000D0D, 0x0001},
-{0x000D0E, 0x0004},
-{0x000D11, 0x0001},
-{0x000D12, 0x0004},
-{0x000D3B, 0x0010},
-{0x000D3D, 0x0004},
-{0x000D3E, 0x0010},
-{0x000D45, 0x0001},
-{0x000D46, 0x0010},
-{0x000D49, 0x0001},
-{0x000D4A, 0x0010},
-{0x000D4E, 0x0004},
-{0x000D4F, 0x0040},
-{0x000D50, 0x0001},
-{0x000D54, 0x0004},
-{0x000D57, 0x0010},
-{0x000D58, 0x0002},
-{0x000D5F, 0x0004},
-{0x000D62, 0x0010},
-{0x000D64, 0x0001},
-{0x000D66, 0x0002},
-{0x000D79, 0x0040},
-{0x000D7A, 0x0004},
-{0x000D80, 0x0001},
-{0x000D81, 0x0010},
-{0x000D84, 0x0001},
-{0x000D85, 0x0004},
-{0x000D97, 0x0001},
-{0x000D9A, 0x0004},
-{0x000DB2, 0x0001},
-{0x000DB3, 0x0004},
-{0x000DBC, 0x0001},
-{0x000DBD, 0x0004},
-{0x000DBE, 0x0001},
-{0x000DC0, 0x0004},
-{0x000DC7, 0x0001},
-{0x000DCA, 0x0010},
-{0x000DCB, 0x0001},
-{0x000DCF, 0x0010},
-{0x000DD5, 0x0001},
-{0x000DD6, 0x0010},
-{0x000DD7, 0x0001},
-{0x000DD8, 0x0010},
-{0x000DE0, 0x0001},
-{0x000DE6, 0x0002},
-{0x000DF0, 0x0001},
-{0x000DF2, 0x0010},
-{0x000DF4, 0x0020},
-{0x000DF5, 0x0001},
-{0x000E01, 0x0004},
-{0x000E31, 0x0010},
-{0x000E32, 0x0004},
-{0x000E34, 0x0010},
-{0x000E3B, 0x0001},
-{0x000E3F, 0x0040},
-{0x000E40, 0x0004},
-{0x000E47, 0x0010},
-{0x000E4F, 0x0020},
-{0x000E50, 0x0002},
-{0x000E5A, 0x0020},
-{0x000E5C, 0x0001},
-{0x000E81, 0x0004},
-{0x000E83, 0x0001},
-{0x000E84, 0x0004},
-{0x000E85, 0x0001},
-{0x000E86, 0x0004},
-{0x000E8B, 0x0001},
-{0x000E8C, 0x0004},
-{0x000EA4, 0x0001},
-{0x000EA5, 0x0004},
-{0x000EA6, 0x0001},
-{0x000EA7, 0x0004},
-{0x000EB1, 0x0010},
-{0x000EB2, 0x0004},
-{0x000EB4, 0x0010},
-{0x000EBD, 0x0004},
-{0x000EBE, 0x0001},
-{0x000EC0, 0x0004},
-{0x000EC5, 0x0001},
-{0x000EC6, 0x0004},
-{0x000EC7, 0x0001},
-{0x000EC8, 0x0010},
-{0x000ECF, 0x0001},
-{0x000ED0, 0x0002},
-{0x000EDA, 0x0001},
-{0x000EDC, 0x0004},
-{0x000EE0, 0x0001},
-{0x000F00, 0x0004},
-{0x000F01, 0x0040},
-{0x000F04, 0x0020},
-{0x000F13, 0x0040},
-{0x000F14, 0x0020},
-{0x000F15, 0x0040},
-{0x000F18, 0x0010},
-{0x000F1A, 0x0040},
-{0x000F20, 0x0002},
-{0x000F34, 0x0040},
-{0x000F35, 0x0010},
-{0x000F36, 0x0040},
-{0x000F37, 0x0010},
-{0x000F38, 0x0040},
-{0x000F39, 0x0010},
-{0x000F3A, 0x0020},
-{0x000F3E, 0x0010},
-{0x000F40, 0x0004},
-{0x000F48, 0x0001},
-{0x000F49, 0x0004},
-{0x000F6D, 0x0001},
-{0x000F71, 0x0010},
-{0x000F85, 0x0020},
-{0x000F86, 0x0010},
-{0x000F88, 0x0004},
-{0x000F8D, 0x0010},
-{0x000F98, 0x0001},
-{0x000F99, 0x0010},
-{0x000FBD, 0x0001},
-{0x000FBE, 0x0040},
-{0x000FC6, 0x0010},
-{0x000FC7, 0x0040},
-{0x000FCD, 0x0001},
-{0x000FCE, 0x0040},
-{0x000FD0, 0x0020},
-{0x000FD5, 0x0040},
-{0x000FD9, 0x0020},
-{0x000FDB, 0x0001},
-{0x001000, 0x0004},
-{0x00102B, 0x0010},
-{0x00103F, 0x0004},
-{0x001040, 0x0002},
-{0x00104A, 0x0020},
-{0x001050, 0x0004},
-{0x001056, 0x0010},
-{0x00105A, 0x0004},
-{0x00105E, 0x0010},
-{0x001061, 0x0004},
-{0x001062, 0x0010},
-{0x001065, 0x0004},
-{0x001067, 0x0010},
-{0x00106E, 0x0004},
-{0x001071, 0x0010},
-{0x001075, 0x0004},
-{0x001082, 0x0010},
-{0x00108E, 0x0004},
-{0x00108F, 0x0010},
-{0x001090, 0x0002},
-{0x00109A, 0x0010},
-{0x00109E, 0x0040},
-{0x0010A0, 0x0004},
-{0x0010C6, 0x0001},
-{0x0010C7, 0x0004},
-{0x0010C8, 0x0001},
-{0x0010CD, 0x0004},
-{0x0010CE, 0x0001},
-{0x0010D0, 0x0004},
-{0x0010FB, 0x0020},
-{0x0010FC, 0x0004},
-{0x001249, 0x0001},
-{0x00124A, 0x0004},
-{0x00124E, 0x0001},
-{0x001250, 0x0004},
-{0x001257, 0x0001},
-{0x001258, 0x0004},
-{0x001259, 0x0001},
-{0x00125A, 0x0004},
-{0x00125E, 0x0001},
-{0x001260, 0x0004},
-{0x001289, 0x0001},
-{0x00128A, 0x0004},
-{0x00128E, 0x0001},
-{0x001290, 0x0004},
-{0x0012B1, 0x0001},
-{0x0012B2, 0x0004},
-{0x0012B6, 0x0001},
-{0x0012B8, 0x0004},
-{0x0012BF, 0x0001},
-{0x0012C0, 0x0004},
-{0x0012C1, 0x0001},
-{0x0012C2, 0x0004},
-{0x0012C6, 0x0001},
-{0x0012C8, 0x0004},
-{0x0012D7, 0x0001},
-{0x0012D8, 0x0004},
-{0x001311, 0x0001},
-{0x001312, 0x0004},
-{0x001316, 0x0001},
-{0x001318, 0x0004},
-{0x00135B, 0x0001},
-{0x00135D, 0x0010},
-{0x001360, 0x0020},
-{0x001369, 0x0002},
-{0x00137D, 0x0001},
-{0x001380, 0x0004},
-{0x001390, 0x0040},
-{0x00139A, 0x0001},
-{0x0013A0, 0x0004},
-{0x0013F6, 0x0001},
-{0x0013F8, 0x0004},
-{0x0013FE, 0x0001},
-{0x001400, 0x0020},
-{0x001401, 0x0004},
-{0x00166D, 0x0040},
-{0x00166E, 0x0020},
-{0x00166F, 0x0004},
-{0x001680, 0x0008},
-{0x001681, 0x0004},
-{0x00169B, 0x0020},
-{0x00169D, 0x0001},
-{0x0016A0, 0x0004},
-{0x0016EB, 0x0020},
-{0x0016EE, 0x0002},
-{0x0016F1, 0x0004},
-{0x0016F9, 0x0001},
-{0x001700, 0x0004},
-{0x001712, 0x0010},
-{0x001716, 0x0001},
-{0x00171F, 0x0004},
-{0x001732, 0x0010},
-{0x001735, 0x0020},
-{0x001737, 0x0001},
-{0x001740, 0x0004},
-{0x001752, 0x0010},
-{0x001754, 0x0001},
-{0x001760, 0x0004},
-{0x00176D, 0x0001},
-{0x00176E, 0x0004},
-{0x001771, 0x0001},
-{0x001772, 0x0010},
-{0x001774, 0x0001},
-{0x001780, 0x0004},
-{0x0017B4, 0x0010},
-{0x0017D4, 0x0020},
-{0x0017D7, 0x0004},
-{0x0017D8, 0x0020},
-{0x0017DB, 0x0040},
-{0x0017DC, 0x0004},
-{0x0017DD, 0x0010},
-{0x0017DE, 0x0001},
-{0x0017E0, 0x0002},
-{0x0017EA, 0x0001},
-{0x0017F0, 0x0002},
-{0x0017FA, 0x0001},
-{0x001800, 0x0020},
-{0x00180B, 0x0010},
-{0x00180E, 0x0080},
-{0x00180F, 0x0010},
-{0x001810, 0x0002},
-{0x00181A, 0x0001},
-{0x001820, 0x0004},
-{0x001879, 0x0001},
-{0x001880, 0x0004},
-{0x001885, 0x0010},
-{0x001887, 0x0004},
-{0x0018A9, 0x0010},
-{0x0018AA, 0x0004},
-{0x0018AB, 0x0001},
-{0x0018B0, 0x0004},
-{0x0018F6, 0x0001},
-{0x001900, 0x0004},
-{0x00191F, 0x0001},
-{0x001920, 0x0010},
-{0x00192C, 0x0001},
-{0x001930, 0x0010},
-{0x00193C, 0x0001},
-{0x001940, 0x0040},
-{0x001941, 0x0001},
-{0x001944, 0x0020},
-{0x001946, 0x0002},
-{0x001950, 0x0004},
-{0x00196E, 0x0001},
-{0x001970, 0x0004},
-{0x001975, 0x0001},
-{0x001980, 0x0004},
-{0x0019AC, 0x0001},
-{0x0019B0, 0x0004},
-{0x0019CA, 0x0001},
-{0x0019D0, 0x0002},
-{0x0019DB, 0x0001},
-{0x0019DE, 0x0040},
-{0x001A00, 0x0004},
-{0x001A17, 0x0010},
-{0x001A1C, 0x0001},
-{0x001A1E, 0x0020},
-{0x001A20, 0x0004},
-{0x001A55, 0x0010},
-{0x001A5F, 0x0001},
-{0x001A60, 0x0010},
-{0x001A7D, 0x0001},
-{0x001A7F, 0x0010},
-{0x001A80, 0x0002},
-{0x001A8A, 0x0001},
-{0x001A90, 0x0002},
-{0x001A9A, 0x0001},
-{0x001AA0, 0x0020},
-{0x001AA7, 0x0004},
-{0x001AA8, 0x0020},
-{0x001AAE, 0x0001},
-{0x001AB0, 0x0010},
-{0x001ACF, 0x0001},
-{0x001B00, 0x0010},
-{0x001B05, 0x0004},
-{0x001B34, 0x0010},
-{0x001B45, 0x0004},
-{0x001B4D, 0x0001},
-{0x001B50, 0x0002},
-{0x001B5A, 0x0020},
-{0x001B61, 0x0040},
-{0x001B6B, 0x0010},
-{0x001B74, 0x0040},
-{0x001B7D, 0x0020},
-{0x001B7F, 0x0001},
-{0x001B80, 0x0010},
-{0x001B83, 0x0004},
-{0x001BA1, 0x0010},
-{0x001BAE, 0x0004},
-{0x001BB0, 0x0002},
-{0x001BBA, 0x0004},
-{0x001BE6, 0x0010},
-{0x001BF4, 0x0001},
-{0x001BFC, 0x0020},
-{0x001C00, 0x0004},
-{0x001C24, 0x0010},
-{0x001C38, 0x0001},
-{0x001C3B, 0x0020},
-{0x001C40, 0x0002},
-{0x001C4A, 0x0001},
-{0x001C4D, 0x0004},
-{0x001C50, 0x0002},
-{0x001C5A, 0x0004},
-{0x001C7E, 0x0020},
-{0x001C80, 0x0004},
-{0x001C89, 0x0001},
-{0x001C90, 0x0004},
-{0x001CBB, 0x0001},
-{0x001CBD, 0x0004},
-{0x001CC0, 0x0020},
-{0x001CC8, 0x0001},
-{0x001CD0, 0x0010},
-{0x001CD3, 0x0020},
-{0x001CD4, 0x0010},
-{0x001CE9, 0x0004},
-{0x001CED, 0x0010},
-{0x001CEE, 0x0004},
-{0x001CF4, 0x0010},
-{0x001CF5, 0x0004},
-{0x001CF7, 0x0010},
-{0x001CFA, 0x0004},
-{0x001CFB, 0x0001},
-{0x001D00, 0x0004},
-{0x001DC0, 0x0010},
-{0x001E00, 0x0004},
-{0x001F16, 0x0001},
-{0x001F18, 0x0004},
-{0x001F1E, 0x0001},
-{0x001F20, 0x0004},
-{0x001F46, 0x0001},
-{0x001F48, 0x0004},
-{0x001F4E, 0x0001},
-{0x001F50, 0x0004},
-{0x001F58, 0x0001},
-{0x001F59, 0x0004},
-{0x001F5A, 0x0001},
-{0x001F5B, 0x0004},
-{0x001F5C, 0x0001},
-{0x001F5D, 0x0004},
-{0x001F5E, 0x0001},
-{0x001F5F, 0x0004},
-{0x001F7E, 0x0001},
-{0x001F80, 0x0004},
-{0x001FB5, 0x0001},
-{0x001FB6, 0x0004},
-{0x001FBD, 0x0040},
-{0x001FBE, 0x0004},
-{0x001FBF, 0x0040},
-{0x001FC2, 0x0004},
-{0x001FC5, 0x0001},
-{0x001FC6, 0x0004},
-{0x001FCD, 0x0040},
-{0x001FD0, 0x0004},
-{0x001FD4, 0x0001},
-{0x001FD6, 0x0004},
-{0x001FDC, 0x0001},
-{0x001FDD, 0x0040},
-{0x001FE0, 0x0004},
-{0x001FED, 0x0040},
-{0x001FF0, 0x0001},
-{0x001FF2, 0x0004},
-{0x001FF5, 0x0001},
-{0x001FF6, 0x0004},
-{0x001FFD, 0x0040},
-{0x001FFF, 0x0001},
-{0x002000, 0x0008},
-{0x00200B, 0x0080},
-{0x002010, 0x0020},
-{0x002028, 0x0008},
-{0x00202A, 0x0080},
-{0x00202F, 0x0008},
-{0x002030, 0x0020},
-{0x002044, 0x0040},
-{0x002045, 0x0020},
-{0x002052, 0x0040},
-{0x002053, 0x0020},
-{0x00205F, 0x0008},
-{0x002060, 0x0080},
-{0x002065, 0x0001},
-{0x002066, 0x0080},
-{0x002070, 0x0002},
-{0x002071, 0x0004},
-{0x002072, 0x0001},
-{0x002074, 0x0002},
-{0x00207A, 0x0040},
-{0x00207D, 0x0020},
-{0x00207F, 0x0004},
-{0x002080, 0x0002},
-{0x00208A, 0x0040},
-{0x00208D, 0x0020},
-{0x00208F, 0x0001},
-{0x002090, 0x0004},
-{0x00209D, 0x0001},
-{0x0020A0, 0x0040},
-{0x0020C1, 0x0001},
-{0x0020D0, 0x0010},
-{0x0020F1, 0x0001},
-{0x002100, 0x0040},
-{0x002102, 0x0004},
-{0x002103, 0x0040},
-{0x002107, 0x0004},
-{0x002108, 0x0040},
-{0x00210A, 0x0004},
-{0x002114, 0x0040},
-{0x002115, 0x0004},
-{0x002116, 0x0040},
-{0x002119, 0x0004},
-{0x00211E, 0x0040},
-{0x002124, 0x0004},
-{0x002125, 0x0040},
-{0x002126, 0x0004},
-{0x002127, 0x0040},
-{0x002128, 0x0004},
-{0x002129, 0x0040},
-{0x00212A, 0x0004},
-{0x00212E, 0x0040},
-{0x00212F, 0x0004},
-{0x00213A, 0x0040},
-{0x00213C, 0x0004},
-{0x002140, 0x0040},
-{0x002145, 0x0004},
-{0x00214A, 0x0040},
-{0x00214E, 0x0004},
-{0x00214F, 0x0040},
-{0x002150, 0x0002},
-{0x002183, 0x0004},
-{0x002185, 0x0002},
-{0x00218A, 0x0040},
-{0x00218C, 0x0001},
-{0x002190, 0x0040},
-{0x002308, 0x0020},
-{0x00230C, 0x0040},
-{0x002329, 0x0020},
-{0x00232B, 0x0040},
-{0x002427, 0x0001},
-{0x002440, 0x0040},
-{0x00244B, 0x0001},
-{0x002460, 0x0002},
-{0x00249C, 0x0040},
-{0x0024EA, 0x0002},
-{0x002500, 0x0040},
-{0x002768, 0x0020},
-{0x002776, 0x0002},
-{0x002794, 0x0040},
-{0x0027C5, 0x0020},
-{0x0027C7, 0x0040},
-{0x0027E6, 0x0020},
-{0x0027F0, 0x0040},
-{0x002983, 0x0020},
-{0x002999, 0x0040},
-{0x0029D8, 0x0020},
-{0x0029DC, 0x0040},
-{0x0029FC, 0x0020},
-{0x0029FE, 0x0040},
-{0x002B74, 0x0001},
-{0x002B76, 0x0040},
-{0x002B96, 0x0001},
-{0x002B97, 0x0040},
-{0x002C00, 0x0004},
-{0x002CE5, 0x0040},
-{0x002CEB, 0x0004},
-{0x002CEF, 0x0010},
-{0x002CF2, 0x0004},
-{0x002CF4, 0x0001},
-{0x002CF9, 0x0020},
-{0x002CFD, 0x0002},
-{0x002CFE, 0x0020},
-{0x002D00, 0x0004},
-{0x002D26, 0x0001},
-{0x002D27, 0x0004},
-{0x002D28, 0x0001},
-{0x002D2D, 0x0004},
-{0x002D2E, 0x0001},
-{0x002D30, 0x0004},
-{0x002D68, 0x0001},
-{0x002D6F, 0x0004},
-{0x002D70, 0x0020},
-{0x002D71, 0x0001},
-{0x002D7F, 0x0010},
-{0x002D80, 0x0004},
-{0x002D97, 0x0001},
-{0x002DA0, 0x0004},
-{0x002DA7, 0x0001},
-{0x002DA8, 0x0004},
-{0x002DAF, 0x0001},
-{0x002DB0, 0x0004},
-{0x002DB7, 0x0001},
-{0x002DB8, 0x0004},
-{0x002DBF, 0x0001},
-{0x002DC0, 0x0004},
-{0x002DC7, 0x0001},
-{0x002DC8, 0x0004},
-{0x002DCF, 0x0001},
-{0x002DD0, 0x0004},
-{0x002DD7, 0x0001},
-{0x002DD8, 0x0004},
-{0x002DDF, 0x0001},
-{0x002DE0, 0x0010},
-{0x002E00, 0x0020},
-{0x002E2F, 0x0004},
-{0x002E30, 0x0020},
-{0x002E50, 0x0040},
-{0x002E52, 0x0020},
-{0x002E5E, 0x0001},
-{0x002E80, 0x0040},
-{0x002E9A, 0x0001},
-{0x002E9B, 0x0040},
-{0x002EF4, 0x0001},
-{0x002F00, 0x0040},
-{0x002FD6, 0x0001},
-{0x002FF0, 0x0040},
-{0x003000, 0x0008},
-{0x003001, 0x0020},
-{0x003004, 0x0040},
-{0x003005, 0x0004},
-{0x003007, 0x0002},
-{0x003008, 0x0020},
-{0x003012, 0x0040},
-{0x003014, 0x0020},
-{0x003020, 0x0040},
-{0x003021, 0x0002},
-{0x00302A, 0x0010},
-{0x003030, 0x0020},
-{0x003031, 0x0004},
-{0x003036, 0x0040},
-{0x003038, 0x0002},
-{0x00303B, 0x0004},
-{0x00303D, 0x0020},
-{0x00303E, 0x0040},
-{0x003040, 0x0001},
-{0x003041, 0x0004},
-{0x003097, 0x0001},
-{0x003099, 0x0010},
-{0x00309B, 0x0040},
-{0x00309D, 0x0004},
-{0x0030A0, 0x0020},
-{0x0030A1, 0x0004},
-{0x0030FB, 0x0020},
-{0x0030FC, 0x0004},
-{0x003100, 0x0001},
-{0x003105, 0x0004},
-{0x003130, 0x0001},
-{0x003131, 0x0004},
-{0x00318F, 0x0001},
-{0x003190, 0x0040},
-{0x003192, 0x0002},
-{0x003196, 0x0040},
-{0x0031A0, 0x0004},
-{0x0031C0, 0x0040},
-{0x0031E4, 0x0001},
-{0x0031EF, 0x0040},
-{0x0031F0, 0x0004},
-{0x003200, 0x0040},
-{0x00321F, 0x0001},
-{0x003220, 0x0002},
-{0x00322A, 0x0040},
-{0x003248, 0x0002},
-{0x003250, 0x0040},
-{0x003251, 0x0002},
-{0x003260, 0x0040},
-{0x003280, 0x0002},
-{0x00328A, 0x0040},
-{0x0032B1, 0x0002},
-{0x0032C0, 0x0040},
-{0x003400, 0x0004},
-{0x004DC0, 0x0040},
-{0x004E00, 0x0004},
-{0x00A48D, 0x0001},
-{0x00A490, 0x0040},
-{0x00A4C7, 0x0001},
-{0x00A4D0, 0x0004},
-{0x00A4FE, 0x0020},
-{0x00A500, 0x0004},
-{0x00A60D, 0x0020},
-{0x00A610, 0x0004},
-{0x00A620, 0x0002},
-{0x00A62A, 0x0004},
-{0x00A62C, 0x0001},
-{0x00A640, 0x0004},
-{0x00A66F, 0x0010},
-{0x00A673, 0x0020},
-{0x00A674, 0x0010},
-{0x00A67E, 0x0020},
-{0x00A67F, 0x0004},
-{0x00A69E, 0x0010},
-{0x00A6A0, 0x0004},
-{0x00A6E6, 0x0002},
-{0x00A6F0, 0x0010},
-{0x00A6F2, 0x0020},
-{0x00A6F8, 0x0001},
-{0x00A700, 0x0040},
-{0x00A717, 0x0004},
-{0x00A720, 0x0040},
-{0x00A722, 0x0004},
-{0x00A789, 0x0040},
-{0x00A78B, 0x0004},
-{0x00A7CB, 0x0001},
-{0x00A7D0, 0x0004},
-{0x00A7D2, 0x0001},
-{0x00A7D3, 0x0004},
-{0x00A7D4, 0x0001},
-{0x00A7D5, 0x0004},
-{0x00A7DA, 0x0001},
-{0x00A7F2, 0x0004},
-{0x00A802, 0x0010},
-{0x00A803, 0x0004},
-{0x00A806, 0x0010},
-{0x00A807, 0x0004},
-{0x00A80B, 0x0010},
-{0x00A80C, 0x0004},
-{0x00A823, 0x0010},
-{0x00A828, 0x0040},
-{0x00A82C, 0x0010},
-{0x00A82D, 0x0001},
-{0x00A830, 0x0002},
-{0x00A836, 0x0040},
-{0x00A83A, 0x0001},
-{0x00A840, 0x0004},
-{0x00A874, 0x0020},
-{0x00A878, 0x0001},
-{0x00A880, 0x0010},
-{0x00A882, 0x0004},
-{0x00A8B4, 0x0010},
-{0x00A8C6, 0x0001},
-{0x00A8CE, 0x0020},
-{0x00A8D0, 0x0002},
-{0x00A8DA, 0x0001},
-{0x00A8E0, 0x0010},
-{0x00A8F2, 0x0004},
-{0x00A8F8, 0x0020},
-{0x00A8FB, 0x0004},
-{0x00A8FC, 0x0020},
-{0x00A8FD, 0x0004},
-{0x00A8FF, 0x0010},
-{0x00A900, 0x0002},
-{0x00A90A, 0x0004},
-{0x00A926, 0x0010},
-{0x00A92E, 0x0020},
-{0x00A930, 0x0004},
-{0x00A947, 0x0010},
-{0x00A954, 0x0001},
-{0x00A95F, 0x0020},
-{0x00A960, 0x0004},
-{0x00A97D, 0x0001},
-{0x00A980, 0x0010},
-{0x00A984, 0x0004},
-{0x00A9B3, 0x0010},
-{0x00A9C1, 0x0020},
-{0x00A9CE, 0x0001},
-{0x00A9CF, 0x0004},
-{0x00A9D0, 0x0002},
-{0x00A9DA, 0x0001},
-{0x00A9DE, 0x0020},
-{0x00A9E0, 0x0004},
-{0x00A9E5, 0x0010},
-{0x00A9E6, 0x0004},
-{0x00A9F0, 0x0002},
-{0x00A9FA, 0x0004},
-{0x00A9FF, 0x0001},
-{0x00AA00, 0x0004},
-{0x00AA29, 0x0010},
-{0x00AA37, 0x0001},
-{0x00AA40, 0x0004},
-{0x00AA43, 0x0010},
-{0x00AA44, 0x0004},
-{0x00AA4C, 0x0010},
-{0x00AA4E, 0x0001},
-{0x00AA50, 0x0002},
-{0x00AA5A, 0x0001},
-{0x00AA5C, 0x0020},
-{0x00AA60, 0x0004},
-{0x00AA77, 0x0040},
-{0x00AA7A, 0x0004},
-{0x00AA7B, 0x0010},
-{0x00AA7E, 0x0004},
-{0x00AAB0, 0x0010},
-{0x00AAB1, 0x0004},
-{0x00AAB2, 0x0010},
-{0x00AAB5, 0x0004},
-{0x00AAB7, 0x0010},
-{0x00AAB9, 0x0004},
-{0x00AABE, 0x0010},
-{0x00AAC0, 0x0004},
-{0x00AAC1, 0x0010},
-{0x00AAC2, 0x0004},
-{0x00AAC3, 0x0001},
-{0x00AADB, 0x0004},
-{0x00AADE, 0x0020},
-{0x00AAE0, 0x0004},
-{0x00AAEB, 0x0010},
-{0x00AAF0, 0x0020},
-{0x00AAF2, 0x0004},
-{0x00AAF5, 0x0010},
-{0x00AAF7, 0x0001},
-{0x00AB01, 0x0004},
-{0x00AB07, 0x0001},
-{0x00AB09, 0x0004},
-{0x00AB0F, 0x0001},
-{0x00AB11, 0x0004},
-{0x00AB17, 0x0001},
-{0x00AB20, 0x0004},
-{0x00AB27, 0x0001},
-{0x00AB28, 0x0004},
-{0x00AB2F, 0x0001},
-{0x00AB30, 0x0004},
-{0x00AB5B, 0x0040},
-{0x00AB5C, 0x0004},
-{0x00AB6A, 0x0040},
-{0x00AB6C, 0x0001},
-{0x00AB70, 0x0004},
-{0x00ABE3, 0x0010},
-{0x00ABEB, 0x0020},
-{0x00ABEC, 0x0010},
-{0x00ABEE, 0x0001},
-{0x00ABF0, 0x0002},
-{0x00ABFA, 0x0001},
-{0x00AC00, 0x0004},
-{0x00D7A4, 0x0001},
-{0x00D7B0, 0x0004},
-{0x00D7C7, 0x0001},
-{0x00D7CB, 0x0004},
-{0x00D7FC, 0x0001},
-{0x00D800, 0x0080},
-{0x00F900, 0x0004},
-{0x00FA6E, 0x0001},
-{0x00FA70, 0x0004},
-{0x00FADA, 0x0001},
-{0x00FB00, 0x0004},
-{0x00FB07, 0x0001},
-{0x00FB13, 0x0004},
-{0x00FB18, 0x0001},
-{0x00FB1D, 0x0004},
-{0x00FB1E, 0x0010},
-{0x00FB1F, 0x0004},
-{0x00FB29, 0x0040},
-{0x00FB2A, 0x0004},
-{0x00FB37, 0x0001},
-{0x00FB38, 0x0004},
-{0x00FB3D, 0x0001},
-{0x00FB3E, 0x0004},
-{0x00FB3F, 0x0001},
-{0x00FB40, 0x0004},
-{0x00FB42, 0x0001},
-{0x00FB43, 0x0004},
-{0x00FB45, 0x0001},
-{0x00FB46, 0x0004},
-{0x00FBB2, 0x0040},
-{0x00FBC3, 0x0001},
-{0x00FBD3, 0x0004},
-{0x00FD3E, 0x0020},
-{0x00FD40, 0x0040},
-{0x00FD50, 0x0004},
-{0x00FD90, 0x0001},
-{0x00FD92, 0x0004},
-{0x00FDC8, 0x0001},
-{0x00FDCF, 0x0040},
-{0x00FDD0, 0x0001},
-{0x00FDF0, 0x0004},
-{0x00FDFC, 0x0040},
-{0x00FE00, 0x0010},
-{0x00FE10, 0x0020},
-{0x00FE1A, 0x0001},
-{0x00FE20, 0x0010},
-{0x00FE30, 0x0020},
-{0x00FE53, 0x0001},
-{0x00FE54, 0x0020},
-{0x00FE62, 0x0040},
-{0x00FE63, 0x0020},
-{0x00FE64, 0x0040},
-{0x00FE67, 0x0001},
-{0x00FE68, 0x0020},
-{0x00FE69, 0x0040},
-{0x00FE6A, 0x0020},
-{0x00FE6C, 0x0001},
-{0x00FE70, 0x0004},
-{0x00FE75, 0x0001},
-{0x00FE76, 0x0004},
-{0x00FEFD, 0x0001},
-{0x00FEFF, 0x0080},
-{0x00FF00, 0x0001},
-{0x00FF01, 0x0020},
-{0x00FF04, 0x0040},
-{0x00FF05, 0x0020},
-{0x00FF0B, 0x0040},
-{0x00FF0C, 0x0020},
-{0x00FF10, 0x0002},
-{0x00FF1A, 0x0020},
-{0x00FF1C, 0x0040},
-{0x00FF1F, 0x0020},
-{0x00FF21, 0x0004},
-{0x00FF3B, 0x0020},
-{0x00FF3E, 0x0040},
-{0x00FF3F, 0x0020},
-{0x00FF40, 0x0040},
-{0x00FF41, 0x0004},
-{0x00FF5B, 0x0020},
-{0x00FF5C, 0x0040},
-{0x00FF5D, 0x0020},
-{0x00FF5E, 0x0040},
-{0x00FF5F, 0x0020},
-{0x00FF66, 0x0004},
-{0x00FFBF, 0x0001},
-{0x00FFC2, 0x0004},
-{0x00FFC8, 0x0001},
-{0x00FFCA, 0x0004},
-{0x00FFD0, 0x0001},
-{0x00FFD2, 0x0004},
-{0x00FFD8, 0x0001},
-{0x00FFDA, 0x0004},
-{0x00FFDD, 0x0001},
-{0x00FFE0, 0x0040},
-{0x00FFE7, 0x0001},
-{0x00FFE8, 0x0040},
-{0x00FFEF, 0x0001},
-{0x00FFF9, 0x0080},
-{0x00FFFC, 0x0040},
-{0x00FFFE, 0x0001},
-{0x010000, 0x0004},
-{0x01000C, 0x0001},
-{0x01000D, 0x0004},
-{0x010027, 0x0001},
-{0x010028, 0x0004},
-{0x01003B, 0x0001},
-{0x01003C, 0x0004},
-{0x01003E, 0x0001},
-{0x01003F, 0x0004},
-{0x01004E, 0x0001},
-{0x010050, 0x0004},
-{0x01005E, 0x0001},
-{0x010080, 0x0004},
-{0x0100FB, 0x0001},
-{0x010100, 0x0020},
-{0x010103, 0x0001},
-{0x010107, 0x0002},
-{0x010134, 0x0001},
-{0x010137, 0x0040},
-{0x010140, 0x0002},
-{0x010179, 0x0040},
-{0x01018A, 0x0002},
-{0x01018C, 0x0040},
-{0x01018F, 0x0001},
-{0x010190, 0x0040},
-{0x01019D, 0x0001},
-{0x0101A0, 0x0040},
-{0x0101A1, 0x0001},
-{0x0101D0, 0x0040},
-{0x0101FD, 0x0010},
-{0x0101FE, 0x0001},
-{0x010280, 0x0004},
-{0x01029D, 0x0001},
-{0x0102A0, 0x0004},
-{0x0102D1, 0x0001},
-{0x0102E0, 0x0010},
-{0x0102E1, 0x0002},
-{0x0102FC, 0x0001},
-{0x010300, 0x0004},
-{0x010320, 0x0002},
-{0x010324, 0x0001},
-{0x01032D, 0x0004},
-{0x010341, 0x0002},
-{0x010342, 0x0004},
-{0x01034A, 0x0002},
-{0x01034B, 0x0001},
-{0x010350, 0x0004},
-{0x010376, 0x0010},
-{0x01037B, 0x0001},
-{0x010380, 0x0004},
-{0x01039E, 0x0001},
-{0x01039F, 0x0020},
-{0x0103A0, 0x0004},
-{0x0103C4, 0x0001},
-{0x0103C8, 0x0004},
-{0x0103D0, 0x0020},
-{0x0103D1, 0x0002},
-{0x0103D6, 0x0001},
-{0x010400, 0x0004},
-{0x01049E, 0x0001},
-{0x0104A0, 0x0002},
-{0x0104AA, 0x0001},
-{0x0104B0, 0x0004},
-{0x0104D4, 0x0001},
-{0x0104D8, 0x0004},
-{0x0104FC, 0x0001},
-{0x010500, 0x0004},
-{0x010528, 0x0001},
-{0x010530, 0x0004},
-{0x010564, 0x0001},
-{0x01056F, 0x0020},
-{0x010570, 0x0004},
-{0x01057B, 0x0001},
-{0x01057C, 0x0004},
-{0x01058B, 0x0001},
-{0x01058C, 0x0004},
-{0x010593, 0x0001},
-{0x010594, 0x0004},
-{0x010596, 0x0001},
-{0x010597, 0x0004},
-{0x0105A2, 0x0001},
-{0x0105A3, 0x0004},
-{0x0105B2, 0x0001},
-{0x0105B3, 0x0004},
-{0x0105BA, 0x0001},
-{0x0105BB, 0x0004},
-{0x0105BD, 0x0001},
-{0x010600, 0x0004},
-{0x010737, 0x0001},
-{0x010740, 0x0004},
-{0x010756, 0x0001},
-{0x010760, 0x0004},
-{0x010768, 0x0001},
-{0x010780, 0x0004},
-{0x010786, 0x0001},
-{0x010787, 0x0004},
-{0x0107B1, 0x0001},
-{0x0107B2, 0x0004},
-{0x0107BB, 0x0001},
-{0x010800, 0x0004},
-{0x010806, 0x0001},
-{0x010808, 0x0004},
-{0x010809, 0x0001},
-{0x01080A, 0x0004},
-{0x010836, 0x0001},
-{0x010837, 0x0004},
-{0x010839, 0x0001},
-{0x01083C, 0x0004},
-{0x01083D, 0x0001},
-{0x01083F, 0x0004},
-{0x010856, 0x0001},
-{0x010857, 0x0020},
-{0x010858, 0x0002},
-{0x010860, 0x0004},
-{0x010877, 0x0040},
-{0x010879, 0x0002},
-{0x010880, 0x0004},
-{0x01089F, 0x0001},
-{0x0108A7, 0x0002},
-{0x0108B0, 0x0001},
-{0x0108E0, 0x0004},
-{0x0108F3, 0x0001},
-{0x0108F4, 0x0004},
-{0x0108F6, 0x0001},
-{0x0108FB, 0x0002},
-{0x010900, 0x0004},
-{0x010916, 0x0002},
-{0x01091C, 0x0001},
-{0x01091F, 0x0020},
-{0x010920, 0x0004},
-{0x01093A, 0x0001},
-{0x01093F, 0x0020},
-{0x010940, 0x0001},
-{0x010980, 0x0004},
-{0x0109B8, 0x0001},
-{0x0109BC, 0x0002},
-{0x0109BE, 0x0004},
-{0x0109C0, 0x0002},
-{0x0109D0, 0x0001},
-{0x0109D2, 0x0002},
-{0x010A00, 0x0004},
-{0x010A01, 0x0010},
-{0x010A04, 0x0001},
-{0x010A05, 0x0010},
-{0x010A07, 0x0001},
-{0x010A0C, 0x0010},
-{0x010A10, 0x0004},
-{0x010A14, 0x0001},
-{0x010A15, 0x0004},
-{0x010A18, 0x0001},
-{0x010A19, 0x0004},
-{0x010A36, 0x0001},
-{0x010A38, 0x0010},
-{0x010A3B, 0x0001},
-{0x010A3F, 0x0010},
-{0x010A40, 0x0002},
-{0x010A49, 0x0001},
-{0x010A50, 0x0020},
-{0x010A59, 0x0001},
-{0x010A60, 0x0004},
-{0x010A7D, 0x0002},
-{0x010A7F, 0x0020},
-{0x010A80, 0x0004},
-{0x010A9D, 0x0002},
-{0x010AA0, 0x0001},
-{0x010AC0, 0x0004},
-{0x010AC8, 0x0040},
-{0x010AC9, 0x0004},
-{0x010AE5, 0x0010},
-{0x010AE7, 0x0001},
-{0x010AEB, 0x0002},
-{0x010AF0, 0x0020},
-{0x010AF7, 0x0001},
-{0x010B00, 0x0004},
-{0x010B36, 0x0001},
-{0x010B39, 0x0020},
-{0x010B40, 0x0004},
-{0x010B56, 0x0001},
-{0x010B58, 0x0002},
-{0x010B60, 0x0004},
-{0x010B73, 0x0001},
-{0x010B78, 0x0002},
-{0x010B80, 0x0004},
-{0x010B92, 0x0001},
-{0x010B99, 0x0020},
-{0x010B9D, 0x0001},
-{0x010BA9, 0x0002},
-{0x010BB0, 0x0001},
-{0x010C00, 0x0004},
-{0x010C49, 0x0001},
-{0x010C80, 0x0004},
-{0x010CB3, 0x0001},
-{0x010CC0, 0x0004},
-{0x010CF3, 0x0001},
-{0x010CFA, 0x0002},
-{0x010D00, 0x0004},
-{0x010D24, 0x0010},
-{0x010D28, 0x0001},
-{0x010D30, 0x0002},
-{0x010D3A, 0x0001},
-{0x010E60, 0x0002},
-{0x010E7F, 0x0001},
-{0x010E80, 0x0004},
-{0x010EAA, 0x0001},
-{0x010EAB, 0x0010},
-{0x010EAD, 0x0020},
-{0x010EAE, 0x0001},
-{0x010EB0, 0x0004},
-{0x010EB2, 0x0001},
-{0x010EFD, 0x0010},
-{0x010F00, 0x0004},
-{0x010F1D, 0x0002},
-{0x010F27, 0x0004},
-{0x010F28, 0x0001},
-{0x010F30, 0x0004},
-{0x010F46, 0x0010},
-{0x010F51, 0x0002},
-{0x010F55, 0x0020},
-{0x010F5A, 0x0001},
-{0x010F70, 0x0004},
-{0x010F82, 0x0010},
-{0x010F86, 0x0020},
-{0x010F8A, 0x0001},
-{0x010FB0, 0x0004},
-{0x010FC5, 0x0002},
-{0x010FCC, 0x0001},
-{0x010FE0, 0x0004},
-{0x010FF7, 0x0001},
-{0x011000, 0x0010},
-{0x011003, 0x0004},
-{0x011038, 0x0010},
-{0x011047, 0x0020},
-{0x01104E, 0x0001},
-{0x011052, 0x0002},
-{0x011070, 0x0010},
-{0x011071, 0x0004},
-{0x011073, 0x0010},
-{0x011075, 0x0004},
-{0x011076, 0x0001},
-{0x01107F, 0x0010},
-{0x011083, 0x0004},
-{0x0110B0, 0x0010},
-{0x0110BB, 0x0020},
-{0x0110BD, 0x0080},
-{0x0110BE, 0x0020},
-{0x0110C2, 0x0010},
-{0x0110C3, 0x0001},
-{0x0110CD, 0x0080},
-{0x0110CE, 0x0001},
-{0x0110D0, 0x0004},
-{0x0110E9, 0x0001},
-{0x0110F0, 0x0002},
-{0x0110FA, 0x0001},
-{0x011100, 0x0010},
-{0x011103, 0x0004},
-{0x011127, 0x0010},
-{0x011135, 0x0001},
-{0x011136, 0x0002},
-{0x011140, 0x0020},
-{0x011144, 0x0004},
-{0x011145, 0x0010},
-{0x011147, 0x0004},
-{0x011148, 0x0001},
-{0x011150, 0x0004},
-{0x011173, 0x0010},
-{0x011174, 0x0020},
-{0x011176, 0x0004},
-{0x011177, 0x0001},
-{0x011180, 0x0010},
-{0x011183, 0x0004},
-{0x0111B3, 0x0010},
-{0x0111C1, 0x0004},
-{0x0111C5, 0x0020},
-{0x0111C9, 0x0010},
-{0x0111CD, 0x0020},
-{0x0111CE, 0x0010},
-{0x0111D0, 0x0002},
-{0x0111DA, 0x0004},
-{0x0111DB, 0x0020},
-{0x0111DC, 0x0004},
-{0x0111DD, 0x0020},
-{0x0111E0, 0x0001},
-{0x0111E1, 0x0002},
-{0x0111F5, 0x0001},
-{0x011200, 0x0004},
-{0x011212, 0x0001},
-{0x011213, 0x0004},
-{0x01122C, 0x0010},
-{0x011238, 0x0020},
-{0x01123E, 0x0010},
-{0x01123F, 0x0004},
-{0x011241, 0x0010},
-{0x011242, 0x0001},
-{0x011280, 0x0004},
-{0x011287, 0x0001},
-{0x011288, 0x0004},
-{0x011289, 0x0001},
-{0x01128A, 0x0004},
-{0x01128E, 0x0001},
-{0x01128F, 0x0004},
-{0x01129E, 0x0001},
-{0x01129F, 0x0004},
-{0x0112A9, 0x0020},
-{0x0112AA, 0x0001},
-{0x0112B0, 0x0004},
-{0x0112DF, 0x0010},
-{0x0112EB, 0x0001},
-{0x0112F0, 0x0002},
-{0x0112FA, 0x0001},
-{0x011300, 0x0010},
-{0x011304, 0x0001},
-{0x011305, 0x0004},
-{0x01130D, 0x0001},
-{0x01130F, 0x0004},
-{0x011311, 0x0001},
-{0x011313, 0x0004},
-{0x011329, 0x0001},
-{0x01132A, 0x0004},
-{0x011331, 0x0001},
-{0x011332, 0x0004},
-{0x011334, 0x0001},
-{0x011335, 0x0004},
-{0x01133A, 0x0001},
-{0x01133B, 0x0010},
-{0x01133D, 0x0004},
-{0x01133E, 0x0010},
-{0x011345, 0x0001},
-{0x011347, 0x0010},
-{0x011349, 0x0001},
-{0x01134B, 0x0010},
-{0x01134E, 0x0001},
-{0x011350, 0x0004},
-{0x011351, 0x0001},
-{0x011357, 0x0010},
-{0x011358, 0x0001},
-{0x01135D, 0x0004},
-{0x011362, 0x0010},
-{0x011364, 0x0001},
-{0x011366, 0x0010},
-{0x01136D, 0x0001},
-{0x011370, 0x0010},
-{0x011375, 0x0001},
-{0x011400, 0x0004},
-{0x011435, 0x0010},
-{0x011447, 0x0004},
-{0x01144B, 0x0020},
-{0x011450, 0x0002},
-{0x01145A, 0x0020},
-{0x01145C, 0x0001},
-{0x01145D, 0x0020},
-{0x01145E, 0x0010},
-{0x01145F, 0x0004},
-{0x011462, 0x0001},
-{0x011480, 0x0004},
-{0x0114B0, 0x0010},
-{0x0114C4, 0x0004},
-{0x0114C6, 0x0020},
-{0x0114C7, 0x0004},
-{0x0114C8, 0x0001},
-{0x0114D0, 0x0002},
-{0x0114DA, 0x0001},
-{0x011580, 0x0004},
-{0x0115AF, 0x0010},
-{0x0115B6, 0x0001},
-{0x0115B8, 0x0010},
-{0x0115C1, 0x0020},
-{0x0115D8, 0x0004},
-{0x0115DC, 0x0010},
-{0x0115DE, 0x0001},
-{0x011600, 0x0004},
-{0x011630, 0x0010},
-{0x011641, 0x0020},
-{0x011644, 0x0004},
-{0x011645, 0x0001},
-{0x011650, 0x0002},
-{0x01165A, 0x0001},
-{0x011660, 0x0020},
-{0x01166D, 0x0001},
-{0x011680, 0x0004},
-{0x0116AB, 0x0010},
-{0x0116B8, 0x0004},
-{0x0116B9, 0x0020},
-{0x0116BA, 0x0001},
-{0x0116C0, 0x0002},
-{0x0116CA, 0x0001},
-{0x011700, 0x0004},
-{0x01171B, 0x0001},
-{0x01171D, 0x0010},
-{0x01172C, 0x0001},
-{0x011730, 0x0002},
-{0x01173C, 0x0020},
-{0x01173F, 0x0040},
-{0x011740, 0x0004},
-{0x011747, 0x0001},
-{0x011800, 0x0004},
-{0x01182C, 0x0010},
-{0x01183B, 0x0020},
-{0x01183C, 0x0001},
-{0x0118A0, 0x0004},
-{0x0118E0, 0x0002},
-{0x0118F3, 0x0001},
-{0x0118FF, 0x0004},
-{0x011907, 0x0001},
-{0x011909, 0x0004},
-{0x01190A, 0x0001},
-{0x01190C, 0x0004},
-{0x011914, 0x0001},
-{0x011915, 0x0004},
-{0x011917, 0x0001},
-{0x011918, 0x0004},
-{0x011930, 0x0010},
-{0x011936, 0x0001},
-{0x011937, 0x0010},
-{0x011939, 0x0001},
-{0x01193B, 0x0010},
-{0x01193F, 0x0004},
-{0x011940, 0x0010},
-{0x011941, 0x0004},
-{0x011942, 0x0010},
-{0x011944, 0x0020},
-{0x011947, 0x0001},
-{0x011950, 0x0002},
-{0x01195A, 0x0001},
-{0x0119A0, 0x0004},
-{0x0119A8, 0x0001},
-{0x0119AA, 0x0004},
-{0x0119D1, 0x0010},
-{0x0119D8, 0x0001},
-{0x0119DA, 0x0010},
-{0x0119E1, 0x0004},
-{0x0119E2, 0x0020},
-{0x0119E3, 0x0004},
-{0x0119E4, 0x0010},
-{0x0119E5, 0x0001},
-{0x011A00, 0x0004},
-{0x011A01, 0x0010},
-{0x011A0B, 0x0004},
-{0x011A33, 0x0010},
-{0x011A3A, 0x0004},
-{0x011A3B, 0x0010},
-{0x011A3F, 0x0020},
-{0x011A47, 0x0010},
-{0x011A48, 0x0001},
-{0x011A50, 0x0004},
-{0x011A51, 0x0010},
-{0x011A5C, 0x0004},
-{0x011A8A, 0x0010},
-{0x011A9A, 0x0020},
-{0x011A9D, 0x0004},
-{0x011A9E, 0x0020},
-{0x011AA3, 0x0001},
-{0x011AB0, 0x0004},
-{0x011AF9, 0x0001},
-{0x011B00, 0x0020},
-{0x011B0A, 0x0001},
-{0x011C00, 0x0004},
-{0x011C09, 0x0001},
-{0x011C0A, 0x0004},
-{0x011C2F, 0x0010},
-{0x011C37, 0x0001},
-{0x011C38, 0x0010},
-{0x011C40, 0x0004},
-{0x011C41, 0x0020},
-{0x011C46, 0x0001},
-{0x011C50, 0x0002},
-{0x011C6D, 0x0001},
-{0x011C70, 0x0020},
-{0x011C72, 0x0004},
-{0x011C90, 0x0001},
-{0x011C92, 0x0010},
-{0x011CA8, 0x0001},
-{0x011CA9, 0x0010},
-{0x011CB7, 0x0001},
-{0x011D00, 0x0004},
-{0x011D07, 0x0001},
-{0x011D08, 0x0004},
-{0x011D0A, 0x0001},
-{0x011D0B, 0x0004},
-{0x011D31, 0x0010},
-{0x011D37, 0x0001},
-{0x011D3A, 0x0010},
-{0x011D3B, 0x0001},
-{0x011D3C, 0x0010},
-{0x011D3E, 0x0001},
-{0x011D3F, 0x0010},
-{0x011D46, 0x0004},
-{0x011D47, 0x0010},
-{0x011D48, 0x0001},
-{0x011D50, 0x0002},
-{0x011D5A, 0x0001},
-{0x011D60, 0x0004},
-{0x011D66, 0x0001},
-{0x011D67, 0x0004},
-{0x011D69, 0x0001},
-{0x011D6A, 0x0004},
-{0x011D8A, 0x0010},
-{0x011D8F, 0x0001},
-{0x011D90, 0x0010},
-{0x011D92, 0x0001},
-{0x011D93, 0x0010},
-{0x011D98, 0x0004},
-{0x011D99, 0x0001},
-{0x011DA0, 0x0002},
-{0x011DAA, 0x0001},
-{0x011EE0, 0x0004},
-{0x011EF3, 0x0010},
-{0x011EF7, 0x0020},
-{0x011EF9, 0x0001},
-{0x011F00, 0x0010},
-{0x011F02, 0x0004},
-{0x011F03, 0x0010},
-{0x011F04, 0x0004},
-{0x011F11, 0x0001},
-{0x011F12, 0x0004},
-{0x011F34, 0x0010},
-{0x011F3B, 0x0001},
-{0x011F3E, 0x0010},
-{0x011F43, 0x0020},
-{0x011F50, 0x0002},
-{0x011F5A, 0x0001},
-{0x011FB0, 0x0004},
-{0x011FB1, 0x0001},
-{0x011FC0, 0x0002},
-{0x011FD5, 0x0040},
-{0x011FF2, 0x0001},
-{0x011FFF, 0x0020},
-{0x012000, 0x0004},
-{0x01239A, 0x0001},
-{0x012400, 0x0002},
-{0x01246F, 0x0001},
-{0x012470, 0x0020},
-{0x012475, 0x0001},
-{0x012480, 0x0004},
-{0x012544, 0x0001},
-{0x012F90, 0x0004},
-{0x012FF1, 0x0020},
-{0x012FF3, 0x0001},
-{0x013000, 0x0004},
-{0x013430, 0x0080},
-{0x013440, 0x0010},
-{0x013441, 0x0004},
-{0x013447, 0x0010},
-{0x013456, 0x0001},
-{0x014400, 0x0004},
-{0x014647, 0x0001},
-{0x016800, 0x0004},
-{0x016A39, 0x0001},
-{0x016A40, 0x0004},
-{0x016A5F, 0x0001},
-{0x016A60, 0x0002},
-{0x016A6A, 0x0001},
-{0x016A6E, 0x0020},
-{0x016A70, 0x0004},
-{0x016ABF, 0x0001},
-{0x016AC0, 0x0002},
-{0x016ACA, 0x0001},
-{0x016AD0, 0x0004},
-{0x016AEE, 0x0001},
-{0x016AF0, 0x0010},
-{0x016AF5, 0x0020},
-{0x016AF6, 0x0001},
-{0x016B00, 0x0004},
-{0x016B30, 0x0010},
-{0x016B37, 0x0020},
-{0x016B3C, 0x0040},
-{0x016B40, 0x0004},
-{0x016B44, 0x0020},
-{0x016B45, 0x0040},
-{0x016B46, 0x0001},
-{0x016B50, 0x0002},
-{0x016B5A, 0x0001},
-{0x016B5B, 0x0002},
-{0x016B62, 0x0001},
-{0x016B63, 0x0004},
-{0x016B78, 0x0001},
-{0x016B7D, 0x0004},
-{0x016B90, 0x0001},
-{0x016E40, 0x0004},
-{0x016E80, 0x0002},
-{0x016E97, 0x0020},
-{0x016E9B, 0x0001},
-{0x016F00, 0x0004},
-{0x016F4B, 0x0001},
-{0x016F4F, 0x0010},
-{0x016F50, 0x0004},
-{0x016F51, 0x0010},
-{0x016F88, 0x0001},
-{0x016F8F, 0x0010},
-{0x016F93, 0x0004},
-{0x016FA0, 0x0001},
-{0x016FE0, 0x0004},
-{0x016FE2, 0x0020},
-{0x016FE3, 0x0004},
-{0x016FE4, 0x0010},
-{0x016FE5, 0x0001},
-{0x016FF0, 0x0010},
-{0x016FF2, 0x0001},
-{0x017000, 0x0004},
-{0x0187F8, 0x0001},
-{0x018800, 0x0004},
-{0x018CD6, 0x0001},
-{0x018D00, 0x0004},
-{0x018D09, 0x0001},
-{0x01AFF0, 0x0004},
-{0x01AFF4, 0x0001},
-{0x01AFF5, 0x0004},
-{0x01AFFC, 0x0001},
-{0x01AFFD, 0x0004},
-{0x01AFFF, 0x0001},
-{0x01B000, 0x0004},
-{0x01B123, 0x0001},
-{0x01B132, 0x0004},
-{0x01B133, 0x0001},
-{0x01B150, 0x0004},
-{0x01B153, 0x0001},
-{0x01B155, 0x0004},
-{0x01B156, 0x0001},
-{0x01B164, 0x0004},
-{0x01B168, 0x0001},
-{0x01B170, 0x0004},
-{0x01B2FC, 0x0001},
-{0x01BC00, 0x0004},
-{0x01BC6B, 0x0001},
-{0x01BC70, 0x0004},
-{0x01BC7D, 0x0001},
-{0x01BC80, 0x0004},
-{0x01BC89, 0x0001},
-{0x01BC90, 0x0004},
-{0x01BC9A, 0x0001},
-{0x01BC9C, 0x0040},
-{0x01BC9D, 0x0010},
-{0x01BC9F, 0x0020},
-{0x01BCA0, 0x0080},
-{0x01BCA4, 0x0001},
-{0x01CF00, 0x0010},
-{0x01CF2E, 0x0001},
-{0x01CF30, 0x0010},
-{0x01CF47, 0x0001},
-{0x01CF50, 0x0040},
-{0x01CFC4, 0x0001},
-{0x01D000, 0x0040},
-{0x01D0F6, 0x0001},
-{0x01D100, 0x0040},
-{0x01D127, 0x0001},
-{0x01D129, 0x0040},
-{0x01D165, 0x0010},
-{0x01D16A, 0x0040},
-{0x01D16D, 0x0010},
-{0x01D173, 0x0080},
-{0x01D17B, 0x0010},
-{0x01D183, 0x0040},
-{0x01D185, 0x0010},
-{0x01D18C, 0x0040},
-{0x01D1AA, 0x0010},
-{0x01D1AE, 0x0040},
-{0x01D1EB, 0x0001},
-{0x01D200, 0x0040},
-{0x01D242, 0x0010},
-{0x01D245, 0x0040},
-{0x01D246, 0x0001},
-{0x01D2C0, 0x0002},
-{0x01D2D4, 0x0001},
-{0x01D2E0, 0x0002},
-{0x01D2F4, 0x0001},
-{0x01D300, 0x0040},
-{0x01D357, 0x0001},
-{0x01D360, 0x0002},
-{0x01D379, 0x0001},
-{0x01D400, 0x0004},
-{0x01D455, 0x0001},
-{0x01D456, 0x0004},
-{0x01D49D, 0x0001},
-{0x01D49E, 0x0004},
-{0x01D4A0, 0x0001},
-{0x01D4A2, 0x0004},
-{0x01D4A3, 0x0001},
-{0x01D4A5, 0x0004},
-{0x01D4A7, 0x0001},
-{0x01D4A9, 0x0004},
-{0x01D4AD, 0x0001},
-{0x01D4AE, 0x0004},
-{0x01D4BA, 0x0001},
-{0x01D4BB, 0x0004},
-{0x01D4BC, 0x0001},
-{0x01D4BD, 0x0004},
-{0x01D4C4, 0x0001},
-{0x01D4C5, 0x0004},
-{0x01D506, 0x0001},
-{0x01D507, 0x0004},
-{0x01D50B, 0x0001},
-{0x01D50D, 0x0004},
-{0x01D515, 0x0001},
-{0x01D516, 0x0004},
-{0x01D51D, 0x0001},
-{0x01D51E, 0x0004},
-{0x01D53A, 0x0001},
-{0x01D53B, 0x0004},
-{0x01D53F, 0x0001},
-{0x01D540, 0x0004},
-{0x01D545, 0x0001},
-{0x01D546, 0x0004},
-{0x01D547, 0x0001},
-{0x01D54A, 0x0004},
-{0x01D551, 0x0001},
-{0x01D552, 0x0004},
-{0x01D6A6, 0x0001},
-{0x01D6A8, 0x0004},
-{0x01D6C1, 0x0040},
-{0x01D6C2, 0x0004},
-{0x01D6DB, 0x0040},
-{0x01D6DC, 0x0004},
-{0x01D6FB, 0x0040},
-{0x01D6FC, 0x0004},
-{0x01D715, 0x0040},
-{0x01D716, 0x0004},
-{0x01D735, 0x0040},
-{0x01D736, 0x0004},
-{0x01D74F, 0x0040},
-{0x01D750, 0x0004},
-{0x01D76F, 0x0040},
-{0x01D770, 0x0004},
-{0x01D789, 0x0040},
-{0x01D78A, 0x0004},
-{0x01D7A9, 0x0040},
-{0x01D7AA, 0x0004},
-{0x01D7C3, 0x0040},
-{0x01D7C4, 0x0004},
-{0x01D7CC, 0x0001},
-{0x01D7CE, 0x0002},
-{0x01D800, 0x0040},
-{0x01DA00, 0x0010},
-{0x01DA37, 0x0040},
-{0x01DA3B, 0x0010},
-{0x01DA6D, 0x0040},
-{0x01DA75, 0x0010},
-{0x01DA76, 0x0040},
-{0x01DA84, 0x0010},
-{0x01DA85, 0x0040},
-{0x01DA87, 0x0020},
-{0x01DA8C, 0x0001},
-{0x01DA9B, 0x0010},
-{0x01DAA0, 0x0001},
-{0x01DAA1, 0x0010},
-{0x01DAB0, 0x0001},
-{0x01DF00, 0x0004},
-{0x01DF1F, 0x0001},
-{0x01DF25, 0x0004},
-{0x01DF2B, 0x0001},
-{0x01E000, 0x0010},
-{0x01E007, 0x0001},
-{0x01E008, 0x0010},
-{0x01E019, 0x0001},
-{0x01E01B, 0x0010},
-{0x01E022, 0x0001},
-{0x01E023, 0x0010},
-{0x01E025, 0x0001},
-{0x01E026, 0x0010},
-{0x01E02B, 0x0001},
-{0x01E030, 0x0004},
-{0x01E06E, 0x0001},
-{0x01E08F, 0x0010},
-{0x01E090, 0x0001},
-{0x01E100, 0x0004},
-{0x01E12D, 0x0001},
-{0x01E130, 0x0010},
-{0x01E137, 0x0004},
-{0x01E13E, 0x0001},
-{0x01E140, 0x0002},
-{0x01E14A, 0x0001},
-{0x01E14E, 0x0004},
-{0x01E14F, 0x0040},
-{0x01E150, 0x0001},
-{0x01E290, 0x0004},
-{0x01E2AE, 0x0010},
-{0x01E2AF, 0x0001},
-{0x01E2C0, 0x0004},
-{0x01E2EC, 0x0010},
-{0x01E2F0, 0x0002},
-{0x01E2FA, 0x0001},
-{0x01E2FF, 0x0040},
-{0x01E300, 0x0001},
-{0x01E4D0, 0x0004},
-{0x01E4EC, 0x0010},
-{0x01E4F0, 0x0002},
-{0x01E4FA, 0x0001},
-{0x01E7E0, 0x0004},
-{0x01E7E7, 0x0001},
-{0x01E7E8, 0x0004},
-{0x01E7EC, 0x0001},
-{0x01E7ED, 0x0004},
-{0x01E7EF, 0x0001},
-{0x01E7F0, 0x0004},
-{0x01E7FF, 0x0001},
-{0x01E800, 0x0004},
-{0x01E8C5, 0x0001},
-{0x01E8C7, 0x0002},
-{0x01E8D0, 0x0010},
-{0x01E8D7, 0x0001},
-{0x01E900, 0x0004},
-{0x01E944, 0x0010},
-{0x01E94B, 0x0004},
-{0x01E94C, 0x0001},
-{0x01E950, 0x0002},
-{0x01E95A, 0x0001},
-{0x01E95E, 0x0020},
-{0x01E960, 0x0001},
-{0x01EC71, 0x0002},
-{0x01ECAC, 0x0040},
-{0x01ECAD, 0x0002},
-{0x01ECB0, 0x0040},
-{0x01ECB1, 0x0002},
-{0x01ECB5, 0x0001},
-{0x01ED01, 0x0002},
-{0x01ED2E, 0x0040},
-{0x01ED2F, 0x0002},
-{0x01ED3E, 0x0001},
-{0x01EE00, 0x0004},
-{0x01EE04, 0x0001},
-{0x01EE05, 0x0004},
-{0x01EE20, 0x0001},
-{0x01EE21, 0x0004},
-{0x01EE23, 0x0001},
-{0x01EE24, 0x0004},
-{0x01EE25, 0x0001},
-{0x01EE27, 0x0004},
-{0x01EE28, 0x0001},
-{0x01EE29, 0x0004},
-{0x01EE33, 0x0001},
-{0x01EE34, 0x0004},
-{0x01EE38, 0x0001},
-{0x01EE39, 0x0004},
-{0x01EE3A, 0x0001},
-{0x01EE3B, 0x0004},
-{0x01EE3C, 0x0001},
-{0x01EE42, 0x0004},
-{0x01EE43, 0x0001},
-{0x01EE47, 0x0004},
-{0x01EE48, 0x0001},
-{0x01EE49, 0x0004},
-{0x01EE4A, 0x0001},
-{0x01EE4B, 0x0004},
-{0x01EE4C, 0x0001},
-{0x01EE4D, 0x0004},
-{0x01EE50, 0x0001},
-{0x01EE51, 0x0004},
-{0x01EE53, 0x0001},
-{0x01EE54, 0x0004},
-{0x01EE55, 0x0001},
-{0x01EE57, 0x0004},
-{0x01EE58, 0x0001},
-{0x01EE59, 0x0004},
-{0x01EE5A, 0x0001},
-{0x01EE5B, 0x0004},
-{0x01EE5C, 0x0001},
-{0x01EE5D, 0x0004},
-{0x01EE5E, 0x0001},
-{0x01EE5F, 0x0004},
-{0x01EE60, 0x0001},
-{0x01EE61, 0x0004},
-{0x01EE63, 0x0001},
-{0x01EE64, 0x0004},
-{0x01EE65, 0x0001},
-{0x01EE67, 0x0004},
-{0x01EE6B, 0x0001},
-{0x01EE6C, 0x0004},
-{0x01EE73, 0x0001},
-{0x01EE74, 0x0004},
-{0x01EE78, 0x0001},
-{0x01EE79, 0x0004},
-{0x01EE7D, 0x0001},
-{0x01EE7E, 0x0004},
-{0x01EE7F, 0x0001},
-{0x01EE80, 0x0004},
-{0x01EE8A, 0x0001},
-{0x01EE8B, 0x0004},
-{0x01EE9C, 0x0001},
-{0x01EEA1, 0x0004},
-{0x01EEA4, 0x0001},
-{0x01EEA5, 0x0004},
-{0x01EEAA, 0x0001},
-{0x01EEAB, 0x0004},
-{0x01EEBC, 0x0001},
-{0x01EEF0, 0x0040},
-{0x01EEF2, 0x0001},
-{0x01F000, 0x0040},
-{0x01F02C, 0x0001},
-{0x01F030, 0x0040},
-{0x01F094, 0x0001},
-{0x01F0A0, 0x0040},
-{0x01F0AF, 0x0001},
-{0x01F0B1, 0x0040},
-{0x01F0C0, 0x0001},
-{0x01F0C1, 0x0040},
-{0x01F0D0, 0x0001},
-{0x01F0D1, 0x0040},
-{0x01F0F6, 0x0001},
-{0x01F100, 0x0002},
-{0x01F10D, 0x0040},
-{0x01F1AE, 0x0001},
-{0x01F1E6, 0x0040},
-{0x01F203, 0x0001},
-{0x01F210, 0x0040},
-{0x01F23C, 0x0001},
-{0x01F240, 0x0040},
-{0x01F249, 0x0001},
-{0x01F250, 0x0040},
-{0x01F252, 0x0001},
-{0x01F260, 0x0040},
-{0x01F266, 0x0001},
-{0x01F300, 0x0040},
-{0x01F6D8, 0x0001},
-{0x01F6DC, 0x0040},
-{0x01F6ED, 0x0001},
-{0x01F6F0, 0x0040},
-{0x01F6FD, 0x0001},
-{0x01F700, 0x0040},
-{0x01F777, 0x0001},
-{0x01F77B, 0x0040},
-{0x01F7DA, 0x0001},
-{0x01F7E0, 0x0040},
-{0x01F7EC, 0x0001},
-{0x01F7F0, 0x0040},
-{0x01F7F1, 0x0001},
-{0x01F800, 0x0040},
-{0x01F80C, 0x0001},
-{0x01F810, 0x0040},
-{0x01F848, 0x0001},
-{0x01F850, 0x0040},
-{0x01F85A, 0x0001},
-{0x01F860, 0x0040},
-{0x01F888, 0x0001},
-{0x01F890, 0x0040},
-{0x01F8AE, 0x0001},
-{0x01F8B0, 0x0040},
-{0x01F8B2, 0x0001},
-{0x01F900, 0x0040},
-{0x01FA54, 0x0001},
-{0x01FA60, 0x0040},
-{0x01FA6E, 0x0001},
-{0x01FA70, 0x0040},
-{0x01FA7D, 0x0001},
-{0x01FA80, 0x0040},
-{0x01FA89, 0x0001},
-{0x01FA90, 0x0040},
-{0x01FABE, 0x0001},
-{0x01FABF, 0x0040},
-{0x01FAC6, 0x0001},
-{0x01FACE, 0x0040},
-{0x01FADC, 0x0001},
-{0x01FAE0, 0x0040},
-{0x01FAE9, 0x0001},
-{0x01FAF0, 0x0040},
-{0x01FAF9, 0x0001},
-{0x01FB00, 0x0040},
-{0x01FB93, 0x0001},
-{0x01FB94, 0x0040},
-{0x01FBCB, 0x0001},
-{0x01FBF0, 0x0002},
-{0x01FBFA, 0x0001},
-{0x020000, 0x0004},
-{0x02A6E0, 0x0001},
-{0x02A700, 0x0004},
-{0x02B73A, 0x0001},
-{0x02B740, 0x0004},
-{0x02B81E, 0x0001},
-{0x02B820, 0x0004},
-{0x02CEA2, 0x0001},
-{0x02CEB0, 0x0004},
-{0x02EBE1, 0x0001},
-{0x02EBF0, 0x0004},
-{0x02EE5E, 0x0001},
-{0x02F800, 0x0004},
-{0x02FA1E, 0x0001},
-{0x030000, 0x0004},
-{0x03134B, 0x0001},
-{0x031350, 0x0004},
-{0x0323B0, 0x0001},
-{0x0E0001, 0x0080},
-{0x0E0002, 0x0001},
-{0x0E0020, 0x0080},
-{0x0E0080, 0x0001},
-{0x0E0100, 0x0010},
-{0x0E01F0, 0x0001},
-{0x0F0000, 0x0080},
-{0x0FFFFE, 0x0001},
-{0x100000, 0x0080},
-{0x10FFFE, 0x0001},
-{0x110000, 0x0000},
+const std::vector<uint16_t> unicode_rle_codepoints_categs = {  // run length encoding, 5 bits categ + 11 bits length
+0x03E1,
+0x001D,
+0x0055,
+0x0017,
+0x0055,
+0x0016,
+0x0012,
+0x0015,
+0x0019,
+0x0015,
+0x0011,
+0x0035,
+0x012D,
+0x0035,
+0x0059,
+0x0035,
+0x0329,
+0x0016,
+0x0015,
+0x0012,
+0x0018,
+0x0010,
+0x0018,
+0x0325,
+0x0016,
+0x0019,
+0x0012,
+0x0019,
+0x0401,
+0x001D,
+0x0015,
+0x0077,
+0x001A,
+0x0015,
+0x0018,
+0x001A,
+0x0007,
+0x0014,
+0x0019,
+0x0002,
+0x001A,
+0x0018,
+0x001A,
+0x0019,
+0x002F,
+0x0018,
+0x0005,
+0x0035,
+0x0018,
+0x000F,
+0x0007,
+0x0013,
+0x004F,
+0x0015,
+0x02C9,
+0x0019,
+0x00C9,
+0x02E5,
+0x0019,
+0x00E5,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0025,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0025,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0029,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0045,
+0x0029,
+0x0005,
+0x0009,
+0x0005,
+0x0029,
+0x0005,
+0x0049,
+0x0025,
+0x0069,
+0x0005,
+0x0029,
+0x0005,
+0x0049,
+0x0045,
+0x0029,
+0x0005,
+0x0029,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0029,
+0x0005,
+0x0009,
+0x0025,
+0x0009,
+0x0005,
+0x0029,
+0x0005,
+0x0049,
+0x0005,
+0x0009,
+0x0005,
+0x0029,
+0x0025,
+0x0007,
+0x0009,
+0x0045,
+0x0067,
+0x0009,
+0x0008,
+0x0005,
+0x0009,
+0x0008,
+0x0005,
+0x0009,
+0x0008,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0025,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0025,
+0x0009,
+0x0008,
+0x0005,
+0x0009,
+0x0005,
+0x0049,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x00C5,
+0x0029,
+0x0005,
+0x0029,
+0x0025,
+0x0009,
+0x0005,
+0x0069,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0885,
+0x0007,
+0x0345,
+0x0226,
+0x0078,
+0x0166,
+0x01B8,
+0x0086,
+0x00D8,
+0x0006,
+0x0018,
+0x0006,
+0x0218,
+0x0DEC,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0006,
+0x0018,
+0x0009,
+0x0005,
+0x0020,
+0x0006,
+0x0045,
+0x0015,
+0x0009,
+0x0060,
+0x0038,
+0x0009,
+0x0015,
+0x0049,
+0x0000,
+0x0009,
+0x0000,
+0x0029,
+0x0005,
+0x0209,
+0x0000,
+0x0109,
+0x0445,
+0x0009,
+0x0025,
+0x0049,
+0x0045,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0085,
+0x0009,
+0x0005,
+0x0019,
+0x0009,
+0x0005,
+0x0029,
+0x0025,
+0x0649,
+0x05E5,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x001A,
+0x008C,
+0x002B,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0029,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0025,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0000,
+0x04A9,
+0x0020,
+0x0006,
+0x00B5,
+0x0505,
+0x0015,
+0x0011,
+0x0020,
+0x003A,
+0x0017,
+0x0000,
+0x058C,
+0x0011,
+0x000C,
+0x0015,
+0x002C,
+0x0015,
+0x002C,
+0x0015,
+0x000C,
+0x00E0,
+0x0347,
+0x0060,
+0x0067,
+0x0035,
+0x0140,
+0x00A2,
+0x0059,
+0x0035,
+0x0017,
+0x0035,
+0x003A,
+0x014C,
+0x0015,
+0x0002,
+0x0055,
+0x03E7,
+0x0006,
+0x0127,
+0x028C,
+0x012D,
+0x0075,
+0x0027,
+0x000C,
+0x0C47,
+0x0015,
+0x0007,
+0x00CC,
+0x0002,
+0x001A,
+0x00AC,
+0x0026,
+0x002C,
+0x001A,
+0x006C,
+0x0027,
+0x012D,
+0x0047,
+0x003A,
+0x0007,
+0x01B5,
+0x0000,
+0x0002,
+0x0007,
+0x000C,
+0x03A7,
+0x034C,
+0x0020,
+0x0B07,
+0x014C,
+0x0007,
+0x01A0,
+0x012D,
+0x0407,
+0x010C,
+0x0026,
+0x001A,
+0x0055,
+0x0006,
+0x0020,
+0x000C,
+0x0037,
+0x02A7,
+0x006C,
+0x0006,
+0x010C,
+0x0006,
+0x004C,
+0x0006,
+0x008C,
+0x0020,
+0x01D5,
+0x0000,
+0x0307,
+0x004C,
+0x0020,
+0x0015,
+0x0000,
+0x0147,
+0x0080,
+0x02E7,
+0x0018,
+0x00A7,
+0x0000,
+0x0022,
+0x00A0,
+0x00EC,
+0x0507,
+0x0006,
+0x02EC,
+0x0002,
+0x03EC,
+0x000A,
+0x06A7,
+0x000C,
+0x000A,
+0x000C,
+0x0007,
+0x004A,
+0x00EC,
+0x006A,
+0x000C,
+0x002A,
+0x0007,
+0x00CC,
+0x0127,
+0x002C,
+0x0035,
+0x012D,
+0x0015,
+0x0006,
+0x01C7,
+0x000C,
+0x002A,
+0x0000,
+0x00E7,
+0x0020,
+0x0027,
+0x0020,
+0x02A7,
+0x0000,
+0x00C7,
+0x0000,
+0x0007,
+0x0040,
+0x0067,
+0x0020,
+0x000C,
+0x0007,
+0x004A,
+0x006C,
+0x0020,
+0x002A,
+0x0020,
+0x002A,
+0x000C,
+0x0007,
+0x00E0,
+0x000A,
+0x0060,
+0x0027,
+0x0000,
+0x0047,
+0x002C,
+0x0020,
+0x012D,
+0x0027,
+0x0037,
+0x00AF,
+0x001A,
+0x0017,
+0x0007,
+0x0015,
+0x000C,
+0x0020,
+0x002C,
+0x000A,
+0x0000,
+0x00A7,
+0x0060,
+0x0027,
+0x0020,
+0x02A7,
+0x0000,
+0x00C7,
+0x0000,
+0x0027,
+0x0000,
+0x0027,
+0x0000,
+0x0027,
+0x0020,
+0x000C,
+0x0000,
+0x004A,
+0x002C,
+0x0060,
+0x002C,
+0x0020,
+0x004C,
+0x0040,
+0x000C,
+0x00C0,
+0x0067,
+0x0000,
+0x0007,
+0x00C0,
+0x012D,
+0x002C,
+0x0047,
+0x000C,
+0x0015,
+0x0120,
+0x002C,
+0x000A,
+0x0000,
+0x0107,
+0x0000,
+0x0047,
+0x0000,
+0x02A7,
+0x0000,
+0x00C7,
+0x0000,
+0x0027,
+0x0000,
+0x0087,
+0x0020,
+0x000C,
+0x0007,
+0x004A,
+0x008C,
+0x0000,
+0x002C,
+0x000A,
+0x0000,
+0x002A,
+0x000C,
+0x0020,
+0x0007,
+0x01C0,
+0x0027,
+0x002C,
+0x0020,
+0x012D,
+0x0015,
+0x0017,
+0x00C0,
+0x0007,
+0x00AC,
+0x0000,
+0x000C,
+0x002A,
+0x0000,
+0x00E7,
+0x0020,
+0x0027,
+0x0020,
+0x02A7,
+0x0000,
+0x00C7,
+0x0000,
+0x0027,
+0x0000,
+0x0087,
+0x0020,
+0x000C,
+0x0007,
+0x000A,
+0x000C,
+0x000A,
+0x006C,
+0x0020,
+0x002A,
+0x0020,
+0x002A,
+0x000C,
+0x00C0,
+0x002C,
+0x000A,
+0x0060,
+0x0027,
+0x0000,
+0x0047,
+0x002C,
+0x0020,
+0x012D,
+0x001A,
+0x0007,
+0x00AF,
+0x0120,
+0x000C,
+0x0007,
+0x0000,
+0x00A7,
+0x0040,
+0x0047,
+0x0000,
+0x0067,
+0x0040,
+0x0027,
+0x0000,
+0x0007,
+0x0000,
+0x0027,
+0x0040,
+0x0027,
+0x0040,
+0x0047,
+0x0040,
+0x0167,
+0x0060,
+0x002A,
+0x000C,
+0x002A,
+0x0040,
+0x004A,
+0x0000,
+0x004A,
+0x000C,
+0x0020,
+0x0007,
+0x00A0,
+0x000A,
+0x01A0,
+0x012D,
+0x004F,
+0x00BA,
+0x0017,
+0x001A,
+0x0080,
+0x000C,
+0x004A,
+0x000C,
+0x00E7,
+0x0000,
+0x0047,
+0x0000,
+0x02C7,
+0x0000,
+0x01E7,
+0x0020,
+0x000C,
+0x0007,
+0x004C,
+0x006A,
+0x0000,
+0x004C,
+0x0000,
+0x006C,
+0x00C0,
+0x002C,
+0x0000,
+0x0047,
+0x0020,
+0x0007,
+0x0020,
+0x0027,
+0x002C,
+0x0020,
+0x012D,
+0x00C0,
+0x0015,
+0x00CF,
+0x001A,
+0x0007,
+0x000C,
+0x002A,
+0x0015,
+0x00E7,
+0x0000,
+0x0047,
+0x0000,
+0x02C7,
+0x0000,
+0x0127,
+0x0000,
+0x0087,
+0x0020,
+0x000C,
+0x0007,
+0x000A,
+0x000C,
+0x008A,
+0x0000,
+0x000C,
+0x002A,
+0x0000,
+0x002A,
+0x002C,
+0x00C0,
+0x002A,
+0x00A0,
+0x0027,
+0x0000,
+0x0027,
+0x002C,
+0x0020,
+0x012D,
+0x0000,
+0x0027,
+0x000A,
+0x0160,
+0x002C,
+0x002A,
+0x0107,
+0x0000,
+0x0047,
+0x0000,
+0x0507,
+0x002C,
+0x0007,
+0x004A,
+0x006C,
+0x0000,
+0x004A,
+0x0000,
+0x004A,
+0x000C,
+0x0007,
+0x001A,
+0x0060,
+0x0047,
+0x000A,
+0x00CF,
+0x0047,
+0x002C,
+0x0020,
+0x012D,
+0x010F,
+0x001A,
+0x00A7,
+0x0000,
+0x000C,
+0x002A,
+0x0000,
+0x0227,
+0x0040,
+0x02E7,
+0x0000,
+0x0107,
+0x0000,
+0x0007,
+0x0020,
+0x00C7,
+0x0040,
+0x000C,
+0x0060,
+0x004A,
+0x004C,
+0x0000,
+0x000C,
+0x0000,
+0x00EA,
+0x00A0,
+0x012D,
+0x0020,
+0x002A,
+0x0015,
+0x0160,
+0x05E7,
+0x000C,
+0x0027,
+0x00CC,
+0x0060,
+0x0017,
+0x00A7,
+0x0006,
+0x00EC,
+0x0015,
+0x012D,
+0x0035,
+0x0480,
+0x0027,
+0x0000,
+0x0007,
+0x0000,
+0x0087,
+0x0000,
+0x02E7,
+0x0000,
+0x0007,
+0x0000,
+0x0127,
+0x000C,
+0x0027,
+0x010C,
+0x0007,
+0x0020,
+0x0087,
+0x0000,
+0x0006,
+0x0000,
+0x00CC,
+0x0000,
+0x012D,
+0x0020,
+0x0067,
+0x03E0,
+0x0007,
+0x005A,
+0x01D5,
+0x001A,
+0x0015,
+0x005A,
+0x002C,
+0x00BA,
+0x012D,
+0x012F,
+0x001A,
+0x000C,
+0x001A,
+0x000C,
+0x001A,
+0x000C,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x002A,
+0x00E7,
+0x0000,
+0x0467,
+0x0060,
+0x01AC,
+0x000A,
+0x008C,
+0x0015,
+0x002C,
+0x0087,
+0x014C,
+0x0000,
+0x046C,
+0x0000,
+0x00FA,
+0x000C,
+0x00BA,
+0x0000,
+0x003A,
+0x0095,
+0x007A,
+0x0035,
+0x0480,
+0x0547,
+0x002A,
+0x006C,
+0x000A,
+0x00AC,
+0x000A,
+0x002C,
+0x002A,
+0x002C,
+0x0007,
+0x012D,
+0x00B5,
+0x00A7,
+0x002A,
+0x002C,
+0x0067,
+0x004C,
+0x0007,
+0x004A,
+0x0027,
+0x00CA,
+0x0047,
+0x006C,
+0x0187,
+0x000C,
+0x002A,
+0x002C,
+0x00AA,
+0x000C,
+0x0007,
+0x000A,
+0x012D,
+0x004A,
+0x000C,
+0x003A,
+0x04A9,
+0x0000,
+0x0009,
+0x0080,
+0x0009,
+0x0020,
+0x0545,
+0x0015,
+0x0006,
+0x0045,
+0x2907,
+0x0000,
+0x0067,
+0x0020,
+0x00C7,
+0x0000,
+0x0007,
+0x0000,
+0x0067,
+0x0020,
+0x0507,
+0x0000,
+0x0067,
+0x0020,
+0x0407,
+0x0000,
+0x0067,
+0x0020,
+0x00C7,
+0x0000,
+0x0007,
+0x0000,
+0x0067,
+0x0020,
+0x01C7,
+0x0000,
+0x0707,
+0x0000,
+0x0067,
+0x0020,
+0x0847,
+0x0020,
+0x004C,
+0x0115,
+0x026F,
+0x0040,
+0x01E7,
+0x013A,
+0x00A0,
+0x0AA9,
+0x0020,
+0x00A5,
+0x0020,
+0x0011,
+0x4D67,
+0x001A,
+0x0015,
+0x0207,
+0x001D,
+0x0327,
+0x0016,
+0x0012,
+0x0040,
+0x0947,
+0x0055,
+0x004E,
+0x00E7,
+0x00C0,
+0x0227,
+0x004C,
+0x000A,
+0x0100,
+0x0247,
+0x002C,
+0x000A,
+0x0035,
+0x0100,
+0x0227,
+0x002C,
+0x0160,
+0x0187,
+0x0000,
+0x0047,
+0x0000,
+0x002C,
+0x0160,
+0x0667,
+0x002C,
+0x000A,
+0x00CC,
+0x00EA,
+0x000C,
+0x002A,
+0x014C,
+0x0055,
+0x0006,
+0x0055,
+0x0017,
+0x0007,
+0x000C,
+0x0020,
+0x012D,
+0x00A0,
+0x012F,
+0x00A0,
+0x00B5,
+0x0011,
+0x0075,
+0x004C,
+0x0002,
+0x000C,
+0x012D,
+0x00A0,
+0x0447,
+0x0006,
+0x0687,
+0x00C0,
+0x0087,
+0x002C,
+0x0427,
+0x000C,
+0x0007,
+0x0080,
+0x08A7,
+0x0120,
+0x03C7,
+0x0000,
+0x004C,
+0x006A,
+0x002C,
+0x004A,
+0x0060,
+0x002A,
+0x000C,
+0x00AA,
+0x004C,
+0x0060,
+0x001A,
+0x0040,
+0x0035,
+0x012D,
+0x03A7,
+0x0020,
+0x0087,
+0x0140,
+0x0567,
+0x0060,
+0x0327,
+0x00A0,
+0x012D,
+0x000F,
+0x0040,
+0x043A,
+0x02C7,
+0x002C,
+0x002A,
+0x000C,
+0x0020,
+0x0035,
+0x0687,
+0x000A,
+0x000C,
+0x000A,
+0x00CC,
+0x0000,
+0x000C,
+0x000A,
+0x000C,
+0x002A,
+0x00EC,
+0x00AA,
+0x012C,
+0x0020,
+0x000C,
+0x012D,
+0x00A0,
+0x012D,
+0x00A0,
+0x00D5,
+0x0006,
+0x00B5,
+0x0020,
+0x01AC,
+0x000B,
+0x01EC,
+0x0600,
+0x006C,
+0x000A,
+0x05C7,
+0x000C,
+0x000A,
+0x008C,
+0x000A,
+0x000C,
+0x008A,
+0x000C,
+0x002A,
+0x00E7,
+0x0040,
+0x012D,
+0x00D5,
+0x013A,
+0x010C,
+0x011A,
+0x0035,
+0x0000,
+0x002C,
+0x000A,
+0x03A7,
+0x000A,
+0x006C,
+0x002A,
+0x002C,
+0x000A,
+0x004C,
+0x0027,
+0x012D,
+0x0567,
+0x000C,
+0x000A,
+0x002C,
+0x004A,
+0x000C,
+0x000A,
+0x004C,
+0x002A,
+0x00E0,
+0x0075,
+0x0467,
+0x00EA,
+0x00EC,
+0x002A,
+0x002C,
+0x0040,
+0x0095,
+0x012D,
+0x0040,
+0x0047,
+0x012D,
+0x03A7,
+0x00A6,
+0x0035,
+0x0105,
+0x00C0,
+0x0549,
+0x0020,
+0x0049,
+0x00F5,
+0x00E0,
+0x004C,
+0x0015,
+0x018C,
+0x000A,
+0x00CC,
+0x0067,
+0x000C,
+0x00A7,
+0x000C,
+0x0027,
+0x000A,
+0x002C,
+0x0007,
+0x0080,
+0x0565,
+0x07C6,
+0x0185,
+0x0006,
+0x0425,
+0x0486,
+0x07EC,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0105,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0105,
+0x00E9,
+0x00A5,
+0x0020,
+0x00A9,
+0x0020,
+0x00E5,
+0x00E9,
+0x00E5,
+0x00E9,
+0x00A5,
+0x0020,
+0x00A9,
+0x0020,
+0x00E5,
+0x0000,
+0x0009,
+0x0000,
+0x0009,
+0x0000,
+0x0009,
+0x0000,
+0x0009,
+0x00E5,
+0x00E9,
+0x01A5,
+0x0020,
+0x00E5,
+0x00E8,
+0x00E5,
+0x00E8,
+0x00E5,
+0x00E8,
+0x0085,
+0x0000,
+0x0025,
+0x0069,
+0x0008,
+0x0018,
+0x0005,
+0x0058,
+0x0045,
+0x0000,
+0x0025,
+0x0069,
+0x0008,
+0x0058,
+0x0065,
+0x0020,
+0x0025,
+0x0069,
+0x0000,
+0x0058,
+0x00E5,
+0x0089,
+0x0058,
+0x0020,
+0x0045,
+0x0000,
+0x0025,
+0x0069,
+0x0008,
+0x0038,
+0x0000,
+0x015D,
+0x0082,
+0x00B1,
+0x0035,
+0x0014,
+0x0013,
+0x0016,
+0x0034,
+0x0013,
+0x0016,
+0x0014,
+0x00F5,
+0x001B,
+0x001C,
+0x0082,
+0x001D,
+0x0115,
+0x0014,
+0x0013,
+0x0075,
+0x0030,
+0x0055,
+0x0019,
+0x0016,
+0x0012,
+0x0155,
+0x0019,
+0x0015,
+0x0010,
+0x0135,
+0x001D,
+0x0082,
+0x0000,
+0x0122,
+0x000F,
+0x0006,
+0x0020,
+0x00AF,
+0x0059,
+0x0016,
+0x0012,
+0x0006,
+0x012F,
+0x0059,
+0x0016,
+0x0012,
+0x0000,
+0x0186,
+0x0040,
+0x0417,
+0x01C0,
+0x018C,
+0x006B,
+0x000C,
+0x004B,
+0x016C,
+0x01C0,
+0x003A,
+0x0009,
+0x007A,
+0x0009,
+0x003A,
+0x0005,
+0x0049,
+0x0025,
+0x0049,
+0x0005,
+0x001A,
+0x0009,
+0x003A,
+0x0019,
+0x0089,
+0x00BA,
+0x0009,
+0x001A,
+0x0009,
+0x001A,
+0x0009,
+0x001A,
+0x0069,
+0x001A,
+0x0005,
+0x0069,
+0x0005,
+0x0067,
+0x0005,
+0x003A,
+0x0025,
+0x0029,
+0x0099,
+0x0009,
+0x0065,
+0x001A,
+0x0019,
+0x003A,
+0x0005,
+0x001A,
+0x01EF,
+0x044E,
+0x0009,
+0x0005,
+0x006E,
+0x000F,
+0x003A,
+0x0060,
+0x0099,
+0x009A,
+0x0039,
+0x007A,
+0x0019,
+0x003A,
+0x0019,
+0x003A,
+0x0019,
+0x00DA,
+0x0019,
+0x03DA,
+0x0039,
+0x003A,
+0x0019,
+0x001A,
+0x0019,
+0x03DA,
+0x2179,
+0x00FA,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x027A,
+0x0039,
+0x00DA,
+0x0016,
+0x0012,
+0x0A1A,
+0x0019,
+0x03BA,
+0x0319,
+0x04FA,
+0x00B9,
+0x089A,
+0x0300,
+0x015A,
+0x0280,
+0x076F,
+0x09BA,
+0x02AF,
+0x16DA,
+0x0019,
+0x011A,
+0x0019,
+0x06BA,
+0x00F9,
+0x0DDA,
+0x0019,
+0x1EFA,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x03AF,
+0x057A,
+0x0099,
+0x0016,
+0x0012,
+0x03D9,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x01F9,
+0x1FFA,
+0x1059,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x07D9,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x03F9,
+0x0016,
+0x0012,
+0x2039,
+0x05FA,
+0x0299,
+0x003A,
+0x00B9,
+0x04DA,
+0x0020,
+0x03FA,
+0x0000,
+0x0D1A,
+0x05E9,
+0x05E5,
+0x0009,
+0x0005,
+0x0049,
+0x0025,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0069,
+0x0005,
+0x0009,
+0x0025,
+0x0009,
+0x00A5,
+0x0026,
+0x0049,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0025,
+0x00BA,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x004C,
+0x0009,
+0x0005,
+0x0080,
+0x0075,
+0x000F,
+0x0035,
+0x04A5,
+0x0000,
+0x0005,
+0x0080,
+0x0005,
+0x0020,
+0x06E7,
+0x00C0,
+0x0006,
+0x0015,
+0x01A0,
+0x000C,
+0x02C7,
+0x0100,
+0x00C7,
+0x0000,
+0x00C7,
+0x0000,
+0x00C7,
+0x0000,
+0x00C7,
+0x0000,
+0x00C7,
+0x0000,
+0x00C7,
+0x0000,
+0x00C7,
+0x0000,
+0x00C7,
+0x0000,
+0x03EC,
+0x0035,
+0x0014,
+0x0013,
+0x0014,
+0x0013,
+0x0055,
+0x0014,
+0x0013,
+0x0015,
+0x0014,
+0x0013,
+0x0115,
+0x0011,
+0x0035,
+0x0011,
+0x0015,
+0x0014,
+0x0013,
+0x0035,
+0x0014,
+0x0013,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0095,
+0x0006,
+0x0135,
+0x0031,
+0x0075,
+0x0011,
+0x0015,
+0x0016,
+0x0195,
+0x003A,
+0x0055,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0011,
+0x0420,
+0x033A,
+0x0000,
+0x0B1A,
+0x0160,
+0x1ABA,
+0x0320,
+0x01FA,
+0x001D,
+0x0055,
+0x001A,
+0x0006,
+0x0007,
+0x000E,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x003A,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0011,
+0x0016,
+0x0032,
+0x001A,
+0x010E,
+0x006C,
+0x002A,
+0x0011,
+0x0086,
+0x003A,
+0x004E,
+0x0006,
+0x0007,
+0x0015,
+0x003A,
+0x0000,
+0x0AA7,
+0x0020,
+0x002C,
+0x0038,
+0x0026,
+0x0007,
+0x0011,
+0x0B27,
+0x0015,
+0x0046,
+0x0007,
+0x0080,
+0x0547,
+0x0000,
+0x0BA7,
+0x0000,
+0x003A,
+0x006F,
+0x013A,
+0x03E7,
+0x047A,
+0x0140,
+0x001A,
+0x01E7,
+0x03DA,
+0x0000,
+0x012F,
+0x03BA,
+0x00EF,
+0x001A,
+0x01CF,
+0x03FA,
+0x012F,
+0x04DA,
+0x01CF,
+0x27FA,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0x37E7,
+0x07FA,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0x4287,
+0x0006,
+0x8EC7,
+0x0040,
+0x06DA,
+0x0100,
+0x04E7,
+0x00A6,
+0x0035,
+0x2167,
+0x0006,
+0x0055,
+0x01E7,
+0x012D,
+0x0027,
+0x0260,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0007,
+0x000C,
+0x004B,
+0x0015,
+0x012C,
+0x0015,
+0x0006,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0026,
+0x002C,
+0x08A7,
+0x012E,
+0x002C,
+0x00B5,
+0x00E0,
+0x02D8,
+0x0106,
+0x0038,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0045,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0006,
+0x00E5,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0029,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0006,
+0x0038,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0007,
+0x0009,
+0x0005,
+0x0009,
+0x0045,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0089,
+0x0005,
+0x0089,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x0069,
+0x0005,
+0x0009,
+0x0005,
+0x0080,
+0x0009,
+0x0005,
+0x0000,
+0x0005,
+0x0000,
+0x0005,
+0x0009,
+0x0005,
+0x0009,
+0x0005,
+0x02E0,
+0x0046,
+0x0009,
+0x0005,
+0x0007,
+0x0026,
+0x0005,
+0x00C7,
+0x000C,
+0x0047,
+0x000C,
+0x0067,
+0x000C,
+0x02C7,
+0x002A,
+0x002C,
+0x000A,
+0x007A,
+0x000C,
+0x0040,
+0x00AF,
+0x003A,
+0x0017,
+0x001A,
+0x00A0,
+0x0667,
+0x0075,
+0x00E0,
+0x002A,
+0x0627,
+0x01EA,
+0x002C,
+0x00E0,
+0x0035,
+0x012D,
+0x00A0,
+0x022C,
+0x00A7,
+0x0055,
+0x0007,
+0x0015,
+0x0027,
+0x000C,
+0x012D,
+0x0367,
+0x00EC,
+0x0035,
+0x02C7,
+0x014C,
+0x002A,
+0x0140,
+0x0015,
+0x0387,
+0x0040,
+0x004C,
+0x000A,
+0x05C7,
+0x000C,
+0x002A,
+0x006C,
+0x002A,
+0x002C,
+0x004A,
+0x0195,
+0x0000,
+0x0006,
+0x012D,
+0x0060,
+0x0035,
+0x0087,
+0x000C,
+0x0006,
+0x0107,
+0x012D,
+0x0087,
+0x0000,
+0x0507,
+0x00AC,
+0x002A,
+0x002C,
+0x002A,
+0x002C,
+0x0100,
+0x0047,
+0x000C,
+0x00E7,
+0x000C,
+0x000A,
+0x0020,
+0x012D,
+0x0020,
+0x0075,
+0x01E7,
+0x0006,
+0x00A7,
+0x005A,
+0x0007,
+0x000A,
+0x000C,
+0x000A,
+0x0627,
+0x000C,
+0x0007,
+0x004C,
+0x0027,
+0x002C,
+0x0087,
+0x002C,
+0x0007,
+0x000C,
+0x0007,
+0x02E0,
+0x0027,
+0x0006,
+0x0035,
+0x0147,
+0x000A,
+0x002C,
+0x002A,
+0x0035,
+0x0007,
+0x0026,
+0x000A,
+0x000C,
+0x0120,
+0x00A7,
+0x0020,
+0x00A7,
+0x0020,
+0x00A7,
+0x0100,
+0x00C7,
+0x0000,
+0x00C7,
+0x0000,
+0x0545,
+0x0018,
+0x0066,
+0x0105,
+0x0006,
+0x0038,
+0x0060,
+0x09E5,
+0x0447,
+0x002A,
+0x000C,
+0x002A,
+0x000C,
+0x002A,
+0x0015,
+0x000A,
+0x000C,
+0x0020,
+0x012D,
+0x00A0,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0x7467,
+0x0160,
+0x02C7,
+0x0060,
+0x0607,
+0x0060,
+0xFFE4,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0x1FE3,
+0x2DA7,
+0x0020,
+0x0D27,
+0x04A0,
+0x00C5,
+0x0160,
+0x0085,
+0x0080,
+0x0007,
+0x000C,
+0x0127,
+0x0019,
+0x0187,
+0x0000,
+0x0087,
+0x0000,
+0x0007,
+0x0000,
+0x0027,
+0x0000,
+0x0027,
+0x0000,
+0x0D67,
+0x0218,
+0x01E0,
+0x2D47,
+0x0012,
+0x0016,
+0x01FA,
+0x07E7,
+0x0020,
+0x06A7,
+0x00C0,
+0x001A,
+0x03E0,
+0x0167,
+0x0017,
+0x005A,
+0x01EC,
+0x00D5,
+0x0016,
+0x0012,
+0x0015,
+0x00A0,
+0x01EC,
+0x0015,
+0x0031,
+0x0030,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0035,
+0x0016,
+0x0012,
+0x0075,
+0x0050,
+0x0055,
+0x0000,
+0x0075,
+0x0011,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0016,
+0x0012,
+0x0055,
+0x0019,
+0x0011,
+0x0059,
+0x0000,
+0x0015,
+0x0017,
+0x0035,
+0x0060,
+0x0087,
+0x0000,
+0x10C7,
+0x0020,
+0x0002,
+0x0000,
+0x0055,
+0x0017,
+0x0055,
+0x0016,
+0x0012,
+0x0015,
+0x0019,
+0x0015,
+0x0011,
+0x0035,
+0x012D,
+0x0035,
+0x0059,
+0x0035,
+0x0329,
+0x0016,
+0x0015,
+0x0012,
+0x0018,
+0x0010,
+0x0018,
+0x0325,
+0x0016,
+0x0019,
+0x0012,
+0x0019,
+0x0016,
+0x0012,
+0x0015,
+0x0016,
+0x0012,
+0x0035,
+0x0127,
+0x0006,
+0x0587,
+0x0026,
+0x03C7,
+0x0040,
+0x00A7,
+0x0020,
+0x00A7,
+0x0020,
+0x00A7,
+0x0020,
+0x0047,
+0x0040,
+0x0037,
+0x0019,
+0x0018,
+0x001A,
+0x0037,
+0x0000,
+0x001A,
+0x0079,
+0x003A,
+0x0120,
+0x0042,
+0x003A,
+0x0020,
+0x0167,
+0x0000,
+0x0327,
+0x0000,
+0x0247,
+0x0000,
+0x0027,
+0x0000,
+0x01C7,
+0x0020,
+0x01A7,
+0x0420,
+0x0F47,
+0x0080,
+0x0055,
+0x0060,
+0x058F,
+0x0040,
+0x011A,
+0x068E,
+0x006F,
+0x021A,
+0x002F,
+0x005A,
+0x0000,
+0x019A,
+0x0040,
+0x001A,
+0x05C0,
+0x059A,
+0x000C,
+0x1020,
+0x0387,
+0x0040,
+0x0607,
+0x01C0,
+0x000C,
+0x034F,
+0x0060,
+0x03E7,
+0x006F,
+0x0100,
+0x0267,
+0x000E,
+0x00E7,
+0x000E,
+0x0080,
+0x04A7,
+0x008C,
+0x0080,
+0x03A7,
+0x0000,
+0x0015,
+0x0467,
+0x0060,
+0x00E7,
+0x0015,
+0x008E,
+0x0520,
+0x04E9,
+0x04E5,
+0x09A7,
+0x0020,
+0x012D,
+0x00A0,
+0x0469,
+0x0060,
+0x0465,
+0x0060,
+0x04E7,
+0x00E0,
+0x0667,
+0x0140,
+0x0015,
+0x0149,
+0x0000,
+0x01C9,
+0x0000,
+0x00C9,
+0x0000,
+0x0029,
+0x0000,
+0x0145,
+0x0000,
+0x01C5,
+0x0000,
+0x00C5,
+0x0000,
+0x0025,
+0x0840,
+0x26C7,
+0x0100,
+0x02A7,
+0x0120,
+0x00E7,
+0x02E0,
+0x00A6,
+0x0000,
+0x0526,
+0x0000,
+0x0106,
+0x0880,
+0x00A7,
+0x0020,
+0x0007,
+0x0000,
+0x0567,
+0x0000,
+0x0027,
+0x0040,
+0x0007,
+0x0020,
+0x02C7,
+0x0000,
+0x0015,
+0x00EF,
+0x02C7,
+0x003A,
+0x00CF,
+0x03C7,
+0x00E0,
+0x010F,
+0x05E0,
+0x0247,
+0x0000,
+0x0027,
+0x0080,
+0x008F,
+0x02A7,
+0x00AF,
+0x0040,
+0x0015,
+0x0327,
+0x0080,
+0x0015,
+0x07E0,
+0x06E7,
+0x0060,
+0x002F,
+0x0027,
+0x01EF,
+0x0020,
+0x05AF,
+0x0007,
+0x004C,
+0x0000,
+0x002C,
+0x0080,
+0x006C,
+0x0067,
+0x0000,
+0x0047,
+0x0000,
+0x0387,
+0x0020,
+0x004C,
+0x0060,
+0x000C,
+0x010F,
+0x00C0,
+0x0115,
+0x00C0,
+0x0387,
+0x002F,
+0x0015,
+0x0387,
+0x004F,
+0x03E0,
+0x00E7,
+0x001A,
+0x0367,
+0x002C,
+0x0060,
+0x008F,
+0x00D5,
+0x0100,
+0x06A7,
+0x0040,
+0x00D5,
+0x02A7,
+0x0020,
+0x00EF,
+0x0247,
+0x0080,
+0x00EF,
+0x0227,
+0x00C0,
+0x0075,
+0x0160,
+0x00CF,
+0x09E0,
+0x0907,
+0x06C0,
+0x0649,
+0x0180,
+0x0645,
+0x00C0,
+0x00AF,
+0x0467,
+0x006C,
+0x00E0,
+0x012D,
+0x24A0,
+0x03CF,
+0x0000,
+0x0527,
+0x0000,
+0x002C,
+0x0011,
+0x0020,
+0x0027,
+0x0940,
+0x004C,
+0x0387,
+0x012F,
+0x0007,
+0x00E0,
+0x02A7,
+0x014C,
+0x006F,
+0x0095,
+0x02A0,
+0x0227,
+0x006C,
+0x0075,
+0x04A0,
+0x0287,
+0x00CF,
+0x0260,
+0x02C7,
+0x0100,
+0x000A,
+0x000C,
+0x000A,
+0x0687,
+0x01CC,
+0x00D5,
+0x0060,
+0x026F,
+0x012D,
+0x000C,
+0x0027,
+0x002C,
+0x0007,
+0x0100,
+0x004C,
+0x000A,
+0x0587,
+0x004A,
+0x006C,
+0x002A,
+0x002C,
+0x0035,
+0x0002,
+0x0075,
+0x000C,
+0x0120,
+0x0002,
+0x0020,
+0x0307,
+0x00C0,
+0x012D,
+0x00A0,
+0x004C,
+0x0467,
+0x008C,
+0x000A,
+0x00EC,
+0x0000,
+0x012D,
+0x0075,
+0x0007,
+0x002A,
+0x0007,
+0x00E0,
+0x0447,
+0x000C,
+0x0035,
+0x0007,
+0x0100,
+0x002C,
+0x000A,
+0x05E7,
+0x004A,
+0x010C,
+0x002A,
+0x0067,
+0x0075,
+0x006C,
+0x0015,
+0x000A,
+0x000C,
+0x012D,
+0x0007,
+0x0015,
+0x0007,
+0x0055,
+0x0000,
+0x026F,
+0x0140,
+0x0227,
+0x0000,
+0x0307,
+0x004A,
+0x004C,
+0x002A,
+0x000C,
+0x000A,
+0x002C,
+0x00B5,
+0x000C,
+0x0027,
+0x000C,
+0x07A0,
+0x00C7,
+0x0000,
+0x0007,
+0x0000,
+0x0067,
+0x0000,
+0x01C7,
+0x0000,
+0x0127,
+0x0015,
+0x00A0,
+0x05C7,
+0x000C,
+0x004A,
+0x00EC,
+0x0080,
+0x012D,
+0x00A0,
+0x002C,
+0x002A,
+0x0000,
+0x00E7,
+0x0020,
+0x0027,
+0x0020,
+0x02A7,
+0x0000,
+0x00C7,
+0x0000,
+0x0027,
+0x0000,
+0x0087,
+0x0000,
+0x002C,
+0x0007,
+0x002A,
+0x000C,
+0x006A,
+0x0020,
+0x002A,
+0x0020,
+0x004A,
+0x0020,
+0x0007,
+0x00A0,
+0x000A,
+0x0080,
+0x0087,
+0x002A,
+0x0020,
+0x00CC,
+0x0040,
+0x008C,
+0x1140,
+0x0687,
+0x004A,
+0x00EC,
+0x002A,
+0x004C,
+0x000A,
+0x000C,
+0x0067,
+0x0095,
+0x012D,
+0x0035,
+0x0000,
+0x0015,
+0x000C,
+0x0047,
+0x03A0,
+0x05E7,
+0x004A,
+0x00AC,
+0x000A,
+0x000C,
+0x006A,
+0x002C,
+0x000A,
+0x002C,
+0x0027,
+0x0015,
+0x0007,
+0x00E0,
+0x012D,
+0x14A0,
+0x05C7,
+0x004A,
+0x006C,
+0x0020,
+0x006A,
+0x002C,
+0x000A,
+0x002C,
+0x02D5,
+0x0067,
+0x002C,
+0x0420,
+0x05E7,
+0x004A,
+0x00EC,
+0x002A,
+0x000C,
+0x000A,
+0x002C,
+0x0055,
+0x0007,
+0x0140,
+0x012D,
+0x00A0,
+0x0195,
+0x0240,
+0x0547,
+0x000C,
+0x000A,
+0x000C,
+0x002A,
+0x00AC,
+0x000A,
+0x000C,
+0x0007,
+0x0015,
+0x00A0,
+0x012D,
+0x06A0,
+0x0347,
+0x0020,
+0x004C,
+0x002A,
+0x006C,
+0x000A,
+0x008C,
+0x0060,
+0x012D,
+0x002F,
+0x0055,
+0x001A,
+0x00C7,
+0x1700,
+0x0567,
+0x004A,
+0x010C,
+0x000A,
+0x002C,
+0x0015,
+0x0C60,
+0x03E9,
+0x03E5,
+0x012D,
+0x010F,
+0x0160,
+0x00E7,
+0x0020,
+0x0007,
+0x0020,
+0x00E7,
+0x0000,
+0x0027,
+0x0000,
+0x02E7,
+0x00AA,
+0x0000,
+0x002A,
+0x0020,
+0x002C,
+0x000A,
+0x000C,
+0x0007,
+0x000A,
+0x0007,
+0x000A,
+0x000C,
+0x0055,
+0x0100,
+0x012D,
+0x08A0,
+0x00E7,
+0x0020,
+0x04C7,
+0x004A,
+0x006C,
+0x0020,
+0x002C,
+0x006A,
+0x000C,
+0x0007,
+0x0015,
+0x0007,
+0x000A,
+0x0340,
+0x0007,
+0x012C,
+0x04E7,
+0x00AC,
+0x000A,
+0x0007,
+0x006C,
+0x00F5,
+0x000C,
+0x00E0,
+0x0007,
+0x00AC,
+0x002A,
+0x004C,
+0x05A7,
+0x018C,
+0x000A,
+0x002C,
+0x0055,
+0x0007,
+0x0095,
+0x0180,
+0x0907,
+0x00C0,
+0x0135,
+0x1EA0,
+0x0107,
+0x0000,
+0x0487,
+0x000A,
+0x00CC,
+0x0000,
+0x00AC,
+0x000A,
+0x000C,
+0x0007,
+0x0095,
+0x0120,
+0x012D,
+0x024F,
+0x0040,
+0x0035,
+0x03A7,
+0x0020,
+0x02AC,
+0x0000,
+0x000A,
+0x00CC,
+0x000A,
+0x002C,
+0x000A,
+0x002C,
+0x0900,
+0x00C7,
+0x0000,
+0x0027,
+0x0000,
+0x04A7,
+0x00AC,
+0x0040,
+0x000C,
+0x0000,
+0x002C,
+0x0000,
+0x00CC,
+0x0007,
+0x000C,
+0x00E0,
+0x012D,
+0x00A0,
+0x00A7,
+0x0000,
+0x0027,
+0x0000,
+0x03E7,
+0x008A,
+0x0000,
+0x002C,
+0x0000,
+0x002A,
+0x000C,
+0x000A,
+0x000C,
+0x0007,
+0x00C0,
+0x012D,
+0x26A0,
+0x0247,
+0x002C,
+0x002A,
+0x0035,
+0x00C0,
+0x002C,
+0x0007,
+0x000A,
+0x0187,
+0x0000,
+0x0427,
+0x002A,
+0x008C,
+0x0040,
+0x002A,
+0x000C,
+0x000A,
+0x000C,
+0x0195,
+0x012D,
+0x0AA0,
+0x0007,
+0x01C0,
+0x028F,
+0x00FA,
+0x0077,
+0x021A,
+0x0180,
+0x0015,
+0x7327,
+0x0CA0,
+0x0DCE,
+0x0000,
+0x0095,
+0x0140,
+0x1867,
+0xFFE0,
+0x4960,
+0x0C07,
+0x0035,
+0x0180,
+0x85E7,
+0x01E2,
+0x000C,
+0x00A7,
+0x01CC,
+0xFFE0,
+0xF520,
+0x48C7,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0x3700,
+0x4707,
+0x00C0,
+0x03C7,
+0x0000,
+0x012D,
+0x0060,
+0x0035,
+0x09C7,
+0x0000,
+0x012D,
+0x00A0,
+0x03A7,
+0x0020,
+0x008C,
+0x0015,
+0x0120,
+0x05E7,
+0x00CC,
+0x0095,
+0x007A,
+0x0066,
+0x0015,
+0x001A,
+0x0120,
+0x012D,
+0x0000,
+0x00CF,
+0x0000,
+0x0287,
+0x0080,
+0x0247,
+0x55E0,
+0x03E9,
+0x03E5,
+0x02CF,
+0x0075,
+0x0C80,
+0x0947,
+0x0060,
+0x000C,
+0x0007,
+0x06CA,
+0x00C0,
+0x006C,
+0x0186,
+0x07E0,
+0x0026,
+0x0015,
+0x0006,
+0x000C,
+0x0140,
+0x002A,
+0x01A0,
+0xFFE7,
+0xFFE7,
+0xFEE7,
+0x00E0,
+0x9AA7,
+0x0520,
+0x0107,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0x5CC0,
+0x0066,
+0x0000,
+0x00C6,
+0x0000,
+0x0026,
+0x0000,
+0x2447,
+0x01C0,
+0x0007,
+0x0380,
+0x0047,
+0x0020,
+0x0007,
+0x01A0,
+0x0067,
+0x00E0,
+0x3167,
+0xFFE0,
+0x2060,
+0x0D47,
+0x0080,
+0x0187,
+0x0040,
+0x0107,
+0x00C0,
+0x0127,
+0x0020,
+0x001A,
+0x002C,
+0x0015,
+0x0062,
+0xFFE0,
+0xFFE0,
+0x4B60,
+0x05AC,
+0x0020,
+0x02CC,
+0x0100,
+0x0E7A,
+0x0760,
+0x1EBA,
+0x0120,
+0x04DA,
+0x0020,
+0x077A,
+0x002A,
+0x004C,
+0x005A,
+0x00AA,
+0x00E2,
+0x00EC,
+0x003A,
+0x00CC,
+0x03BA,
+0x006C,
+0x079A,
+0x0280,
+0x083A,
+0x004C,
+0x001A,
+0x0F20,
+0x026F,
+0x0160,
+0x026F,
+0x0160,
+0x0ADA,
+0x0100,
+0x030F,
+0x10C0,
+0x0329,
+0x0325,
+0x0329,
+0x00C5,
+0x0000,
+0x0225,
+0x0329,
+0x0325,
+0x0009,
+0x0000,
+0x0029,
+0x0020,
+0x0009,
+0x0020,
+0x0029,
+0x0020,
+0x0069,
+0x0000,
+0x00E9,
+0x0065,
+0x0000,
+0x0005,
+0x0000,
+0x00C5,
+0x0000,
+0x0145,
+0x0329,
+0x0325,
+0x0029,
+0x0000,
+0x0069,
+0x0020,
+0x00E9,
+0x0000,
+0x00C9,
+0x0000,
+0x0325,
+0x0029,
+0x0000,
+0x0069,
+0x0000,
+0x0089,
+0x0000,
+0x0009,
+0x0040,
+0x00C9,
+0x0000,
+0x0325,
+0x0329,
+0x0325,
+0x0329,
+0x0325,
+0x0329,
+0x0325,
+0x0329,
+0x0325,
+0x0329,
+0x0325,
+0x0329,
+0x0365,
+0x0020,
+0x0309,
+0x0019,
+0x0305,
+0x0019,
+0x00A5,
+0x0309,
+0x0019,
+0x0305,
+0x0019,
+0x00A5,
+0x0309,
+0x0019,
+0x0305,
+0x0019,
+0x00A5,
+0x0309,
+0x0019,
+0x0305,
+0x0019,
+0x00A5,
+0x0309,
+0x0019,
+0x0305,
+0x0019,
+0x00A5,
+0x0009,
+0x0005,
+0x0020,
+0x062D,
+0x3FFA,
+0x06CC,
+0x007A,
+0x062C,
+0x00FA,
+0x000C,
+0x01BA,
+0x000C,
+0x003A,
+0x0095,
+0x01C0,
+0x008C,
+0x0000,
+0x01CC,
+0x89E0,
+0x0125,
+0x0007,
+0x0265,
+0x00A0,
+0x00A5,
+0x1A80,
+0x00CC,
+0x0000,
+0x020C,
+0x0020,
+0x00CC,
+0x0000,
+0x002C,
+0x0000,
+0x008C,
+0x0080,
+0x07A6,
+0x0400,
+0x000C,
+0x0DE0,
+0x0587,
+0x0040,
+0x00CC,
+0x00C6,
+0x0020,
+0x012D,
+0x0060,
+0x0007,
+0x001A,
+0x27E0,
+0x03A7,
+0x000C,
+0x0200,
+0x0567,
+0x006C,
+0x012D,
+0x0080,
+0x0017,
+0x39E0,
+0x0347,
+0x0006,
+0x006C,
+0x012D,
+0x5CA0,
+0x00C7,
+0x0000,
+0x0067,
+0x0000,
+0x0027,
+0x0000,
+0x01C7,
+0x0000,
+0x1887,
+0x0020,
+0x010F,
+0x00CC,
+0x0500,
+0x0429,
+0x0425,
+0x00CC,
+0x0006,
+0x0060,
+0x012D,
+0x0060,
+0x0035,
+0x6200,
+0x074F,
+0x001A,
+0x004F,
+0x0017,
+0x006F,
+0x0960,
+0x058F,
+0x001A,
+0x01CF,
+0x1820,
+0x0067,
+0x0000,
+0x0347,
+0x0000,
+0x0027,
+0x0000,
+0x0007,
+0x0020,
+0x0007,
+0x0000,
+0x0127,
+0x0000,
+0x0067,
+0x0000,
+0x0007,
+0x0000,
+0x0007,
+0x00A0,
+0x0007,
+0x0060,
+0x0007,
+0x0000,
+0x0007,
+0x0000,
+0x0007,
+0x0000,
+0x0047,
+0x0000,
+0x0027,
+0x0000,
+0x0007,
+0x0020,
+0x0007,
+0x0000,
+0x0007,
+0x0000,
+0x0007,
+0x0000,
+0x0007,
+0x0000,
+0x0007,
+0x0000,
+0x0027,
+0x0000,
+0x0007,
+0x0020,
+0x0067,
+0x0000,
+0x00C7,
+0x0000,
+0x0067,
+0x0000,
+0x0067,
+0x0000,
+0x0007,
+0x0000,
+0x0127,
+0x0000,
+0x0207,
+0x0080,
+0x0047,
+0x0000,
+0x0087,
+0x0000,
+0x0207,
+0x0660,
+0x0039,
+0x21A0,
+0x057A,
+0x0060,
+0x0C7A,
+0x0160,
+0x01DA,
+0x0020,
+0x01DA,
+0x0000,
+0x01DA,
+0x0000,
+0x049A,
+0x0120,
+0x018F,
+0x141A,
+0x06E0,
+0x039A,
+0x0180,
+0x057A,
+0x0060,
+0x011A,
+0x00C0,
+0x003A,
+0x01A0,
+0x00BA,
+0x1320,
+0x1F5A,
+0x0098,
+0x5AFA,
+0x0060,
+0x021A,
+0x0040,
+0x019A,
+0x0040,
+0x0EDA,
+0x0060,
+0x0BDA,
+0x00A0,
+0x017A,
+0x0060,
+0x001A,
+0x01C0,
+0x017A,
+0x0060,
+0x06FA,
+0x00E0,
+0x013A,
+0x00A0,
+0x04FA,
+0x00E0,
+0x03BA,
+0x0020,
+0x003A,
+0x09A0,
+0x2A7A,
+0x0160,
+0x01BA,
+0x0020,
+0x019A,
+0x0040,
+0x011A,
+0x00C0,
+0x05BA,
+0x0000,
+0x00DA,
+0x00E0,
+0x01BA,
+0x0060,
+0x011A,
+0x00C0,
+0x011A,
+0x00C0,
+0x125A,
+0x0000,
+0x06DA,
+0x0480,
+0x012D,
+0x80A0,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0xDBE7,
+0x03E0,
+0xFFE7,
+0xFFE7,
+0x0727,
+0x00A0,
+0x1BA7,
+0x0020,
+0xFFE7,
+0xFFE7,
+0xD027,
+0x01A0,
+0xFFE7,
+0xFFE7,
+0xFFE7,
+0xA607,
+0x01C0,
+0x4DA7,
+0xFFE0,
+0x3420,
+0x43A7,
+0xBC20,
+0xFFE7,
+0xFFE7,
+0x6947,
+0x0080,
+0xFFE7,
+0xFFE7,
+0x0BE7,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0x8A00,
+0x0002,
+0x03A0,
+0x0BE2,
+0x0FE0,
+0x1DEC,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xFFE0,
+0xC1E0,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFA3,
+0x0020,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFE3,
+0xFFA3,
+0x0020,
 };
 
 const std::unordered_set<uint32_t> unicode_set_whitespace = {
@@ -7030,3 +9274,4 @@ const std::vector<range_nfd> unicode_ranges_nfd = {  // start, last, nfd
 {0x02FA1C, 0x02FA1C, 0x009F3B},
 {0x02FA1D, 0x02FA1D, 0x02A600},
 };
+
diff --git a/src/unicode-data.h b/src/unicode-data.h
index e27fe1770710a..cd6a6451a278f 100644
--- a/src/unicode-data.h
+++ b/src/unicode-data.h
@@ -13,7 +13,7 @@ struct range_nfd {
 
 static const uint32_t MAX_CODEPOINTS = 0x110000;
 
-extern const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
+extern const std::vector<uint16_t> unicode_rle_codepoints_categs;
 extern const std::unordered_set<uint32_t> unicode_set_whitespace;
 extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase;
 extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase;

From 2636cb61703d5984dc94d3d757d0c506ec783846 Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Sat, 20 Jul 2024 23:19:42 +0200
Subject: [PATCH 05/29] Decode unicode data categories

---
 scripts/gen-unicode-data.py |  3 +-
 src/unicode.cpp             | 77 ++++++++++++++++++-------------------
 src/unicode.h               | 26 +++++++++++++
 3 files changed, 65 insertions(+), 41 deletions(-)

diff --git a/scripts/gen-unicode-data.py b/scripts/gen-unicode-data.py
index 55ac0af12c29f..542a9edbac582 100644
--- a/scripts/gen-unicode-data.py
+++ b/scripts/gen-unicode-data.py
@@ -49,6 +49,7 @@ def unicode_data_iter():
         yield (cpt, cpt_lower, cpt_upper, categ, bidir)
 
 
+# see codepoint_categ::from_index() in unicode.h
 UNICODE_CATEGORY_TO_INDEX = {
     "Cn":  0,  # \p{Cn} Undefined
     "Cc":  1,  # \p{Cc} Control
@@ -123,7 +124,7 @@ def unicode_data_iter():
 table_nfd.sort()
 
 
-# run length encoding
+# run length encoding, see unicode_cpt_category() in unicode.cpp
 assert (max(UNICODE_CATEGORY_TO_INDEX.values()) < 32)
 codepoint_categs_runs = [codepoint_categs[0]]  # 5 bits categ + 11 bits length
 for cpt, categ in enumerate(codepoint_categs[1:], 1):
diff --git a/src/unicode.cpp b/src/unicode.cpp
index e05fb9d1775dd..a78c59f740348 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -113,38 +113,6 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
 //    return result;
 //}
 
-static std::vector<codepoint_flags> unicode_cpt_flags_array() {
-    std::vector<codepoint_flags> cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED);
-
-    assert (unicode_ranges_flags.front().first == 0);
-    assert (unicode_ranges_flags.back().first == MAX_CODEPOINTS);
-    for (size_t i = 1; i < unicode_ranges_flags.size(); ++i) {
-        const auto range_ini = unicode_ranges_flags[i-1];  // codepoint_ini, flags
-        const auto range_end = unicode_ranges_flags[i];    // codepoint_end, flags
-        for (uint32_t cpt = range_ini.first; cpt < range_end.first; ++cpt) {
-            cpt_flags[cpt] = range_ini.second;
-        }
-    }
-
-    for (auto cpt : unicode_set_whitespace) {
-        cpt_flags[cpt].is_whitespace = true;
-    }
-
-    for (auto p : unicode_map_lowercase) {
-        cpt_flags[p.second].is_lowercase = true;
-    }
-
-    for (auto p : unicode_map_uppercase) {
-        cpt_flags[p.second].is_uppercase = true;
-    }
-
-    for (auto &range : unicode_ranges_nfd) {  // start, last, nfd
-        cpt_flags[range.nfd].is_nfd = true;
-    }
-
-    return cpt_flags;
-}
-
 static std::unordered_map<uint8_t, std::string> unicode_byte_to_utf8_map() {
     std::unordered_map<uint8_t, std::string> map;
     for (int ch = 0x21; ch <= 0x7E; ++ch) {  // u'!' to u'~'
@@ -606,19 +574,48 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
     return result;
 }
 
-codepoint_flags unicode_cpt_flags(const uint32_t cp) {
-    static const codepoint_flags undef(codepoint_flags::UNDEFINED);
-    static const auto cpt_flags = unicode_cpt_flags_array();
-    return cp < cpt_flags.size() ? cpt_flags[cp] : undef;
+codepoint_categ unicode_cpt_category(const uint32_t cp) {
+    static const std::vector<codepoint_categ> cpt_categs = [] {
+        std::vector<codepoint_categ> cpt_categs(MAX_CODEPOINTS, codepoint_categ::UNDEF);
+        uint32_t cpt = 0;
+        for (uint16_t rle : unicode_rle_codepoints_categs) {
+            const uint32_t index = rle & 31;
+            const uint32_t count = rle >> 5;
+            const auto categ = codepoint_categ::from_index(index);
+            //printf( "Codepoints 0x%05X to 0x%05X categ %s\n", cpt, cpt + count, categ.c_str());
+            for (uint32_t i = 0; i <= count; ++i) {
+                cpt_categs[cpt++] = categ;
+            }
+        }
+        assert (cpt == MAX_CODEPOINTS);
+
+        for (auto cpt : unicode_set_whitespace) {
+            cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACE);
+        }
+
+        for (auto p : unicode_map_lowercase) {
+            cpt_categs[cpt].set_flag(codepoint_categ::LOWERCASE);
+        }
+
+        for (auto p : unicode_map_uppercase) {
+            cpt_categs[cpt].set_flag(codepoint_categ::UPPERCASE);
+        }
+
+        //for (auto &range : unicode_ranges_nfd) {  // start, last, nfd
+        //    cpt_categs[cpt].set_flag(codepoint_categ::NORM_NFD);
+        //}
+
+        return cpt_categs;
+    }();
+    return cp < cpt_categs.size() ? cpt_categs[cp] : codepoint_categ{};
 }
 
-codepoint_flags unicode_cpt_flags(const std::string & utf8) {
-    static const codepoint_flags undef(codepoint_flags::UNDEFINED);
+codepoint_categ unicode_cpt_category(const std::string & utf8) {
     if (utf8.empty()) {
-        return undef;  // undefined
+        return codepoint_categ{};  // undefined
     }
     size_t offset = 0;
-    return unicode_cpt_flags(unicode_cpt_from_utf8(utf8, offset));
+    return unicode_cpt_category(unicode_cpt_from_utf8(utf8, offset));
 }
 
 std::string unicode_byte_to_utf8(uint8_t byte) {
diff --git a/src/unicode.h b/src/unicode.h
index f9f4fcc8cc7a0..e8928f261445d 100644
--- a/src/unicode.h
+++ b/src/unicode.h
@@ -3,6 +3,8 @@
 #include <cstdint>
 #include <string>
 #include <vector>
+#include <array>
+#include <map>
 
 struct codepoint_categ {
     enum _category : uint16_t {
@@ -59,6 +61,18 @@ struct codepoint_categ {
 
     inline codepoint_categ(const uint16_t categ=0) : encoded{categ} {}
 
+    static codepoint_categ from_index(int index) {
+        static const std::array<codepoint_categ, 32> table = {
+            UNDEF, Cc, Cf, Co, Cs, Ll, Lm, Lo, Lt, Lu, Mc, Me, Mn, Nd, Nl, No, Pc, Pd, Pe, Pf, Pi, Po, Ps, Sc, Sk, Sm, So, Zl, Zp, Zs, UNDEF, UNDEF
+        };
+        return (size_t)index < table.size() ? table[index] : table[0];
+    }
+
+    inline void set_flag(_flags flags, bool value = true) {
+        flags = (_flags) (flags & ~SUBMASK);  // ignore category bits
+        encoded = value ? (encoded | flags) : (encoded & ~flags);
+    }
+
     inline uint8_t get_category() const { return encoded & MASK; }
     inline uint8_t get_subcategory() const { return encoded & SUBMASK; }
 
@@ -107,6 +121,18 @@ struct codepoint_categ {
     inline auto is_Zp() const { return (encoded & SUBMASK) == Zp; }
     inline auto is_Zs() const { return (encoded & SUBMASK) == Zs; }
 
+    const char * c_str() const {
+        static const std::map<uint16_t, const char *> map = {
+            {UNDEF, "UNDEF"}, {C, "C"}, {L, "L"}, {M, "M"}, {N, "N"}, {P, "P"}, {S, "S"}, {Z, "Z"},
+            {Cc, "Cc"}, {Cf, "Cf"}, {Co, "Co"}, {Cs, "Cs"}, {Ll, "Ll"}, {Lm, "Lm"}, {Lo, "Lo"}, {Lt, "Lt"},
+            {Lu, "Lu"}, {Mc, "Mc"}, {Me, "Me"}, {Mn, "Mn"}, {Nd, "Nd"}, {Nl, "Nl"}, {No, "No"}, {Pc, "Pc"},
+            {Pd, "Pd"}, {Pe, "Pe"}, {Pf, "Pf"}, {Pi, "Pi"}, {Po, "Po"}, {Ps, "Ps"}, {Sc, "Sc"}, {Sk, "Sk"},
+            {Sm, "Sm"}, {So, "So"}, {Zl, "Zl"}, {Zp, "Zp"}, {Zs, "Zs"},
+        };
+        const auto it = map.find(encoded & SUBMASK);
+        return it == map.end() ? "INVALID" : it->second;
+    }
+
     uint16_t encoded;
 };
 

From 23cf064e3bcd1ca6502484c47b23cb48f4cb4321 Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Sat, 20 Jul 2024 23:28:05 +0200
Subject: [PATCH 06/29] Replace 'codepoint_flags' with 'codepoint_categ'

---
 src/llama.cpp   | 12 +++----
 src/unicode.cpp | 84 +++++++++++++++++++++++++------------------------
 src/unicode.h   |  8 +++++
 3 files changed, 57 insertions(+), 47 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 7d68ed8111873..e8dcc9ff348da 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -15836,22 +15836,22 @@ struct llm_tokenizer_wpm {
         std::vector<std::string> words(1, "");
 
         for (const uint32_t cpt : cpts_nfd) {
-            const auto flags = unicode_cpt_flags(cpt);
+            const auto categ = unicode_cpt_category(cpt);
 
-            if (flags.is_whitespace) {
+            if (categ.is_whitespace()) {
                 if (words.back().size()) {  // finish previous word if any
                     words.emplace_back();
                 }
                 continue;
             }
 
-            assert (!flags.is_separator);
-            if (cpt == 0 || cpt == 0xFFFD || flags.is_control) {
+            assert (!categ.is_S());
+            if (cpt == 0 || cpt == 0xFFFD || categ.is_C()) {
                 continue;
             }
 
             const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
-            if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) {
+            if (categ.is_P() || (cpt < 0x7F && categ.is_S()) || is_chinese_char(cpt)) {
                 if (words.back().size()) {  // finish previous word if any
                     words.emplace_back();
                 }
@@ -15869,7 +15869,7 @@ struct llm_tokenizer_wpm {
         return words;
     }
 
-    static bool is_chinese_char(uint32_t cpt) {
+    static bool is_chinese_char(uint32_t cpt) {  //TODO: move to unicode-data.cpp? unicode_cpt_category(cpt).is_chinese()?
         return
             (cpt >= 0x04E00 && cpt <= 0x09FFF) ||
             (cpt >= 0x03400 && cpt <= 0x04DBF) ||
diff --git a/src/unicode.cpp b/src/unicode.cpp
index a78c59f740348..4c33359743dcc 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -203,8 +203,9 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
             return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
         };
 
-        auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
-            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
+        static const codepoint_categ SENTINEL = codepoint_categ::MASK + 1;
+        auto _get_categ = [&] (const size_t pos) -> codepoint_categ {
+            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_category(cpts[pos]) : SENTINEL;
         };
 
         size_t _prev_end = offset_ini;
@@ -226,7 +227,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
 
         for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
             const uint32_t cpt = _get_cpt(pos);
-            const auto flags = _get_flags(pos);
+            const auto categ = _get_categ(pos);
 
             // regex: 's|'t|'re|'ve|'m|'ll|'d
             if (cpt == '\'' && pos+1 < offset_end) {
@@ -246,37 +247,37 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
                 }
             }
 
-            auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
+            auto categ2 = (cpt == ' ' ? _get_categ(pos+1) : categ);
             // regex: <space>?\p{L}+
-            if (flags2.is_letter) {
+            if (categ2.is_L()) {
                 pos += (cpt == ' ');
-                while (flags2.is_letter) {
-                    flags2 = _get_flags(++pos);
+                while (categ2.is_L()) {
+                    categ2 = _get_categ(++pos);
                 }
                 _add_token(pos);
                 continue;
             }
             // regex: <space>?\p{N}+
-            if (flags2.is_number) {
+            if (categ2.is_N()) {
                 pos += (cpt == ' ');
-                while (flags2.is_number) {
-                    flags2 = _get_flags(++pos);
+                while (categ2.is_N()) {
+                    categ2 = _get_categ(++pos);
                 }
                 _add_token(pos);
                 continue;
             }
             // regex: <space>?[^\s\p{L}\p{N}]+
-            if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
+            if (!(categ2.is_whitespace() | categ2.is_L() | categ2.is_N()) && categ2 != SENTINEL) {
                 pos += (cpt == ' ');
-                while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
-                    flags2 = _get_flags(++pos);
+                while (!(categ2.is_whitespace() | categ2.is_L() | categ2.is_N()) && categ2 != SENTINEL) {
+                    categ2 = _get_categ(++pos);
                 }
                 _add_token(pos);
                 continue;
             }
 
             size_t num_whitespaces = 0;
-            while (_get_flags(pos+num_whitespaces).is_whitespace) {
+            while (_get_categ(pos+num_whitespaces).is_whitespace()) {
                 num_whitespaces++;
             }
 
@@ -321,8 +322,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
             return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
         };
 
-        auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
-            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
+        static const codepoint_categ SENTINEL = codepoint_categ::MASK + 1;
+        auto _get_categ = [&] (const size_t pos) -> codepoint_categ {
+            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_category(cpts[pos]) : SENTINEL;
         };
 
         size_t _prev_end = offset_ini;
@@ -344,7 +346,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
 
         for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
             const uint32_t cpt = _get_cpt(pos);
-            const auto flags = _get_flags(pos);
+            const auto categ = _get_categ(pos);
 
             // regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
             if (cpt == '\'' && pos+1 < offset_end) {
@@ -365,10 +367,10 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
             }
 
             // regex: [^\r\n\p{L}\p{N}]?\p{L}+
-            if (!(cpt == '\r' || cpt == '\n' || flags.is_number)) {
-                if (flags.is_letter || _get_flags(pos+1).is_letter) {  // one or more letters
+            if (!(cpt == '\r' || cpt == '\n' || categ.is_N())) {
+                if (categ.is_L() || _get_categ(pos+1).is_L()) {  // one or more letters
                     pos++;
-                    while (_get_flags(pos).is_letter) {
+                    while (_get_categ(pos).is_L()) {
                         pos++;
                     }
                     _add_token(pos);
@@ -377,9 +379,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
             }
 
             // regex: \p{N}{1,3}
-            if (flags.is_number) {
+            if (categ.is_N()) {
                 size_t ini = pos;
-                while (_get_flags(pos).is_number) {
+                while (_get_categ(pos).is_N()) {
                     if (++pos - ini >= 3 ) {
                         _add_token(pos);
                         ini = pos;
@@ -390,11 +392,11 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
             }
 
             // regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
-            auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
-            if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags.as_uint()) {
+            auto categ2 = (cpt == ' ' ? _get_categ(pos+1) : categ);
+            if (!(categ2.is_whitespace() | categ2.is_L() | categ2.is_N()) && categ2 != SENTINEL) {
                 pos += (cpt == ' ');
-                while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
-                    flags2 = _get_flags(++pos);
+                while (!(categ2.is_whitespace() | categ2.is_L() | categ2.is_N()) && categ2 != SENTINEL) {
+                    categ2 = _get_categ(++pos);
                 }
                 uint32_t cpt2 = _get_cpt(pos);
                 while (cpt2 == '\r' || cpt2 == '\n') {
@@ -406,7 +408,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
 
             size_t num_whitespaces = 0;
             size_t last_end_r_or_n = 0;
-            while (_get_flags(pos+num_whitespaces).is_whitespace) {
+            while (_get_categ(pos+num_whitespaces).is_whitespace()) {
                 uint32_t cpt2 = _get_cpt(pos+num_whitespaces);
                 if (cpt2 == '\r' || cpt2 == '\n') {
                     last_end_r_or_n = pos + num_whitespaces + 1;
@@ -636,21 +638,21 @@ uint32_t unicode_tolower(uint32_t cp) {
 std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
     // unicode categories
     static const std::map<std::string, int> k_ucat_enum = {
-        { "\\p{N}", codepoint_flags::NUMBER },
-        { "\\p{L}", codepoint_flags::LETTER },
-        { "\\p{P}", codepoint_flags::PUNCTUATION },
+        { "\\p{N}", codepoint_categ::N },
+        { "\\p{L}", codepoint_categ::L },
+        { "\\p{P}", codepoint_categ::P },
     };
 
     static const std::map<int, int> k_ucat_cpt = {
-        { codepoint_flags::NUMBER,        0xD1 },
-        { codepoint_flags::LETTER,        0xD2 },
-        { codepoint_flags::PUNCTUATION,   0xD3 },
+        { codepoint_categ::N, 0xD1 },
+        { codepoint_categ::L, 0xD2 },
+        { codepoint_categ::P, 0xD3 },
     };
 
     static const std::map<int, std::string> k_ucat_map = {
-        { codepoint_flags::NUMBER,        "\x30-\x39" }, // 0-9
-        { codepoint_flags::LETTER,        "\x41-\x5A\x61-\x7A" }, // A-Za-z
-        { codepoint_flags::PUNCTUATION,   "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
+        { codepoint_categ::N, "\x30-\x39" }, // 0-9
+        { codepoint_categ::L, "\x41-\x5A\x61-\x7A" }, // A-Za-z
+        { codepoint_categ::P, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
     };
 
     // compute collapsed codepoints only if needed by at least one regex
@@ -681,14 +683,14 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                 continue;
             }
 
-            const auto flags = unicode_cpt_flags(cpts[i]);
+            const auto categ = unicode_cpt_category(cpts[i]);
 
-            if (flags.is_whitespace) {
+            if (categ.is_whitespace()) {
                 //NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does.
                 //text_collapsed[i] = (char) 0x85;  // <Next Line> as whitespace fallback
                 text_collapsed[i] = (char) 0x0B;    // <vertical tab> as whitespace fallback
-            } else if (k_ucat_cpt.find(flags.category_flag()) != k_ucat_cpt.end()) {
-                text_collapsed[i] = k_ucat_cpt.at(flags.category_flag());
+            } else if (k_ucat_cpt.find(categ.get_category()) != k_ucat_cpt.end()) {
+                text_collapsed[i] = k_ucat_cpt.at(categ.get_category());
             } else {
                 text_collapsed[i] = (char) 0xD0; // fallback
             }
@@ -777,7 +779,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                 // std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
                 std::wstring wtext(cpts.begin(), cpts.end());
                 for (size_t i = 0; i < wtext.size(); ++i) {
-                    if (wtext[i] > 0x7F && unicode_cpt_flags(wtext[i]).is_whitespace) {
+                    if (wtext[i] > 0x7F && unicode_cpt_category(wtext[i]).is_whitespace()) {
                         wtext[i] = 0x0B;
                     }
                 }
diff --git a/src/unicode.h b/src/unicode.h
index e8928f261445d..339ef2893a4e9 100644
--- a/src/unicode.h
+++ b/src/unicode.h
@@ -121,6 +121,14 @@ struct codepoint_categ {
     inline auto is_Zp() const { return (encoded & SUBMASK) == Zp; }
     inline auto is_Zs() const { return (encoded & SUBMASK) == Zs; }
 
+    inline bool operator == (const codepoint_categ other) const {
+        return encoded == other.encoded;
+    }
+
+    inline bool operator != (const codepoint_categ other) const {
+        return encoded != other.encoded;
+    }
+
     const char * c_str() const {
         static const std::map<uint16_t, const char *> map = {
             {UNDEF, "UNDEF"}, {C, "C"}, {L, "L"}, {M, "M"}, {N, "N"}, {P, "P"}, {S, "S"}, {Z, "Z"},

From ecebfc0c718c81b7f0d6d6552e81c71fe9bf2053 Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Fri, 26 Jul 2024 00:16:24 +0200
Subject: [PATCH 07/29] Update unicode data: sorted whitespaces

---
 scripts/gen-unicode-data.py | 2 +-
 src/unicode-data.cpp        | 3 +--
 src/unicode-data.h          | 3 +--
 src/unicode.cpp             | 2 +-
 4 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/scripts/gen-unicode-data.py b/scripts/gen-unicode-data.py
index 542a9edbac582..d774fcabe9481 100644
--- a/scripts/gen-unicode-data.py
+++ b/scripts/gen-unicode-data.py
@@ -170,7 +170,7 @@ def out(line=""):
     out("0x%04X," % rle)
 out("};\n")
 
-out("const std::unordered_set<uint32_t> unicode_set_whitespace = {")
+out("const std::vector<uint32_t> unicode_vec_whitespace = {")
 for codepoint in table_whitespace:
     out("0x%06X," % codepoint)
 out("};\n")
diff --git a/src/unicode-data.cpp b/src/unicode-data.cpp
index 4a0c0547c7d03..2591723ce3172 100644
--- a/src/unicode-data.cpp
+++ b/src/unicode-data.cpp
@@ -5,7 +5,6 @@
 #include <cstdint>
 #include <vector>
 #include <unordered_map>
-#include <unordered_set>
 
 const std::vector<uint16_t> unicode_rle_codepoints_categs = {  // run length encoding, 5 bits categ + 11 bits length
 0x03E1,
@@ -4527,7 +4526,7 @@ const std::vector<uint16_t> unicode_rle_codepoints_categs = {  // run length enc
 0x0020,
 };
 
-const std::unordered_set<uint32_t> unicode_set_whitespace = {
+const std::vector<uint32_t> unicode_vec_whitespace = {
 0x000009,
 0x00000A,
 0x00000B,
diff --git a/src/unicode-data.h b/src/unicode-data.h
index cd6a6451a278f..682f79c373749 100644
--- a/src/unicode-data.h
+++ b/src/unicode-data.h
@@ -3,7 +3,6 @@
 #include <cstdint>
 #include <vector>
 #include <unordered_map>
-#include <unordered_set>
 
 struct range_nfd {
     uint32_t first;
@@ -14,7 +13,7 @@ struct range_nfd {
 static const uint32_t MAX_CODEPOINTS = 0x110000;
 
 extern const std::vector<uint16_t> unicode_rle_codepoints_categs;
-extern const std::unordered_set<uint32_t> unicode_set_whitespace;
+extern const std::vector<uint32_t> unicode_vec_whitespace;
 extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase;
 extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase;
 extern const std::vector<range_nfd> unicode_ranges_nfd;
diff --git a/src/unicode.cpp b/src/unicode.cpp
index 4c33359743dcc..dd413c8092a54 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -591,7 +591,7 @@ codepoint_categ unicode_cpt_category(const uint32_t cp) {
         }
         assert (cpt == MAX_CODEPOINTS);
 
-        for (auto cpt : unicode_set_whitespace) {
+        for (auto cpt : unicode_vec_whitespace) {
             cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACE);
         }
 

From 8c8e1afaaece651ef50d926d8fcf997dc98c8263 Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Fri, 26 Jul 2024 00:18:16 +0200
Subject: [PATCH 08/29] Fix codepoint_categ return types

---
 src/unicode.h | 86 +++++++++++++++++++++++++--------------------------
 1 file changed, 43 insertions(+), 43 deletions(-)

diff --git a/src/unicode.h b/src/unicode.h
index 339ef2893a4e9..4ea8f19472ebc 100644
--- a/src/unicode.h
+++ b/src/unicode.h
@@ -73,53 +73,53 @@ struct codepoint_categ {
         encoded = value ? (encoded | flags) : (encoded & ~flags);
     }
 
-    inline uint8_t get_category() const { return encoded & MASK; }
-    inline uint8_t get_subcategory() const { return encoded & SUBMASK; }
+    inline uint16_t get_category() const { return encoded & MASK; }
+    inline uint16_t get_subcategory() const { return encoded & SUBMASK; }
 
     inline bool is_undefined() const { return !encoded; }
     inline bool is_defined() const { return encoded; }
 
-    inline auto is_whitespace() const { return encoded & WHITESPACE; }
-    inline auto is_lowercase()  const { return encoded & LOWERCASE; }
-    inline auto is_uppercase()  const { return encoded & UPPERCASE; }
-
-    inline auto is_C() const { return encoded & C; }
-    inline auto is_L() const { return encoded & L; }
-    inline auto is_M() const { return encoded & M; }
-    inline auto is_N() const { return encoded & N; }
-    inline auto is_P() const { return encoded & P; }
-    inline auto is_S() const { return encoded & S; }
-    inline auto is_Z() const { return encoded & Z; }
-
-    inline auto is_Cc() const { return (encoded & SUBMASK) == Cc; }
-    inline auto is_Cf() const { return (encoded & SUBMASK) == Cf; }
-    inline auto is_Co() const { return (encoded & SUBMASK) == Co; }
-    inline auto is_Cs() const { return (encoded & SUBMASK) == Cs; }
-    inline auto is_Ll() const { return (encoded & SUBMASK) == Ll; }
-    inline auto is_Lm() const { return (encoded & SUBMASK) == Lm; }
-    inline auto is_Lo() const { return (encoded & SUBMASK) == Lo; }
-    inline auto is_Lt() const { return (encoded & SUBMASK) == Lt; }
-    inline auto is_Lu() const { return (encoded & SUBMASK) == Lu; }
-    inline auto is_Mc() const { return (encoded & SUBMASK) == Mc; }
-    inline auto is_Me() const { return (encoded & SUBMASK) == Me; }
-    inline auto is_Mn() const { return (encoded & SUBMASK) == Mn; }
-    inline auto is_Nd() const { return (encoded & SUBMASK) == Nd; }
-    inline auto is_Nl() const { return (encoded & SUBMASK) == Nl; }
-    inline auto is_No() const { return (encoded & SUBMASK) == No; }
-    inline auto is_Pc() const { return (encoded & SUBMASK) == Pc; }
-    inline auto is_Pd() const { return (encoded & SUBMASK) == Pd; }
-    inline auto is_Pe() const { return (encoded & SUBMASK) == Pe; }
-    inline auto is_Pf() const { return (encoded & SUBMASK) == Pf; }
-    inline auto is_Pi() const { return (encoded & SUBMASK) == Pi; }
-    inline auto is_Po() const { return (encoded & SUBMASK) == Po; }
-    inline auto is_Ps() const { return (encoded & SUBMASK) == Ps; }
-    inline auto is_Sc() const { return (encoded & SUBMASK) == Sc; }
-    inline auto is_Sk() const { return (encoded & SUBMASK) == Sk; }
-    inline auto is_Sm() const { return (encoded & SUBMASK) == Sm; }
-    inline auto is_So() const { return (encoded & SUBMASK) == So; }
-    inline auto is_Zl() const { return (encoded & SUBMASK) == Zl; }
-    inline auto is_Zp() const { return (encoded & SUBMASK) == Zp; }
-    inline auto is_Zs() const { return (encoded & SUBMASK) == Zs; }
+    inline uint16_t is_whitespace() const { return encoded & WHITESPACE; }
+    inline uint16_t is_lowercase()  const { return encoded & LOWERCASE; }
+    inline uint16_t is_uppercase()  const { return encoded & UPPERCASE; }
+
+    inline uint16_t is_C() const { return encoded & C; }
+    inline uint16_t is_L() const { return encoded & L; }
+    inline uint16_t is_M() const { return encoded & M; }
+    inline uint16_t is_N() const { return encoded & N; }
+    inline uint16_t is_P() const { return encoded & P; }
+    inline uint16_t is_S() const { return encoded & S; }
+    inline uint16_t is_Z() const { return encoded & Z; }
+
+    inline bool is_Cc() const { return (encoded & SUBMASK) == Cc; }
+    inline bool is_Cf() const { return (encoded & SUBMASK) == Cf; }
+    inline bool is_Co() const { return (encoded & SUBMASK) == Co; }
+    inline bool is_Cs() const { return (encoded & SUBMASK) == Cs; }
+    inline bool is_Ll() const { return (encoded & SUBMASK) == Ll; }
+    inline bool is_Lm() const { return (encoded & SUBMASK) == Lm; }
+    inline bool is_Lo() const { return (encoded & SUBMASK) == Lo; }
+    inline bool is_Lt() const { return (encoded & SUBMASK) == Lt; }
+    inline bool is_Lu() const { return (encoded & SUBMASK) == Lu; }
+    inline bool is_Mc() const { return (encoded & SUBMASK) == Mc; }
+    inline bool is_Me() const { return (encoded & SUBMASK) == Me; }
+    inline bool is_Mn() const { return (encoded & SUBMASK) == Mn; }
+    inline bool is_Nd() const { return (encoded & SUBMASK) == Nd; }
+    inline bool is_Nl() const { return (encoded & SUBMASK) == Nl; }
+    inline bool is_No() const { return (encoded & SUBMASK) == No; }
+    inline bool is_Pc() const { return (encoded & SUBMASK) == Pc; }
+    inline bool is_Pd() const { return (encoded & SUBMASK) == Pd; }
+    inline bool is_Pe() const { return (encoded & SUBMASK) == Pe; }
+    inline bool is_Pf() const { return (encoded & SUBMASK) == Pf; }
+    inline bool is_Pi() const { return (encoded & SUBMASK) == Pi; }
+    inline bool is_Po() const { return (encoded & SUBMASK) == Po; }
+    inline bool is_Ps() const { return (encoded & SUBMASK) == Ps; }
+    inline bool is_Sc() const { return (encoded & SUBMASK) == Sc; }
+    inline bool is_Sk() const { return (encoded & SUBMASK) == Sk; }
+    inline bool is_Sm() const { return (encoded & SUBMASK) == Sm; }
+    inline bool is_So() const { return (encoded & SUBMASK) == So; }
+    inline bool is_Zl() const { return (encoded & SUBMASK) == Zl; }
+    inline bool is_Zp() const { return (encoded & SUBMASK) == Zp; }
+    inline bool is_Zs() const { return (encoded & SUBMASK) == Zs; }
 
     inline bool operator == (const codepoint_categ other) const {
         return encoded == other.encoded;

From 8f7d56ec5b0c4fc5c42265f6071f6fb6977ec856 Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Fri, 26 Jul 2024 00:26:42 +0200
Subject: [PATCH 09/29] Add unicode_data helper functions

---
 src/unicode.h | 37 ++++++++++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 7 deletions(-)

diff --git a/src/unicode.h b/src/unicode.h
index 4ea8f19472ebc..0b8243ccd4ac9 100644
--- a/src/unicode.h
+++ b/src/unicode.h
@@ -1,6 +1,8 @@
 #pragma once
 
 #include <cstdint>
+#include <cassert>
+#include <cstring>
 #include <string>
 #include <vector>
 #include <array>
@@ -61,13 +63,6 @@ struct codepoint_categ {
 
     inline codepoint_categ(const uint16_t categ=0) : encoded{categ} {}
 
-    static codepoint_categ from_index(int index) {
-        static const std::array<codepoint_categ, 32> table = {
-            UNDEF, Cc, Cf, Co, Cs, Ll, Lm, Lo, Lt, Lu, Mc, Me, Mn, Nd, Nl, No, Pc, Pd, Pe, Pf, Pi, Po, Ps, Sc, Sk, Sm, So, Zl, Zp, Zs, UNDEF, UNDEF
-        };
-        return (size_t)index < table.size() ? table[index] : table[0];
-    }
-
     inline void set_flag(_flags flags, bool value = true) {
         flags = (_flags) (flags & ~SUBMASK);  // ignore category bits
         encoded = value ? (encoded | flags) : (encoded & ~flags);
@@ -141,6 +136,34 @@ struct codepoint_categ {
         return it == map.end() ? "INVALID" : it->second;
     }
 
+    static codepoint_categ from_index(int index) {
+        static const std::array<codepoint_categ, 32> table = {
+            UNDEF, Cc, Cf, Co, Cs, Ll, Lm, Lo, Lt, Lu, Mc, Me, Mn, Nd, Nl, No, Pc, Pd, Pe, Pf, Pi, Po, Ps, Sc, Sk, Sm, So, Zl, Zp, Zs, UNDEF, UNDEF
+        };
+        return (size_t)index < table.size() ? table[index] : table[0];
+    }
+
+    static codepoint_categ from_chars(const char categ, const char subcateg = '\0') {
+        auto _subindex = [] (const char subcateg, const char subcategs[]) -> uint16_t {
+            if (!subcateg) {
+                return 0;
+            }
+            const char * p = strchr(subcategs, subcateg);
+            return p ? (p - subcategs + 1) : 0;
+        };
+        switch(categ) {
+            case 'C':  if(subcateg == 'n') return 0;  // undefined
+                       return C | (_subindex(subcateg, "cfos"   ) << 7);
+            case 'L':  return L | (_subindex(subcateg, "lmotu"  ) << 7);
+            case 'M':  return M | (_subindex(subcateg, "cen"    ) << 7);
+            case 'N':  return N | (_subindex(subcateg, "dlo"    ) << 7);
+            case 'P':  return P | (_subindex(subcateg, "cdefios") << 7);
+            case 'S':  return S | (_subindex(subcateg, "ckmo"   ) << 7);
+            case 'Z':  return Z | (_subindex(subcateg, "lps"    ) << 7);
+            default:   assert (false);  return 0;
+        }
+    };
+
     uint16_t encoded;
 };
 

From 1cd7ac090b08765133b5de1bacf0b58b66738d0f Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Fri, 26 Jul 2024 00:43:43 +0200
Subject: [PATCH 10/29] Reimplement 'collapsed' unicode categories:

- Add all unicode categories.
- Fix \s with non-ASCII problem.
---
 src/unicode.cpp | 395 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 274 insertions(+), 121 deletions(-)

diff --git a/src/unicode.cpp b/src/unicode.cpp
index dd413c8092a54..68cadf0c49075 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -636,67 +636,39 @@ uint32_t unicode_tolower(uint32_t cp) {
 }
 
 std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
-    // unicode categories
-    static const std::map<std::string, int> k_ucat_enum = {
-        { "\\p{N}", codepoint_categ::N },
-        { "\\p{L}", codepoint_categ::L },
-        { "\\p{P}", codepoint_categ::P },
-    };
+    //TODO: update and add more comments
+    // generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte
+    // ref: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935
 
-    static const std::map<int, int> k_ucat_cpt = {
-        { codepoint_categ::N, 0xD1 },
-        { codepoint_categ::L, 0xD2 },
-        { codepoint_categ::P, 0xD3 },
+    // 0xDB80 to 0xDBFF: Private Use High Surrogate (128 range values)
+    static const uint32_t COLLAPSE_CPT_RANGE_FIRST = 0xDB80;
+    static const uint32_t COLLAPSE_CPT_RANGE_LAST  = 0xDBFF;
+    auto category_to_collapsed_cpt = [] (const codepoint_categ categ) {
+        const uint16_t subindex = categ.get_subcategory() >> 7;  // subcategory stored in 3 bits
+        switch(categ.get_category()) {                           // category fits in other 3 bits
+            case codepoint_categ::UNDEF: return COLLAPSE_CPT_RANGE_FIRST + ((0 << 3) | subindex);
+            case codepoint_categ::C:     return COLLAPSE_CPT_RANGE_FIRST + ((1 << 3) | subindex);
+            case codepoint_categ::L:     return COLLAPSE_CPT_RANGE_FIRST + ((2 << 3) | subindex);
+            case codepoint_categ::M:     return COLLAPSE_CPT_RANGE_FIRST + ((3 << 3) | subindex);
+            case codepoint_categ::N:     return COLLAPSE_CPT_RANGE_FIRST + ((4 << 3) | subindex);
+            case codepoint_categ::P:     return COLLAPSE_CPT_RANGE_FIRST + ((5 << 3) | subindex);
+            case codepoint_categ::S:     return COLLAPSE_CPT_RANGE_FIRST + ((6 << 3) | subindex);
+            case codepoint_categ::Z:     return COLLAPSE_CPT_RANGE_FIRST + ((7 << 3) | subindex);
+            default:  assert (false);    return COLLAPSE_CPT_RANGE_FIRST;
+        }
     };
-
-    static const std::map<int, std::string> k_ucat_map = {
-        { codepoint_categ::N, "\x30-\x39" }, // 0-9
-        { codepoint_categ::L, "\x41-\x5A\x61-\x7A" }, // A-Za-z
-        { codepoint_categ::P, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
+    auto category_to_collapsed_range = [&] (const codepoint_categ categ) {
+        // \p{Ll} --> \p{Ll} to \p{Ll}  // has subcategory ? yes
+        // \p{Lu} --> \p{Lu} to \p{Lu}  // has subcategory ? yes
+        // \p{L}  --> \p{Ll} to \p{Lu}  // has subcategory ? no
+        assert ((COLLAPSE_CPT_RANGE_FIRST & 0b111) == 0);
+        const uint32_t collapsed = category_to_collapsed_cpt(categ);
+        const uint32_t range = (collapsed & 0b111) ? 0 : 0b111;  // has subcategory ?
+        return std::pair<uint32_t, uint32_t>(collapsed, collapsed + range);
     };
 
-    // compute collapsed codepoints only if needed by at least one regex
-    bool need_collapse = false;
-    for (auto & regex_expr : regex_exprs) {
-        // search for unicode categories
-        for (const auto & ucat : k_ucat_enum) {
-            if (std::string::npos != regex_expr.find(ucat.first)) {
-                need_collapse = true;
-                break;
-            }
-        }
-    }
-
     const auto cpts = unicode_cpts_from_utf8(text);
 
-    // generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte
-    // ref: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935
-    std::string text_collapsed;
-    if (need_collapse) {
-        // collapse all unicode categories
-        text_collapsed.resize(cpts.size());
-
-        for (size_t i = 0; i < cpts.size(); ++i) {
-            // keep single-byte codepoints as is
-            if (cpts[i] < 128) {
-                text_collapsed[i] = cpts[i];
-                continue;
-            }
-
-            const auto categ = unicode_cpt_category(cpts[i]);
-
-            if (categ.is_whitespace()) {
-                //NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does.
-                //text_collapsed[i] = (char) 0x85;  // <Next Line> as whitespace fallback
-                text_collapsed[i] = (char) 0x0B;    // <vertical tab> as whitespace fallback
-            } else if (k_ucat_cpt.find(categ.get_category()) != k_ucat_cpt.end()) {
-                text_collapsed[i] = k_ucat_cpt.at(categ.get_category());
-            } else {
-                text_collapsed[i] = (char) 0xD0; // fallback
-            }
-        }
-    }
-
     std::vector<size_t> bpe_offsets = { cpts.size() };
 
     for (auto & regex_expr : regex_exprs) {
@@ -708,91 +680,272 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
             continue;
         }
 
-        // fallback to general-purpose std::regex / std::wregex
-        try {
-            // if a unicode category is used in the regex, we use the collapsed text and replace the unicode category
-            // with the corresponding collapsed representation
-            bool use_collapsed = false;
-            for (auto & ucat : k_ucat_enum) {
-                if (std::string::npos != regex_expr.find(ucat.first)) {
-                    use_collapsed = true;
-                    break;
-                }
+        std::vector<std::pair<uint32_t, uint32_t>> regex_expr_ranges;        // start codepoint, last codepoint
+        std::vector<std::pair<uint32_t, codepoint_categ>> regex_expr_categs; // offset, codepoint category
+        std::map<uint16_t, std::wstring> map_categ_wregex;                   // categ --> regex utf32 string
+        std::wstring wregex_collapsed;
+        std::wstring wtext_collapsed;
+        bool inside_square = false;
+        bool is_cpt_range  = false;
+
+        // common ranges: \w \d
+        regex_expr_ranges.emplace_back('a', 'z');
+        regex_expr_ranges.emplace_back('A', 'Z');
+        regex_expr_ranges.emplace_back('0', '9');
+        regex_expr_ranges.emplace_back('_', '_');
+        // common ranges: \s
+        for (uint32_t cpt : unicode_vec_whitespace) {
+            const auto categ_prev = unicode_cpt_category(regex_expr_ranges.back().second);
+            const auto categ_last = unicode_cpt_category(cpt);
+            if (categ_prev == categ_last && regex_expr_ranges.back().second + 1 == cpt) {
+                regex_expr_ranges.back().second = cpt;
+            } else {
+                regex_expr_ranges.emplace_back(cpt, cpt);
             }
+        }
 
-            if (use_collapsed) {
-                // sanity-check that the original regex does not contain any non-ASCII characters
-                const auto cpts_regex = unicode_cpts_from_utf8(regex_expr);
-                for (size_t i = 0; i < cpts_regex.size(); ++i) {
-                    if (cpts_regex[i] >= 128) {
-                        throw std::runtime_error("Regex includes both unicode categories and non-ASCII characters - not supported");
+        // std::wregex \s does not match non-ASCII whitespaces
+        static const codepoint_categ categ_whitespace(codepoint_categ::MASK + 1);  // UNDEF category, subcategory 1
+        std::wstring & wregex_whitespaces = map_categ_wregex[categ_whitespace.get_subcategory()];
+        wregex_whitespaces += L"\\s";
+        for (uint32_t cpt : unicode_vec_whitespace) {
+            if (cpt >= 0x80) {  // non-ASCII whitespaces
+                if (wregex_whitespaces.back() + 1 == cpt) {
+                    if (*(wregex_whitespaces.end() - 2) == '-') {
+                        wregex_whitespaces.back() = cpt;
+                    } else {
+                        wregex_whitespaces += '-';
+                        wregex_whitespaces += cpt;
                     }
+                } else {
+                    wregex_whitespaces += cpt;
                 }
+            }
+        }
 
-                // generate a collapsed representation of the regex
-                std::string regex_expr_collapsed;
+        const auto cpts_regex = unicode_cpts_from_utf8(regex_expr);
 
-                // track if we are inside [], because nested [] are not allowed
-                bool inside = false;
-                for (size_t i = 0; i < regex_expr.size(); ++i) {
-                    if (regex_expr[i] == '[' && (i == 0 || regex_expr[i - 1] != '\\')) {
-                        regex_expr_collapsed += '[';
-                        inside = true;
-                        continue;
-                    }
+        for (size_t i = 0; i < cpts_regex.size(); ++i) {
+            uint32_t cpt = cpts_regex[i];
 
-                    if (inside && regex_expr[i] == ']' && regex_expr[i - 1] != '\\') {
-                        regex_expr_collapsed += ']';
-                        inside = false;
+            if (inside_square) {
+                switch(cpt) {
+                    case '^':
+                        if (cpts_regex[i - 1] != '[') {
+                            break;
+                        }
                         continue;
-                    }
-
-                    if (regex_expr[i + 0] == '\\' && i + 4 < regex_expr.size() &&
-                        regex_expr[i + 1] == 'p' &&
-                        regex_expr[i + 2] == '{' &&
-                        regex_expr[i + 4] == '}') {
-                        const std::string pat = regex_expr.substr(i, 5);
-                        if (k_ucat_enum.find(pat) != k_ucat_enum.end()) {
-                            if (!inside) {
-                                regex_expr_collapsed += '[';
-                            }
-                            regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat));
-                            regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat));
-                            if (!inside) {
-                                regex_expr_collapsed += ']';
+                    case ']':
+                        inside_square = false;
+                        continue;
+                    case '-':
+                        is_cpt_range = true;
+                        continue;
+                }
+            } else {
+                switch(cpt) {
+                    case '^':
+                        if (i > 0) {
+                            break;
+                        }
+                        continue;
+                    case '$':
+                        if (i + 1 < cpts_regex.size()) {
+                            break;
+                        }
+                        continue;
+                    case '[':
+                        inside_square = true;
+                        continue;
+                    case '{':
+                        while (cpt && cpt != '}') {
+                            cpt = cpts_regex[++i];
+                        }
+                        continue;
+                    case '}':
+                    case ']':
+                        assert (false);
+                    case '(':
+                        if (cpts_regex[i + 1] == '?') {  // (?: (?i: (?= (?! (?<= (?<!
+                            if (cpts_regex[i + 2] == ':') {
+                                i += 2;
+                            } else if (cpts_regex[i + 2] == 'i') {
+                                i += 3;
+                                assert (cpts_regex[i] == ':');
+                            } else {
+                                i += 2 + (cpts_regex[i + 2] == '<');
+                                assert (cpts_regex[i] == '=' || cpts_regex[i] == '!');
                             }
-                            i += 4;
-                            continue;
                         }
-                    }
+                        continue;
+                    case ')':
+                    case '|':
+                    case '.':
+                    case '?':
+                    case '+':
+                    case '*':
+                        continue;
+                }
+            }
+
+            if (cpt == '\\' && cpts_regex[i + 1] == 'p' && cpts_regex[i + 2] == '{') {
+                assert (cpts_regex[i + 3] && cpts_regex[i + 4]);
+                codepoint_categ categ = {};
+                if (cpts_regex[i + 4] == '}') {
+                    categ = codepoint_categ::from_chars((char)cpts_regex[i + 3]);
+                } else {
+                    categ = codepoint_categ::from_chars((char)cpts_regex[i + 3], (char)cpts_regex[i + 4]);
+                    assert (cpts_regex[i + 5] == '}');
+                }
+                categ.set_flag(codepoint_categ::WHITESPACE, inside_square);  //NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets'
+                regex_expr_categs.emplace_back(i, categ);
+                i += cpts_regex[i + 4] == '}' ? 4 : 5;
+                continue;
+            }
+
+            if (cpt == '\\') {
+                if (cpts_regex[i + 1] == 's' || cpts_regex[i + 1] == 'S') {  // \s \S
+                    regex_expr_categs.emplace_back(i, categ_whitespace);
+                    //NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets'
+                    regex_expr_categs.back().second.set_flag(codepoint_categ::WHITESPACE, inside_square);
+                    i += 1;
+                    continue;
+                }
+            }
+
+            if (cpt == '\\') {  
+                switch (cpts_regex[i + 1]) {
+                    case 's':  ++i;  continue;  // \s whitespaces
+                    case 'w':  ++i;  continue;  // \w words
+                    case 'd':  ++i;  continue;  // \d digits
+                    case 'S':  ++i;  continue;  // \S no whitespaces
+                    case 'W':  ++i;  continue;  // \W no words
+                    case 'D':  ++i;  continue;  // \D no digits
+                    case 't':  ++i;  cpt = '\t';  break;
+                    case 'r':  ++i;  cpt = '\r';  break;
+                    case 'n':  ++i;  cpt = '\n';  break;
+                    case 'x':  assert (false);    break;  //TODO: hex values
+                    case 'u':  assert (false);    break;  //TODO: unicode values
+                    case 'U':  assert (false);    break;  //TODO: unicode values
+                    default:  // escaped character
+                        assert (!is_cpt_range);
+                        cpt = cpts_regex[++i];
+                        assert (cpt < 0x80);
+                    break;
+                }
+            }
+
+            assert (cpt < COLLAPSE_CPT_RANGE_FIRST || COLLAPSE_CPT_RANGE_LAST < cpt);
+
+            if (is_cpt_range) {
+                is_cpt_range = false;
+                regex_expr_ranges.back().second = cpt;
+            } else {
+                regex_expr_ranges.emplace_back(cpt, cpt);
+            }
+        }
 
-                    regex_expr_collapsed += regex_expr[i];
+        // assign collapsed codepoint to each category regex \p{...}
+        for (auto offset_categ : regex_expr_categs) {
+            const uint16_t subcateg = offset_categ.second.get_subcategory();
+            auto it = map_categ_wregex.find(subcateg);
+            if (it == map_categ_wregex.end()) {
+                const auto collapsed_range = category_to_collapsed_range(offset_categ.second);
+                map_categ_wregex[subcateg] = (wchar_t) collapsed_range.first;
+                if (collapsed_range.first < collapsed_range.second) {
+                    map_categ_wregex[subcateg] += (wchar_t) '-';
+                    map_categ_wregex[subcateg] += (wchar_t) collapsed_range.second;
                 }
+            }
+        }
 
-                //printf("text_collapsed: %s\n", text_collapsed.c_str());
-                //printf("regex_expr_collapsed: %s\n", regex_expr_collapsed.c_str());
-                bpe_offsets = unicode_regex_split_stl(text_collapsed, regex_expr_collapsed, bpe_offsets);
+        // copy found regex ranges to each category regex
+        uint32_t regex_expr_ranges_uniques = 0;
+        std::pair<uint32_t, uint32_t> prev_range = {0, -1};
+        std::sort(regex_expr_ranges.begin(), regex_expr_ranges.end());
+        for (auto range : regex_expr_ranges) {
+            range.first = std::max(range.first, prev_range.second + 1);  // prevent overlapping  //TODO: as error?
+            if (range.first > range.second) {  // skip overlapping and repetitions
+                continue;
+            }
+            codepoint_categ categ = unicode_cpt_category(range.first);
+            assert (categ == unicode_cpt_category(range.second));
+            auto it0 = map_categ_wregex.find(categ.get_category());
+            auto it1 = map_categ_wregex.find(categ.get_subcategory());
+            for (const auto & it : {it0, it1}) {
+                if (it != map_categ_wregex.end()) {
+                    it->second += (wchar_t) range.first;
+                    if (range.first < range.second) {
+                        it->second += (wchar_t) '-';
+                        it->second += (wchar_t) range.second;
+                    } 
+                }
+            }
+            prev_range = range;
+            regex_expr_ranges[regex_expr_ranges_uniques++] = range;
+        }
+        regex_expr_ranges.resize(regex_expr_ranges_uniques);
+
+        // replace categories with respective collapsed codepoint and ranges
+        uint32_t i = 0;
+        wregex_collapsed.reserve(regex_expr.size());
+        for (auto offset_categ : regex_expr_categs) {
+            while (i < offset_categ.first) {  // copy original regex until reaching the category
+                wregex_collapsed += (wchar_t) cpts_regex[i];
+                i++;
+            }
+            assert (cpts_regex[i] == '\\');
+            const uint32_t cpt_next = cpts_regex[i + 1];
+            const bool is_negated = cpt_next < 'a';  // is uppercase
+            if (cpt_next == 'p' || cpt_next == 'P') {
+                assert (cpts_regex[i + 2] == '{' && cpts_regex[i + 3]);
+                i += cpts_regex[i + 4] == '}' ? 5 : 6;
+                assert (cpts_regex[i - 1] == '}');
             } else {
-                // no unicode category used, we can use std::wregex directly
-                const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr);
-
-                // std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
-                std::wstring wtext(cpts.begin(), cpts.end());
-                for (size_t i = 0; i < wtext.size(); ++i) {
-                    if (wtext[i] > 0x7F && unicode_cpt_category(wtext[i]).is_whitespace()) {
-                        wtext[i] = 0x0B;
+                assert (cpt_next == 's' || cpt_next == 'w' || cpt_next == 'd' ||  // \s \w \d
+                        cpt_next == 'S' || cpt_next == 'W' || cpt_next == 'D');   // \S \W \D
+                i += 2;
+            }
+            const codepoint_categ categ = offset_categ.second;
+            auto it = map_categ_wregex.find(categ.get_subcategory());
+            assert (it != map_categ_wregex.end());
+            if (it != map_categ_wregex.end()) {
+                if (categ.is_whitespace()) {  // inside square brackets  //NOTE: reusing flag WHITESPACE
+                    assert (is_negated == false);
+                    wregex_collapsed += it->second;
+                } else if(it->second.size() == 1 && !is_negated) {
+                    wregex_collapsed += it->second;
+                } else {
+                    wregex_collapsed += '[';
+                    if (is_negated) {
+                        wregex_collapsed += '^';
                     }
+                    wregex_collapsed += it->second;
+                    wregex_collapsed += ']';
                 }
+            }
+        }
+        while (i < (uint32_t)cpts_regex.size()) {
+            wregex_collapsed += cpts_regex[i];
+            i++;
+        }
 
-                //printf("text: %s\n", text.c_str());
-                //printf("regex_expr: %s\n", regex_expr.c_str());
-                bpe_offsets = unicode_regex_split_stl(wtext, wregex_expr, bpe_offsets);
+        // collapse text codepoints not included in 'regex_expr_ranges'
+        wtext_collapsed.reserve(cpts.size());
+        for (uint32_t cpt : cpts) {
+            const codepoint_categ categ = unicode_cpt_category(cpt);
+            auto it = std::lower_bound(regex_expr_ranges.begin(), regex_expr_ranges.end(), cpt,
+                [] (const std::pair<uint32_t, uint32_t> range, const uint32_t cpt) {
+                    return range.second < cpt;
+                }
+            );
+            if (it == regex_expr_ranges.end() || cpt < it->first || it->second < cpt) {
+                cpt = category_to_collapsed_cpt(categ);  // not found, collapse to category codepoint
             }
-        } catch (std::regex_error & e) {
-            fprintf(stderr, "Failed to process regex: '%s'\n", regex_expr.c_str());
-            fprintf(stderr, "Regex error: %s\n", e.what());
-            throw std::runtime_error("Failed to process regex");
+            wtext_collapsed += (wchar_t) cpt;
         }
+
+        bpe_offsets = unicode_regex_split_stl(wtext_collapsed, wregex_collapsed, bpe_offsets);
     }
 
     std::vector<std::string> bpe_words;

From aeac3421329ef2332fc08b2805a629a42f1bde6b Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Sun, 4 Aug 2024 23:22:56 +0200
Subject: [PATCH 11/29] Add more comments

---
 src/unicode.cpp | 59 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 56 insertions(+), 3 deletions(-)

diff --git a/src/unicode.cpp b/src/unicode.cpp
index 68cadf0c49075..f5d1496488a12 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -636,13 +636,47 @@ uint32_t unicode_tolower(uint32_t cp) {
 }
 
 std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
-    //TODO: update and add more comments
-    // generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte
-    // ref: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935
+    // std::regex does not support unicode categories: \p{N}, \p{L}, \p{Lu}, \p{Ll} ...
+    // std::regex does not support unicode whitespaces \s: 0x85, 0xA0, 0x001680 ... 0x003000.
+    // Generate a "collapsed" representation of the regex, where all unicode categories are replaced by codepoints ranges.
+    // Generate a "collapsed" representation of the text, where all codepoints are forced to fall into generated category ranges.
+    //  Text codepoints not found in generated category ranges are replaced by a "collapsed" codepoint.
+    // This implementation generalizes the original implementation adding support to unicode subcategories:
+    //  https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935
+
+    // Definitions:
+    // - Unicode cagegory: high unicode categories, \p{C}, \p{L}, \p{M}, \p{N}, \p{P}, \p{S}, \p{Z}.
+    // - Unicode subcagegory: including all unicode categories, \p{Cc}, \p{Cf}, \p{Co}, \p{Cs}, ..., \p{Zs}.
+    // - Collapsed codepoint: unused codepoint representing a unicode subcategory.
+    // - Collapsed range: sequence of "collapsed" codepoint, representing one unicode category.
+    // - Collapsed regex: original regex including "collapsed" codepoints and ranges.
+
+    // (1)     Build the "collapsed" regex:
+    // (1.1)     Generate a replacement list of codepoint ranges:
+    // (1.1.1)     For each unicode category.
+    // (1.1.2)     For each unicode subcategory.
+    // (1.1.3)     Expand \s adding unicode whitespaces.
+    // (1.2)     Each list includes its respective "collaped" codepoint/range.
+    // (1.3)     [Optimization] Only build lists of categories present in the regex.
+    // (1.4)     Build the "collapsed" regex replacing categories and subcategories by this "collapsed" lists.
+    // (2)     Build a list of codepoint ranges.
+    // (2.1)     If a codepoint is not found in this list, then it is "collapsable".
+    // (2.2)     [Optimization] Only build lists of ranges present in the regex.
+    // (3)     For each input text:
+    // (3.1)     Search codepoints in the regex codepoint ranges.
+    // (3.2)     If found, it is a valid codepoint (the "collapsed" regex uses it), literal copy.
+    // (3.3)     If not found, replace with its "collapsed" codepoint so the "collapsed" regex can process it.
+
+    //TODO: Refactor optimizations
+    // Steps (1) and (2) only depends on the regex expression text.
+    // Step (3) needs 'regex_expr_ranges' for text "collapsing" and 'wregex_collapsed'.
+    // Optimization: store and reuse 'wregex_collapsed' and 'regex_expr_ranges'.
 
     // 0xDB80 to 0xDBFF: Private Use High Surrogate (128 range values)
     static const uint32_t COLLAPSE_CPT_RANGE_FIRST = 0xDB80;
     static const uint32_t COLLAPSE_CPT_RANGE_LAST  = 0xDBFF;
+
+    // return the collapsed codepoint of an unicode category or subcategory
     auto category_to_collapsed_cpt = [] (const codepoint_categ categ) {
         const uint16_t subindex = categ.get_subcategory() >> 7;  // subcategory stored in 3 bits
         switch(categ.get_category()) {                           // category fits in other 3 bits
@@ -657,6 +691,8 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
             default:  assert (false);    return COLLAPSE_CPT_RANGE_FIRST;
         }
     };
+
+    // return the collapsed range of an unicode category (range including all subcategories)
     auto category_to_collapsed_range = [&] (const codepoint_categ categ) {
         // \p{Ll} --> \p{Ll} to \p{Ll}  // has subcategory ? yes
         // \p{Lu} --> \p{Lu} to \p{Lu}  // has subcategory ? yes
@@ -688,11 +724,14 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
         bool inside_square = false;
         bool is_cpt_range  = false;
 
+        // (2) Build a list of codepoint ranges
         // common ranges: \w \d
         regex_expr_ranges.emplace_back('a', 'z');
         regex_expr_ranges.emplace_back('A', 'Z');
         regex_expr_ranges.emplace_back('0', '9');
         regex_expr_ranges.emplace_back('_', '_');
+
+        // (2) Build a list of codepoint ranges
         // common ranges: \s
         for (uint32_t cpt : unicode_vec_whitespace) {
             const auto categ_prev = unicode_cpt_category(regex_expr_ranges.back().second);
@@ -704,6 +743,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
             }
         }
 
+        // (1.1.3) Expand \s adding unicode whitespaces.
         // std::wregex \s does not match non-ASCII whitespaces
         static const codepoint_categ categ_whitespace(codepoint_categ::MASK + 1);  // UNDEF category, subcategory 1
         std::wstring & wregex_whitespaces = map_categ_wregex[categ_whitespace.get_subcategory()];
@@ -728,6 +768,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
         for (size_t i = 0; i < cpts_regex.size(); ++i) {
             uint32_t cpt = cpts_regex[i];
 
+            // skip regex metacharacters
             if (inside_square) {
                 switch(cpt) {
                     case '^':
@@ -788,6 +829,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                 }
             }
 
+            // parse unicode categories and subcategories
             if (cpt == '\\' && cpts_regex[i + 1] == 'p' && cpts_regex[i + 2] == '{') {
                 assert (cpts_regex[i + 3] && cpts_regex[i + 4]);
                 codepoint_categ categ = {};
@@ -797,6 +839,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                     categ = codepoint_categ::from_chars((char)cpts_regex[i + 3], (char)cpts_regex[i + 4]);
                     assert (cpts_regex[i + 5] == '}');
                 }
+                // (2) Build a list of codepoint ranges. (2.2) [Optimization] Only build lists of ranges present in the regex.
                 categ.set_flag(codepoint_categ::WHITESPACE, inside_square);  //NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets'
                 regex_expr_categs.emplace_back(i, categ);
                 i += cpts_regex[i + 4] == '}' ? 4 : 5;
@@ -805,6 +848,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
 
             if (cpt == '\\') {
                 if (cpts_regex[i + 1] == 's' || cpts_regex[i + 1] == 'S') {  // \s \S
+                    // (2) Build a list of codepoint ranges. (2.2) [Optimization] Only build lists of ranges present in the regex.
                     regex_expr_categs.emplace_back(i, categ_whitespace);
                     //NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets'
                     regex_expr_categs.back().second.set_flag(codepoint_categ::WHITESPACE, inside_square);
@@ -813,6 +857,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                 }
             }
 
+            // parse more metcharacters and espaped characters
             if (cpt == '\\') {  
                 switch (cpts_regex[i + 1]) {
                     case 's':  ++i;  continue;  // \s whitespaces
@@ -835,8 +880,10 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                 }
             }
 
+            // ensure there is not a collission with any "collapsed" codepoints
             assert (cpt < COLLAPSE_CPT_RANGE_FIRST || COLLAPSE_CPT_RANGE_LAST < cpt);
 
+            // (2) Build a list of codepoint ranges
             if (is_cpt_range) {
                 is_cpt_range = false;
                 regex_expr_ranges.back().second = cpt;
@@ -850,6 +897,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
             const uint16_t subcateg = offset_categ.second.get_subcategory();
             auto it = map_categ_wregex.find(subcateg);
             if (it == map_categ_wregex.end()) {
+                // (1.2) Each list includes its respective "collaped" codepoint/range.
                 const auto collapsed_range = category_to_collapsed_range(offset_categ.second);
                 map_categ_wregex[subcateg] = (wchar_t) collapsed_range.first;
                 if (collapsed_range.first < collapsed_range.second) {
@@ -868,6 +916,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
             if (range.first > range.second) {  // skip overlapping and repetitions
                 continue;
             }
+            // (1.1) Generate a replacement list of codepoint ranges
             codepoint_categ categ = unicode_cpt_category(range.first);
             assert (categ == unicode_cpt_category(range.second));
             auto it0 = map_categ_wregex.find(categ.get_category());
@@ -906,6 +955,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                         cpt_next == 'S' || cpt_next == 'W' || cpt_next == 'D');   // \S \W \D
                 i += 2;
             }
+            // (1.4) Build the "collapsed" regex replacing categories and subcategories by this "collapsed" lists.
             const codepoint_categ categ = offset_categ.second;
             auto it = map_categ_wregex.find(categ.get_subcategory());
             assert (it != map_categ_wregex.end());
@@ -934,14 +984,17 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
         wtext_collapsed.reserve(cpts.size());
         for (uint32_t cpt : cpts) {
             const codepoint_categ categ = unicode_cpt_category(cpt);
+            // (3.1) Search codepoints in the regex codepoint ranges.
             auto it = std::lower_bound(regex_expr_ranges.begin(), regex_expr_ranges.end(), cpt,
                 [] (const std::pair<uint32_t, uint32_t> range, const uint32_t cpt) {
                     return range.second < cpt;
                 }
             );
             if (it == regex_expr_ranges.end() || cpt < it->first || it->second < cpt) {
+                // (3.3) If not found, replace with its "collapsed" codepoint so the "collapsed" regex can process it.
                 cpt = category_to_collapsed_cpt(categ);  // not found, collapse to category codepoint
             }
+            // (3.2) If found, it is a valid codepoint (the "collapsed" regex uses it), literal copy.
             wtext_collapsed += (wchar_t) cpt;
         }
 

From 85c59df9ce6c560fbd01b42be91b366f6e79a9e3 Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Mon, 5 Aug 2024 20:52:25 +0200
Subject: [PATCH 12/29] minor: remove trailing whitespaces and extra semicolons

---
 src/unicode.cpp | 4 ++--
 src/unicode.h   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/unicode.cpp b/src/unicode.cpp
index 5a2c9bb8a4e51..7cd479450b8af 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -864,7 +864,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
             }
 
             // parse more metcharacters and espaped characters
-            if (cpt == '\\') {  
+            if (cpt == '\\') {
                 switch (cpts_regex[i + 1]) {
                     case 's':  ++i;  continue;  // \s whitespaces
                     case 'w':  ++i;  continue;  // \w words
@@ -933,7 +933,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                     if (range.first < range.second) {
                         it->second += (wchar_t) '-';
                         it->second += (wchar_t) range.second;
-                    } 
+                    }
                 }
             }
             prev_range = range;
diff --git a/src/unicode.h b/src/unicode.h
index 536e80ef16693..75cdb3f4a596f 100644
--- a/src/unicode.h
+++ b/src/unicode.h
@@ -162,7 +162,7 @@ struct codepoint_categ {
             case 'Z':  return Z | (_subindex(subcateg, "lps"    ) << 7);
             default:   assert (false);  return 0;
         }
-    };
+    }
 
     uint16_t encoded;
 };

From 735105edf9290566d50c194f76408f424b4dad6f Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Mon, 5 Aug 2024 20:54:30 +0200
Subject: [PATCH 13/29] Use GGML_ASSERT and GGML_ABORT

---
 src/unicode.cpp | 54 ++++++++++++++++++++++++-------------------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/src/unicode.cpp b/src/unicode.cpp
index 7cd479450b8af..a5a377b39f567 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -2,10 +2,10 @@
 #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
 #endif
 
+#include "ggml.h"
 #include "unicode.h"
 #include "unicode-data.h"
 
-#include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <map>
@@ -201,7 +201,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
     for (auto offset : offsets) {
         const size_t offset_ini = start;
         const size_t offset_end = start + offset;
-        assert(offset_end <= cpts.size());
+        GGML_ASSERT(offset_end <= cpts.size());
         start = offset_end;
 
         static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
@@ -216,7 +216,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
 
         size_t _prev_end = offset_ini;
         auto _add_token = [&] (const size_t end) -> size_t {
-            assert(_prev_end <= end && end <= offset_end);
+            GGML_ASSERT(_prev_end <= end && end <= offset_end);
             size_t len = end - _prev_end;
             if (len > 0) {
                 bpe_offsets.push_back(len);
@@ -320,7 +320,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
     for (auto offset : offsets) {
         const size_t offset_ini = start;
         const size_t offset_end = start + offset;
-        assert(offset_end <= cpts.size());
+        GGML_ASSERT(offset_end <= cpts.size());
         start = offset_end;
 
         static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
@@ -335,7 +335,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
 
         size_t _prev_end = offset_ini;
         auto _add_token = [&] (const size_t end) -> size_t {
-            assert(_prev_end <= end && end <= offset_end);
+            GGML_ASSERT(_prev_end <= end && end <= offset_end);
             size_t len = end - _prev_end;
             if (len > 0) {
                 bpe_offsets.push_back(len);
@@ -595,7 +595,7 @@ codepoint_categ unicode_cpt_category(const uint32_t cp) {
                 cpt_categs[cpt++] = categ;
             }
         }
-        assert (cpt == MAX_CODEPOINTS);
+        GGML_ASSERT(cpt == MAX_CODEPOINTS);
 
         for (auto cpt : unicode_vec_whitespace) {
             cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACE);
@@ -694,7 +694,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
             case codepoint_categ::P:     return COLLAPSE_CPT_RANGE_FIRST + ((5 << 3) | subindex);
             case codepoint_categ::S:     return COLLAPSE_CPT_RANGE_FIRST + ((6 << 3) | subindex);
             case codepoint_categ::Z:     return COLLAPSE_CPT_RANGE_FIRST + ((7 << 3) | subindex);
-            default:  assert (false);    return COLLAPSE_CPT_RANGE_FIRST;
+            default: GGML_ASSERT(false); return COLLAPSE_CPT_RANGE_FIRST;
         }
     };
 
@@ -703,7 +703,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
         // \p{Ll} --> \p{Ll} to \p{Ll}  // has subcategory ? yes
         // \p{Lu} --> \p{Lu} to \p{Lu}  // has subcategory ? yes
         // \p{L}  --> \p{Ll} to \p{Lu}  // has subcategory ? no
-        assert ((COLLAPSE_CPT_RANGE_FIRST & 0b111) == 0);
+        GGML_ASSERT((COLLAPSE_CPT_RANGE_FIRST & 0b111) == 0);
         const uint32_t collapsed = category_to_collapsed_cpt(categ);
         const uint32_t range = (collapsed & 0b111) ? 0 : 0b111;  // has subcategory ?
         return std::pair<uint32_t, uint32_t>(collapsed, collapsed + range);
@@ -811,17 +811,17 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                         continue;
                     case '}':
                     case ']':
-                        assert (false);
+                        GGML_ABORT("invalid regex");
                     case '(':
                         if (cpts_regex[i + 1] == '?') {  // (?: (?i: (?= (?! (?<= (?<!
                             if (cpts_regex[i + 2] == ':') {
                                 i += 2;
                             } else if (cpts_regex[i + 2] == 'i') {
                                 i += 3;
-                                assert (cpts_regex[i] == ':');
+                                GGML_ASSERT(cpts_regex[i] == ':');
                             } else {
                                 i += 2 + (cpts_regex[i + 2] == '<');
-                                assert (cpts_regex[i] == '=' || cpts_regex[i] == '!');
+                                GGML_ASSERT(cpts_regex[i] == '=' || cpts_regex[i] == '!');
                             }
                         }
                         continue;
@@ -837,13 +837,13 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
 
             // parse unicode categories and subcategories
             if (cpt == '\\' && cpts_regex[i + 1] == 'p' && cpts_regex[i + 2] == '{') {
-                assert (cpts_regex[i + 3] && cpts_regex[i + 4]);
+                GGML_ASSERT(cpts_regex[i + 3] && cpts_regex[i + 4]);
                 codepoint_categ categ = {};
                 if (cpts_regex[i + 4] == '}') {
                     categ = codepoint_categ::from_chars((char)cpts_regex[i + 3]);
                 } else {
                     categ = codepoint_categ::from_chars((char)cpts_regex[i + 3], (char)cpts_regex[i + 4]);
-                    assert (cpts_regex[i + 5] == '}');
+                    GGML_ASSERT(cpts_regex[i + 5] == '}');
                 }
                 // (2) Build a list of codepoint ranges. (2.2) [Optimization] Only build lists of ranges present in the regex.
                 categ.set_flag(codepoint_categ::WHITESPACE, inside_square);  //NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets'
@@ -875,19 +875,19 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                     case 't':  ++i;  cpt = '\t';  break;
                     case 'r':  ++i;  cpt = '\r';  break;
                     case 'n':  ++i;  cpt = '\n';  break;
-                    case 'x':  assert (false);    break;  //TODO: hex values
-                    case 'u':  assert (false);    break;  //TODO: unicode values
-                    case 'U':  assert (false);    break;  //TODO: unicode values
+                    case 'x':  GGML_ABORT("TODO");  break;  //TODO: hex values
+                    case 'u':  GGML_ABORT("TODO");  break;  //TODO: unicode values
+                    case 'U':  GGML_ABORT("TODO");  break;  //TODO: unicode values
                     default:  // escaped character
-                        assert (!is_cpt_range);
+                        GGML_ASSERT(!is_cpt_range);
                         cpt = cpts_regex[++i];
-                        assert (cpt < 0x80);
+                        GGML_ASSERT(cpt < 0x80);
                     break;
                 }
             }
 
             // ensure there is not a collission with any "collapsed" codepoints
-            assert (cpt < COLLAPSE_CPT_RANGE_FIRST || COLLAPSE_CPT_RANGE_LAST < cpt);
+            GGML_ASSERT(cpt < COLLAPSE_CPT_RANGE_FIRST || COLLAPSE_CPT_RANGE_LAST < cpt);
 
             // (2) Build a list of codepoint ranges
             if (is_cpt_range) {
@@ -924,7 +924,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
             }
             // (1.1) Generate a replacement list of codepoint ranges
             codepoint_categ categ = unicode_cpt_category(range.first);
-            assert (categ == unicode_cpt_category(range.second));
+            GGML_ASSERT(categ == unicode_cpt_category(range.second));
             auto it0 = map_categ_wregex.find(categ.get_category());
             auto it1 = map_categ_wregex.find(categ.get_subcategory());
             for (const auto & it : {it0, it1}) {
@@ -949,25 +949,25 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                 wregex_collapsed += (wchar_t) cpts_regex[i];
                 i++;
             }
-            assert (cpts_regex[i] == '\\');
+            GGML_ASSERT(cpts_regex[i] == '\\');
             const uint32_t cpt_next = cpts_regex[i + 1];
             const bool is_negated = cpt_next < 'a';  // is uppercase
             if (cpt_next == 'p' || cpt_next == 'P') {
-                assert (cpts_regex[i + 2] == '{' && cpts_regex[i + 3]);
+                GGML_ASSERT(cpts_regex[i + 2] == '{' && cpts_regex[i + 3]);
                 i += cpts_regex[i + 4] == '}' ? 5 : 6;
-                assert (cpts_regex[i - 1] == '}');
+                GGML_ASSERT(cpts_regex[i - 1] == '}');
             } else {
-                assert (cpt_next == 's' || cpt_next == 'w' || cpt_next == 'd' ||  // \s \w \d
-                        cpt_next == 'S' || cpt_next == 'W' || cpt_next == 'D');   // \S \W \D
+                GGML_ASSERT(cpt_next == 's' || cpt_next == 'w' || cpt_next == 'd' ||  // \s \w \d
+                            cpt_next == 'S' || cpt_next == 'W' || cpt_next == 'D');   // \S \W \D
                 i += 2;
             }
             // (1.4) Build the "collapsed" regex replacing categories and subcategories by this "collapsed" lists.
             const codepoint_categ categ = offset_categ.second;
             auto it = map_categ_wregex.find(categ.get_subcategory());
-            assert (it != map_categ_wregex.end());
+            GGML_ASSERT(it != map_categ_wregex.end());
             if (it != map_categ_wregex.end()) {
                 if (categ.is_whitespace()) {  // inside square brackets  //NOTE: reusing flag WHITESPACE
-                    assert (is_negated == false);
+                    GGML_ASSERT(is_negated == false);
                     wregex_collapsed += it->second;
                 } else if(it->second.size() == 1 && !is_negated) {
                     wregex_collapsed += it->second;

From fd6d9b9e6a918a1f08f4718cb61fb299b92dd390 Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Mon, 5 Aug 2024 20:58:15 +0200
Subject: [PATCH 14/29] Update bruteforce test: fix pyright complaints

---
 tests/test-tokenizer-random.py | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py
index c17a1cfbd85a7..f3447d482b989 100644
--- a/tests/test-tokenizer-random.py
+++ b/tests/test-tokenizer-random.py
@@ -124,8 +124,7 @@ def get_vocab(self, detokenize=False) -> list[str]:
                 text = self.detokenize([id], remove_special=False, unparse_special=True)
             else:
                 text = self.lib.llama_token_get_text(self.model, id)
-                text = self.ffi.string(text)
-                text = str(text, encoding="utf-8", errors="replace")  # replace errors with '\uFFFD'
+                text = str(cast(bytes, self.ffi.string(text)), encoding="utf-8", errors="replace")  # replace errors with '\uFFFD'
             vocab.append(text)
         return vocab
 
@@ -162,12 +161,13 @@ def __init__(self, dir_tokenizer: str):
         self.eos_token = self.model.eos_token
 
     def get_vocab(self, detokenize=False) -> list[str]:
+        vocab: list[str] = []
         max_token_id = max(self.model.get_vocab().values())
         if detokenize:
             ids = list(range(max_token_id + 1))
             vocab = self.model.batch_decode(ids, skip_special_tokens=False)
         else:
-            vocab = [None] * (max_token_id + 1)
+            vocab = [""] * (max_token_id + 1)
             for text, id in self.model.get_vocab().items():
                 vocab[id] = text
         return vocab
@@ -455,14 +455,6 @@ def generator_random_vocab_words(tokenizer: TokenizerGroundtruth, iterations=100
 
 def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLlamaCpp, generator: Iterator[str]):
 
-    def find_first_mismatch(ids1: list[int] | str, ids2: list[int] | str):
-        for i, (a, b) in enumerate(zip(ids1, ids2)):
-            if a != b:
-                return i
-        if len(ids1) == len(ids2):
-            return -1
-        return min(len(ids1), len(ids2))
-
     def check_detokenizer(text: str, text1: str, text2: str) -> bool:
         if text1 == text2:  # equal to TokenizerGroundtruth?
             return True

From 3b36703c8afd32966ab80dd861b1985a929df79f Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Mon, 5 Aug 2024 21:10:45 +0200
Subject: [PATCH 15/29] Update bruteforce test: - Faster failing text range
 selection. - Show unique failing texts differences. - Add more recent models.

---
 tests/test-tokenizer-random.py | 82 ++++++++++++++++++++--------------
 1 file changed, 48 insertions(+), 34 deletions(-)

diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py
index f3447d482b989..4a5773fa535a9 100644
--- a/tests/test-tokenizer-random.py
+++ b/tests/test-tokenizer-random.py
@@ -472,10 +472,11 @@ def check_detokenizer(text: str, text1: str, text2: str) -> bool:
     t_decode1 = 0
     t_decode2 = 0
     t_start = time.perf_counter()
+    total_tests = 0
+    failing_texts = set()
     encode_errors = 0
     decode_errors = 0
-    total_tests = 0
-    MAX_ERRORS = 10
+    MAX_ERRORS = 5
 
     logger.info("%s: %s" % (generator.__qualname__, "ini"))
     for text in generator:
@@ -494,13 +495,11 @@ def check_detokenizer(text: str, text1: str, text2: str) -> bool:
         t_encode2 += t2 - t1
         t_decode1 += t3 - t2
         t_decode2 += t4 - t3
+        total_tests += 1
         # compare
         encode_ok = ids1 == ids2
         decode_ok = check_detokenizer(text, text1, text2)
-        encode_errors += not encode_ok
-        decode_errors += not decode_ok
-        total_tests += 1
-        if (encode_errors < MAX_ERRORS and not encode_ok) or (decode_errors < MAX_ERRORS and not decode_ok):
+        if not (encode_ok and decode_ok):
             def _compare(text: str):
                 ids1  = tokenizer1.encode(text)
                 ids2  = tokenizer2.encode(text)
@@ -510,33 +509,42 @@ def _compare(text: str):
                 decode_ok = check_detokenizer(text, text1, text2)
                 ok = encode_ok and decode_ok
                 return ok, ids1, ids2, text1, text2
+            # binary search upper and lower failing range
             a, b = 0, len(text)
-            for step in [64, 32, 16, 8, 4, 2, 1]:
-                while a < b:
-                    t = max(a, b - step)
-                    if _compare(text[a : t])[0]:
-                        break
-                    b = t
-            for step in [64, 32, 16, 8, 4, 2, 1]:
-                while a < b:
-                    t = min(a + step, b)
-                    if _compare(text[t : b])[0]:
-                        break
-                    a = t
+            step = b
+            while step > 1:
+                step = step // 2
+                if not _compare(text[a : b - step])[0]:
+                    b = b - step
+            step = b
+            while step > 1:
+                step = step // 2
+                if not _compare(text[a + step : b])[0]:
+                    a = a + step
             ok, ids1, ids2, text1, text2 = _compare(text[a : b])
             assert a <= b and not ok
-            logger.error(" Text:" + repr(text[a : b]))
-            logger.error("  " + " ".join(repr(x) + ":" + hex(ord(x)) for x in text[a : b]))
-            logger.error(" Expected: " + str(ids1))
-            logger.error("   Result: " + str(ids2))
-            logger.error(" Expected: " + " ".join(repr(x) + ":" + hex(ord(x)) for x in text1))
-            logger.error("   Result: " + " ".join(repr(x) + ":" + hex(ord(x)) for x in text2))
-            logger.error(f" {encode_errors=}")
-            logger.error(f" {decode_errors=}")
-        if encode_errors >= MAX_ERRORS and decode_errors >= MAX_ERRORS:
-            logger.error(f" EXIT: {encode_errors=} {decode_errors=}")
-            # raise Exception()
-            break
+            # show unique failing texts differences
+            failing_text = text[a : b]
+            if failing_text not in failing_texts:
+                failing_texts.add(failing_text)
+                if encode_errors < MAX_ERRORS and not encode_ok:
+                    encode_errors += 1
+                    logger.error(f" {encode_errors=}")
+                    logger.error(" Text:" + repr(failing_text))
+                    logger.error("  " + " ".join(repr(x) + ":" + hex(ord(x)) for x in failing_text))
+                    logger.error(" Expected: " + str(ids1))
+                    logger.error("   Result: " + str(ids2))
+                if decode_errors < MAX_ERRORS and not decode_ok:
+                    decode_errors += 1
+                    logger.error(f" {decode_errors=}")
+                    logger.error(" Text:" + repr(failing_text))
+                    logger.error("  " + " ".join(repr(x) + ":" + hex(ord(x)) for x in failing_text))
+                    logger.error(" Expected: " + " ".join(repr(x) + ":" + hex(ord(x)) for x in text1))
+                    logger.error("   Result: " + " ".join(repr(x) + ":" + hex(ord(x)) for x in text2))
+                if encode_errors >= MAX_ERRORS and decode_errors >= MAX_ERRORS:
+                    logger.error(f" EXIT: {encode_errors=} {decode_errors=}")
+                    # raise Exception()
+                    break
 
     t_total = time.perf_counter() - t_start
     logger.info(f"{generator.__qualname__}: end,  {t_encode1=:.3f} {t_encode2=:.3f}  {t_decode1=:.3f} {t_decode2=:.3f}  {t_total=:.3f}")
@@ -635,21 +643,19 @@ def main(argv: list[str] | None = None):
         "phi-3",          # SPM
         "gemma",          # SPM
         "gemma-2",        # SPM
-        "baichuan",       # SPM
+        # "baichuan",       # SPM
         "bert-bge",       # WPM
         "jina-v2-en",     # WPM
+        # "t5",             # UGM
         "llama-bpe",      # BPE
         "phi-2",          # BPE
         "deepseek-llm",   # BPE
         "deepseek-coder", # BPE
         "falcon",         # BPE
-        "mpt",            # BPE
         "starcoder",      # BPE
         "gpt-2",          # BPE
         "stablelm2",      # BPE
         "refact",         # BPE
-        "qwen2",          # BPE
-        "olmo",           # BPE
         "jina-v2-es",     # BPE
         "jina-v2-de",     # BPE
         "smaug-bpe",      # BPE
@@ -657,6 +663,14 @@ def main(argv: list[str] | None = None):
         "jina-v2-code",   # BPE
         "viking",         # BPE
         "jais",           # BPE
+        "codeshell",      # BPE
+        "tekken",         # BPE
+        "smollm",         # BPE
+        "mpt",            # BPE NFC
+        "command-r",      # BPE NFC
+        "qwen2",          # BPE NFC
+        "olmo",           # BPE NFC
+        "gpt-neox",       # BPE NFC
     ]
 
     logger.info("=" * 50)

From d558c736fd691adea0a1d36b99d4a802240d3679 Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Mon, 5 Aug 2024 21:24:13 +0200
Subject: [PATCH 16/29] Binary constants are a C++14 feature

---
 src/unicode.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/unicode.cpp b/src/unicode.cpp
index a5a377b39f567..19f55145e117b 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -703,9 +703,9 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
         // \p{Ll} --> \p{Ll} to \p{Ll}  // has subcategory ? yes
         // \p{Lu} --> \p{Lu} to \p{Lu}  // has subcategory ? yes
         // \p{L}  --> \p{Ll} to \p{Lu}  // has subcategory ? no
-        GGML_ASSERT((COLLAPSE_CPT_RANGE_FIRST & 0b111) == 0);
+        GGML_ASSERT((COLLAPSE_CPT_RANGE_FIRST & 0x7) == 0);
         const uint32_t collapsed = category_to_collapsed_cpt(categ);
-        const uint32_t range = (collapsed & 0b111) ? 0 : 0b111;  // has subcategory ?
+        const uint32_t range = (collapsed & 0x7) ? 0 : 0x7;  // has subcategory ?
         return std::pair<uint32_t, uint32_t>(collapsed, collapsed + range);
     };
 

From 674f0faa74dc483ed3a1bac1f1f3ec8d0eb7e792 Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Mon, 5 Aug 2024 21:43:32 +0200
Subject: [PATCH 17/29] Fix copy/paste wrong variable

---
 src/unicode.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/unicode.cpp b/src/unicode.cpp
index 19f55145e117b..ae36d2b43c828 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -602,11 +602,11 @@ codepoint_categ unicode_cpt_category(const uint32_t cp) {
         }
 
         for (auto p : unicode_map_lowercase) {
-            cpt_categs[cpt].set_flag(codepoint_categ::LOWERCASE);
+            cpt_categs[p.second].set_flag(codepoint_categ::LOWERCASE);
         }
 
         for (auto p : unicode_map_uppercase) {
-            cpt_categs[cpt].set_flag(codepoint_categ::UPPERCASE);
+            cpt_categs[p.second].set_flag(codepoint_categ::UPPERCASE);
         }
 
         //for (auto &range : unicode_ranges_nfd) {  // start, last, nfd

From 2ca313830e56d293719040d09733bc574f59923e Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Mon, 5 Aug 2024 23:55:17 +0200
Subject: [PATCH 18/29] Fix compiler complaints

---
 src/unicode.cpp | 18 ++++++++++--------
 src/unicode.h   |  2 +-
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/unicode.cpp b/src/unicode.cpp
index ae36d2b43c828..725476600f2ff 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -694,7 +694,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
             case codepoint_categ::P:     return COLLAPSE_CPT_RANGE_FIRST + ((5 << 3) | subindex);
             case codepoint_categ::S:     return COLLAPSE_CPT_RANGE_FIRST + ((6 << 3) | subindex);
             case codepoint_categ::Z:     return COLLAPSE_CPT_RANGE_FIRST + ((7 << 3) | subindex);
-            default: GGML_ASSERT(false); return COLLAPSE_CPT_RANGE_FIRST;
+            default:                     GGML_ABORT("invalid category");
         }
     };
 
@@ -709,6 +709,8 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
         return std::pair<uint32_t, uint32_t>(collapsed, collapsed + range);
     };
 
+    GGML_ASSERT(sizeof(wchar_t) == sizeof(u_int32_t));
+
     const auto cpts = unicode_cpts_from_utf8(text);
 
     std::vector<size_t> bpe_offsets = { cpts.size() };
@@ -756,7 +758,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
         wregex_whitespaces += L"\\s";
         for (uint32_t cpt : unicode_vec_whitespace) {
             if (cpt >= 0x80) {  // non-ASCII whitespaces
-                if (wregex_whitespaces.back() + 1 == cpt) {
+                if (wregex_whitespaces.back() + 1 == (wchar_t) cpt) {
                     if (*(wregex_whitespaces.end() - 2) == '-') {
                         wregex_whitespaces.back() = cpt;
                     } else {
@@ -764,7 +766,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                         wregex_whitespaces += cpt;
                     }
                 } else {
-                    wregex_whitespaces += cpt;
+                    wregex_whitespaces += (wchar_t) cpt;
                 }
             }
         }
@@ -847,7 +849,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                 }
                 // (2) Build a list of codepoint ranges. (2.2) [Optimization] Only build lists of ranges present in the regex.
                 categ.set_flag(codepoint_categ::WHITESPACE, inside_square);  //NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets'
-                regex_expr_categs.emplace_back(i, categ);
+                regex_expr_categs.emplace_back((uint32_t)i, categ);
                 i += cpts_regex[i + 4] == '}' ? 4 : 5;
                 continue;
             }
@@ -855,7 +857,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
             if (cpt == '\\') {
                 if (cpts_regex[i + 1] == 's' || cpts_regex[i + 1] == 'S') {  // \s \S
                     // (2) Build a list of codepoint ranges. (2.2) [Optimization] Only build lists of ranges present in the regex.
-                    regex_expr_categs.emplace_back(i, categ_whitespace);
+                    regex_expr_categs.emplace_back((uint32_t)i, categ_whitespace);
                     //NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets'
                     regex_expr_categs.back().second.set_flag(codepoint_categ::WHITESPACE, inside_square);
                     i += 1;
@@ -875,9 +877,9 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                     case 't':  ++i;  cpt = '\t';  break;
                     case 'r':  ++i;  cpt = '\r';  break;
                     case 'n':  ++i;  cpt = '\n';  break;
-                    case 'x':  GGML_ABORT("TODO");  break;  //TODO: hex values
-                    case 'u':  GGML_ABORT("TODO");  break;  //TODO: unicode values
-                    case 'U':  GGML_ABORT("TODO");  break;  //TODO: unicode values
+                    case 'x':  GGML_ABORT("TODO");  //TODO: hex values
+                    case 'u':  GGML_ABORT("TODO");  //TODO: unicode values
+                    case 'U':  GGML_ABORT("TODO");  //TODO: unicode values
                     default:  // escaped character
                         GGML_ASSERT(!is_cpt_range);
                         cpt = cpts_regex[++i];
diff --git a/src/unicode.h b/src/unicode.h
index 75cdb3f4a596f..8a3f4078ca79b 100644
--- a/src/unicode.h
+++ b/src/unicode.h
@@ -149,7 +149,7 @@ struct codepoint_categ {
                 return 0;
             }
             const char * p = strchr(subcategs, subcateg);
-            return p ? (p - subcategs + 1) : 0;
+            return (uint16_t) (p ? (p - subcategs + 1) : 0);
         };
         switch(categ) {
             case 'C':  if(subcateg == 'n') return 0;  // undefined

From 80f41234e40d4368960c85d7383c7b6a70cb5eac Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Wed, 7 Aug 2024 23:08:04 +0200
Subject: [PATCH 19/29] Update bruteforce test: fix binary search

---
 tests/test-tokenizer-random.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py
index 4a5773fa535a9..f7c3b140776c8 100644
--- a/tests/test-tokenizer-random.py
+++ b/tests/test-tokenizer-random.py
@@ -513,14 +513,16 @@ def _compare(text: str):
             a, b = 0, len(text)
             step = b
             while step > 1:
-                step = step // 2
-                if not _compare(text[a : b - step])[0]:
-                    b = b - step
+                step = (step + 1) // 2
+                t = max(a, b - step)
+                if not _compare(text[a : t])[0]:
+                    b = t
             step = b
             while step > 1:
-                step = step // 2
-                if not _compare(text[a + step : b])[0]:
-                    a = a + step
+                step = (step + 1) // 2
+                t = min(a + step, b)
+                if not _compare(text[t : b])[0]:
+                    a = t
             ok, ids1, ids2, text1, text2 = _compare(text[a : b])
             assert a <= b and not ok
             # show unique failing texts differences

From 7afe6df6a2290fea257414938a2d39110b9e7a33 Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Wed, 7 Aug 2024 23:14:36 +0200
Subject: [PATCH 20/29] Unicode data whitespaces as ranges

---
 scripts/gen-unicode-data.py | 23 +++++++++++-----------
 src/unicode-data.cpp        | 38 ++++++++++++-------------------------
 src/unicode-data.h          |  2 +-
 src/unicode.cpp             |  6 ++++--
 4 files changed, 28 insertions(+), 41 deletions(-)

diff --git a/scripts/gen-unicode-data.py b/scripts/gen-unicode-data.py
index d774fcabe9481..1528a13db4c80 100644
--- a/scripts/gen-unicode-data.py
+++ b/scripts/gen-unicode-data.py
@@ -85,7 +85,6 @@ def unicode_data_iter():
 
 
 codepoint_categs = array.array('B', [0]) * MAX_CODEPOINTS  # Undefined
-table_whitespace = []
 table_lowercase = []
 table_uppercase = []
 table_nfd = []
@@ -111,19 +110,20 @@ def unicode_data_iter():
         table_nfd.append((cpt, norm))
 
 
-# whitespaces, see "<White_Space>" https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
-table_whitespace.extend(range(0x0009, 0x000D + 1))
-table_whitespace.extend(range(0x2000, 0x200A + 1))
-table_whitespace.extend([0x0020, 0x0085, 0x00A0, 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000])
-
-
 # sort by codepoint
-table_whitespace.sort()
 table_lowercase.sort()
 table_uppercase.sort()
 table_nfd.sort()
 
 
+# whitespaces, see "<White_Space>" https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
+whitespace_ranges: list[tuple[int, int]] = []  # start, last
+whitespace_ranges.append((0x0009, 0x000D))
+whitespace_ranges.append((0x2000, 0x200A))
+for whitespace in [0x0020, 0x0085, 0x00A0, 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000]:
+    whitespace_ranges.append((whitespace, whitespace))
+
+
 # run length encoding, see unicode_cpt_category() in unicode.cpp
 assert (max(UNICODE_CATEGORY_TO_INDEX.values()) < 32)
 codepoint_categs_runs = [codepoint_categs[0]]  # 5 bits categ + 11 bits length
@@ -162,7 +162,6 @@ def out(line=""):
 #include <cstdint>
 #include <vector>
 #include <unordered_map>
-#include <unordered_set>
 """)
 
 out("const std::vector<uint16_t> unicode_rle_codepoints_categs = {  // run length encoding, 5 bits categ + 11 bits length")
@@ -170,9 +169,9 @@ def out(line=""):
     out("0x%04X," % rle)
 out("};\n")
 
-out("const std::vector<uint32_t> unicode_vec_whitespace = {")
-for codepoint in table_whitespace:
-    out("0x%06X," % codepoint)
+out("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace = {")
+for (start, last) in whitespace_ranges:
+    out("{0x%06X, 0x%06X}," % (start, last))
 out("};\n")
 
 out("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {")
diff --git a/src/unicode-data.cpp b/src/unicode-data.cpp
index 2591723ce3172..1a2ceb01739b8 100644
--- a/src/unicode-data.cpp
+++ b/src/unicode-data.cpp
@@ -4526,32 +4526,18 @@ const std::vector<uint16_t> unicode_rle_codepoints_categs = {  // run length enc
 0x0020,
 };
 
-const std::vector<uint32_t> unicode_vec_whitespace = {
-0x000009,
-0x00000A,
-0x00000B,
-0x00000C,
-0x00000D,
-0x000020,
-0x000085,
-0x0000A0,
-0x001680,
-0x002000,
-0x002001,
-0x002002,
-0x002003,
-0x002004,
-0x002005,
-0x002006,
-0x002007,
-0x002008,
-0x002009,
-0x00200A,
-0x002028,
-0x002029,
-0x00202F,
-0x00205F,
-0x003000,
+const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace = {
+{0x000009, 0x00000D},
+{0x002000, 0x00200A},
+{0x000020, 0x000020},
+{0x000085, 0x000085},
+{0x0000A0, 0x0000A0},
+{0x001680, 0x001680},
+{0x002028, 0x002028},
+{0x002029, 0x002029},
+{0x00202F, 0x00202F},
+{0x00205F, 0x00205F},
+{0x003000, 0x003000},
 };
 
 const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {
diff --git a/src/unicode-data.h b/src/unicode-data.h
index 682f79c373749..447826879eaee 100644
--- a/src/unicode-data.h
+++ b/src/unicode-data.h
@@ -13,7 +13,7 @@ struct range_nfd {
 static const uint32_t MAX_CODEPOINTS = 0x110000;
 
 extern const std::vector<uint16_t> unicode_rle_codepoints_categs;
-extern const std::vector<uint32_t> unicode_vec_whitespace;
+extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
 extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase;
 extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase;
 extern const std::vector<range_nfd> unicode_ranges_nfd;
diff --git a/src/unicode.cpp b/src/unicode.cpp
index 725476600f2ff..6ebef0ec96e02 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -597,8 +597,10 @@ codepoint_categ unicode_cpt_category(const uint32_t cp) {
         }
         GGML_ASSERT(cpt == MAX_CODEPOINTS);
 
-        for (auto cpt : unicode_vec_whitespace) {
-            cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACE);
+        for (auto p : unicode_ranges_whitespace) {
+            for (uint32_t cpt = p.first; cpt <= p.second; ++cpt) {
+                cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACE);
+            }
         }
 
         for (auto p : unicode_map_lowercase) {

From c2406383749aec4c3617c90054cd1d64134fcf15 Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Thu, 8 Aug 2024 01:35:20 +0200
Subject: [PATCH 21/29] Reimplement unicode_regex_split()

---
 src/unicode.cpp | 425 +++++++++++++++++++-----------------------------
 1 file changed, 171 insertions(+), 254 deletions(-)

diff --git a/src/unicode.cpp b/src/unicode.cpp
index 6ebef0ec96e02..4a5728ed6fd88 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -644,141 +644,128 @@ uint32_t unicode_tolower(uint32_t cp) {
 }
 
 std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
-    // std::regex does not support unicode categories: \p{N}, \p{L}, \p{Lu}, \p{Ll} ...
-    // std::regex does not support unicode whitespaces \s: 0x85, 0xA0, 0x001680 ... 0x003000.
-    // Generate a "collapsed" representation of the regex, where all unicode categories are replaced by codepoints ranges.
-    // Generate a "collapsed" representation of the text, where all codepoints are forced to fall into generated category ranges.
-    //  Text codepoints not found in generated category ranges are replaced by a "collapsed" codepoint.
-    // This implementation generalizes the original implementation adding support to unicode subcategories:
-    //  https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935
-
-    // Definitions:
-    // - Unicode cagegory: high unicode categories, \p{C}, \p{L}, \p{M}, \p{N}, \p{P}, \p{S}, \p{Z}.
-    // - Unicode subcagegory: including all unicode categories, \p{Cc}, \p{Cf}, \p{Co}, \p{Cs}, ..., \p{Zs}.
-    // - Collapsed codepoint: unused codepoint representing a unicode subcategory.
-    // - Collapsed range: sequence of "collapsed" codepoint, representing one unicode category.
-    // - Collapsed regex: original regex including "collapsed" codepoints and ranges.
-
-    // (1)     Build the "collapsed" regex:
-    // (1.1)     Generate a replacement list of codepoint ranges:
-    // (1.1.1)     For each unicode category.
-    // (1.1.2)     For each unicode subcategory.
-    // (1.1.3)     Expand \s adding unicode whitespaces.
-    // (1.2)     Each list includes its respective "collaped" codepoint/range.
-    // (1.3)     [Optimization] Only build lists of categories present in the regex.
-    // (1.4)     Build the "collapsed" regex replacing categories and subcategories by this "collapsed" lists.
-    // (2)     Build a list of codepoint ranges.
-    // (2.1)     If a codepoint is not found in this list, then it is "collapsable".
-    // (2.2)     [Optimization] Only build lists of ranges present in the regex.
-    // (3)     For each input text:
-    // (3.1)     Search codepoints in the regex codepoint ranges.
-    // (3.2)     If found, it is a valid codepoint (the "collapsed" regex uses it), literal copy.
-    // (3.3)     If not found, replace with its "collapsed" codepoint so the "collapsed" regex can process it.
-
-    //TODO: Refactor optimizations
-    // Steps (1) and (2) only depends on the regex expression text.
-    // Step (3) needs 'regex_expr_ranges' for text "collapsing" and 'wregex_collapsed'.
-    // Optimization: store and reuse 'wregex_collapsed' and 'regex_expr_ranges'.
-
-    // 0xDB80 to 0xDBFF: Private Use High Surrogate (128 range values)
-    static const uint32_t COLLAPSE_CPT_RANGE_FIRST = 0xDB80;
-    static const uint32_t COLLAPSE_CPT_RANGE_LAST  = 0xDBFF;
-
-    // return the collapsed codepoint of an unicode category or subcategory
-    auto category_to_collapsed_cpt = [] (const codepoint_categ categ) {
-        const uint16_t subindex = categ.get_subcategory() >> 7;  // subcategory stored in 3 bits
-        switch(categ.get_category()) {                           // category fits in other 3 bits
-            case codepoint_categ::UNDEF: return COLLAPSE_CPT_RANGE_FIRST + ((0 << 3) | subindex);
-            case codepoint_categ::C:     return COLLAPSE_CPT_RANGE_FIRST + ((1 << 3) | subindex);
-            case codepoint_categ::L:     return COLLAPSE_CPT_RANGE_FIRST + ((2 << 3) | subindex);
-            case codepoint_categ::M:     return COLLAPSE_CPT_RANGE_FIRST + ((3 << 3) | subindex);
-            case codepoint_categ::N:     return COLLAPSE_CPT_RANGE_FIRST + ((4 << 3) | subindex);
-            case codepoint_categ::P:     return COLLAPSE_CPT_RANGE_FIRST + ((5 << 3) | subindex);
-            case codepoint_categ::S:     return COLLAPSE_CPT_RANGE_FIRST + ((6 << 3) | subindex);
-            case codepoint_categ::Z:     return COLLAPSE_CPT_RANGE_FIRST + ((7 << 3) | subindex);
-            default:                     GGML_ABORT("invalid category");
+    // std::wregex does not support unicode categories: \p{N}, \p{L}, \p{Lu}, \p{Ll} ...
+    // std::wregex does not support unicode whitespaces \s: 0x85, 0xA0, 0x001680 ... 0x003000.
+    // std::wregex allows full wchar_t 32 bit codepoints, not limited to standard max 0x110000.
+    // The main idea is to insert unicode category bits into all regex and text codepoints.
+    //   Max unicode codepoint 0x110000 fits in 21 bits.
+    //   Store unicode category and subcategory in 10 bits.
+    //   Set the high bit to zero to keep wchar_t positive (uint32_t codepoints).
+    //   Categorized codepoint:
+    //     1 bit zero + 7 bits category + 3 bits subcategory index + 21 bits codepoint
+    //     0b0'XXXXXXX'xxx'ccccccccccccccccccccc
+    // A "categorized codepoint" re-defines the ordering keeping category hierarchy.
+    //   All high category codepoints \p{X} fall into the range:
+    //     0b0'XXXXXXX'000'000000000000000000000
+    //     0b0'XXXXXXX'111'111111111111111111111
+    //   All subcategory codepoints \p{Xx} fall into the range:
+    //     0b0'XXXXXXX'xxx'000000000000000000000
+    //     0b0'XXXXXXX'xxx'111111111111111111111
+    // Processing steps:
+    //   Build a lists of "categorized codepoints/ranges" for replacing regex \s \w and \d.
+    //   Replace all regex codepoints/ranges with respective "categorized codepoints/ranges".
+    //   Replace all text codepoints with respective "categorized codepoints".
+    // Caveats:
+    //   Some regex ranges starts and ends with different category/subcategory.
+    //   Split the ranges in sub-ranges to ensure a single category to maintain the new hierarchy.
+    //   This forces iterating all ranges and could produce long sub-range sequences.
+
+    //TODO: Regex processing can be cached.
+
+    // insert unicode category and subcategory before codepoint bits
+    // 1 bit zero + 7 bits category + 3 bits subcategory index + 21 bits zero
+    static const auto categorized_prefix = [] (const codepoint_categ categ) -> wchar_t {
+        static const uint32_t MASK    = codepoint_categ::MASK;  // category mask
+        static const uint32_t SUBMASK = codepoint_categ::SUBMASK & ~codepoint_categ::MASK;  // subcategory mask
+        return (wchar_t) (((categ.encoded & MASK) << (21+3)) | ((categ.encoded & SUBMASK) << (21-7)));
+    };
+
+    // insert unicode category and subcategory before codepoint bits
+    // 1 bit zero + 7 bits category + 3 bits subcategory index + 21 bits codepoint
+    static const auto categorize_codepoint = [] (const uint32_t cpt) -> wchar_t {
+        GGML_ASSERT(cpt < (1 << 21));
+        return categorized_prefix(unicode_cpt_category(cpt)) | (wchar_t)cpt;
+    };
+
+    // remove the categorized prefix bits and restore original codepoint bits
+    static const auto decategorize_codepoint = [] (const wchar_t cpt) -> uint32_t {
+        return (uint32_t) cpt & ((1 << 21) - 1);
+    };
+
+    // returns the respective categorized codepoint range of the category/subcategory
+    static const auto categorize_range_from_chars = [] (const char categ, const char subcateg) {
+        const wchar_t range_ini = categorized_prefix(codepoint_categ::from_chars(categ, subcateg));
+        const wchar_t range_end = (wchar_t) (range_ini | (subcateg ? (1<<21)-1 : (1<<24)-1));
+        return std::pair<wchar_t, wchar_t>(range_ini, range_end);
+    };
+
+    // helper function to append/concat regex expressions
+    auto wregex_append_subregex = [] (std::wstring & wregex, const std::wstring & subregex, const bool add_squares, const bool negated) {
+        if (add_squares) {
+            wregex += '[';
+            if (negated) {
+                wregex += '^';
+            }
+            wregex += subregex;
+            wregex += ']';
+        } else {
+            GGML_ASSERT(!negated);  //TODO: negation inside square brackets: \S \W \D
+            wregex += subregex;
         }
     };
 
-    // return the collapsed range of an unicode category (range including all subcategories)
-    auto category_to_collapsed_range = [&] (const codepoint_categ categ) {
-        // \p{Ll} --> \p{Ll} to \p{Ll}  // has subcategory ? yes
-        // \p{Lu} --> \p{Lu} to \p{Lu}  // has subcategory ? yes
-        // \p{L}  --> \p{Ll} to \p{Lu}  // has subcategory ? no
-        GGML_ASSERT((COLLAPSE_CPT_RANGE_FIRST & 0x7) == 0);
-        const uint32_t collapsed = category_to_collapsed_cpt(categ);
-        const uint32_t range = (collapsed & 0x7) ? 0 : 0x7;  // has subcategory ?
-        return std::pair<uint32_t, uint32_t>(collapsed, collapsed + range);
+    // \d digits replacement
+    static const std::wstring wregex_digits = {
+        categorize_codepoint('0'), '-', categorize_codepoint('9'),
     };
 
-    GGML_ASSERT(sizeof(wchar_t) == sizeof(u_int32_t));
+    // \w words replacement
+    static const std::wstring wregex_words = {
+        categorize_codepoint('_'),
+        categorize_codepoint('0'), '-', categorize_codepoint('9'),
+        categorize_codepoint('A'), '-', categorize_codepoint('Z'),
+        categorize_codepoint('a'), '-', categorize_codepoint('z'),
+    };
 
-    const auto cpts = unicode_cpts_from_utf8(text);
+    // \s whitespaces replacement
+    static const std::wstring wregex_whitespaces = [] {
+        std::wstring wregex_whitespaces;
+        for (const auto & range : unicode_ranges_whitespace) {
+            wregex_whitespaces += categorize_codepoint(range.first);
+            if (range.second > range.first) {
+                wregex_whitespaces += '-';
+                wregex_whitespaces += categorize_codepoint(range.second);
+            }
+        }
+        return wregex_whitespaces;
+    }();
+
+    GGML_ASSERT(sizeof(wchar_t) == sizeof(uint32_t));
+    std::wstring wtext = unicode_wstring_from_utf8(text);
 
-    std::vector<size_t> bpe_offsets = { cpts.size() };
+    std::vector<size_t> offsets = { wtext.size() };
 
     for (auto & regex_expr : regex_exprs) {
         // first, see if we have an efficient custom regex implementation
-        auto tmp = unicode_regex_split_custom(text, regex_expr, bpe_offsets);
+        auto tmp = unicode_regex_split_custom(text, regex_expr, offsets);
 
         if (!tmp.empty()) {
-            bpe_offsets = std::move(tmp);
+            offsets = std::move(tmp);
             continue;
         }
 
-        std::vector<std::pair<uint32_t, uint32_t>> regex_expr_ranges;        // start codepoint, last codepoint
-        std::vector<std::pair<uint32_t, codepoint_categ>> regex_expr_categs; // offset, codepoint category
-        std::map<uint16_t, std::wstring> map_categ_wregex;                   // categ --> regex utf32 string
-        std::wstring wregex_collapsed;
-        std::wstring wtext_collapsed;
+        std::wstring wregex;
         bool inside_square = false;
         bool is_cpt_range  = false;
 
-        // (2) Build a list of codepoint ranges
-        // common ranges: \w \d
-        regex_expr_ranges.emplace_back('a', 'z');
-        regex_expr_ranges.emplace_back('A', 'Z');
-        regex_expr_ranges.emplace_back('0', '9');
-        regex_expr_ranges.emplace_back('_', '_');
-
-        // (2) Build a list of codepoint ranges
-        // common ranges: \s
-        for (uint32_t cpt : unicode_vec_whitespace) {
-            const auto categ_prev = unicode_cpt_category(regex_expr_ranges.back().second);
-            const auto categ_last = unicode_cpt_category(cpt);
-            if (categ_prev == categ_last && regex_expr_ranges.back().second + 1 == cpt) {
-                regex_expr_ranges.back().second = cpt;
-            } else {
-                regex_expr_ranges.emplace_back(cpt, cpt);
-            }
-        }
-
-        // (1.1.3) Expand \s adding unicode whitespaces.
-        // std::wregex \s does not match non-ASCII whitespaces
-        static const codepoint_categ categ_whitespace(codepoint_categ::MASK + 1);  // UNDEF category, subcategory 1
-        std::wstring & wregex_whitespaces = map_categ_wregex[categ_whitespace.get_subcategory()];
-        wregex_whitespaces += L"\\s";
-        for (uint32_t cpt : unicode_vec_whitespace) {
-            if (cpt >= 0x80) {  // non-ASCII whitespaces
-                if (wregex_whitespaces.back() + 1 == (wchar_t) cpt) {
-                    if (*(wregex_whitespaces.end() - 2) == '-') {
-                        wregex_whitespaces.back() = cpt;
-                    } else {
-                        wregex_whitespaces += '-';
-                        wregex_whitespaces += cpt;
-                    }
-                } else {
-                    wregex_whitespaces += (wchar_t) cpt;
-                }
-            }
-        }
-
         const auto cpts_regex = unicode_cpts_from_utf8(regex_expr);
+        wregex.reserve(2 * cpts_regex.size());
 
         for (size_t i = 0; i < cpts_regex.size(); ++i) {
             uint32_t cpt = cpts_regex[i];
 
-            // skip regex metacharacters
+            // parse regex metacharacters
+            wregex += (wchar_t) cpt;
             if (inside_square) {
                 switch(cpt) {
                     case '^':
@@ -811,6 +798,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                     case '{':
                         while (cpt && cpt != '}') {
                             cpt = cpts_regex[++i];
+                            wregex += (wchar_t) cpt;
                         }
                         continue;
                     case '}':
@@ -819,12 +807,19 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                     case '(':
                         if (cpts_regex[i + 1] == '?') {  // (?: (?i: (?= (?! (?<= (?<!
                             if (cpts_regex[i + 2] == ':') {
-                                i += 2;
+                                wregex += (wchar_t) cpts_regex[++i];
+                                wregex += (wchar_t) cpts_regex[++i];
                             } else if (cpts_regex[i + 2] == 'i') {
-                                i += 3;
+                                wregex += (wchar_t) cpts_regex[++i];
+                                wregex += (wchar_t) cpts_regex[++i];
+                                wregex += (wchar_t) cpts_regex[++i];
                                 GGML_ASSERT(cpts_regex[i] == ':');
                             } else {
-                                i += 2 + (cpts_regex[i + 2] == '<');
+                                wregex += (wchar_t) cpts_regex[++i];
+                                wregex += (wchar_t) cpts_regex[++i];
+                                if (cpts_regex[i] == '<') {
+                                    wregex += (wchar_t) cpts_regex[++i];
+                                }
                                 GGML_ASSERT(cpts_regex[i] == '=' || cpts_regex[i] == '!');
                             }
                         }
@@ -838,44 +833,40 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                         continue;
                 }
             }
+            wregex.pop_back();
 
-            // parse unicode categories and subcategories
+            // parse unicode categories and subcategories, replace category with the categorized range
             if (cpt == '\\' && cpts_regex[i + 1] == 'p' && cpts_regex[i + 2] == '{') {
                 GGML_ASSERT(cpts_regex[i + 3] && cpts_regex[i + 4]);
-                codepoint_categ categ = {};
+                std::pair<wchar_t, wchar_t> range;
                 if (cpts_regex[i + 4] == '}') {
-                    categ = codepoint_categ::from_chars((char)cpts_regex[i + 3]);
+                    range = categorize_range_from_chars((char)cpts_regex[i + 3], (char)'\0');
+                    i += 4;
                 } else {
-                    categ = codepoint_categ::from_chars((char)cpts_regex[i + 3], (char)cpts_regex[i + 4]);
-                    GGML_ASSERT(cpts_regex[i + 5] == '}');
+                    range = categorize_range_from_chars((char)cpts_regex[i + 3], (char)cpts_regex[i + 4]);
+                    i += 5;
                 }
-                // (2) Build a list of codepoint ranges. (2.2) [Optimization] Only build lists of ranges present in the regex.
-                categ.set_flag(codepoint_categ::WHITESPACE, inside_square);  //NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets'
-                regex_expr_categs.emplace_back((uint32_t)i, categ);
-                i += cpts_regex[i + 4] == '}' ? 4 : 5;
+                GGML_ASSERT(cpts_regex[i] == '}');
+                const std::wstring subregex = {range.first, '-', range.second};
+                wregex_append_subregex(wregex, subregex, !inside_square, false);
                 continue;
             }
 
-            if (cpt == '\\') {
-                if (cpts_regex[i + 1] == 's' || cpts_regex[i + 1] == 'S') {  // \s \S
-                    // (2) Build a list of codepoint ranges. (2.2) [Optimization] Only build lists of ranges present in the regex.
-                    regex_expr_categs.emplace_back((uint32_t)i, categ_whitespace);
-                    //NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets'
-                    regex_expr_categs.back().second.set_flag(codepoint_categ::WHITESPACE, inside_square);
-                    i += 1;
-                    continue;
-                }
-            }
-
             // parse more metcharacters and espaped characters
             if (cpt == '\\') {
                 switch (cpts_regex[i + 1]) {
-                    case 's':  ++i;  continue;  // \s whitespaces
-                    case 'w':  ++i;  continue;  // \w words
-                    case 'd':  ++i;  continue;  // \d digits
-                    case 'S':  ++i;  continue;  // \S no whitespaces
-                    case 'W':  ++i;  continue;  // \W no words
-                    case 'D':  ++i;  continue;  // \D no digits
+                    case 's':  // \s whitespaces
+                    case 'S':  // \S no whitespaces
+                        wregex_append_subregex(wregex, wregex_whitespaces, !inside_square, cpts_regex[++i] == 'S');
+                        continue;
+                    case 'w':  // \w words
+                    case 'W':  // \W no words
+                        wregex_append_subregex(wregex, wregex_words, !inside_square, cpts_regex[++i] == 'W');
+                        continue;
+                    case 'd':  // \d digits
+                    case 'D':  // \D no digits
+                        wregex_append_subregex(wregex, wregex_digits, !inside_square, cpts_regex[++i] == 'D');
+                        continue;
                     case 't':  ++i;  cpt = '\t';  break;
                     case 'r':  ++i;  cpt = '\r';  break;
                     case 'n':  ++i;  cpt = '\n';  break;
@@ -886,139 +877,65 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                         GGML_ASSERT(!is_cpt_range);
                         cpt = cpts_regex[++i];
                         GGML_ASSERT(cpt < 0x80);
-                    break;
+                        break;
                 }
             }
 
-            // ensure there is not a collission with any "collapsed" codepoints
-            GGML_ASSERT(cpt < COLLAPSE_CPT_RANGE_FIRST || COLLAPSE_CPT_RANGE_LAST < cpt);
-
-            // (2) Build a list of codepoint ranges
             if (is_cpt_range) {
-                is_cpt_range = false;
-                regex_expr_ranges.back().second = cpt;
-            } else {
-                regex_expr_ranges.emplace_back(cpt, cpt);
-            }
-        }
-
-        // assign collapsed codepoint to each category regex \p{...}
-        for (auto offset_categ : regex_expr_categs) {
-            const uint16_t subcateg = offset_categ.second.get_subcategory();
-            auto it = map_categ_wregex.find(subcateg);
-            if (it == map_categ_wregex.end()) {
-                // (1.2) Each list includes its respective "collaped" codepoint/range.
-                const auto collapsed_range = category_to_collapsed_range(offset_categ.second);
-                map_categ_wregex[subcateg] = (wchar_t) collapsed_range.first;
-                if (collapsed_range.first < collapsed_range.second) {
-                    map_categ_wregex[subcateg] += (wchar_t) '-';
-                    map_categ_wregex[subcateg] += (wchar_t) collapsed_range.second;
-                }
-            }
-        }
-
-        // copy found regex ranges to each category regex
-        uint32_t regex_expr_ranges_uniques = 0;
-        std::pair<uint32_t, uint32_t> prev_range = {0, -1};
-        std::sort(regex_expr_ranges.begin(), regex_expr_ranges.end());
-        for (auto range : regex_expr_ranges) {
-            range.first = std::max(range.first, prev_range.second + 1);  // prevent overlapping  //TODO: as error?
-            if (range.first > range.second) {  // skip overlapping and repetitions
-                continue;
-            }
-            // (1.1) Generate a replacement list of codepoint ranges
-            codepoint_categ categ = unicode_cpt_category(range.first);
-            GGML_ASSERT(categ == unicode_cpt_category(range.second));
-            auto it0 = map_categ_wregex.find(categ.get_category());
-            auto it1 = map_categ_wregex.find(categ.get_subcategory());
-            for (const auto & it : {it0, it1}) {
-                if (it != map_categ_wregex.end()) {
-                    it->second += (wchar_t) range.first;
-                    if (range.first < range.second) {
-                        it->second += (wchar_t) '-';
-                        it->second += (wchar_t) range.second;
+                // Some regex ranges starts and ends with different category/subcategory.
+                // Split the ranges in sub-ranges to ensure a single category to maintain the new hierarchy.
+                // Warning: This forces iterating all ranges and could produce long sub-range sequences.
+                GGML_ASSERT(wregex.size() && wregex.back() == '-');
+                wregex.pop_back();
+                wchar_t categorized = wregex.back();
+                uint32_t range_ini = decategorize_codepoint(categorized);
+                const uint32_t range_end = cpt;
+                GGML_ASSERT(range_ini <= range_end);
+                codepoint_categ range_categ = unicode_cpt_category(range_ini);
+                for (cpt = range_ini + 1; cpt <= range_end; ++cpt) {
+                    codepoint_categ categ = unicode_cpt_category(cpt);
+                    if (categ == range_categ) {  // still same range category ?
+                        ++categorized;
+                        if (cpt == range_ini + 1) {  // single step, no need range
+                            wregex += categorized;
+                        } else if (cpt == range_ini + 2) {  // need range if +2 step
+                            wregex.back() = '-';
+                            wregex += categorized;
+                        } else {
+                            wregex.back() = categorized;  // keep range growing
+                        }
+                    } else {  // new range category
+                        categorized = categorize_codepoint(cpt);
+                        wregex += categorized;
+                        range_categ = categ;
+                        range_ini = cpt;
                     }
                 }
-            }
-            prev_range = range;
-            regex_expr_ranges[regex_expr_ranges_uniques++] = range;
-        }
-        regex_expr_ranges.resize(regex_expr_ranges_uniques);
-
-        // replace categories with respective collapsed codepoint and ranges
-        uint32_t i = 0;
-        wregex_collapsed.reserve(regex_expr.size());
-        for (auto offset_categ : regex_expr_categs) {
-            while (i < offset_categ.first) {  // copy original regex until reaching the category
-                wregex_collapsed += (wchar_t) cpts_regex[i];
-                i++;
-            }
-            GGML_ASSERT(cpts_regex[i] == '\\');
-            const uint32_t cpt_next = cpts_regex[i + 1];
-            const bool is_negated = cpt_next < 'a';  // is uppercase
-            if (cpt_next == 'p' || cpt_next == 'P') {
-                GGML_ASSERT(cpts_regex[i + 2] == '{' && cpts_regex[i + 3]);
-                i += cpts_regex[i + 4] == '}' ? 5 : 6;
-                GGML_ASSERT(cpts_regex[i - 1] == '}');
+                is_cpt_range = false;
             } else {
-                GGML_ASSERT(cpt_next == 's' || cpt_next == 'w' || cpt_next == 'd' ||  // \s \w \d
-                            cpt_next == 'S' || cpt_next == 'W' || cpt_next == 'D');   // \S \W \D
-                i += 2;
+                wregex += categorize_codepoint(cpt);
             }
-            // (1.4) Build the "collapsed" regex replacing categories and subcategories by this "collapsed" lists.
-            const codepoint_categ categ = offset_categ.second;
-            auto it = map_categ_wregex.find(categ.get_subcategory());
-            GGML_ASSERT(it != map_categ_wregex.end());
-            if (it != map_categ_wregex.end()) {
-                if (categ.is_whitespace()) {  // inside square brackets  //NOTE: reusing flag WHITESPACE
-                    GGML_ASSERT(is_negated == false);
-                    wregex_collapsed += it->second;
-                } else if(it->second.size() == 1 && !is_negated) {
-                    wregex_collapsed += it->second;
-                } else {
-                    wregex_collapsed += '[';
-                    if (is_negated) {
-                        wregex_collapsed += '^';
-                    }
-                    wregex_collapsed += it->second;
-                    wregex_collapsed += ']';
-                }
-            }
-        }
-        while (i < (uint32_t)cpts_regex.size()) {
-            wregex_collapsed += cpts_regex[i];
-            i++;
         }
 
-        // collapse text codepoints not included in 'regex_expr_ranges'
-        wtext_collapsed.reserve(cpts.size());
-        for (uint32_t cpt : cpts) {
-            const codepoint_categ categ = unicode_cpt_category(cpt);
-            // (3.1) Search codepoints in the regex codepoint ranges.
-            auto it = std::lower_bound(regex_expr_ranges.begin(), regex_expr_ranges.end(), cpt,
-                [] (const std::pair<uint32_t, uint32_t> range, const uint32_t cpt) {
-                    return range.second < cpt;
-                }
-            );
-            if (it == regex_expr_ranges.end() || cpt < it->first || it->second < cpt) {
-                // (3.3) If not found, replace with its "collapsed" codepoint so the "collapsed" regex can process it.
-                cpt = category_to_collapsed_cpt(categ);  // not found, collapse to category codepoint
+        // categorize all wtext codepoints
+        if (wtext.size() && wtext[0] < MAX_CODEPOINTS) {  // if not already categorized
+            for (size_t i = 0; i < wtext.size(); ++i) {
+                wtext[i] = categorize_codepoint((uint32_t) wtext[i]);
             }
-            // (3.2) If found, it is a valid codepoint (the "collapsed" regex uses it), literal copy.
-            wtext_collapsed += (wchar_t) cpt;
         }
 
-        bpe_offsets = unicode_regex_split_stl(wtext_collapsed, wregex_collapsed, bpe_offsets);
+        offsets = unicode_regex_split_stl(wtext, wregex, offsets);
     }
 
     std::vector<std::string> bpe_words;
-    bpe_words.reserve(bpe_offsets.size()); // reserve memory for the approximate size
+    bpe_words.reserve(offsets.size()); // reserve memory for the approximate size
 
     size_t start = 0;
-    for (size_t & offset : bpe_offsets) {
+    for (size_t & offset : offsets) {
         bpe_words.emplace_back();
         for (size_t i = start; i < start + offset; ++i) {
-            bpe_words.back() += unicode_cpt_to_utf8(cpts[i]);
+            const uint32_t cpt = decategorize_codepoint(wtext[i]);
+            bpe_words.back() += unicode_cpt_to_utf8(cpt);
         }
         start += offset;
     }

From 312c4322cc4d607fa4a81724c859672605e03c33 Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Tue, 13 Aug 2024 16:30:30 +0200
Subject: [PATCH 22/29] Remove invalid assert

---
 src/llama-vocab.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 5eeae05858ebb..6192fd195746f 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -710,7 +710,6 @@ struct llm_tokenizer_wpm {
                 continue;
             }
 
-            assert (!categ.is_S());
             if (cpt == 0 || cpt == 0xFFFD || categ.is_C()) {
                 continue;
             }

From b565148cb43b732327a4c515b6b484d55dc53f9b Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Tue, 13 Aug 2024 16:42:33 +0200
Subject: [PATCH 23/29] Update codepoint_categ:

- Reorganize category/subcategory bits.
- Regex flags for \s \w \d.
---
 src/unicode.cpp |  22 +++---
 src/unicode.h   | 176 ++++++++++++++++++++++++------------------------
 2 files changed, 96 insertions(+), 102 deletions(-)

diff --git a/src/unicode.cpp b/src/unicode.cpp
index 4a5728ed6fd88..20c1287c43199 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -209,7 +209,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
             return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
         };
 
-        static const codepoint_categ SENTINEL = codepoint_categ::MASK + 1;
+        static const codepoint_categ SENTINEL = codepoint_categ::UNDEF + 1;
         auto _get_categ = [&] (const size_t pos) -> codepoint_categ {
             return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_category(cpts[pos]) : SENTINEL;
         };
@@ -328,7 +328,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
             return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
         };
 
-        static const codepoint_categ SENTINEL = codepoint_categ::MASK + 1;
+        static const codepoint_categ SENTINEL = codepoint_categ::UNDEF + 1;
         auto _get_categ = [&] (const size_t pos) -> codepoint_categ {
             return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_category(cpts[pos]) : SENTINEL;
         };
@@ -589,28 +589,24 @@ codepoint_categ unicode_cpt_category(const uint32_t cp) {
         for (uint16_t rle : unicode_rle_codepoints_categs) {
             const uint32_t index = rle & 31;
             const uint32_t count = rle >> 5;
-            const auto categ = codepoint_categ::from_index(index);
-            //printf( "Codepoints 0x%05X to 0x%05X categ %s\n", cpt, cpt + count, categ.c_str());
+            auto categ = codepoint_categ::from_index(index);
+            //printf("Codepoints 0x%05X to 0x%05X categ %s\n", cpt, cpt + count, categ.c_str());
+            categ.set_flag(codepoint_categ::DIGITS, categ.is_Nd());               // \d --> \p{Nd}
+            categ.set_flag(codepoint_categ::WORDS, categ.is_L() | categ.is_N());  // \w --> \p{L} \p{N} _
             for (uint32_t i = 0; i <= count; ++i) {
                 cpt_categs[cpt++] = categ;
             }
         }
         GGML_ASSERT(cpt == MAX_CODEPOINTS);
 
+        cpt_categs['_'].set_flag(codepoint_categ::WORDS);  // \w --> \p{L} \p{N} _
+
         for (auto p : unicode_ranges_whitespace) {
             for (uint32_t cpt = p.first; cpt <= p.second; ++cpt) {
-                cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACE);
+                cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACES);
             }
         }
 
-        for (auto p : unicode_map_lowercase) {
-            cpt_categs[p.second].set_flag(codepoint_categ::LOWERCASE);
-        }
-
-        for (auto p : unicode_map_uppercase) {
-            cpt_categs[p.second].set_flag(codepoint_categ::UPPERCASE);
-        }
-
         //for (auto &range : unicode_ranges_nfd) {  // start, last, nfd
         //    cpt_categs[cpt].set_flag(codepoint_categ::NORM_NFD);
         //}
diff --git a/src/unicode.h b/src/unicode.h
index 8a3f4078ca79b..3aeb74771eb7f 100644
--- a/src/unicode.h
+++ b/src/unicode.h
@@ -9,74 +9,71 @@
 #include <map>
 
 struct codepoint_categ {
+    // 0bffffff'ccccccc'sss --> 6 bits flags + 7 bits category + 3 bits subcategory
     enum _category : uint16_t {
-        UNDEF = 0,   // \p{Cn} Undefined
-        C = 1 << 0,  // \p{C}  Control
-        L = 1 << 1,  // \p{L}  Letter
-        M = 1 << 2,  // \p{M}  Mark
-        N = 1 << 3,  // \p{N}  Number
-        P = 1 << 4,  // \p{P}  Punctuation
-        S = 1 << 5,  // \p{S}  Symbol
-        Z = 1 << 6,  // \p{Z}  Separator
-        MASK = (1 << 7) - 1  // 7 bits
-    };
-
-    enum _subcategory : uint16_t {
-        Cc = C | (1 << 7),  // \p{Cc} Control
-        Cf = C | (2 << 7),  // \p{Cf} Format
-        Co = C | (3 << 7),  // \p{Co} Private Use
-        Cs = C | (4 << 7),  // \p{Cs} Surrrogate
-        Ll = L | (1 << 7),  // \p{Ll} Lowercase Letter
-        Lm = L | (2 << 7),  // \p{Lm} Modifier Letter
-        Lo = L | (3 << 7),  // \p{Lo} Other Letter
-        Lt = L | (4 << 7),  // \p{Lt} Titlecase Letter
-        Lu = L | (5 << 7),  // \p{Lu} Uppercase Letter
-        Mc = M | (1 << 7),  // \p{Mc} Spacing Mark
-        Me = M | (2 << 7),  // \p{Me} Enclosing Mark
-        Mn = M | (3 << 7),  // \p{Mn} Nonspacing Mark
-        Nd = N | (1 << 7),  // \p{Nd} Decimal Number
-        Nl = N | (2 << 7),  // \p{Nl} Letter Number
-        No = N | (3 << 7),  // \p{No} Other Number
-        Pc = P | (1 << 7),  // \p{Pc} Connector Punctuation
-        Pd = P | (2 << 7),  // \p{Pd} Dash Punctuation
-        Pe = P | (3 << 7),  // \p{Pe} Close Punctuation
-        Pf = P | (4 << 7),  // \p{Pf} Final Punctuation
-        Pi = P | (5 << 7),  // \p{Pi} Initial Punctuation
-        Po = P | (6 << 7),  // \p{Po} Other Punctuation
-        Ps = P | (7 << 7),  // \p{Ps} Open Punctuation
-        Sc = S | (1 << 7),  // \p{Sc} Currency Symbol
-        Sk = S | (2 << 7),  // \p{Sk} Modifier Symbol
-        Sm = S | (3 << 7),  // \p{Sm} Math Symbol
-        So = S | (4 << 7),  // \p{So} Other Symbol
-        Zl = Z | (1 << 7),  // \p{Zl} Line Separator
-        Zp = Z | (2 << 7),  // \p{Zp} Paragraph Separator
-        Zs = Z | (3 << 7),  // \p{Zs} Space Separator
-        SUBMASK = (1 << 10) - 1  // 7+3 bits
+        UNDEF = 0,         // \p{Cn} Undefined
+        C = 1 << (0 + 3),  // \p{C}  Control
+        L = 1 << (1 + 3),  // \p{L}  Letter
+        M = 1 << (2 + 3),  // \p{M}  Mark
+        N = 1 << (3 + 3),  // \p{N}  Number
+        P = 1 << (4 + 3),  // \p{P}  Punctuation
+        S = 1 << (5 + 3),  // \p{S}  Symbol
+        Z = 1 << (6 + 3),  // \p{Z}  Separator
+        Cc = C | 1,  // \p{Cc} Control
+        Cf = C | 2,  // \p{Cf} Format
+        Co = C | 3,  // \p{Co} Private Use
+        Cs = C | 4,  // \p{Cs} Surrrogate
+        Ll = L | 1,  // \p{Ll} Lowercase Letter
+        Lm = L | 2,  // \p{Lm} Modifier Letter
+        Lo = L | 3,  // \p{Lo} Other Letter
+        Lt = L | 4,  // \p{Lt} Titlecase Letter
+        Lu = L | 5,  // \p{Lu} Uppercase Letter
+        Mc = M | 1,  // \p{Mc} Spacing Mark
+        Me = M | 2,  // \p{Me} Enclosing Mark
+        Mn = M | 3,  // \p{Mn} Nonspacing Mark
+        Nd = N | 1,  // \p{Nd} Decimal Number
+        Nl = N | 2,  // \p{Nl} Letter Number
+        No = N | 3,  // \p{No} Other Number
+        Pc = P | 1,  // \p{Pc} Connector Punctuation
+        Pd = P | 2,  // \p{Pd} Dash Punctuation
+        Pe = P | 3,  // \p{Pe} Close Punctuation
+        Pf = P | 4,  // \p{Pf} Final Punctuation
+        Pi = P | 5,  // \p{Pi} Initial Punctuation
+        Po = P | 6,  // \p{Po} Other Punctuation
+        Ps = P | 7,  // \p{Ps} Open Punctuation
+        Sc = S | 1,  // \p{Sc} Currency Symbol
+        Sk = S | 2,  // \p{Sk} Modifier Symbol
+        Sm = S | 3,  // \p{Sm} Math Symbol
+        So = S | 4,  // \p{So} Other Symbol
+        Zl = Z | 1,  // \p{Zl} Line Separator
+        Zp = Z | 2,  // \p{Zp} Paragraph Separator
+        Zs = Z | 3,  // \p{Zs} Space Separator
+        SUBMASK = (1 <<  3) - 1,  // 3 bits   0b000000'0000000'111
+        MASK    = (1 << 10) - 1,  // 7+3 bits 0b000000'1111111'111
     };
 
     enum _flags : uint16_t {
-        WHITESPACE = (1 << 10),  // regex: \s
-        LOWERCASE  = (1 << 11),
-        UPPERCASE  = (1 << 12),
+        WHITESPACES = (1 << 10),  // regex: \s
+        WORDS       = (1 << 11),  // regex: \w
+        DIGITS      = (1 << 12),  // regex: \d
         //Norm NFD/NFC  = ...,
     };
 
     inline codepoint_categ(const uint16_t categ=0) : encoded{categ} {}
 
     inline void set_flag(_flags flags, bool value = true) {
-        flags = (_flags) (flags & ~SUBMASK);  // ignore category bits
+        flags = (_flags) (flags & ~MASK);  // do not modify category bits
         encoded = value ? (encoded | flags) : (encoded & ~flags);
     }
 
     inline uint16_t get_category() const { return encoded & MASK; }
-    inline uint16_t get_subcategory() const { return encoded & SUBMASK; }
 
     inline bool is_undefined() const { return !encoded; }
     inline bool is_defined() const { return encoded; }
 
-    inline uint16_t is_whitespace() const { return encoded & WHITESPACE; }
-    inline uint16_t is_lowercase()  const { return encoded & LOWERCASE; }
-    inline uint16_t is_uppercase()  const { return encoded & UPPERCASE; }
+    inline uint16_t is_whitespace() const { return encoded & WHITESPACES; }
+    inline uint16_t is_word()       const { return encoded & WORDS;  }
+    inline uint16_t is_digit()      const { return encoded & DIGITS; }
 
     inline uint16_t is_C() const { return encoded & C; }
     inline uint16_t is_L() const { return encoded & L; }
@@ -86,35 +83,35 @@ struct codepoint_categ {
     inline uint16_t is_S() const { return encoded & S; }
     inline uint16_t is_Z() const { return encoded & Z; }
 
-    inline bool is_Cc() const { return (encoded & SUBMASK) == Cc; }
-    inline bool is_Cf() const { return (encoded & SUBMASK) == Cf; }
-    inline bool is_Co() const { return (encoded & SUBMASK) == Co; }
-    inline bool is_Cs() const { return (encoded & SUBMASK) == Cs; }
-    inline bool is_Ll() const { return (encoded & SUBMASK) == Ll; }
-    inline bool is_Lm() const { return (encoded & SUBMASK) == Lm; }
-    inline bool is_Lo() const { return (encoded & SUBMASK) == Lo; }
-    inline bool is_Lt() const { return (encoded & SUBMASK) == Lt; }
-    inline bool is_Lu() const { return (encoded & SUBMASK) == Lu; }
-    inline bool is_Mc() const { return (encoded & SUBMASK) == Mc; }
-    inline bool is_Me() const { return (encoded & SUBMASK) == Me; }
-    inline bool is_Mn() const { return (encoded & SUBMASK) == Mn; }
-    inline bool is_Nd() const { return (encoded & SUBMASK) == Nd; }
-    inline bool is_Nl() const { return (encoded & SUBMASK) == Nl; }
-    inline bool is_No() const { return (encoded & SUBMASK) == No; }
-    inline bool is_Pc() const { return (encoded & SUBMASK) == Pc; }
-    inline bool is_Pd() const { return (encoded & SUBMASK) == Pd; }
-    inline bool is_Pe() const { return (encoded & SUBMASK) == Pe; }
-    inline bool is_Pf() const { return (encoded & SUBMASK) == Pf; }
-    inline bool is_Pi() const { return (encoded & SUBMASK) == Pi; }
-    inline bool is_Po() const { return (encoded & SUBMASK) == Po; }
-    inline bool is_Ps() const { return (encoded & SUBMASK) == Ps; }
-    inline bool is_Sc() const { return (encoded & SUBMASK) == Sc; }
-    inline bool is_Sk() const { return (encoded & SUBMASK) == Sk; }
-    inline bool is_Sm() const { return (encoded & SUBMASK) == Sm; }
-    inline bool is_So() const { return (encoded & SUBMASK) == So; }
-    inline bool is_Zl() const { return (encoded & SUBMASK) == Zl; }
-    inline bool is_Zp() const { return (encoded & SUBMASK) == Zp; }
-    inline bool is_Zs() const { return (encoded & SUBMASK) == Zs; }
+    inline bool is_Cc() const { return (encoded & MASK) == Cc; }
+    inline bool is_Cf() const { return (encoded & MASK) == Cf; }
+    inline bool is_Co() const { return (encoded & MASK) == Co; }
+    inline bool is_Cs() const { return (encoded & MASK) == Cs; }
+    inline bool is_Ll() const { return (encoded & MASK) == Ll; }
+    inline bool is_Lm() const { return (encoded & MASK) == Lm; }
+    inline bool is_Lo() const { return (encoded & MASK) == Lo; }
+    inline bool is_Lt() const { return (encoded & MASK) == Lt; }
+    inline bool is_Lu() const { return (encoded & MASK) == Lu; }
+    inline bool is_Mc() const { return (encoded & MASK) == Mc; }
+    inline bool is_Me() const { return (encoded & MASK) == Me; }
+    inline bool is_Mn() const { return (encoded & MASK) == Mn; }
+    inline bool is_Nd() const { return (encoded & MASK) == Nd; }
+    inline bool is_Nl() const { return (encoded & MASK) == Nl; }
+    inline bool is_No() const { return (encoded & MASK) == No; }
+    inline bool is_Pc() const { return (encoded & MASK) == Pc; }
+    inline bool is_Pd() const { return (encoded & MASK) == Pd; }
+    inline bool is_Pe() const { return (encoded & MASK) == Pe; }
+    inline bool is_Pf() const { return (encoded & MASK) == Pf; }
+    inline bool is_Pi() const { return (encoded & MASK) == Pi; }
+    inline bool is_Po() const { return (encoded & MASK) == Po; }
+    inline bool is_Ps() const { return (encoded & MASK) == Ps; }
+    inline bool is_Sc() const { return (encoded & MASK) == Sc; }
+    inline bool is_Sk() const { return (encoded & MASK) == Sk; }
+    inline bool is_Sm() const { return (encoded & MASK) == Sm; }
+    inline bool is_So() const { return (encoded & MASK) == So; }
+    inline bool is_Zl() const { return (encoded & MASK) == Zl; }
+    inline bool is_Zp() const { return (encoded & MASK) == Zp; }
+    inline bool is_Zs() const { return (encoded & MASK) == Zs; }
 
     inline bool operator == (const codepoint_categ other) const {
         return encoded == other.encoded;
@@ -132,7 +129,7 @@ struct codepoint_categ {
             {Pd, "Pd"}, {Pe, "Pe"}, {Pf, "Pf"}, {Pi, "Pi"}, {Po, "Po"}, {Ps, "Ps"}, {Sc, "Sc"}, {Sk, "Sk"},
             {Sm, "Sm"}, {So, "So"}, {Zl, "Zl"}, {Zp, "Zp"}, {Zs, "Zs"},
         };
-        const auto it = map.find(encoded & SUBMASK);
+        const auto it = map.find(encoded & MASK);
         return it == map.end() ? "INVALID" : it->second;
     }
 
@@ -149,18 +146,19 @@ struct codepoint_categ {
                 return 0;
             }
             const char * p = strchr(subcategs, subcateg);
-            return (uint16_t) (p ? (p - subcategs + 1) : 0);
+            GGML_ASSERT(p);
+            return (uint16_t) (p - subcategs + 1);
         };
         switch(categ) {
             case 'C':  if(subcateg == 'n') return 0;  // undefined
-                       return C | (_subindex(subcateg, "cfos"   ) << 7);
-            case 'L':  return L | (_subindex(subcateg, "lmotu"  ) << 7);
-            case 'M':  return M | (_subindex(subcateg, "cen"    ) << 7);
-            case 'N':  return N | (_subindex(subcateg, "dlo"    ) << 7);
-            case 'P':  return P | (_subindex(subcateg, "cdefios") << 7);
-            case 'S':  return S | (_subindex(subcateg, "ckmo"   ) << 7);
-            case 'Z':  return Z | (_subindex(subcateg, "lps"    ) << 7);
-            default:   assert (false);  return 0;
+                       return C | _subindex(subcateg, "cfos"   );
+            case 'L':  return L | _subindex(subcateg, "lmotu"  );
+            case 'M':  return M | _subindex(subcateg, "cen"    );
+            case 'N':  return N | _subindex(subcateg, "dlo"    );
+            case 'P':  return P | _subindex(subcateg, "cdefios");
+            case 'S':  return S | _subindex(subcateg, "ckmo"   );
+            case 'Z':  return Z | _subindex(subcateg, "lps"    );
+            default:   GGML_ABORT("invalid category character");
         }
     }
 

From 5a93d2ec504c649ccd7cfb6ff8c23a5bd3105894 Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Tue, 13 Aug 2024 17:38:46 +0200
Subject: [PATCH 24/29] Reimplement unicode_regex_split():

- Using std::basic_regex.
- Custom std::ctype specialization for 32bits codepoints.
- Custom std::regex_traits specialization for 32bits codepoints.
- Implementing custom 'character class expression' for \p{Xx}.
- Single pass regex preparation.
---
 src/unicode.cpp | 577 +++++++++++++++++++++---------------------------
 src/unicode.h   |  17 ++
 2 files changed, 269 insertions(+), 325 deletions(-)

diff --git a/src/unicode.cpp b/src/unicode.cpp
index 20c1287c43199..988dd35e4d3c2 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -451,76 +451,271 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
     return bpe_offsets;
 }
 
-// use std::wregex to split the text
-static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) {
-    std::wregex expr(regex_expr);
-    std::vector<size_t> bpe_offsets; // store the offset of each word
-    bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
-    size_t start = 0;
-    for (auto offset : offsets) {
-        std::wcregex_iterator it(wtext.data() + start, wtext.data() + start + offset, expr);
-        std::wcregex_iterator end;
+static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
+    std::vector<size_t> bpe_offsets;
 
-        int64_t start_idx = 0;
-        while (it != end) {
-            std::wcmatch match = *it;
-            if (match.position() > start_idx) {
-                bpe_offsets.emplace_back(match.position() - start_idx);
+    if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
+        bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
+    } else if (
+            regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" ||
+            regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
+
+        bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
+    }
+
+    return bpe_offsets;
+}
+
+// Custom std::regex specializations for 32bit unicode codepoints
+//   std::wregex does not support unicode categories: \p{N}, \p{L}, \p{Lu}, \p{Ll} ...
+//   std::wregex does not support unicode whitespaces \s: 0x85, 0xA0, 0x001680 ... 0x003000.
+//   std::wregex supports full 32 bit codepoints, not limited to standard max 0x110000.
+namespace std {
+    using codepoint = uint32_t;  // codepoint type for all template specializations
+
+    // Minimal required implementation for std::regex string processing
+    template<>  // custom specialized std::ctype<codepoint>
+    class ctype<codepoint> {
+        public:
+
+        using CharT = codepoint;
+        using char_type = CharT;
+
+        using mask = uint8_t;          //NOTE: see std::ctype_base
+        static const mask digit  = 1;  // requiered variable names
+        static const mask xdigit = 2;  // user defined values
+        static const mask alpha  = 3;  // used to be a bitmask
+        static const mask upper  = 4;  // we do not need a bitmask
+        static const mask lower  = 5;  // using a sequence instead
+
+        static locale::id id;  // required by std::locale::facet
+
+        bool is(mask m, char_type c) const {
+            switch (m) {
+                case digit:  return ('0' <= c && c <= '9');
+                case xdigit: return ('0' <= c && c <= '9') || ('A' <= c && c <= 'F');
+                case alpha:  return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z');
+                case upper:  return ('A' <= c && c <= 'Z');
+                case lower:  return ('a' <= c && c <= 'z');
+                default:     return false;
             }
-            bpe_offsets.emplace_back(match.length());
-            start_idx = match.position() + match.length();
-            ++it;
         }
 
-        if (start_idx < (int64_t) offset) {
-            bpe_offsets.emplace_back(offset - start_idx);
+        char_type toupper(char_type c) const {
+            return ('a' <= c && c <= 'z') ? c - ('a' - 'A') : c;
         }
-        start += offset;
+
+        char_type tolower(char_type c) const {
+            return ('A' <= c && c <= 'Z') ? c + ('a' - 'A') : c;
+        }
+
+        char_type widen(char c) const {  // char to codepoint
+            return (char_type) c;
+        }
+
+        char narrow(char_type c, char dfault) const {  // codepoint to char
+            return (c < 0x80 ? (char)c : dfault);
+        }
+    };
+
+    locale::id ctype<codepoint>::id = {};
+
+    template<>  // specialization to use our custom specialized std::ctype<codepoint>
+    const std::ctype<codepoint> & use_facet<std::ctype<codepoint>>(const std::locale &) {
+        static std::ctype<codepoint> ctype_uint32 = {};
+        return ctype_uint32;
     }
 
-    return bpe_offsets;
+    template<>  // specialization to use our custom specialized std::ctype<codepoint>
+    const std::ctype<codepoint> & use_facet<const std::ctype<codepoint>>(const std::locale & loc) {
+        return use_facet<std::ctype<codepoint>>(loc);
+    }
+
+    // Minimal required implementation for std::regex string processing
+    template<>  // custom specialized std::regex_traits<codepoint>
+    class regex_traits<codepoint> {
+    public:
+
+        using CharT       = codepoint;
+        using char_type   = codepoint;
+        using size_type   = size_t;
+        using string_type = std::basic_string<CharT>;
+        using locale_type = std::locale;
+        using char_class_type = uint64_t;
+
+        #if (defined(_WIN32) || defined(_WIN64))  // MSVC class _Regex_traits
+            using _Uelem = CharT;
+            static const auto _Ch_upper = std::ctype<CharT>::upper;
+            static const auto _Ch_alpha = std::ctype<CharT>::alpha;
+        #endif
+
+        static size_type length(const CharT * str) {
+            return std::char_traits<CharT>::length(str);
+        }
+
+        CharT translate(CharT c) const {
+            return c;
+        }
+
+        CharT translate_nocase(CharT c) const {
+            return unicode_tolower(c);
+        }
+
+        template<typename It>
+        string_type transform(It first, It last) const {
+            GGML_ASSERT(false);   //TODO: not needed ?
+            return {first, last}; //TODO: not tested
+        }
+
+        template<typename It>
+        string_type transform_primary(It first, It last) const {
+            (void) first;
+            (void) last;
+            GGML_ASSERT(*first < MAX_CODEPOINTS);  // valid codepoint
+            return {};
+        }
+
+        template<typename It>
+        string_type lookup_collatename(It first, It last) const {
+            (void) last;
+            GGML_ASSERT(*first & (1 << 31));
+            return {*first};
+        }
+
+        template<typename It>
+        char_class_type lookup_classname(It first, It last, bool icase = false) const {
+            (void) last;
+            (void) icase;
+            const uint32_t encoded = *first;
+            codepoint_categ categ = {};
+            switch(encoded) {
+                case 's':
+                case 'S':  // negation is internally tracked
+                    categ.set_flag(codepoint_categ::WHITESPACES);
+                    return categ.expand_bits();
+                case 'w':
+                case 'W':  // negation is internally tracked
+                    categ.set_flag(codepoint_categ::WORDS);
+                    return categ.expand_bits();
+                case 'd':
+                case 'D':  // negation is internally tracked
+                    categ.set_flag(codepoint_categ::DIGITS);
+                    return categ.expand_bits();
+                default: {  // unicode category \p{Xx} encoded in codepoint
+                    GGML_ASSERT(encoded & (1 << 31));  // make sure its our custom codepoint encoding the category
+                    const bool negated = encoded & (1 << 30);  // negation of 'character class expression' are not internally tracked
+                    categ = {(uint16_t) encoded};
+                    return ((uint64_t) negated << 63) | categ.expand_bits(false);
+                }
+            }
+        }
+
+        bool isctype(CharT c, char_class_type mask) const {
+            const bool negated = mask & (1llu << 63);
+            mask &= unicode_cpt_category(c).expand_bits();
+            return negated ^ (bool) mask;
+        }
+
+        int value(CharT c, int radix) const {  // char to int value
+            switch (radix) {
+                case 8:  return ('0' <= c && c <= '7') ? (int)c - '0' : -1;
+                case 10: return ('0' <= c && c <= '9') ? (int)c - '0' : -1;
+                case 16: return ('0' <= c && c <= '9') ? (int)c - '0' : (('A' <= c && c <= 'F') ? (int)c - 'A' + 10 : -1);
+                default: return -1;
+            }
+        }
+
+        const locale_type & imbue(const locale_type &) {  // set locale  //NOTE: ignoring locales
+            return std::locale::classic();
+        }
+
+        const locale_type & getloc() const {  // get locale  //NOTE: ignoring locales
+            return std::locale::classic();
+        }
+    };
+}
+
+static std::vector<uint32_t> unicode_regex_prepare(const std::string & regex) {
+    std::vector<uint32_t> regex_cpts;
+    regex_cpts.reserve(regex.size() * 12 / 10);  // estimate +20%
+
+    size_t offset = 0;
+    int inside_square = 0;
+    bool any_positive = false;
+    bool any_negative = false;
+
+    const size_t size = regex.size();
+    while (offset < size) {
+        inside_square += regex[offset] == '[';
+        inside_square -= regex[offset] == ']';
+        GGML_ASSERT(inside_square >= 0);
+        if (!inside_square) {
+            any_positive = false;
+            any_negative = false;
+        }
+
+        if (regex[offset] == '\\') {
+            const size_t i = offset + 1;
+            if (regex[i] == 'p' || regex[i] == 'P') {
+                // convert \p{Xx} to custom 'character class expression' [:Xy:]
+                if (regex[i + 1] == '{' && regex[i + 2] && regex[i + 3]) {
+                    codepoint_categ categ = {};
+                    if (regex[i + 3] == '}') {
+                        categ = codepoint_categ::from_chars(regex[i + 2]);
+                        offset += 5;
+                    } else if (regex[i + 3] != '}' && regex[i + 4] == '}') {
+                        categ = codepoint_categ::from_chars(regex[i + 2], regex[i + 3]);
+                        offset += 6;
+                    }
+                    bool negated = regex[i] == 'P';
+                    any_positive |= !negated;
+                    any_negative |= negated;
+                    GGML_ASSERT(any_positive != any_negative);  //BUG: can not mix 'p' and 'P' inside []
+                    GGML_ASSERT(sizeof(categ) <= 2);
+                    // encoded category in 32 bits codepoint
+                    uint32_t cpt_categ = (1 << 31) | (negated << 30) | categ.encoded;
+                    if (inside_square) {
+                        regex_cpts.insert(regex_cpts.end(), {'[', ':', cpt_categ, ':', ']'});
+                    } else {
+                        regex_cpts.insert(regex_cpts.end(), {'[', '[', ':', cpt_categ, ':', ']', ']'});
+                    }
+                    continue;
+                }
+            }
+        }
+
+        regex_cpts.push_back(unicode_cpt_from_utf8(regex, offset));
+    }
+
+    return regex_cpts;
 }
 
-// use std::regex to split the text
-static std::vector<size_t> unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
-    std::regex expr(regex_expr);
+// use std::basic_regex<uint32_t> to split the text codepoints
+static std::vector<size_t> unicode_regex_split_stl(const std::vector<uint32_t> & text_cpts, const std::vector<uint32_t> & regex_cpts, const std::vector<size_t> & offsets) {
+    using regex_type = std::basic_regex<uint32_t>;
+    using iter_type = std::regex_iterator<const uint32_t *>;
+    regex_type regex(regex_cpts.begin(), regex_cpts.end());
+    const iter_type end;
+
     std::vector<size_t> bpe_offsets; // store the offset of each word
-    bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
-    size_t start = 0;
+    bpe_offsets.reserve(offsets.size()); // reserve memory for the approximate size
+    const uint32_t * text_data = text_cpts.data();
     for (auto offset : offsets) {
-        std::cregex_iterator it(text.data() + start, text.data() + start + offset, expr);
-        std::cregex_iterator end;
-
+        iter_type it(text_data, text_data + offset, regex);
         int64_t start_idx = 0;
         while (it != end) {
-            std::cmatch match = *it;
-            if (match.position() > start_idx) {
-                bpe_offsets.emplace_back(match.position() - start_idx);
+            if (it->position() > start_idx) {
+                bpe_offsets.emplace_back(it->position() - start_idx);
             }
-            bpe_offsets.emplace_back(match.length());
-            start_idx = match.position() + match.length();
+            bpe_offsets.emplace_back(it->length());
+            start_idx = it->position() + it->length();
             ++it;
         }
 
         if (start_idx < (int64_t) offset) {
             bpe_offsets.emplace_back(offset - start_idx);
         }
-        start += offset;
-    }
-
-    return bpe_offsets;
-}
-
-static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
-    std::vector<size_t> bpe_offsets;
-
-    if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
-        bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
-    } else if (
-            regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" ||
-            regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
-
-        bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
+        text_data += offset;
     }
 
     return bpe_offsets;
@@ -639,288 +834,21 @@ uint32_t unicode_tolower(uint32_t cp) {
     return it == unicode_map_lowercase.end() ? cp : it->second;
 }
 
-std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
-    // std::wregex does not support unicode categories: \p{N}, \p{L}, \p{Lu}, \p{Ll} ...
-    // std::wregex does not support unicode whitespaces \s: 0x85, 0xA0, 0x001680 ... 0x003000.
-    // std::wregex allows full wchar_t 32 bit codepoints, not limited to standard max 0x110000.
-    // The main idea is to insert unicode category bits into all regex and text codepoints.
-    //   Max unicode codepoint 0x110000 fits in 21 bits.
-    //   Store unicode category and subcategory in 10 bits.
-    //   Set the high bit to zero to keep wchar_t positive (uint32_t codepoints).
-    //   Categorized codepoint:
-    //     1 bit zero + 7 bits category + 3 bits subcategory index + 21 bits codepoint
-    //     0b0'XXXXXXX'xxx'ccccccccccccccccccccc
-    // A "categorized codepoint" re-defines the ordering keeping category hierarchy.
-    //   All high category codepoints \p{X} fall into the range:
-    //     0b0'XXXXXXX'000'000000000000000000000
-    //     0b0'XXXXXXX'111'111111111111111111111
-    //   All subcategory codepoints \p{Xx} fall into the range:
-    //     0b0'XXXXXXX'xxx'000000000000000000000
-    //     0b0'XXXXXXX'xxx'111111111111111111111
-    // Processing steps:
-    //   Build a lists of "categorized codepoints/ranges" for replacing regex \s \w and \d.
-    //   Replace all regex codepoints/ranges with respective "categorized codepoints/ranges".
-    //   Replace all text codepoints with respective "categorized codepoints".
-    // Caveats:
-    //   Some regex ranges starts and ends with different category/subcategory.
-    //   Split the ranges in sub-ranges to ensure a single category to maintain the new hierarchy.
-    //   This forces iterating all ranges and could produce long sub-range sequences.
-
-    //TODO: Regex processing can be cached.
-
-    // insert unicode category and subcategory before codepoint bits
-    // 1 bit zero + 7 bits category + 3 bits subcategory index + 21 bits zero
-    static const auto categorized_prefix = [] (const codepoint_categ categ) -> wchar_t {
-        static const uint32_t MASK    = codepoint_categ::MASK;  // category mask
-        static const uint32_t SUBMASK = codepoint_categ::SUBMASK & ~codepoint_categ::MASK;  // subcategory mask
-        return (wchar_t) (((categ.encoded & MASK) << (21+3)) | ((categ.encoded & SUBMASK) << (21-7)));
-    };
-
-    // insert unicode category and subcategory before codepoint bits
-    // 1 bit zero + 7 bits category + 3 bits subcategory index + 21 bits codepoint
-    static const auto categorize_codepoint = [] (const uint32_t cpt) -> wchar_t {
-        GGML_ASSERT(cpt < (1 << 21));
-        return categorized_prefix(unicode_cpt_category(cpt)) | (wchar_t)cpt;
-    };
-
-    // remove the categorized prefix bits and restore original codepoint bits
-    static const auto decategorize_codepoint = [] (const wchar_t cpt) -> uint32_t {
-        return (uint32_t) cpt & ((1 << 21) - 1);
-    };
-
-    // returns the respective categorized codepoint range of the category/subcategory
-    static const auto categorize_range_from_chars = [] (const char categ, const char subcateg) {
-        const wchar_t range_ini = categorized_prefix(codepoint_categ::from_chars(categ, subcateg));
-        const wchar_t range_end = (wchar_t) (range_ini | (subcateg ? (1<<21)-1 : (1<<24)-1));
-        return std::pair<wchar_t, wchar_t>(range_ini, range_end);
-    };
-
-    // helper function to append/concat regex expressions
-    auto wregex_append_subregex = [] (std::wstring & wregex, const std::wstring & subregex, const bool add_squares, const bool negated) {
-        if (add_squares) {
-            wregex += '[';
-            if (negated) {
-                wregex += '^';
-            }
-            wregex += subregex;
-            wregex += ']';
-        } else {
-            GGML_ASSERT(!negated);  //TODO: negation inside square brackets: \S \W \D
-            wregex += subregex;
-        }
-    };
-
-    // \d digits replacement
-    static const std::wstring wregex_digits = {
-        categorize_codepoint('0'), '-', categorize_codepoint('9'),
-    };
-
-    // \w words replacement
-    static const std::wstring wregex_words = {
-        categorize_codepoint('_'),
-        categorize_codepoint('0'), '-', categorize_codepoint('9'),
-        categorize_codepoint('A'), '-', categorize_codepoint('Z'),
-        categorize_codepoint('a'), '-', categorize_codepoint('z'),
-    };
-
-    // \s whitespaces replacement
-    static const std::wstring wregex_whitespaces = [] {
-        std::wstring wregex_whitespaces;
-        for (const auto & range : unicode_ranges_whitespace) {
-            wregex_whitespaces += categorize_codepoint(range.first);
-            if (range.second > range.first) {
-                wregex_whitespaces += '-';
-                wregex_whitespaces += categorize_codepoint(range.second);
-            }
-        }
-        return wregex_whitespaces;
-    }();
-
-    GGML_ASSERT(sizeof(wchar_t) == sizeof(uint32_t));
-    std::wstring wtext = unicode_wstring_from_utf8(text);
-
-    std::vector<size_t> offsets = { wtext.size() };
+std::vector<std::string> unicode_regex_split(const std::string & text_utf8, const std::vector<std::string> & regex_exprs) {
+    const std::vector<uint32_t> cpts = unicode_cpts_from_utf8(text_utf8);
+    std::vector<size_t> offsets = { cpts.size() };
 
     for (auto & regex_expr : regex_exprs) {
         // first, see if we have an efficient custom regex implementation
-        auto tmp = unicode_regex_split_custom(text, regex_expr, offsets);
+        auto tmp = unicode_regex_split_custom(text_utf8, regex_expr, offsets);
 
         if (!tmp.empty()) {
             offsets = std::move(tmp);
             continue;
         }
 
-        std::wstring wregex;
-        bool inside_square = false;
-        bool is_cpt_range  = false;
-
-        const auto cpts_regex = unicode_cpts_from_utf8(regex_expr);
-        wregex.reserve(2 * cpts_regex.size());
-
-        for (size_t i = 0; i < cpts_regex.size(); ++i) {
-            uint32_t cpt = cpts_regex[i];
-
-            // parse regex metacharacters
-            wregex += (wchar_t) cpt;
-            if (inside_square) {
-                switch(cpt) {
-                    case '^':
-                        if (cpts_regex[i - 1] != '[') {
-                            break;
-                        }
-                        continue;
-                    case ']':
-                        inside_square = false;
-                        continue;
-                    case '-':
-                        is_cpt_range = true;
-                        continue;
-                }
-            } else {
-                switch(cpt) {
-                    case '^':
-                        if (i > 0) {
-                            break;
-                        }
-                        continue;
-                    case '$':
-                        if (i + 1 < cpts_regex.size()) {
-                            break;
-                        }
-                        continue;
-                    case '[':
-                        inside_square = true;
-                        continue;
-                    case '{':
-                        while (cpt && cpt != '}') {
-                            cpt = cpts_regex[++i];
-                            wregex += (wchar_t) cpt;
-                        }
-                        continue;
-                    case '}':
-                    case ']':
-                        GGML_ABORT("invalid regex");
-                    case '(':
-                        if (cpts_regex[i + 1] == '?') {  // (?: (?i: (?= (?! (?<= (?<!
-                            if (cpts_regex[i + 2] == ':') {
-                                wregex += (wchar_t) cpts_regex[++i];
-                                wregex += (wchar_t) cpts_regex[++i];
-                            } else if (cpts_regex[i + 2] == 'i') {
-                                wregex += (wchar_t) cpts_regex[++i];
-                                wregex += (wchar_t) cpts_regex[++i];
-                                wregex += (wchar_t) cpts_regex[++i];
-                                GGML_ASSERT(cpts_regex[i] == ':');
-                            } else {
-                                wregex += (wchar_t) cpts_regex[++i];
-                                wregex += (wchar_t) cpts_regex[++i];
-                                if (cpts_regex[i] == '<') {
-                                    wregex += (wchar_t) cpts_regex[++i];
-                                }
-                                GGML_ASSERT(cpts_regex[i] == '=' || cpts_regex[i] == '!');
-                            }
-                        }
-                        continue;
-                    case ')':
-                    case '|':
-                    case '.':
-                    case '?':
-                    case '+':
-                    case '*':
-                        continue;
-                }
-            }
-            wregex.pop_back();
-
-            // parse unicode categories and subcategories, replace category with the categorized range
-            if (cpt == '\\' && cpts_regex[i + 1] == 'p' && cpts_regex[i + 2] == '{') {
-                GGML_ASSERT(cpts_regex[i + 3] && cpts_regex[i + 4]);
-                std::pair<wchar_t, wchar_t> range;
-                if (cpts_regex[i + 4] == '}') {
-                    range = categorize_range_from_chars((char)cpts_regex[i + 3], (char)'\0');
-                    i += 4;
-                } else {
-                    range = categorize_range_from_chars((char)cpts_regex[i + 3], (char)cpts_regex[i + 4]);
-                    i += 5;
-                }
-                GGML_ASSERT(cpts_regex[i] == '}');
-                const std::wstring subregex = {range.first, '-', range.second};
-                wregex_append_subregex(wregex, subregex, !inside_square, false);
-                continue;
-            }
-
-            // parse more metcharacters and espaped characters
-            if (cpt == '\\') {
-                switch (cpts_regex[i + 1]) {
-                    case 's':  // \s whitespaces
-                    case 'S':  // \S no whitespaces
-                        wregex_append_subregex(wregex, wregex_whitespaces, !inside_square, cpts_regex[++i] == 'S');
-                        continue;
-                    case 'w':  // \w words
-                    case 'W':  // \W no words
-                        wregex_append_subregex(wregex, wregex_words, !inside_square, cpts_regex[++i] == 'W');
-                        continue;
-                    case 'd':  // \d digits
-                    case 'D':  // \D no digits
-                        wregex_append_subregex(wregex, wregex_digits, !inside_square, cpts_regex[++i] == 'D');
-                        continue;
-                    case 't':  ++i;  cpt = '\t';  break;
-                    case 'r':  ++i;  cpt = '\r';  break;
-                    case 'n':  ++i;  cpt = '\n';  break;
-                    case 'x':  GGML_ABORT("TODO");  //TODO: hex values
-                    case 'u':  GGML_ABORT("TODO");  //TODO: unicode values
-                    case 'U':  GGML_ABORT("TODO");  //TODO: unicode values
-                    default:  // escaped character
-                        GGML_ASSERT(!is_cpt_range);
-                        cpt = cpts_regex[++i];
-                        GGML_ASSERT(cpt < 0x80);
-                        break;
-                }
-            }
-
-            if (is_cpt_range) {
-                // Some regex ranges starts and ends with different category/subcategory.
-                // Split the ranges in sub-ranges to ensure a single category to maintain the new hierarchy.
-                // Warning: This forces iterating all ranges and could produce long sub-range sequences.
-                GGML_ASSERT(wregex.size() && wregex.back() == '-');
-                wregex.pop_back();
-                wchar_t categorized = wregex.back();
-                uint32_t range_ini = decategorize_codepoint(categorized);
-                const uint32_t range_end = cpt;
-                GGML_ASSERT(range_ini <= range_end);
-                codepoint_categ range_categ = unicode_cpt_category(range_ini);
-                for (cpt = range_ini + 1; cpt <= range_end; ++cpt) {
-                    codepoint_categ categ = unicode_cpt_category(cpt);
-                    if (categ == range_categ) {  // still same range category ?
-                        ++categorized;
-                        if (cpt == range_ini + 1) {  // single step, no need range
-                            wregex += categorized;
-                        } else if (cpt == range_ini + 2) {  // need range if +2 step
-                            wregex.back() = '-';
-                            wregex += categorized;
-                        } else {
-                            wregex.back() = categorized;  // keep range growing
-                        }
-                    } else {  // new range category
-                        categorized = categorize_codepoint(cpt);
-                        wregex += categorized;
-                        range_categ = categ;
-                        range_ini = cpt;
-                    }
-                }
-                is_cpt_range = false;
-            } else {
-                wregex += categorize_codepoint(cpt);
-            }
-        }
-
-        // categorize all wtext codepoints
-        if (wtext.size() && wtext[0] < MAX_CODEPOINTS) {  // if not already categorized
-            for (size_t i = 0; i < wtext.size(); ++i) {
-                wtext[i] = categorize_codepoint((uint32_t) wtext[i]);
-            }
-        }
-
-        offsets = unicode_regex_split_stl(wtext, wregex, offsets);
+        const auto regex_cpts = unicode_regex_prepare(regex_expr);
+        offsets = unicode_regex_split_stl(cpts, regex_cpts, offsets);
     }
 
     std::vector<std::string> bpe_words;
@@ -930,8 +858,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
     for (size_t & offset : offsets) {
         bpe_words.emplace_back();
         for (size_t i = start; i < start + offset; ++i) {
-            const uint32_t cpt = decategorize_codepoint(wtext[i]);
-            bpe_words.back() += unicode_cpt_to_utf8(cpt);
+            bpe_words.back() += unicode_cpt_to_utf8(cpts[i]);
         }
         start += offset;
     }
diff --git a/src/unicode.h b/src/unicode.h
index 3aeb74771eb7f..f2c3e71479975 100644
--- a/src/unicode.h
+++ b/src/unicode.h
@@ -113,6 +113,23 @@ struct codepoint_categ {
     inline bool is_Zp() const { return (encoded & MASK) == Zp; }
     inline bool is_Zs() const { return (encoded & MASK) == Zs; }
 
+    inline uint64_t expand_bits(const bool add_categ=true) const {  // one bit for each category/subcateory and flags
+        const uint32_t subindex = encoded & SUBMASK;
+        const uint64_t bits = (encoded & MASK) >> 3;
+        const uint64_t flags = encoded >> 10;
+        return (flags << (7 * 8)) | (bits << (7 * subindex)) | (bits * add_categ);
+    }
+
+    inline bool is_in_range(const codepoint_categ other) const {  // this.first <= other <= this.last
+        if (encoded & SUBMASK) {
+            return encoded == other.encoded;  // no range
+        }
+        if (encoded & MASK) {
+            return encoded == (other.encoded & ~SUBMASK);  // from 0bffffff'ccccccc'000 to 0bffffff'ccccccc'111
+        }
+        return encoded == (other.encoded & ~MASK);  // from 0bffffff'0000000'000 to 0bffffff'1111111'111
+    }
+
     inline bool operator == (const codepoint_categ other) const {
         return encoded == other.encoded;
     }

From 7ff916eae812b1a116fa50a71d61620f77087971 Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Tue, 13 Aug 2024 17:39:41 +0200
Subject: [PATCH 25/29] Original regex for 'tekken'

---
 src/llama-vocab.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 6192fd195746f..e4a1cbb29296d 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -440,10 +440,8 @@ struct llm_tokenizer_bpe {
                 };
                 break;
             case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
-                // original regex from tokenizer.json
-                // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
                 regex_exprs = {
-                    "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                    "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                 };
                 break;
             default:

From 50e1b1e36d2981d98e335a2065ebf945ee414998 Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Tue, 13 Aug 2024 19:55:12 +0200
Subject: [PATCH 26/29] Remove unused function

---
 src/unicode.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/unicode.cpp b/src/unicode.cpp
index 988dd35e4d3c2..73b757a795514 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -549,10 +549,6 @@ namespace std {
             static const auto _Ch_alpha = std::ctype<CharT>::alpha;
         #endif
 
-        static size_type length(const CharT * str) {
-            return std::char_traits<CharT>::length(str);
-        }
-
         CharT translate(CharT c) const {
             return c;
         }

From dcac74792b036176927c0715f924bf6135620cef Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Tue, 13 Aug 2024 19:58:36 +0200
Subject: [PATCH 27/29] Using 32bit wchar_t by default, uint32_t on Windows

---
 src/unicode.cpp | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/unicode.cpp b/src/unicode.cpp
index 73b757a795514..b7c0fc549653c 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -471,8 +471,16 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
 //   std::wregex does not support unicode whitespaces \s: 0x85, 0xA0, 0x001680 ... 0x003000.
 //   std::wregex supports full 32 bit codepoints, not limited to standard max 0x110000.
 namespace std {
-    using codepoint = uint32_t;  // codepoint type for all template specializations
 
+// codepoint type for all template specializations
+#if (WCHAR_MAX > 0xFFFF)
+    using codepoint = wchar_t;  // sizeof(wchar_t) == 4
+#else
+    using codepoint = uint32_t;  // Windows: sizeof(wchar_t) == 2
+    #define CUSTOM_CTYPE_CODEPOINT
+#endif
+
+#ifdef CUSTOM_CTYPE_CODEPOINT
     // Minimal required implementation for std::regex string processing
     template<>  // custom specialized std::ctype<codepoint>
     class ctype<codepoint> {
@@ -530,6 +538,7 @@ namespace std {
     const std::ctype<codepoint> & use_facet<const std::ctype<codepoint>>(const std::locale & loc) {
         return use_facet<std::ctype<codepoint>>(loc);
     }
+#endif
 
     // Minimal required implementation for std::regex string processing
     template<>  // custom specialized std::regex_traits<codepoint>

From b67c81d1fab3608099e229b916da4dfd2c81d57e Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Tue, 13 Aug 2024 20:25:45 +0200
Subject: [PATCH 28/29] Fix previous commit

---
 src/unicode.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/unicode.cpp b/src/unicode.cpp
index b7c0fc549653c..2c98676a869b4 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -697,14 +697,17 @@ static std::vector<uint32_t> unicode_regex_prepare(const std::string & regex) {
 
 // use std::basic_regex<uint32_t> to split the text codepoints
 static std::vector<size_t> unicode_regex_split_stl(const std::vector<uint32_t> & text_cpts, const std::vector<uint32_t> & regex_cpts, const std::vector<size_t> & offsets) {
-    using regex_type = std::basic_regex<uint32_t>;
-    using iter_type = std::regex_iterator<const uint32_t *>;
-    regex_type regex(regex_cpts.begin(), regex_cpts.end());
+    GGML_ASSERT(sizeof(std::codepoint) == sizeof(uint32_t));
+    using regex_type = std::basic_regex<std::codepoint>;
+    using iter_type = std::regex_iterator<const std::codepoint *>;
+
+    const std::codepoint * text_data  = (const std::codepoint *) text_cpts.data();
+    const std::codepoint * regex_data = (const std::codepoint *) regex_cpts.data();
+    regex_type regex(regex_data, regex_data+regex_cpts.size());
     const iter_type end;
 
     std::vector<size_t> bpe_offsets; // store the offset of each word
     bpe_offsets.reserve(offsets.size()); // reserve memory for the approximate size
-    const uint32_t * text_data = text_cpts.data();
     for (auto offset : offsets) {
         iter_type it(text_data, text_data + offset, regex);
         int64_t start_idx = 0;

From db78320b4d20185e1a2155d056d9aa32c93940f8 Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Tue, 13 Aug 2024 21:19:18 +0200
Subject: [PATCH 29/29] Fix compiler complaints

---
 src/unicode.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/unicode.cpp b/src/unicode.cpp
index 2c98676a869b4..7bd10f50bcf14 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -576,7 +576,7 @@ namespace std {
         string_type transform_primary(It first, It last) const {
             (void) first;
             (void) last;
-            GGML_ASSERT(*first < MAX_CODEPOINTS);  // valid codepoint
+            GGML_ASSERT((uint32_t) *first < MAX_CODEPOINTS);  // check valid codepoint
             return {};
         }