From 3d16f647d10c2a11080d4a7f16ab615727e11004 Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Sat, 20 Jul 2024 22:57:59 +0200 Subject: [PATCH 01/29] Update bruteforce test: - Compare tokenizer vocab tokens. - Bruteforce byte token generator. - Find minimal mismatched substring. --- tests/test-tokenizer-random.py | 162 +++++++++++++++++++++++++++------ 1 file changed, 136 insertions(+), 26 deletions(-) diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index 9ebe6c89185a3..c17a1cfbd85a7 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -116,9 +116,25 @@ def detokenize(self, ids: list[int], remove_special: bool = False, unparse_speci num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special) return str(cast(Buffer, self.ffi.buffer(self.text_buff, num)), encoding="utf-8", errors="replace") # replace errors with '\uFFFD' + def get_vocab(self, detokenize=False) -> list[str]: + vocab: list[str] = [] + num_tokens = self.lib.llama_n_vocab(self.model) + for id in range(num_tokens): + if detokenize: + text = self.detokenize([id], remove_special=False, unparse_special=True) + else: + text = self.lib.llama_token_get_text(self.model, id) + text = self.ffi.string(text) + text = str(text, encoding="utf-8", errors="replace") # replace errors with '\uFFFD' + vocab.append(text) + return vocab + class Tokenizer: + def get_vocab(self, detokenize=False) -> list[str]: + raise NotImplementedError + def encode(self, text: str) -> list[int]: raise NotImplementedError @@ -129,7 +145,7 @@ def decode(self, ids: list[int]) -> str: class TokenizerGroundtruth (Tokenizer): def __init__(self, dir_tokenizer: str): - self.model: PreTrainedTokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) + self.model: PreTrainedTokenizer = AutoTokenizer.from_pretrained(dir_tokenizer, trust_remote_code=False) # guess BOS and EOS ids = self.encode("a") assert 1 <= len(ids) <= 3 @@ -138,15 +154,24 @@ def __init__(self, dir_tokenizer: str): self.add_bos_token = getattr(self.model, "add_bos_token", add_bos_token) self.add_eos_token = getattr(self.model, "add_eos_token", add_eos_token) # build vocab - tokens = list(self.model.get_vocab().values()) - self.vocab = self.model.batch_decode(tokens, skip_special_tokens=True) - self.vocab = list(sorted(self.vocab)) + self.vocab = self.get_vocab(detokenize=True) # tokens and lists - self.special_tokens = list(self.model.all_special_tokens) - self.added_tokens = self.model.batch_decode(self.model.added_tokens_encoder.values(), skip_special_tokens=False) + self.special_tokens = [self.vocab[i] for i in sorted(self.model.all_special_ids)] + self.added_tokens = [self.vocab[i] for i in sorted(self.model.added_tokens_encoder.values())] self.bos_token = self.model.bos_token self.eos_token = self.model.eos_token + def get_vocab(self, detokenize=False) -> list[str]: + max_token_id = max(self.model.get_vocab().values()) + if detokenize: + ids = list(range(max_token_id + 1)) + vocab = self.model.batch_decode(ids, skip_special_tokens=False) + else: + vocab = [None] * (max_token_id + 1) + for text, id in self.model.get_vocab().items(): + vocab[id] = text + return vocab + def encode(self, text: str) -> list[int]: return self.model.encode(text, add_special_tokens=True) @@ -163,6 +188,9 @@ def __init__(self, vocab_file: str): self.libllama = LibLlama() self.model = LibLlamaModel(self.libllama, vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096)) + def get_vocab(self, detokenize=False) -> list[str]: + return self.model.get_vocab(detokenize) + def encode(self, text: str) -> list[int]: return self.model.tokenize(text, add_special=True, parse_special=True) @@ -253,6 +281,23 @@ def generator_vocab_words(tokenizer: TokenizerGroundtruth) -> Iterator[str]: yield from tokenizer.vocab +def generator_byte_tokens() -> Iterator[str]: + """Brute force check common byte encoding""" + for a, b in ["<>", "[]", "()", ("\\", "")]: + yield from [f"{a}{i}{b}" for i in range(256)] + yield from [f"{a}{i:x}{b}" for i in range(256)] + yield from [f"{a}{i:X}{b}" for i in range(256)] + yield from [f"{a}x{i:x}{b}" for i in range(256)] + yield from [f"{a}x{i:X}{b}" for i in range(256)] + yield from [f"{a}x{i:02x}{b}" for i in range(256)] + yield from [f"{a}x{i:02X}{b}" for i in range(256)] + yield from [f"{a}0x{i:x}{b}" for i in range(256)] + yield from [f"{a}0x{i:X}{b}" for i in range(256)] + yield from [f"{a}0x{i:02x}{b}" for i in range(256)] + yield from [f"{a}0x{i:02X}{b}" for i in range(256)] + yield from [f"{a}{chr(i)}{b}" for i in range(256)] + + def generator_ascii_lr_strip() -> Iterator[str]: WHITESPACES = ["", " ", " "] CHARACTERS = list(chr(i) for i in range(1, 0x80)) + [""] @@ -275,10 +320,11 @@ def generator_apostrophe() -> Iterator[str]: yield char1 + lstrip + "'" + rstrip + char2 yield char1 + char2 + lstrip + "'" + rstrip + "z" yield "a" + lstrip + "'" + rstrip + char1 + char2 + yield "a" + lstrip + "'" + char1 + char2 + rstrip + "z" def generator_added_lr_strip(tokenizer: TokenizerGroundtruth) -> Iterator[str]: - WHITESPACES = ["", " ", " ", "\n", "\r\n", "\n\n", "\t", "\t\t"] + WHITESPACES = ["", " ", " ", "\n", "\r\n", "\n\n", "\t", "\t\t", " "] all_tokens = list(sorted(set(tokenizer.special_tokens + tokenizer.added_tokens))) for token in all_tokens: for lstrip in WHITESPACES: @@ -436,6 +482,7 @@ def check_detokenizer(text: str, text1: str, text2: str) -> bool: t_start = time.perf_counter() encode_errors = 0 decode_errors = 0 + total_tests = 0 MAX_ERRORS = 10 logger.info("%s: %s" % (generator.__qualname__, "ini")) @@ -455,21 +502,44 @@ def check_detokenizer(text: str, text1: str, text2: str) -> bool: t_encode2 += t2 - t1 t_decode1 += t3 - t2 t_decode2 += t4 - t3 - if encode_errors < MAX_ERRORS and ids1 != ids2: - i = find_first_mismatch(ids1, ids2) - ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1] - ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1] + # compare + encode_ok = ids1 == ids2 + decode_ok = check_detokenizer(text, text1, text2) + encode_errors += not encode_ok + decode_errors += not decode_ok + total_tests += 1 + if (encode_errors < MAX_ERRORS and not encode_ok) or (decode_errors < MAX_ERRORS and not decode_ok): + def _compare(text: str): + ids1 = tokenizer1.encode(text) + ids2 = tokenizer2.encode(text) + text1 = tokenizer1.decode(ids1) + text2 = tokenizer2.decode(ids1) + encode_ok = ids1 == ids2 + decode_ok = check_detokenizer(text, text1, text2) + ok = encode_ok and decode_ok + return ok, ids1, ids2, text1, text2 + a, b = 0, len(text) + for step in [64, 32, 16, 8, 4, 2, 1]: + while a < b: + t = max(a, b - step) + if _compare(text[a : t])[0]: + break + b = t + for step in [64, 32, 16, 8, 4, 2, 1]: + while a < b: + t = min(a + step, b) + if _compare(text[t : b])[0]: + break + a = t + ok, ids1, ids2, text1, text2 = _compare(text[a : b]) + assert a <= b and not ok + logger.error(" Text:" + repr(text[a : b])) + logger.error(" " + " ".join(repr(x) + ":" + hex(ord(x)) for x in text[a : b])) logger.error(" Expected: " + str(ids1)) logger.error(" Result: " + str(ids2)) - encode_errors += 1 + logger.error(" Expected: " + " ".join(repr(x) + ":" + hex(ord(x)) for x in text1)) + logger.error(" Result: " + " ".join(repr(x) + ":" + hex(ord(x)) for x in text2)) logger.error(f" {encode_errors=}") - if decode_errors < MAX_ERRORS and not check_detokenizer(text, text1, text2): - i = find_first_mismatch(text1, text2) - text1 = list(text1[max(0, i - 2) : i + 5 + 1]) - text2 = list(text2[max(0, i - 2) : i + 5 + 1]) - logger.error(" Expected: " + " ".join(hex(ord(x)) for x in text1)) - logger.error(" Result: " + " ".join(hex(ord(x)) for x in text2)) - decode_errors += 1 logger.error(f" {decode_errors=}") if encode_errors >= MAX_ERRORS and decode_errors >= MAX_ERRORS: logger.error(f" EXIT: {encode_errors=} {decode_errors=}") @@ -480,6 +550,43 @@ def check_detokenizer(text: str, text1: str, text2: str) -> bool: logger.info(f"{generator.__qualname__}: end, {t_encode1=:.3f} {t_encode2=:.3f} {t_decode1=:.3f} {t_decode2=:.3f} {t_total=:.3f}") +def compare_vocabs(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLlamaCpp): + + MAX_PRINT_ERRORS = 10 + + logger.info("compare_vocabs: ini") + + t_start = time.perf_counter() + + for detokenize in (False, True): + vocab1 = tokenizer1.get_vocab(detokenize) + vocab2 = tokenizer2.get_vocab(detokenize) + if vocab1 != vocab2: + num_errors = 0 + for i in range(max(len(vocab1), len(vocab2))): + text1 = vocab1[i] if i < len(vocab1) else None + text2 = vocab2[i] if i < len(vocab2) else None + if text1 != text2: + # is "[UNUSED_TOKEN_" and "[PAD" valid for all models ? #TODO: use toktypes + if text1 is not None: + text1 = text1.replace("[UNUSED_TOKEN_", "[PAD") + if text2 is not None: + text2 = text2.replace("[UNUSED_TOKEN_", "[PAD") + if text1 is None and (text2 or "").startswith('[PAD'): + text2 = None + if text2 is None and (text1 or "").startswith('[PAD'): + text1 = None + if text1 != text2: + num_errors += 1 + if num_errors < MAX_PRINT_ERRORS: + logger.error(f" {detokenize=} id={i} expected={repr(text1)} result={repr(text2)}") + if num_errors: + logger.error(f" {num_errors=}") + + t_total = time.perf_counter() - t_start + logger.info(f"compare_vocabs: end, {t_total=:.3f}") + + def main(argv: list[str] | None = None): parser = argparse.ArgumentParser() parser.add_argument("vocab_file", type=str, help="path to vocab 'gguf' file") @@ -493,18 +600,21 @@ def main(argv: list[str] | None = None): tokenizer1 = TokenizerGroundtruth(args.dir_tokenizer) tokenizer2 = TokenizerLlamaCpp(args.vocab_file) - # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text()) - # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text_edge_cases()) + compare_vocabs(tokenizer1, tokenizer2) + + compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text()) + compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text_edge_cases()) + compare_tokenizers(tokenizer1, tokenizer2, generator_byte_tokens()) compare_tokenizers(tokenizer1, tokenizer2, generator_ascii_lr_strip()) compare_tokenizers(tokenizer1, tokenizer2, generator_apostrophe()) compare_tokenizers(tokenizer1, tokenizer2, generator_unicodes()) compare_tokenizers(tokenizer1, tokenizer2, generator_vocab_words(tokenizer1)) compare_tokenizers(tokenizer1, tokenizer2, generator_added_lr_strip(tokenizer1)) - # compare_tokenizers(tokenizer1, tokenizer2, generator_random_added_tokens(tokenizer1, 10_000)) - # compare_tokenizers(tokenizer1, tokenizer2, generator_random_chars(10_000)) - # compare_tokenizers(tokenizer1, tokenizer2, generator_random_unicodes(10_000)) - # compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_chars(tokenizer1, 10_000)) - # compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_words(tokenizer1, 5_000)) + compare_tokenizers(tokenizer1, tokenizer2, generator_random_added_tokens(tokenizer1, 10_000)) + compare_tokenizers(tokenizer1, tokenizer2, generator_random_chars(10_000)) + compare_tokenizers(tokenizer1, tokenizer2, generator_random_unicodes(10_000)) + compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_chars(tokenizer1, 10_000)) + compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_words(tokenizer1, 5_000)) tokenizer2.model.free() From 5ceab90b4d69dcbea2e006e77af5c8bd8eb15645 Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Sat, 20 Jul 2024 23:04:23 +0200 Subject: [PATCH 02/29] Store all unicode codepoint categories --- scripts/gen-unicode-data.py | 102 +++++++++++++++++------------------- 1 file changed, 48 insertions(+), 54 deletions(-) diff --git a/scripts/gen-unicode-data.py b/scripts/gen-unicode-data.py index 2d9bde01c3ca7..55ac0af12c29f 100644 --- a/scripts/gen-unicode-data.py +++ b/scripts/gen-unicode-data.py @@ -49,52 +49,41 @@ def unicode_data_iter(): yield (cpt, cpt_lower, cpt_upper, categ, bidir) -# see definition in unicode.h -CODEPOINT_FLAG_UNDEFINED = 0x0001 # -CODEPOINT_FLAG_NUMBER = 0x0002 # \p{N} -CODEPOINT_FLAG_LETTER = 0x0004 # \p{L} -CODEPOINT_FLAG_SEPARATOR = 0x0008 # \p{Z} -CODEPOINT_FLAG_MARK = 0x0010 # \p{M} -CODEPOINT_FLAG_PUNCTUATION = 0x0020 # \p{P} -CODEPOINT_FLAG_SYMBOL = 0x0040 # \p{S} -CODEPOINT_FLAG_CONTROL = 0x0080 # \p{C} - -UNICODE_CATEGORY_TO_FLAG = { - "Cn": CODEPOINT_FLAG_UNDEFINED, # Undefined - "Cc": CODEPOINT_FLAG_CONTROL, # Control - "Cf": CODEPOINT_FLAG_CONTROL, # Format - "Co": CODEPOINT_FLAG_CONTROL, # Private Use - "Cs": CODEPOINT_FLAG_CONTROL, # Surrrogate - "Ll": CODEPOINT_FLAG_LETTER, # Lowercase Letter - "Lm": CODEPOINT_FLAG_LETTER, # Modifier Letter - "Lo": CODEPOINT_FLAG_LETTER, # Other Letter - "Lt": CODEPOINT_FLAG_LETTER, # Titlecase Letter - "Lu": CODEPOINT_FLAG_LETTER, # Uppercase Letter - "L&": CODEPOINT_FLAG_LETTER, # Cased Letter - "Mc": CODEPOINT_FLAG_MARK, # Spacing Mark - "Me": CODEPOINT_FLAG_MARK, # Enclosing Mark - "Mn": CODEPOINT_FLAG_MARK, # Nonspacing Mark - "Nd": CODEPOINT_FLAG_NUMBER, # Decimal Number - "Nl": CODEPOINT_FLAG_NUMBER, # Letter Number - "No": CODEPOINT_FLAG_NUMBER, # Other Number - "Pc": CODEPOINT_FLAG_PUNCTUATION, # Connector Punctuation - "Pd": CODEPOINT_FLAG_PUNCTUATION, # Dash Punctuation - "Pe": CODEPOINT_FLAG_PUNCTUATION, # Close Punctuation - "Pf": CODEPOINT_FLAG_PUNCTUATION, # Final Punctuation - "Pi": CODEPOINT_FLAG_PUNCTUATION, # Initial Punctuation - "Po": CODEPOINT_FLAG_PUNCTUATION, # Other Punctuation - "Ps": CODEPOINT_FLAG_PUNCTUATION, # Open Punctuation - "Sc": CODEPOINT_FLAG_SYMBOL, # Currency Symbol - "Sk": CODEPOINT_FLAG_SYMBOL, # Modifier Symbol - "Sm": CODEPOINT_FLAG_SYMBOL, # Math Symbol - "So": CODEPOINT_FLAG_SYMBOL, # Other Symbol - "Zl": CODEPOINT_FLAG_SEPARATOR, # Line Separator - "Zp": CODEPOINT_FLAG_SEPARATOR, # Paragraph Separator - "Zs": CODEPOINT_FLAG_SEPARATOR, # Space Separator +UNICODE_CATEGORY_TO_INDEX = { + "Cn": 0, # \p{Cn} Undefined + "Cc": 1, # \p{Cc} Control + "Cf": 2, # \p{Cf} Format + "Co": 3, # \p{Co} Private Use + "Cs": 4, # \p{Cs} Surrrogate + "Ll": 5, # \p{Ll} Lowercase Letter + "Lm": 6, # \p{Lm} Modifier Letter + "Lo": 7, # \p{Lo} Other Letter + "Lt": 8, # \p{Lt} Titlecase Letter + "Lu": 9, # \p{Lu} Uppercase Letter + "Mc": 10, # \p{Mc} Spacing Mark + "Me": 11, # \p{Me} Enclosing Mark + "Mn": 12, # \p{Mn} Nonspacing Mark + "Nd": 13, # \p{Nd} Decimal Number + "Nl": 14, # \p{Nl} Letter Number + "No": 15, # \p{No} Other Number + "Pc": 16, # \p{Pc} Connector Punctuation + "Pd": 17, # \p{Pd} Dash Punctuation + "Pe": 18, # \p{Pe} Close Punctuation + "Pf": 19, # \p{Pf} Final Punctuation + "Pi": 20, # \p{Pi} Initial Punctuation + "Po": 21, # \p{Po} Other Punctuation + "Ps": 22, # \p{Ps} Open Punctuation + "Sc": 23, # \p{Sc} Currency Symbol + "Sk": 24, # \p{Sk} Modifier Symbol + "Sm": 25, # \p{Sm} Math Symbol + "So": 26, # \p{So} Other Symbol + "Zl": 27, # \p{Zl} Line Separator + "Zp": 28, # \p{Zp} Paragraph Separator + "Zs": 29, # \p{Zs} Space Separator } -codepoint_flags = array.array('H', [CODEPOINT_FLAG_UNDEFINED]) * MAX_CODEPOINTS +codepoint_categs = array.array('B', [0]) * MAX_CODEPOINTS # Undefined table_whitespace = [] table_lowercase = [] table_uppercase = [] @@ -105,7 +94,7 @@ def unicode_data_iter(): char = chr(cpt) # codepoint category flags - codepoint_flags[cpt] = UNICODE_CATEGORY_TO_FLAG[categ] + codepoint_categs[cpt] = UNICODE_CATEGORY_TO_INDEX[categ] # lowercase conversion if cpt_lower: @@ -134,12 +123,17 @@ def unicode_data_iter(): table_nfd.sort() -# group ranges with same flags -ranges_flags: list[tuple[int, int]] = [(0, codepoint_flags[0])] # start, flags -for codepoint, flags in enumerate(codepoint_flags): - if flags != ranges_flags[-1][1]: - ranges_flags.append((codepoint, flags)) -ranges_flags.append((MAX_CODEPOINTS, 0x0000)) +# run length encoding +assert (max(UNICODE_CATEGORY_TO_INDEX.values()) < 32) +codepoint_categs_runs = [codepoint_categs[0]] # 5 bits categ + 11 bits length +for cpt, categ in enumerate(codepoint_categs[1:], 1): + prev = codepoint_categs_runs[-1] + if prev <= (0xFFFF - 32) and (prev & 31) == categ: + codepoint_categs_runs[-1] += 32 # increment run length + else: + codepoint_categs_runs.append(categ) # new run value + assert (codepoint_categs_runs[-1] < 0xFFFF) +assert (MAX_CODEPOINTS == sum((rle >> 5) + 1 for rle in codepoint_categs_runs)) # group ranges with same nfd @@ -153,7 +147,7 @@ def unicode_data_iter(): # Generate 'unicode-data.cpp': -# python ./scripts//gen-unicode-data.py > unicode-data.cpp +# python ./scripts//gen-unicode-data.py > ./src/unicode-data.cpp def out(line=""): print(line, end='\n') # noqa @@ -170,9 +164,9 @@ def out(line=""): #include """) -out("const std::vector> unicode_ranges_flags = { // start, flags // last=next_start-1") -for codepoint, flags in ranges_flags: - out("{0x%06X, 0x%04X}," % (codepoint, flags)) +out("const std::vector unicode_rle_codepoints_categs = { // run length encoding, 5 bits categ + 11 bits length") +for rle in codepoint_categs_runs: + out("0x%04X," % rle) out("};\n") out("const std::unordered_set unicode_set_whitespace = {") From ba4bbbd1ad7692d60f38fb9a65cde7ec6f86158f Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Sat, 20 Jul 2024 23:09:33 +0200 Subject: [PATCH 03/29] Reimplement 'codepoint_flags' as 'codepoint_categ' --- src/unicode.h | 144 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 104 insertions(+), 40 deletions(-) diff --git a/src/unicode.h b/src/unicode.h index 30b07ba7fa493..f9f4fcc8cc7a0 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -4,46 +4,110 @@ #include #include -struct codepoint_flags { - enum { - UNDEFINED = 0x0001, - NUMBER = 0x0002, // regex: \p{N} - LETTER = 0x0004, // regex: \p{L} - SEPARATOR = 0x0008, // regex: \p{Z} - ACCENT_MARK = 0x0010, // regex: \p{M} - PUNCTUATION = 0x0020, // regex: \p{P} - SYMBOL = 0x0040, // regex: \p{S} - CONTROL = 0x0080, // regex: \p{C} - MASK_CATEGORIES = 0x00FF, +struct codepoint_categ { + enum _category : uint16_t { + UNDEF = 0, // \p{Cn} Undefined + C = 1 << 0, // \p{C} Control + L = 1 << 1, // \p{L} Letter + M = 1 << 2, // \p{M} Mark + N = 1 << 3, // \p{N} Number + P = 1 << 4, // \p{P} Punctuation + S = 1 << 5, // \p{S} Symbol + Z = 1 << 6, // \p{Z} Separator + MASK = (1 << 7) - 1 // 7 bits }; - // codepoint type - uint16_t is_undefined : 1; - uint16_t is_number : 1; // regex: \p{N} - uint16_t is_letter : 1; // regex: \p{L} - uint16_t is_separator : 1; // regex: \p{Z} - uint16_t is_accent_mark : 1; // regex: \p{M} - uint16_t is_punctuation : 1; // regex: \p{P} - uint16_t is_symbol : 1; // regex: \p{S} - uint16_t is_control : 1; // regex: \p{C} - // helper flags - uint16_t is_whitespace : 1; // regex: \s - uint16_t is_lowercase : 1; - uint16_t is_uppercase : 1; - uint16_t is_nfd : 1; - - // decode from uint16 - inline codepoint_flags(const uint16_t flags=0) { - *reinterpret_cast(this) = flags; - } - - inline uint16_t as_uint() const { - return *reinterpret_cast(this); - } - - inline uint16_t category_flag() const { - return this->as_uint() & MASK_CATEGORIES; - } + enum _subcategory : uint16_t { + Cc = C | (1 << 7), // \p{Cc} Control + Cf = C | (2 << 7), // \p{Cf} Format + Co = C | (3 << 7), // \p{Co} Private Use + Cs = C | (4 << 7), // \p{Cs} Surrrogate + Ll = L | (1 << 7), // \p{Ll} Lowercase Letter + Lm = L | (2 << 7), // \p{Lm} Modifier Letter + Lo = L | (3 << 7), // \p{Lo} Other Letter + Lt = L | (4 << 7), // \p{Lt} Titlecase Letter + Lu = L | (5 << 7), // \p{Lu} Uppercase Letter + Mc = M | (1 << 7), // \p{Mc} Spacing Mark + Me = M | (2 << 7), // \p{Me} Enclosing Mark + Mn = M | (3 << 7), // \p{Mn} Nonspacing Mark + Nd = N | (1 << 7), // \p{Nd} Decimal Number + Nl = N | (2 << 7), // \p{Nl} Letter Number + No = N | (3 << 7), // \p{No} Other Number + Pc = P | (1 << 7), // \p{Pc} Connector Punctuation + Pd = P | (2 << 7), // \p{Pd} Dash Punctuation + Pe = P | (3 << 7), // \p{Pe} Close Punctuation + Pf = P | (4 << 7), // \p{Pf} Final Punctuation + Pi = P | (5 << 7), // \p{Pi} Initial Punctuation + Po = P | (6 << 7), // \p{Po} Other Punctuation + Ps = P | (7 << 7), // \p{Ps} Open Punctuation + Sc = S | (1 << 7), // \p{Sc} Currency Symbol + Sk = S | (2 << 7), // \p{Sk} Modifier Symbol + Sm = S | (3 << 7), // \p{Sm} Math Symbol + So = S | (4 << 7), // \p{So} Other Symbol + Zl = Z | (1 << 7), // \p{Zl} Line Separator + Zp = Z | (2 << 7), // \p{Zp} Paragraph Separator + Zs = Z | (3 << 7), // \p{Zs} Space Separator + SUBMASK = (1 << 10) - 1 // 7+3 bits + }; + + enum _flags : uint16_t { + WHITESPACE = (1 << 10), // regex: \s + LOWERCASE = (1 << 11), + UPPERCASE = (1 << 12), + //Norm NFD/NFC = ..., + }; + + inline codepoint_categ(const uint16_t categ=0) : encoded{categ} {} + + inline uint8_t get_category() const { return encoded & MASK; } + inline uint8_t get_subcategory() const { return encoded & SUBMASK; } + + inline bool is_undefined() const { return !encoded; } + inline bool is_defined() const { return encoded; } + + inline auto is_whitespace() const { return encoded & WHITESPACE; } + inline auto is_lowercase() const { return encoded & LOWERCASE; } + inline auto is_uppercase() const { return encoded & UPPERCASE; } + + inline auto is_C() const { return encoded & C; } + inline auto is_L() const { return encoded & L; } + inline auto is_M() const { return encoded & M; } + inline auto is_N() const { return encoded & N; } + inline auto is_P() const { return encoded & P; } + inline auto is_S() const { return encoded & S; } + inline auto is_Z() const { return encoded & Z; } + + inline auto is_Cc() const { return (encoded & SUBMASK) == Cc; } + inline auto is_Cf() const { return (encoded & SUBMASK) == Cf; } + inline auto is_Co() const { return (encoded & SUBMASK) == Co; } + inline auto is_Cs() const { return (encoded & SUBMASK) == Cs; } + inline auto is_Ll() const { return (encoded & SUBMASK) == Ll; } + inline auto is_Lm() const { return (encoded & SUBMASK) == Lm; } + inline auto is_Lo() const { return (encoded & SUBMASK) == Lo; } + inline auto is_Lt() const { return (encoded & SUBMASK) == Lt; } + inline auto is_Lu() const { return (encoded & SUBMASK) == Lu; } + inline auto is_Mc() const { return (encoded & SUBMASK) == Mc; } + inline auto is_Me() const { return (encoded & SUBMASK) == Me; } + inline auto is_Mn() const { return (encoded & SUBMASK) == Mn; } + inline auto is_Nd() const { return (encoded & SUBMASK) == Nd; } + inline auto is_Nl() const { return (encoded & SUBMASK) == Nl; } + inline auto is_No() const { return (encoded & SUBMASK) == No; } + inline auto is_Pc() const { return (encoded & SUBMASK) == Pc; } + inline auto is_Pd() const { return (encoded & SUBMASK) == Pd; } + inline auto is_Pe() const { return (encoded & SUBMASK) == Pe; } + inline auto is_Pf() const { return (encoded & SUBMASK) == Pf; } + inline auto is_Pi() const { return (encoded & SUBMASK) == Pi; } + inline auto is_Po() const { return (encoded & SUBMASK) == Po; } + inline auto is_Ps() const { return (encoded & SUBMASK) == Ps; } + inline auto is_Sc() const { return (encoded & SUBMASK) == Sc; } + inline auto is_Sk() const { return (encoded & SUBMASK) == Sk; } + inline auto is_Sm() const { return (encoded & SUBMASK) == Sm; } + inline auto is_So() const { return (encoded & SUBMASK) == So; } + inline auto is_Zl() const { return (encoded & SUBMASK) == Zl; } + inline auto is_Zp() const { return (encoded & SUBMASK) == Zp; } + inline auto is_Zs() const { return (encoded & SUBMASK) == Zs; } + + uint16_t encoded; }; @@ -53,8 +117,8 @@ std::vector unicode_cpts_from_utf8(const std::string & utf8); std::vector unicode_cpts_normalize_nfd(const std::vector & cpts); -codepoint_flags unicode_cpt_flags(const uint32_t cp); -codepoint_flags unicode_cpt_flags(const std::string & utf8); +codepoint_categ unicode_cpt_category(const uint32_t cp); +codepoint_categ unicode_cpt_category(const std::string & utf8); std::string unicode_byte_to_utf8(uint8_t byte); uint8_t unicode_utf8_to_byte(const std::string & utf8); From 8f9f05bf6de7177d90107fde3ea80cbabdecc6ed Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Sat, 20 Jul 2024 23:12:08 +0200 Subject: [PATCH 04/29] Update unicode data --- src/unicode-data.cpp | 6793 ++++++++++++++++++++++++++++-------------- src/unicode-data.h | 2 +- 2 files changed, 4520 insertions(+), 2275 deletions(-) diff --git a/src/unicode-data.cpp b/src/unicode-data.cpp index 02bdf782380fe..4a0c0547c7d03 100644 --- a/src/unicode-data.cpp +++ b/src/unicode-data.cpp @@ -7,2280 +7,4524 @@ #include #include -const std::vector> unicode_ranges_flags = { // start, flags // last=next_start-1 -{0x000000, 0x0080}, -{0x000020, 0x0008}, -{0x000021, 0x0020}, -{0x000024, 0x0040}, -{0x000025, 0x0020}, -{0x00002B, 0x0040}, -{0x00002C, 0x0020}, -{0x000030, 0x0002}, -{0x00003A, 0x0020}, -{0x00003C, 0x0040}, -{0x00003F, 0x0020}, -{0x000041, 0x0004}, -{0x00005B, 0x0020}, -{0x00005E, 0x0040}, -{0x00005F, 0x0020}, -{0x000060, 0x0040}, -{0x000061, 0x0004}, -{0x00007B, 0x0020}, -{0x00007C, 0x0040}, -{0x00007D, 0x0020}, -{0x00007E, 0x0040}, -{0x00007F, 0x0080}, -{0x0000A0, 0x0008}, -{0x0000A1, 0x0020}, -{0x0000A2, 0x0040}, -{0x0000A7, 0x0020}, -{0x0000A8, 0x0040}, -{0x0000AA, 0x0004}, -{0x0000AB, 0x0020}, -{0x0000AC, 0x0040}, -{0x0000AD, 0x0080}, -{0x0000AE, 0x0040}, -{0x0000B2, 0x0002}, -{0x0000B4, 0x0040}, -{0x0000B5, 0x0004}, -{0x0000B6, 0x0020}, -{0x0000B8, 0x0040}, -{0x0000B9, 0x0002}, -{0x0000BA, 0x0004}, -{0x0000BB, 0x0020}, -{0x0000BC, 0x0002}, -{0x0000BF, 0x0020}, -{0x0000C0, 0x0004}, -{0x0000D7, 0x0040}, -{0x0000D8, 0x0004}, -{0x0000F7, 0x0040}, -{0x0000F8, 0x0004}, -{0x0002C2, 0x0040}, -{0x0002C6, 0x0004}, -{0x0002D2, 0x0040}, -{0x0002E0, 0x0004}, -{0x0002E5, 0x0040}, -{0x0002EC, 0x0004}, -{0x0002ED, 0x0040}, -{0x0002EE, 0x0004}, -{0x0002EF, 0x0040}, -{0x000300, 0x0010}, -{0x000370, 0x0004}, -{0x000375, 0x0040}, -{0x000376, 0x0004}, -{0x000378, 0x0001}, -{0x00037A, 0x0004}, -{0x00037E, 0x0020}, -{0x00037F, 0x0004}, -{0x000380, 0x0001}, -{0x000384, 0x0040}, -{0x000386, 0x0004}, -{0x000387, 0x0020}, -{0x000388, 0x0004}, -{0x00038B, 0x0001}, -{0x00038C, 0x0004}, -{0x00038D, 0x0001}, -{0x00038E, 0x0004}, -{0x0003A2, 0x0001}, -{0x0003A3, 0x0004}, -{0x0003F6, 0x0040}, -{0x0003F7, 0x0004}, -{0x000482, 0x0040}, -{0x000483, 0x0010}, -{0x00048A, 0x0004}, -{0x000530, 0x0001}, -{0x000531, 0x0004}, -{0x000557, 0x0001}, -{0x000559, 0x0004}, -{0x00055A, 0x0020}, -{0x000560, 0x0004}, -{0x000589, 0x0020}, -{0x00058B, 0x0001}, -{0x00058D, 0x0040}, -{0x000590, 0x0001}, -{0x000591, 0x0010}, -{0x0005BE, 0x0020}, -{0x0005BF, 0x0010}, -{0x0005C0, 0x0020}, -{0x0005C1, 0x0010}, -{0x0005C3, 0x0020}, -{0x0005C4, 0x0010}, -{0x0005C6, 0x0020}, -{0x0005C7, 0x0010}, -{0x0005C8, 0x0001}, -{0x0005D0, 0x0004}, -{0x0005EB, 0x0001}, -{0x0005EF, 0x0004}, -{0x0005F3, 0x0020}, -{0x0005F5, 0x0001}, -{0x000600, 0x0080}, -{0x000606, 0x0040}, -{0x000609, 0x0020}, -{0x00060B, 0x0040}, -{0x00060C, 0x0020}, -{0x00060E, 0x0040}, -{0x000610, 0x0010}, -{0x00061B, 0x0020}, -{0x00061C, 0x0080}, -{0x00061D, 0x0020}, -{0x000620, 0x0004}, -{0x00064B, 0x0010}, -{0x000660, 0x0002}, -{0x00066A, 0x0020}, -{0x00066E, 0x0004}, -{0x000670, 0x0010}, -{0x000671, 0x0004}, -{0x0006D4, 0x0020}, -{0x0006D5, 0x0004}, -{0x0006D6, 0x0010}, -{0x0006DD, 0x0080}, -{0x0006DE, 0x0040}, -{0x0006DF, 0x0010}, -{0x0006E5, 0x0004}, -{0x0006E7, 0x0010}, -{0x0006E9, 0x0040}, -{0x0006EA, 0x0010}, -{0x0006EE, 0x0004}, -{0x0006F0, 0x0002}, -{0x0006FA, 0x0004}, -{0x0006FD, 0x0040}, -{0x0006FF, 0x0004}, -{0x000700, 0x0020}, -{0x00070E, 0x0001}, -{0x00070F, 0x0080}, -{0x000710, 0x0004}, -{0x000711, 0x0010}, -{0x000712, 0x0004}, -{0x000730, 0x0010}, -{0x00074B, 0x0001}, -{0x00074D, 0x0004}, -{0x0007A6, 0x0010}, -{0x0007B1, 0x0004}, -{0x0007B2, 0x0001}, -{0x0007C0, 0x0002}, -{0x0007CA, 0x0004}, -{0x0007EB, 0x0010}, -{0x0007F4, 0x0004}, -{0x0007F6, 0x0040}, -{0x0007F7, 0x0020}, -{0x0007FA, 0x0004}, -{0x0007FB, 0x0001}, -{0x0007FD, 0x0010}, -{0x0007FE, 0x0040}, -{0x000800, 0x0004}, -{0x000816, 0x0010}, -{0x00081A, 0x0004}, -{0x00081B, 0x0010}, -{0x000824, 0x0004}, -{0x000825, 0x0010}, -{0x000828, 0x0004}, -{0x000829, 0x0010}, -{0x00082E, 0x0001}, -{0x000830, 0x0020}, -{0x00083F, 0x0001}, -{0x000840, 0x0004}, -{0x000859, 0x0010}, -{0x00085C, 0x0001}, -{0x00085E, 0x0020}, -{0x00085F, 0x0001}, -{0x000860, 0x0004}, -{0x00086B, 0x0001}, -{0x000870, 0x0004}, -{0x000888, 0x0040}, -{0x000889, 0x0004}, -{0x00088F, 0x0001}, -{0x000890, 0x0080}, -{0x000892, 0x0001}, -{0x000898, 0x0010}, -{0x0008A0, 0x0004}, -{0x0008CA, 0x0010}, -{0x0008E2, 0x0080}, -{0x0008E3, 0x0010}, -{0x000904, 0x0004}, -{0x00093A, 0x0010}, -{0x00093D, 0x0004}, -{0x00093E, 0x0010}, -{0x000950, 0x0004}, -{0x000951, 0x0010}, -{0x000958, 0x0004}, -{0x000962, 0x0010}, -{0x000964, 0x0020}, -{0x000966, 0x0002}, -{0x000970, 0x0020}, -{0x000971, 0x0004}, -{0x000981, 0x0010}, -{0x000984, 0x0001}, -{0x000985, 0x0004}, -{0x00098D, 0x0001}, -{0x00098F, 0x0004}, -{0x000991, 0x0001}, -{0x000993, 0x0004}, -{0x0009A9, 0x0001}, -{0x0009AA, 0x0004}, -{0x0009B1, 0x0001}, -{0x0009B2, 0x0004}, -{0x0009B3, 0x0001}, -{0x0009B6, 0x0004}, -{0x0009BA, 0x0001}, -{0x0009BC, 0x0010}, -{0x0009BD, 0x0004}, -{0x0009BE, 0x0010}, -{0x0009C5, 0x0001}, -{0x0009C7, 0x0010}, -{0x0009C9, 0x0001}, -{0x0009CB, 0x0010}, -{0x0009CE, 0x0004}, -{0x0009CF, 0x0001}, -{0x0009D7, 0x0010}, -{0x0009D8, 0x0001}, -{0x0009DC, 0x0004}, -{0x0009DE, 0x0001}, -{0x0009DF, 0x0004}, -{0x0009E2, 0x0010}, -{0x0009E4, 0x0001}, -{0x0009E6, 0x0002}, -{0x0009F0, 0x0004}, -{0x0009F2, 0x0040}, -{0x0009F4, 0x0002}, -{0x0009FA, 0x0040}, -{0x0009FC, 0x0004}, -{0x0009FD, 0x0020}, -{0x0009FE, 0x0010}, -{0x0009FF, 0x0001}, -{0x000A01, 0x0010}, -{0x000A04, 0x0001}, -{0x000A05, 0x0004}, -{0x000A0B, 0x0001}, -{0x000A0F, 0x0004}, -{0x000A11, 0x0001}, -{0x000A13, 0x0004}, -{0x000A29, 0x0001}, -{0x000A2A, 0x0004}, -{0x000A31, 0x0001}, -{0x000A32, 0x0004}, -{0x000A34, 0x0001}, -{0x000A35, 0x0004}, -{0x000A37, 0x0001}, -{0x000A38, 0x0004}, -{0x000A3A, 0x0001}, -{0x000A3C, 0x0010}, -{0x000A3D, 0x0001}, -{0x000A3E, 0x0010}, -{0x000A43, 0x0001}, -{0x000A47, 0x0010}, -{0x000A49, 0x0001}, -{0x000A4B, 0x0010}, -{0x000A4E, 0x0001}, -{0x000A51, 0x0010}, -{0x000A52, 0x0001}, -{0x000A59, 0x0004}, -{0x000A5D, 0x0001}, -{0x000A5E, 0x0004}, -{0x000A5F, 0x0001}, -{0x000A66, 0x0002}, -{0x000A70, 0x0010}, -{0x000A72, 0x0004}, -{0x000A75, 0x0010}, -{0x000A76, 0x0020}, -{0x000A77, 0x0001}, -{0x000A81, 0x0010}, -{0x000A84, 0x0001}, -{0x000A85, 0x0004}, -{0x000A8E, 0x0001}, -{0x000A8F, 0x0004}, -{0x000A92, 0x0001}, -{0x000A93, 0x0004}, -{0x000AA9, 0x0001}, -{0x000AAA, 0x0004}, -{0x000AB1, 0x0001}, -{0x000AB2, 0x0004}, -{0x000AB4, 0x0001}, -{0x000AB5, 0x0004}, -{0x000ABA, 0x0001}, -{0x000ABC, 0x0010}, -{0x000ABD, 0x0004}, -{0x000ABE, 0x0010}, -{0x000AC6, 0x0001}, -{0x000AC7, 0x0010}, -{0x000ACA, 0x0001}, -{0x000ACB, 0x0010}, -{0x000ACE, 0x0001}, -{0x000AD0, 0x0004}, -{0x000AD1, 0x0001}, -{0x000AE0, 0x0004}, -{0x000AE2, 0x0010}, -{0x000AE4, 0x0001}, -{0x000AE6, 0x0002}, -{0x000AF0, 0x0020}, -{0x000AF1, 0x0040}, -{0x000AF2, 0x0001}, -{0x000AF9, 0x0004}, -{0x000AFA, 0x0010}, -{0x000B00, 0x0001}, -{0x000B01, 0x0010}, -{0x000B04, 0x0001}, -{0x000B05, 0x0004}, -{0x000B0D, 0x0001}, -{0x000B0F, 0x0004}, -{0x000B11, 0x0001}, -{0x000B13, 0x0004}, -{0x000B29, 0x0001}, -{0x000B2A, 0x0004}, -{0x000B31, 0x0001}, -{0x000B32, 0x0004}, -{0x000B34, 0x0001}, -{0x000B35, 0x0004}, -{0x000B3A, 0x0001}, -{0x000B3C, 0x0010}, -{0x000B3D, 0x0004}, -{0x000B3E, 0x0010}, -{0x000B45, 0x0001}, -{0x000B47, 0x0010}, -{0x000B49, 0x0001}, -{0x000B4B, 0x0010}, -{0x000B4E, 0x0001}, -{0x000B55, 0x0010}, -{0x000B58, 0x0001}, -{0x000B5C, 0x0004}, -{0x000B5E, 0x0001}, -{0x000B5F, 0x0004}, -{0x000B62, 0x0010}, -{0x000B64, 0x0001}, -{0x000B66, 0x0002}, -{0x000B70, 0x0040}, -{0x000B71, 0x0004}, -{0x000B72, 0x0002}, -{0x000B78, 0x0001}, -{0x000B82, 0x0010}, -{0x000B83, 0x0004}, -{0x000B84, 0x0001}, -{0x000B85, 0x0004}, -{0x000B8B, 0x0001}, -{0x000B8E, 0x0004}, -{0x000B91, 0x0001}, -{0x000B92, 0x0004}, -{0x000B96, 0x0001}, -{0x000B99, 0x0004}, -{0x000B9B, 0x0001}, -{0x000B9C, 0x0004}, -{0x000B9D, 0x0001}, -{0x000B9E, 0x0004}, -{0x000BA0, 0x0001}, -{0x000BA3, 0x0004}, -{0x000BA5, 0x0001}, -{0x000BA8, 0x0004}, -{0x000BAB, 0x0001}, -{0x000BAE, 0x0004}, -{0x000BBA, 0x0001}, -{0x000BBE, 0x0010}, -{0x000BC3, 0x0001}, -{0x000BC6, 0x0010}, -{0x000BC9, 0x0001}, -{0x000BCA, 0x0010}, -{0x000BCE, 0x0001}, -{0x000BD0, 0x0004}, -{0x000BD1, 0x0001}, -{0x000BD7, 0x0010}, -{0x000BD8, 0x0001}, -{0x000BE6, 0x0002}, -{0x000BF3, 0x0040}, -{0x000BFB, 0x0001}, -{0x000C00, 0x0010}, -{0x000C05, 0x0004}, -{0x000C0D, 0x0001}, -{0x000C0E, 0x0004}, -{0x000C11, 0x0001}, -{0x000C12, 0x0004}, -{0x000C29, 0x0001}, -{0x000C2A, 0x0004}, -{0x000C3A, 0x0001}, -{0x000C3C, 0x0010}, -{0x000C3D, 0x0004}, -{0x000C3E, 0x0010}, -{0x000C45, 0x0001}, -{0x000C46, 0x0010}, -{0x000C49, 0x0001}, -{0x000C4A, 0x0010}, -{0x000C4E, 0x0001}, -{0x000C55, 0x0010}, -{0x000C57, 0x0001}, -{0x000C58, 0x0004}, -{0x000C5B, 0x0001}, -{0x000C5D, 0x0004}, -{0x000C5E, 0x0001}, -{0x000C60, 0x0004}, -{0x000C62, 0x0010}, -{0x000C64, 0x0001}, -{0x000C66, 0x0002}, -{0x000C70, 0x0001}, -{0x000C77, 0x0020}, -{0x000C78, 0x0002}, -{0x000C7F, 0x0040}, -{0x000C80, 0x0004}, -{0x000C81, 0x0010}, -{0x000C84, 0x0020}, -{0x000C85, 0x0004}, -{0x000C8D, 0x0001}, -{0x000C8E, 0x0004}, -{0x000C91, 0x0001}, -{0x000C92, 0x0004}, -{0x000CA9, 0x0001}, -{0x000CAA, 0x0004}, -{0x000CB4, 0x0001}, -{0x000CB5, 0x0004}, -{0x000CBA, 0x0001}, -{0x000CBC, 0x0010}, -{0x000CBD, 0x0004}, -{0x000CBE, 0x0010}, -{0x000CC5, 0x0001}, -{0x000CC6, 0x0010}, -{0x000CC9, 0x0001}, -{0x000CCA, 0x0010}, -{0x000CCE, 0x0001}, -{0x000CD5, 0x0010}, -{0x000CD7, 0x0001}, -{0x000CDD, 0x0004}, -{0x000CDF, 0x0001}, -{0x000CE0, 0x0004}, -{0x000CE2, 0x0010}, -{0x000CE4, 0x0001}, -{0x000CE6, 0x0002}, -{0x000CF0, 0x0001}, -{0x000CF1, 0x0004}, -{0x000CF3, 0x0010}, -{0x000CF4, 0x0001}, -{0x000D00, 0x0010}, -{0x000D04, 0x0004}, -{0x000D0D, 0x0001}, -{0x000D0E, 0x0004}, -{0x000D11, 0x0001}, -{0x000D12, 0x0004}, -{0x000D3B, 0x0010}, -{0x000D3D, 0x0004}, -{0x000D3E, 0x0010}, -{0x000D45, 0x0001}, -{0x000D46, 0x0010}, -{0x000D49, 0x0001}, -{0x000D4A, 0x0010}, -{0x000D4E, 0x0004}, -{0x000D4F, 0x0040}, -{0x000D50, 0x0001}, -{0x000D54, 0x0004}, -{0x000D57, 0x0010}, -{0x000D58, 0x0002}, -{0x000D5F, 0x0004}, -{0x000D62, 0x0010}, -{0x000D64, 0x0001}, -{0x000D66, 0x0002}, -{0x000D79, 0x0040}, -{0x000D7A, 0x0004}, -{0x000D80, 0x0001}, -{0x000D81, 0x0010}, -{0x000D84, 0x0001}, -{0x000D85, 0x0004}, -{0x000D97, 0x0001}, -{0x000D9A, 0x0004}, -{0x000DB2, 0x0001}, -{0x000DB3, 0x0004}, -{0x000DBC, 0x0001}, -{0x000DBD, 0x0004}, -{0x000DBE, 0x0001}, -{0x000DC0, 0x0004}, -{0x000DC7, 0x0001}, -{0x000DCA, 0x0010}, -{0x000DCB, 0x0001}, -{0x000DCF, 0x0010}, -{0x000DD5, 0x0001}, -{0x000DD6, 0x0010}, -{0x000DD7, 0x0001}, -{0x000DD8, 0x0010}, -{0x000DE0, 0x0001}, -{0x000DE6, 0x0002}, -{0x000DF0, 0x0001}, -{0x000DF2, 0x0010}, -{0x000DF4, 0x0020}, -{0x000DF5, 0x0001}, -{0x000E01, 0x0004}, -{0x000E31, 0x0010}, -{0x000E32, 0x0004}, -{0x000E34, 0x0010}, -{0x000E3B, 0x0001}, -{0x000E3F, 0x0040}, -{0x000E40, 0x0004}, -{0x000E47, 0x0010}, -{0x000E4F, 0x0020}, -{0x000E50, 0x0002}, -{0x000E5A, 0x0020}, -{0x000E5C, 0x0001}, -{0x000E81, 0x0004}, -{0x000E83, 0x0001}, -{0x000E84, 0x0004}, -{0x000E85, 0x0001}, -{0x000E86, 0x0004}, -{0x000E8B, 0x0001}, -{0x000E8C, 0x0004}, -{0x000EA4, 0x0001}, -{0x000EA5, 0x0004}, -{0x000EA6, 0x0001}, -{0x000EA7, 0x0004}, -{0x000EB1, 0x0010}, -{0x000EB2, 0x0004}, -{0x000EB4, 0x0010}, -{0x000EBD, 0x0004}, -{0x000EBE, 0x0001}, -{0x000EC0, 0x0004}, -{0x000EC5, 0x0001}, -{0x000EC6, 0x0004}, -{0x000EC7, 0x0001}, -{0x000EC8, 0x0010}, -{0x000ECF, 0x0001}, -{0x000ED0, 0x0002}, -{0x000EDA, 0x0001}, -{0x000EDC, 0x0004}, -{0x000EE0, 0x0001}, -{0x000F00, 0x0004}, -{0x000F01, 0x0040}, -{0x000F04, 0x0020}, -{0x000F13, 0x0040}, -{0x000F14, 0x0020}, -{0x000F15, 0x0040}, -{0x000F18, 0x0010}, -{0x000F1A, 0x0040}, -{0x000F20, 0x0002}, -{0x000F34, 0x0040}, -{0x000F35, 0x0010}, -{0x000F36, 0x0040}, -{0x000F37, 0x0010}, -{0x000F38, 0x0040}, -{0x000F39, 0x0010}, -{0x000F3A, 0x0020}, -{0x000F3E, 0x0010}, -{0x000F40, 0x0004}, -{0x000F48, 0x0001}, -{0x000F49, 0x0004}, -{0x000F6D, 0x0001}, -{0x000F71, 0x0010}, -{0x000F85, 0x0020}, -{0x000F86, 0x0010}, -{0x000F88, 0x0004}, -{0x000F8D, 0x0010}, -{0x000F98, 0x0001}, -{0x000F99, 0x0010}, -{0x000FBD, 0x0001}, -{0x000FBE, 0x0040}, -{0x000FC6, 0x0010}, -{0x000FC7, 0x0040}, -{0x000FCD, 0x0001}, -{0x000FCE, 0x0040}, -{0x000FD0, 0x0020}, -{0x000FD5, 0x0040}, -{0x000FD9, 0x0020}, -{0x000FDB, 0x0001}, -{0x001000, 0x0004}, -{0x00102B, 0x0010}, -{0x00103F, 0x0004}, -{0x001040, 0x0002}, -{0x00104A, 0x0020}, -{0x001050, 0x0004}, -{0x001056, 0x0010}, -{0x00105A, 0x0004}, -{0x00105E, 0x0010}, -{0x001061, 0x0004}, -{0x001062, 0x0010}, -{0x001065, 0x0004}, -{0x001067, 0x0010}, -{0x00106E, 0x0004}, -{0x001071, 0x0010}, -{0x001075, 0x0004}, -{0x001082, 0x0010}, -{0x00108E, 0x0004}, -{0x00108F, 0x0010}, -{0x001090, 0x0002}, -{0x00109A, 0x0010}, -{0x00109E, 0x0040}, -{0x0010A0, 0x0004}, -{0x0010C6, 0x0001}, -{0x0010C7, 0x0004}, -{0x0010C8, 0x0001}, -{0x0010CD, 0x0004}, -{0x0010CE, 0x0001}, -{0x0010D0, 0x0004}, -{0x0010FB, 0x0020}, -{0x0010FC, 0x0004}, -{0x001249, 0x0001}, -{0x00124A, 0x0004}, -{0x00124E, 0x0001}, -{0x001250, 0x0004}, -{0x001257, 0x0001}, -{0x001258, 0x0004}, -{0x001259, 0x0001}, -{0x00125A, 0x0004}, -{0x00125E, 0x0001}, -{0x001260, 0x0004}, -{0x001289, 0x0001}, -{0x00128A, 0x0004}, -{0x00128E, 0x0001}, -{0x001290, 0x0004}, -{0x0012B1, 0x0001}, -{0x0012B2, 0x0004}, -{0x0012B6, 0x0001}, -{0x0012B8, 0x0004}, -{0x0012BF, 0x0001}, -{0x0012C0, 0x0004}, -{0x0012C1, 0x0001}, -{0x0012C2, 0x0004}, -{0x0012C6, 0x0001}, -{0x0012C8, 0x0004}, -{0x0012D7, 0x0001}, -{0x0012D8, 0x0004}, -{0x001311, 0x0001}, -{0x001312, 0x0004}, -{0x001316, 0x0001}, -{0x001318, 0x0004}, -{0x00135B, 0x0001}, -{0x00135D, 0x0010}, -{0x001360, 0x0020}, -{0x001369, 0x0002}, -{0x00137D, 0x0001}, -{0x001380, 0x0004}, -{0x001390, 0x0040}, -{0x00139A, 0x0001}, -{0x0013A0, 0x0004}, -{0x0013F6, 0x0001}, -{0x0013F8, 0x0004}, -{0x0013FE, 0x0001}, -{0x001400, 0x0020}, -{0x001401, 0x0004}, -{0x00166D, 0x0040}, -{0x00166E, 0x0020}, -{0x00166F, 0x0004}, -{0x001680, 0x0008}, -{0x001681, 0x0004}, -{0x00169B, 0x0020}, -{0x00169D, 0x0001}, -{0x0016A0, 0x0004}, -{0x0016EB, 0x0020}, -{0x0016EE, 0x0002}, -{0x0016F1, 0x0004}, -{0x0016F9, 0x0001}, -{0x001700, 0x0004}, -{0x001712, 0x0010}, -{0x001716, 0x0001}, -{0x00171F, 0x0004}, -{0x001732, 0x0010}, -{0x001735, 0x0020}, -{0x001737, 0x0001}, -{0x001740, 0x0004}, -{0x001752, 0x0010}, -{0x001754, 0x0001}, -{0x001760, 0x0004}, -{0x00176D, 0x0001}, -{0x00176E, 0x0004}, -{0x001771, 0x0001}, -{0x001772, 0x0010}, -{0x001774, 0x0001}, -{0x001780, 0x0004}, -{0x0017B4, 0x0010}, -{0x0017D4, 0x0020}, -{0x0017D7, 0x0004}, -{0x0017D8, 0x0020}, -{0x0017DB, 0x0040}, -{0x0017DC, 0x0004}, -{0x0017DD, 0x0010}, -{0x0017DE, 0x0001}, -{0x0017E0, 0x0002}, -{0x0017EA, 0x0001}, -{0x0017F0, 0x0002}, -{0x0017FA, 0x0001}, -{0x001800, 0x0020}, -{0x00180B, 0x0010}, -{0x00180E, 0x0080}, -{0x00180F, 0x0010}, -{0x001810, 0x0002}, -{0x00181A, 0x0001}, -{0x001820, 0x0004}, -{0x001879, 0x0001}, -{0x001880, 0x0004}, -{0x001885, 0x0010}, -{0x001887, 0x0004}, -{0x0018A9, 0x0010}, -{0x0018AA, 0x0004}, -{0x0018AB, 0x0001}, -{0x0018B0, 0x0004}, -{0x0018F6, 0x0001}, -{0x001900, 0x0004}, -{0x00191F, 0x0001}, -{0x001920, 0x0010}, -{0x00192C, 0x0001}, -{0x001930, 0x0010}, -{0x00193C, 0x0001}, -{0x001940, 0x0040}, -{0x001941, 0x0001}, -{0x001944, 0x0020}, -{0x001946, 0x0002}, -{0x001950, 0x0004}, -{0x00196E, 0x0001}, -{0x001970, 0x0004}, -{0x001975, 0x0001}, -{0x001980, 0x0004}, -{0x0019AC, 0x0001}, -{0x0019B0, 0x0004}, -{0x0019CA, 0x0001}, -{0x0019D0, 0x0002}, -{0x0019DB, 0x0001}, -{0x0019DE, 0x0040}, -{0x001A00, 0x0004}, -{0x001A17, 0x0010}, -{0x001A1C, 0x0001}, -{0x001A1E, 0x0020}, -{0x001A20, 0x0004}, -{0x001A55, 0x0010}, -{0x001A5F, 0x0001}, -{0x001A60, 0x0010}, -{0x001A7D, 0x0001}, -{0x001A7F, 0x0010}, -{0x001A80, 0x0002}, -{0x001A8A, 0x0001}, -{0x001A90, 0x0002}, -{0x001A9A, 0x0001}, -{0x001AA0, 0x0020}, -{0x001AA7, 0x0004}, -{0x001AA8, 0x0020}, -{0x001AAE, 0x0001}, -{0x001AB0, 0x0010}, -{0x001ACF, 0x0001}, -{0x001B00, 0x0010}, -{0x001B05, 0x0004}, -{0x001B34, 0x0010}, -{0x001B45, 0x0004}, -{0x001B4D, 0x0001}, -{0x001B50, 0x0002}, -{0x001B5A, 0x0020}, -{0x001B61, 0x0040}, -{0x001B6B, 0x0010}, -{0x001B74, 0x0040}, -{0x001B7D, 0x0020}, -{0x001B7F, 0x0001}, -{0x001B80, 0x0010}, -{0x001B83, 0x0004}, -{0x001BA1, 0x0010}, -{0x001BAE, 0x0004}, -{0x001BB0, 0x0002}, -{0x001BBA, 0x0004}, -{0x001BE6, 0x0010}, -{0x001BF4, 0x0001}, -{0x001BFC, 0x0020}, -{0x001C00, 0x0004}, -{0x001C24, 0x0010}, -{0x001C38, 0x0001}, -{0x001C3B, 0x0020}, -{0x001C40, 0x0002}, -{0x001C4A, 0x0001}, -{0x001C4D, 0x0004}, -{0x001C50, 0x0002}, -{0x001C5A, 0x0004}, -{0x001C7E, 0x0020}, -{0x001C80, 0x0004}, -{0x001C89, 0x0001}, -{0x001C90, 0x0004}, -{0x001CBB, 0x0001}, -{0x001CBD, 0x0004}, -{0x001CC0, 0x0020}, -{0x001CC8, 0x0001}, -{0x001CD0, 0x0010}, -{0x001CD3, 0x0020}, -{0x001CD4, 0x0010}, -{0x001CE9, 0x0004}, -{0x001CED, 0x0010}, -{0x001CEE, 0x0004}, -{0x001CF4, 0x0010}, -{0x001CF5, 0x0004}, -{0x001CF7, 0x0010}, -{0x001CFA, 0x0004}, -{0x001CFB, 0x0001}, -{0x001D00, 0x0004}, -{0x001DC0, 0x0010}, -{0x001E00, 0x0004}, -{0x001F16, 0x0001}, -{0x001F18, 0x0004}, -{0x001F1E, 0x0001}, -{0x001F20, 0x0004}, -{0x001F46, 0x0001}, -{0x001F48, 0x0004}, -{0x001F4E, 0x0001}, -{0x001F50, 0x0004}, -{0x001F58, 0x0001}, -{0x001F59, 0x0004}, -{0x001F5A, 0x0001}, -{0x001F5B, 0x0004}, -{0x001F5C, 0x0001}, -{0x001F5D, 0x0004}, -{0x001F5E, 0x0001}, -{0x001F5F, 0x0004}, -{0x001F7E, 0x0001}, -{0x001F80, 0x0004}, -{0x001FB5, 0x0001}, -{0x001FB6, 0x0004}, -{0x001FBD, 0x0040}, -{0x001FBE, 0x0004}, -{0x001FBF, 0x0040}, -{0x001FC2, 0x0004}, -{0x001FC5, 0x0001}, -{0x001FC6, 0x0004}, -{0x001FCD, 0x0040}, -{0x001FD0, 0x0004}, -{0x001FD4, 0x0001}, -{0x001FD6, 0x0004}, -{0x001FDC, 0x0001}, -{0x001FDD, 0x0040}, -{0x001FE0, 0x0004}, -{0x001FED, 0x0040}, -{0x001FF0, 0x0001}, -{0x001FF2, 0x0004}, -{0x001FF5, 0x0001}, -{0x001FF6, 0x0004}, -{0x001FFD, 0x0040}, -{0x001FFF, 0x0001}, -{0x002000, 0x0008}, -{0x00200B, 0x0080}, -{0x002010, 0x0020}, -{0x002028, 0x0008}, -{0x00202A, 0x0080}, -{0x00202F, 0x0008}, -{0x002030, 0x0020}, -{0x002044, 0x0040}, -{0x002045, 0x0020}, -{0x002052, 0x0040}, -{0x002053, 0x0020}, -{0x00205F, 0x0008}, -{0x002060, 0x0080}, -{0x002065, 0x0001}, -{0x002066, 0x0080}, -{0x002070, 0x0002}, -{0x002071, 0x0004}, -{0x002072, 0x0001}, -{0x002074, 0x0002}, -{0x00207A, 0x0040}, -{0x00207D, 0x0020}, -{0x00207F, 0x0004}, -{0x002080, 0x0002}, -{0x00208A, 0x0040}, -{0x00208D, 0x0020}, -{0x00208F, 0x0001}, -{0x002090, 0x0004}, -{0x00209D, 0x0001}, -{0x0020A0, 0x0040}, -{0x0020C1, 0x0001}, -{0x0020D0, 0x0010}, -{0x0020F1, 0x0001}, -{0x002100, 0x0040}, -{0x002102, 0x0004}, -{0x002103, 0x0040}, -{0x002107, 0x0004}, -{0x002108, 0x0040}, -{0x00210A, 0x0004}, -{0x002114, 0x0040}, -{0x002115, 0x0004}, -{0x002116, 0x0040}, -{0x002119, 0x0004}, -{0x00211E, 0x0040}, -{0x002124, 0x0004}, -{0x002125, 0x0040}, -{0x002126, 0x0004}, -{0x002127, 0x0040}, -{0x002128, 0x0004}, -{0x002129, 0x0040}, -{0x00212A, 0x0004}, -{0x00212E, 0x0040}, -{0x00212F, 0x0004}, -{0x00213A, 0x0040}, -{0x00213C, 0x0004}, -{0x002140, 0x0040}, -{0x002145, 0x0004}, -{0x00214A, 0x0040}, -{0x00214E, 0x0004}, -{0x00214F, 0x0040}, -{0x002150, 0x0002}, -{0x002183, 0x0004}, -{0x002185, 0x0002}, -{0x00218A, 0x0040}, -{0x00218C, 0x0001}, -{0x002190, 0x0040}, -{0x002308, 0x0020}, -{0x00230C, 0x0040}, -{0x002329, 0x0020}, -{0x00232B, 0x0040}, -{0x002427, 0x0001}, -{0x002440, 0x0040}, -{0x00244B, 0x0001}, -{0x002460, 0x0002}, -{0x00249C, 0x0040}, -{0x0024EA, 0x0002}, -{0x002500, 0x0040}, -{0x002768, 0x0020}, -{0x002776, 0x0002}, -{0x002794, 0x0040}, -{0x0027C5, 0x0020}, -{0x0027C7, 0x0040}, -{0x0027E6, 0x0020}, -{0x0027F0, 0x0040}, -{0x002983, 0x0020}, -{0x002999, 0x0040}, -{0x0029D8, 0x0020}, -{0x0029DC, 0x0040}, -{0x0029FC, 0x0020}, -{0x0029FE, 0x0040}, -{0x002B74, 0x0001}, -{0x002B76, 0x0040}, -{0x002B96, 0x0001}, -{0x002B97, 0x0040}, -{0x002C00, 0x0004}, -{0x002CE5, 0x0040}, -{0x002CEB, 0x0004}, -{0x002CEF, 0x0010}, -{0x002CF2, 0x0004}, -{0x002CF4, 0x0001}, -{0x002CF9, 0x0020}, -{0x002CFD, 0x0002}, -{0x002CFE, 0x0020}, -{0x002D00, 0x0004}, -{0x002D26, 0x0001}, -{0x002D27, 0x0004}, -{0x002D28, 0x0001}, -{0x002D2D, 0x0004}, -{0x002D2E, 0x0001}, -{0x002D30, 0x0004}, -{0x002D68, 0x0001}, -{0x002D6F, 0x0004}, -{0x002D70, 0x0020}, -{0x002D71, 0x0001}, -{0x002D7F, 0x0010}, -{0x002D80, 0x0004}, -{0x002D97, 0x0001}, -{0x002DA0, 0x0004}, -{0x002DA7, 0x0001}, -{0x002DA8, 0x0004}, -{0x002DAF, 0x0001}, -{0x002DB0, 0x0004}, -{0x002DB7, 0x0001}, -{0x002DB8, 0x0004}, -{0x002DBF, 0x0001}, -{0x002DC0, 0x0004}, -{0x002DC7, 0x0001}, -{0x002DC8, 0x0004}, -{0x002DCF, 0x0001}, -{0x002DD0, 0x0004}, -{0x002DD7, 0x0001}, -{0x002DD8, 0x0004}, -{0x002DDF, 0x0001}, -{0x002DE0, 0x0010}, -{0x002E00, 0x0020}, -{0x002E2F, 0x0004}, -{0x002E30, 0x0020}, -{0x002E50, 0x0040}, -{0x002E52, 0x0020}, -{0x002E5E, 0x0001}, -{0x002E80, 0x0040}, -{0x002E9A, 0x0001}, -{0x002E9B, 0x0040}, -{0x002EF4, 0x0001}, -{0x002F00, 0x0040}, -{0x002FD6, 0x0001}, -{0x002FF0, 0x0040}, -{0x003000, 0x0008}, -{0x003001, 0x0020}, -{0x003004, 0x0040}, -{0x003005, 0x0004}, -{0x003007, 0x0002}, -{0x003008, 0x0020}, -{0x003012, 0x0040}, -{0x003014, 0x0020}, -{0x003020, 0x0040}, -{0x003021, 0x0002}, -{0x00302A, 0x0010}, -{0x003030, 0x0020}, -{0x003031, 0x0004}, -{0x003036, 0x0040}, -{0x003038, 0x0002}, -{0x00303B, 0x0004}, -{0x00303D, 0x0020}, -{0x00303E, 0x0040}, -{0x003040, 0x0001}, -{0x003041, 0x0004}, -{0x003097, 0x0001}, -{0x003099, 0x0010}, -{0x00309B, 0x0040}, -{0x00309D, 0x0004}, -{0x0030A0, 0x0020}, -{0x0030A1, 0x0004}, -{0x0030FB, 0x0020}, -{0x0030FC, 0x0004}, -{0x003100, 0x0001}, -{0x003105, 0x0004}, -{0x003130, 0x0001}, -{0x003131, 0x0004}, -{0x00318F, 0x0001}, -{0x003190, 0x0040}, -{0x003192, 0x0002}, -{0x003196, 0x0040}, -{0x0031A0, 0x0004}, -{0x0031C0, 0x0040}, -{0x0031E4, 0x0001}, -{0x0031EF, 0x0040}, -{0x0031F0, 0x0004}, -{0x003200, 0x0040}, -{0x00321F, 0x0001}, -{0x003220, 0x0002}, -{0x00322A, 0x0040}, -{0x003248, 0x0002}, -{0x003250, 0x0040}, -{0x003251, 0x0002}, -{0x003260, 0x0040}, -{0x003280, 0x0002}, -{0x00328A, 0x0040}, -{0x0032B1, 0x0002}, -{0x0032C0, 0x0040}, -{0x003400, 0x0004}, -{0x004DC0, 0x0040}, -{0x004E00, 0x0004}, -{0x00A48D, 0x0001}, -{0x00A490, 0x0040}, -{0x00A4C7, 0x0001}, -{0x00A4D0, 0x0004}, -{0x00A4FE, 0x0020}, -{0x00A500, 0x0004}, -{0x00A60D, 0x0020}, -{0x00A610, 0x0004}, -{0x00A620, 0x0002}, -{0x00A62A, 0x0004}, -{0x00A62C, 0x0001}, -{0x00A640, 0x0004}, -{0x00A66F, 0x0010}, -{0x00A673, 0x0020}, -{0x00A674, 0x0010}, -{0x00A67E, 0x0020}, -{0x00A67F, 0x0004}, -{0x00A69E, 0x0010}, -{0x00A6A0, 0x0004}, -{0x00A6E6, 0x0002}, -{0x00A6F0, 0x0010}, -{0x00A6F2, 0x0020}, -{0x00A6F8, 0x0001}, -{0x00A700, 0x0040}, -{0x00A717, 0x0004}, -{0x00A720, 0x0040}, -{0x00A722, 0x0004}, -{0x00A789, 0x0040}, -{0x00A78B, 0x0004}, -{0x00A7CB, 0x0001}, -{0x00A7D0, 0x0004}, -{0x00A7D2, 0x0001}, -{0x00A7D3, 0x0004}, -{0x00A7D4, 0x0001}, -{0x00A7D5, 0x0004}, -{0x00A7DA, 0x0001}, -{0x00A7F2, 0x0004}, -{0x00A802, 0x0010}, -{0x00A803, 0x0004}, -{0x00A806, 0x0010}, -{0x00A807, 0x0004}, -{0x00A80B, 0x0010}, -{0x00A80C, 0x0004}, -{0x00A823, 0x0010}, -{0x00A828, 0x0040}, -{0x00A82C, 0x0010}, -{0x00A82D, 0x0001}, -{0x00A830, 0x0002}, -{0x00A836, 0x0040}, -{0x00A83A, 0x0001}, -{0x00A840, 0x0004}, -{0x00A874, 0x0020}, -{0x00A878, 0x0001}, -{0x00A880, 0x0010}, -{0x00A882, 0x0004}, -{0x00A8B4, 0x0010}, -{0x00A8C6, 0x0001}, -{0x00A8CE, 0x0020}, -{0x00A8D0, 0x0002}, -{0x00A8DA, 0x0001}, -{0x00A8E0, 0x0010}, -{0x00A8F2, 0x0004}, -{0x00A8F8, 0x0020}, -{0x00A8FB, 0x0004}, -{0x00A8FC, 0x0020}, -{0x00A8FD, 0x0004}, -{0x00A8FF, 0x0010}, -{0x00A900, 0x0002}, -{0x00A90A, 0x0004}, -{0x00A926, 0x0010}, -{0x00A92E, 0x0020}, -{0x00A930, 0x0004}, -{0x00A947, 0x0010}, -{0x00A954, 0x0001}, -{0x00A95F, 0x0020}, -{0x00A960, 0x0004}, -{0x00A97D, 0x0001}, -{0x00A980, 0x0010}, -{0x00A984, 0x0004}, -{0x00A9B3, 0x0010}, -{0x00A9C1, 0x0020}, -{0x00A9CE, 0x0001}, -{0x00A9CF, 0x0004}, -{0x00A9D0, 0x0002}, -{0x00A9DA, 0x0001}, -{0x00A9DE, 0x0020}, -{0x00A9E0, 0x0004}, -{0x00A9E5, 0x0010}, -{0x00A9E6, 0x0004}, -{0x00A9F0, 0x0002}, -{0x00A9FA, 0x0004}, -{0x00A9FF, 0x0001}, -{0x00AA00, 0x0004}, -{0x00AA29, 0x0010}, -{0x00AA37, 0x0001}, -{0x00AA40, 0x0004}, -{0x00AA43, 0x0010}, -{0x00AA44, 0x0004}, -{0x00AA4C, 0x0010}, -{0x00AA4E, 0x0001}, -{0x00AA50, 0x0002}, -{0x00AA5A, 0x0001}, -{0x00AA5C, 0x0020}, -{0x00AA60, 0x0004}, -{0x00AA77, 0x0040}, -{0x00AA7A, 0x0004}, -{0x00AA7B, 0x0010}, -{0x00AA7E, 0x0004}, -{0x00AAB0, 0x0010}, -{0x00AAB1, 0x0004}, -{0x00AAB2, 0x0010}, -{0x00AAB5, 0x0004}, -{0x00AAB7, 0x0010}, -{0x00AAB9, 0x0004}, -{0x00AABE, 0x0010}, -{0x00AAC0, 0x0004}, -{0x00AAC1, 0x0010}, -{0x00AAC2, 0x0004}, -{0x00AAC3, 0x0001}, -{0x00AADB, 0x0004}, -{0x00AADE, 0x0020}, -{0x00AAE0, 0x0004}, -{0x00AAEB, 0x0010}, -{0x00AAF0, 0x0020}, -{0x00AAF2, 0x0004}, -{0x00AAF5, 0x0010}, -{0x00AAF7, 0x0001}, -{0x00AB01, 0x0004}, -{0x00AB07, 0x0001}, -{0x00AB09, 0x0004}, -{0x00AB0F, 0x0001}, -{0x00AB11, 0x0004}, -{0x00AB17, 0x0001}, -{0x00AB20, 0x0004}, -{0x00AB27, 0x0001}, -{0x00AB28, 0x0004}, -{0x00AB2F, 0x0001}, -{0x00AB30, 0x0004}, -{0x00AB5B, 0x0040}, -{0x00AB5C, 0x0004}, -{0x00AB6A, 0x0040}, -{0x00AB6C, 0x0001}, -{0x00AB70, 0x0004}, -{0x00ABE3, 0x0010}, -{0x00ABEB, 0x0020}, -{0x00ABEC, 0x0010}, -{0x00ABEE, 0x0001}, -{0x00ABF0, 0x0002}, -{0x00ABFA, 0x0001}, -{0x00AC00, 0x0004}, -{0x00D7A4, 0x0001}, -{0x00D7B0, 0x0004}, -{0x00D7C7, 0x0001}, -{0x00D7CB, 0x0004}, -{0x00D7FC, 0x0001}, -{0x00D800, 0x0080}, -{0x00F900, 0x0004}, -{0x00FA6E, 0x0001}, -{0x00FA70, 0x0004}, -{0x00FADA, 0x0001}, -{0x00FB00, 0x0004}, -{0x00FB07, 0x0001}, -{0x00FB13, 0x0004}, -{0x00FB18, 0x0001}, -{0x00FB1D, 0x0004}, -{0x00FB1E, 0x0010}, -{0x00FB1F, 0x0004}, -{0x00FB29, 0x0040}, -{0x00FB2A, 0x0004}, -{0x00FB37, 0x0001}, -{0x00FB38, 0x0004}, -{0x00FB3D, 0x0001}, -{0x00FB3E, 0x0004}, -{0x00FB3F, 0x0001}, -{0x00FB40, 0x0004}, -{0x00FB42, 0x0001}, -{0x00FB43, 0x0004}, -{0x00FB45, 0x0001}, -{0x00FB46, 0x0004}, -{0x00FBB2, 0x0040}, -{0x00FBC3, 0x0001}, -{0x00FBD3, 0x0004}, -{0x00FD3E, 0x0020}, -{0x00FD40, 0x0040}, -{0x00FD50, 0x0004}, -{0x00FD90, 0x0001}, -{0x00FD92, 0x0004}, -{0x00FDC8, 0x0001}, -{0x00FDCF, 0x0040}, -{0x00FDD0, 0x0001}, -{0x00FDF0, 0x0004}, -{0x00FDFC, 0x0040}, -{0x00FE00, 0x0010}, -{0x00FE10, 0x0020}, -{0x00FE1A, 0x0001}, -{0x00FE20, 0x0010}, -{0x00FE30, 0x0020}, -{0x00FE53, 0x0001}, -{0x00FE54, 0x0020}, -{0x00FE62, 0x0040}, -{0x00FE63, 0x0020}, -{0x00FE64, 0x0040}, -{0x00FE67, 0x0001}, -{0x00FE68, 0x0020}, -{0x00FE69, 0x0040}, -{0x00FE6A, 0x0020}, -{0x00FE6C, 0x0001}, -{0x00FE70, 0x0004}, -{0x00FE75, 0x0001}, -{0x00FE76, 0x0004}, -{0x00FEFD, 0x0001}, -{0x00FEFF, 0x0080}, -{0x00FF00, 0x0001}, -{0x00FF01, 0x0020}, -{0x00FF04, 0x0040}, -{0x00FF05, 0x0020}, -{0x00FF0B, 0x0040}, -{0x00FF0C, 0x0020}, -{0x00FF10, 0x0002}, -{0x00FF1A, 0x0020}, -{0x00FF1C, 0x0040}, -{0x00FF1F, 0x0020}, -{0x00FF21, 0x0004}, -{0x00FF3B, 0x0020}, -{0x00FF3E, 0x0040}, -{0x00FF3F, 0x0020}, -{0x00FF40, 0x0040}, -{0x00FF41, 0x0004}, -{0x00FF5B, 0x0020}, -{0x00FF5C, 0x0040}, -{0x00FF5D, 0x0020}, -{0x00FF5E, 0x0040}, -{0x00FF5F, 0x0020}, -{0x00FF66, 0x0004}, -{0x00FFBF, 0x0001}, -{0x00FFC2, 0x0004}, -{0x00FFC8, 0x0001}, -{0x00FFCA, 0x0004}, -{0x00FFD0, 0x0001}, -{0x00FFD2, 0x0004}, -{0x00FFD8, 0x0001}, -{0x00FFDA, 0x0004}, -{0x00FFDD, 0x0001}, -{0x00FFE0, 0x0040}, -{0x00FFE7, 0x0001}, -{0x00FFE8, 0x0040}, -{0x00FFEF, 0x0001}, -{0x00FFF9, 0x0080}, -{0x00FFFC, 0x0040}, -{0x00FFFE, 0x0001}, -{0x010000, 0x0004}, -{0x01000C, 0x0001}, -{0x01000D, 0x0004}, -{0x010027, 0x0001}, -{0x010028, 0x0004}, -{0x01003B, 0x0001}, -{0x01003C, 0x0004}, -{0x01003E, 0x0001}, -{0x01003F, 0x0004}, -{0x01004E, 0x0001}, -{0x010050, 0x0004}, -{0x01005E, 0x0001}, -{0x010080, 0x0004}, -{0x0100FB, 0x0001}, -{0x010100, 0x0020}, -{0x010103, 0x0001}, -{0x010107, 0x0002}, -{0x010134, 0x0001}, -{0x010137, 0x0040}, -{0x010140, 0x0002}, -{0x010179, 0x0040}, -{0x01018A, 0x0002}, -{0x01018C, 0x0040}, -{0x01018F, 0x0001}, -{0x010190, 0x0040}, -{0x01019D, 0x0001}, -{0x0101A0, 0x0040}, -{0x0101A1, 0x0001}, -{0x0101D0, 0x0040}, -{0x0101FD, 0x0010}, -{0x0101FE, 0x0001}, -{0x010280, 0x0004}, -{0x01029D, 0x0001}, -{0x0102A0, 0x0004}, -{0x0102D1, 0x0001}, -{0x0102E0, 0x0010}, -{0x0102E1, 0x0002}, -{0x0102FC, 0x0001}, -{0x010300, 0x0004}, -{0x010320, 0x0002}, -{0x010324, 0x0001}, -{0x01032D, 0x0004}, -{0x010341, 0x0002}, -{0x010342, 0x0004}, -{0x01034A, 0x0002}, -{0x01034B, 0x0001}, -{0x010350, 0x0004}, -{0x010376, 0x0010}, -{0x01037B, 0x0001}, -{0x010380, 0x0004}, -{0x01039E, 0x0001}, -{0x01039F, 0x0020}, -{0x0103A0, 0x0004}, -{0x0103C4, 0x0001}, -{0x0103C8, 0x0004}, -{0x0103D0, 0x0020}, -{0x0103D1, 0x0002}, -{0x0103D6, 0x0001}, -{0x010400, 0x0004}, -{0x01049E, 0x0001}, -{0x0104A0, 0x0002}, -{0x0104AA, 0x0001}, -{0x0104B0, 0x0004}, -{0x0104D4, 0x0001}, -{0x0104D8, 0x0004}, -{0x0104FC, 0x0001}, -{0x010500, 0x0004}, -{0x010528, 0x0001}, -{0x010530, 0x0004}, -{0x010564, 0x0001}, -{0x01056F, 0x0020}, -{0x010570, 0x0004}, -{0x01057B, 0x0001}, -{0x01057C, 0x0004}, -{0x01058B, 0x0001}, -{0x01058C, 0x0004}, -{0x010593, 0x0001}, -{0x010594, 0x0004}, -{0x010596, 0x0001}, -{0x010597, 0x0004}, -{0x0105A2, 0x0001}, -{0x0105A3, 0x0004}, -{0x0105B2, 0x0001}, -{0x0105B3, 0x0004}, -{0x0105BA, 0x0001}, -{0x0105BB, 0x0004}, -{0x0105BD, 0x0001}, -{0x010600, 0x0004}, -{0x010737, 0x0001}, -{0x010740, 0x0004}, -{0x010756, 0x0001}, -{0x010760, 0x0004}, -{0x010768, 0x0001}, -{0x010780, 0x0004}, -{0x010786, 0x0001}, -{0x010787, 0x0004}, -{0x0107B1, 0x0001}, -{0x0107B2, 0x0004}, -{0x0107BB, 0x0001}, -{0x010800, 0x0004}, -{0x010806, 0x0001}, -{0x010808, 0x0004}, -{0x010809, 0x0001}, -{0x01080A, 0x0004}, -{0x010836, 0x0001}, -{0x010837, 0x0004}, -{0x010839, 0x0001}, -{0x01083C, 0x0004}, -{0x01083D, 0x0001}, -{0x01083F, 0x0004}, -{0x010856, 0x0001}, -{0x010857, 0x0020}, -{0x010858, 0x0002}, -{0x010860, 0x0004}, -{0x010877, 0x0040}, -{0x010879, 0x0002}, -{0x010880, 0x0004}, -{0x01089F, 0x0001}, -{0x0108A7, 0x0002}, -{0x0108B0, 0x0001}, -{0x0108E0, 0x0004}, -{0x0108F3, 0x0001}, -{0x0108F4, 0x0004}, -{0x0108F6, 0x0001}, -{0x0108FB, 0x0002}, -{0x010900, 0x0004}, -{0x010916, 0x0002}, -{0x01091C, 0x0001}, -{0x01091F, 0x0020}, -{0x010920, 0x0004}, -{0x01093A, 0x0001}, -{0x01093F, 0x0020}, -{0x010940, 0x0001}, -{0x010980, 0x0004}, -{0x0109B8, 0x0001}, -{0x0109BC, 0x0002}, -{0x0109BE, 0x0004}, -{0x0109C0, 0x0002}, -{0x0109D0, 0x0001}, -{0x0109D2, 0x0002}, -{0x010A00, 0x0004}, -{0x010A01, 0x0010}, -{0x010A04, 0x0001}, -{0x010A05, 0x0010}, -{0x010A07, 0x0001}, -{0x010A0C, 0x0010}, -{0x010A10, 0x0004}, -{0x010A14, 0x0001}, -{0x010A15, 0x0004}, -{0x010A18, 0x0001}, -{0x010A19, 0x0004}, -{0x010A36, 0x0001}, -{0x010A38, 0x0010}, -{0x010A3B, 0x0001}, -{0x010A3F, 0x0010}, -{0x010A40, 0x0002}, -{0x010A49, 0x0001}, -{0x010A50, 0x0020}, -{0x010A59, 0x0001}, -{0x010A60, 0x0004}, -{0x010A7D, 0x0002}, -{0x010A7F, 0x0020}, -{0x010A80, 0x0004}, -{0x010A9D, 0x0002}, -{0x010AA0, 0x0001}, -{0x010AC0, 0x0004}, -{0x010AC8, 0x0040}, -{0x010AC9, 0x0004}, -{0x010AE5, 0x0010}, -{0x010AE7, 0x0001}, -{0x010AEB, 0x0002}, -{0x010AF0, 0x0020}, -{0x010AF7, 0x0001}, -{0x010B00, 0x0004}, -{0x010B36, 0x0001}, -{0x010B39, 0x0020}, -{0x010B40, 0x0004}, -{0x010B56, 0x0001}, -{0x010B58, 0x0002}, -{0x010B60, 0x0004}, -{0x010B73, 0x0001}, -{0x010B78, 0x0002}, -{0x010B80, 0x0004}, -{0x010B92, 0x0001}, -{0x010B99, 0x0020}, -{0x010B9D, 0x0001}, -{0x010BA9, 0x0002}, -{0x010BB0, 0x0001}, -{0x010C00, 0x0004}, -{0x010C49, 0x0001}, -{0x010C80, 0x0004}, -{0x010CB3, 0x0001}, -{0x010CC0, 0x0004}, -{0x010CF3, 0x0001}, -{0x010CFA, 0x0002}, -{0x010D00, 0x0004}, -{0x010D24, 0x0010}, -{0x010D28, 0x0001}, -{0x010D30, 0x0002}, -{0x010D3A, 0x0001}, -{0x010E60, 0x0002}, -{0x010E7F, 0x0001}, -{0x010E80, 0x0004}, -{0x010EAA, 0x0001}, -{0x010EAB, 0x0010}, -{0x010EAD, 0x0020}, -{0x010EAE, 0x0001}, -{0x010EB0, 0x0004}, -{0x010EB2, 0x0001}, -{0x010EFD, 0x0010}, -{0x010F00, 0x0004}, -{0x010F1D, 0x0002}, -{0x010F27, 0x0004}, -{0x010F28, 0x0001}, -{0x010F30, 0x0004}, -{0x010F46, 0x0010}, -{0x010F51, 0x0002}, -{0x010F55, 0x0020}, -{0x010F5A, 0x0001}, -{0x010F70, 0x0004}, -{0x010F82, 0x0010}, -{0x010F86, 0x0020}, -{0x010F8A, 0x0001}, -{0x010FB0, 0x0004}, -{0x010FC5, 0x0002}, -{0x010FCC, 0x0001}, -{0x010FE0, 0x0004}, -{0x010FF7, 0x0001}, -{0x011000, 0x0010}, -{0x011003, 0x0004}, -{0x011038, 0x0010}, -{0x011047, 0x0020}, -{0x01104E, 0x0001}, -{0x011052, 0x0002}, -{0x011070, 0x0010}, -{0x011071, 0x0004}, -{0x011073, 0x0010}, -{0x011075, 0x0004}, -{0x011076, 0x0001}, -{0x01107F, 0x0010}, -{0x011083, 0x0004}, -{0x0110B0, 0x0010}, -{0x0110BB, 0x0020}, -{0x0110BD, 0x0080}, -{0x0110BE, 0x0020}, -{0x0110C2, 0x0010}, -{0x0110C3, 0x0001}, -{0x0110CD, 0x0080}, -{0x0110CE, 0x0001}, -{0x0110D0, 0x0004}, -{0x0110E9, 0x0001}, -{0x0110F0, 0x0002}, -{0x0110FA, 0x0001}, -{0x011100, 0x0010}, -{0x011103, 0x0004}, -{0x011127, 0x0010}, -{0x011135, 0x0001}, -{0x011136, 0x0002}, -{0x011140, 0x0020}, -{0x011144, 0x0004}, -{0x011145, 0x0010}, -{0x011147, 0x0004}, -{0x011148, 0x0001}, -{0x011150, 0x0004}, -{0x011173, 0x0010}, -{0x011174, 0x0020}, -{0x011176, 0x0004}, -{0x011177, 0x0001}, -{0x011180, 0x0010}, -{0x011183, 0x0004}, -{0x0111B3, 0x0010}, -{0x0111C1, 0x0004}, -{0x0111C5, 0x0020}, -{0x0111C9, 0x0010}, -{0x0111CD, 0x0020}, -{0x0111CE, 0x0010}, -{0x0111D0, 0x0002}, -{0x0111DA, 0x0004}, -{0x0111DB, 0x0020}, -{0x0111DC, 0x0004}, -{0x0111DD, 0x0020}, -{0x0111E0, 0x0001}, -{0x0111E1, 0x0002}, -{0x0111F5, 0x0001}, -{0x011200, 0x0004}, -{0x011212, 0x0001}, -{0x011213, 0x0004}, -{0x01122C, 0x0010}, -{0x011238, 0x0020}, -{0x01123E, 0x0010}, -{0x01123F, 0x0004}, -{0x011241, 0x0010}, -{0x011242, 0x0001}, -{0x011280, 0x0004}, -{0x011287, 0x0001}, -{0x011288, 0x0004}, -{0x011289, 0x0001}, -{0x01128A, 0x0004}, -{0x01128E, 0x0001}, -{0x01128F, 0x0004}, -{0x01129E, 0x0001}, -{0x01129F, 0x0004}, -{0x0112A9, 0x0020}, -{0x0112AA, 0x0001}, -{0x0112B0, 0x0004}, -{0x0112DF, 0x0010}, -{0x0112EB, 0x0001}, -{0x0112F0, 0x0002}, -{0x0112FA, 0x0001}, -{0x011300, 0x0010}, -{0x011304, 0x0001}, -{0x011305, 0x0004}, -{0x01130D, 0x0001}, -{0x01130F, 0x0004}, -{0x011311, 0x0001}, -{0x011313, 0x0004}, -{0x011329, 0x0001}, -{0x01132A, 0x0004}, -{0x011331, 0x0001}, -{0x011332, 0x0004}, -{0x011334, 0x0001}, -{0x011335, 0x0004}, -{0x01133A, 0x0001}, -{0x01133B, 0x0010}, -{0x01133D, 0x0004}, -{0x01133E, 0x0010}, -{0x011345, 0x0001}, -{0x011347, 0x0010}, -{0x011349, 0x0001}, -{0x01134B, 0x0010}, -{0x01134E, 0x0001}, -{0x011350, 0x0004}, -{0x011351, 0x0001}, -{0x011357, 0x0010}, -{0x011358, 0x0001}, -{0x01135D, 0x0004}, -{0x011362, 0x0010}, -{0x011364, 0x0001}, -{0x011366, 0x0010}, -{0x01136D, 0x0001}, -{0x011370, 0x0010}, -{0x011375, 0x0001}, -{0x011400, 0x0004}, -{0x011435, 0x0010}, -{0x011447, 0x0004}, -{0x01144B, 0x0020}, -{0x011450, 0x0002}, -{0x01145A, 0x0020}, -{0x01145C, 0x0001}, -{0x01145D, 0x0020}, -{0x01145E, 0x0010}, -{0x01145F, 0x0004}, -{0x011462, 0x0001}, -{0x011480, 0x0004}, -{0x0114B0, 0x0010}, -{0x0114C4, 0x0004}, -{0x0114C6, 0x0020}, -{0x0114C7, 0x0004}, -{0x0114C8, 0x0001}, -{0x0114D0, 0x0002}, -{0x0114DA, 0x0001}, -{0x011580, 0x0004}, -{0x0115AF, 0x0010}, -{0x0115B6, 0x0001}, -{0x0115B8, 0x0010}, -{0x0115C1, 0x0020}, -{0x0115D8, 0x0004}, -{0x0115DC, 0x0010}, -{0x0115DE, 0x0001}, -{0x011600, 0x0004}, -{0x011630, 0x0010}, -{0x011641, 0x0020}, -{0x011644, 0x0004}, -{0x011645, 0x0001}, -{0x011650, 0x0002}, -{0x01165A, 0x0001}, -{0x011660, 0x0020}, -{0x01166D, 0x0001}, -{0x011680, 0x0004}, -{0x0116AB, 0x0010}, -{0x0116B8, 0x0004}, -{0x0116B9, 0x0020}, -{0x0116BA, 0x0001}, -{0x0116C0, 0x0002}, -{0x0116CA, 0x0001}, -{0x011700, 0x0004}, -{0x01171B, 0x0001}, -{0x01171D, 0x0010}, -{0x01172C, 0x0001}, -{0x011730, 0x0002}, -{0x01173C, 0x0020}, -{0x01173F, 0x0040}, -{0x011740, 0x0004}, -{0x011747, 0x0001}, -{0x011800, 0x0004}, -{0x01182C, 0x0010}, -{0x01183B, 0x0020}, -{0x01183C, 0x0001}, -{0x0118A0, 0x0004}, -{0x0118E0, 0x0002}, -{0x0118F3, 0x0001}, -{0x0118FF, 0x0004}, -{0x011907, 0x0001}, -{0x011909, 0x0004}, -{0x01190A, 0x0001}, -{0x01190C, 0x0004}, -{0x011914, 0x0001}, -{0x011915, 0x0004}, -{0x011917, 0x0001}, -{0x011918, 0x0004}, -{0x011930, 0x0010}, -{0x011936, 0x0001}, -{0x011937, 0x0010}, -{0x011939, 0x0001}, -{0x01193B, 0x0010}, -{0x01193F, 0x0004}, -{0x011940, 0x0010}, -{0x011941, 0x0004}, -{0x011942, 0x0010}, -{0x011944, 0x0020}, -{0x011947, 0x0001}, -{0x011950, 0x0002}, -{0x01195A, 0x0001}, -{0x0119A0, 0x0004}, -{0x0119A8, 0x0001}, -{0x0119AA, 0x0004}, -{0x0119D1, 0x0010}, -{0x0119D8, 0x0001}, -{0x0119DA, 0x0010}, -{0x0119E1, 0x0004}, -{0x0119E2, 0x0020}, -{0x0119E3, 0x0004}, -{0x0119E4, 0x0010}, -{0x0119E5, 0x0001}, -{0x011A00, 0x0004}, -{0x011A01, 0x0010}, -{0x011A0B, 0x0004}, -{0x011A33, 0x0010}, -{0x011A3A, 0x0004}, -{0x011A3B, 0x0010}, -{0x011A3F, 0x0020}, -{0x011A47, 0x0010}, -{0x011A48, 0x0001}, -{0x011A50, 0x0004}, -{0x011A51, 0x0010}, -{0x011A5C, 0x0004}, -{0x011A8A, 0x0010}, -{0x011A9A, 0x0020}, -{0x011A9D, 0x0004}, -{0x011A9E, 0x0020}, -{0x011AA3, 0x0001}, -{0x011AB0, 0x0004}, -{0x011AF9, 0x0001}, -{0x011B00, 0x0020}, -{0x011B0A, 0x0001}, -{0x011C00, 0x0004}, -{0x011C09, 0x0001}, -{0x011C0A, 0x0004}, -{0x011C2F, 0x0010}, -{0x011C37, 0x0001}, -{0x011C38, 0x0010}, -{0x011C40, 0x0004}, -{0x011C41, 0x0020}, -{0x011C46, 0x0001}, -{0x011C50, 0x0002}, -{0x011C6D, 0x0001}, -{0x011C70, 0x0020}, -{0x011C72, 0x0004}, -{0x011C90, 0x0001}, -{0x011C92, 0x0010}, -{0x011CA8, 0x0001}, -{0x011CA9, 0x0010}, -{0x011CB7, 0x0001}, -{0x011D00, 0x0004}, -{0x011D07, 0x0001}, -{0x011D08, 0x0004}, -{0x011D0A, 0x0001}, -{0x011D0B, 0x0004}, -{0x011D31, 0x0010}, -{0x011D37, 0x0001}, -{0x011D3A, 0x0010}, -{0x011D3B, 0x0001}, -{0x011D3C, 0x0010}, -{0x011D3E, 0x0001}, -{0x011D3F, 0x0010}, -{0x011D46, 0x0004}, -{0x011D47, 0x0010}, -{0x011D48, 0x0001}, -{0x011D50, 0x0002}, -{0x011D5A, 0x0001}, -{0x011D60, 0x0004}, -{0x011D66, 0x0001}, -{0x011D67, 0x0004}, -{0x011D69, 0x0001}, -{0x011D6A, 0x0004}, -{0x011D8A, 0x0010}, -{0x011D8F, 0x0001}, -{0x011D90, 0x0010}, -{0x011D92, 0x0001}, -{0x011D93, 0x0010}, -{0x011D98, 0x0004}, -{0x011D99, 0x0001}, -{0x011DA0, 0x0002}, -{0x011DAA, 0x0001}, -{0x011EE0, 0x0004}, -{0x011EF3, 0x0010}, -{0x011EF7, 0x0020}, -{0x011EF9, 0x0001}, -{0x011F00, 0x0010}, -{0x011F02, 0x0004}, -{0x011F03, 0x0010}, -{0x011F04, 0x0004}, -{0x011F11, 0x0001}, -{0x011F12, 0x0004}, -{0x011F34, 0x0010}, -{0x011F3B, 0x0001}, -{0x011F3E, 0x0010}, -{0x011F43, 0x0020}, -{0x011F50, 0x0002}, -{0x011F5A, 0x0001}, -{0x011FB0, 0x0004}, -{0x011FB1, 0x0001}, -{0x011FC0, 0x0002}, -{0x011FD5, 0x0040}, -{0x011FF2, 0x0001}, -{0x011FFF, 0x0020}, -{0x012000, 0x0004}, -{0x01239A, 0x0001}, -{0x012400, 0x0002}, -{0x01246F, 0x0001}, -{0x012470, 0x0020}, -{0x012475, 0x0001}, -{0x012480, 0x0004}, -{0x012544, 0x0001}, -{0x012F90, 0x0004}, -{0x012FF1, 0x0020}, -{0x012FF3, 0x0001}, -{0x013000, 0x0004}, -{0x013430, 0x0080}, -{0x013440, 0x0010}, -{0x013441, 0x0004}, -{0x013447, 0x0010}, -{0x013456, 0x0001}, -{0x014400, 0x0004}, -{0x014647, 0x0001}, -{0x016800, 0x0004}, -{0x016A39, 0x0001}, -{0x016A40, 0x0004}, -{0x016A5F, 0x0001}, -{0x016A60, 0x0002}, -{0x016A6A, 0x0001}, -{0x016A6E, 0x0020}, -{0x016A70, 0x0004}, -{0x016ABF, 0x0001}, -{0x016AC0, 0x0002}, -{0x016ACA, 0x0001}, -{0x016AD0, 0x0004}, -{0x016AEE, 0x0001}, -{0x016AF0, 0x0010}, -{0x016AF5, 0x0020}, -{0x016AF6, 0x0001}, -{0x016B00, 0x0004}, -{0x016B30, 0x0010}, -{0x016B37, 0x0020}, -{0x016B3C, 0x0040}, -{0x016B40, 0x0004}, -{0x016B44, 0x0020}, -{0x016B45, 0x0040}, -{0x016B46, 0x0001}, -{0x016B50, 0x0002}, -{0x016B5A, 0x0001}, -{0x016B5B, 0x0002}, -{0x016B62, 0x0001}, -{0x016B63, 0x0004}, -{0x016B78, 0x0001}, -{0x016B7D, 0x0004}, -{0x016B90, 0x0001}, -{0x016E40, 0x0004}, -{0x016E80, 0x0002}, -{0x016E97, 0x0020}, -{0x016E9B, 0x0001}, -{0x016F00, 0x0004}, -{0x016F4B, 0x0001}, -{0x016F4F, 0x0010}, -{0x016F50, 0x0004}, -{0x016F51, 0x0010}, -{0x016F88, 0x0001}, -{0x016F8F, 0x0010}, -{0x016F93, 0x0004}, -{0x016FA0, 0x0001}, -{0x016FE0, 0x0004}, -{0x016FE2, 0x0020}, -{0x016FE3, 0x0004}, -{0x016FE4, 0x0010}, -{0x016FE5, 0x0001}, -{0x016FF0, 0x0010}, -{0x016FF2, 0x0001}, -{0x017000, 0x0004}, -{0x0187F8, 0x0001}, -{0x018800, 0x0004}, -{0x018CD6, 0x0001}, -{0x018D00, 0x0004}, -{0x018D09, 0x0001}, -{0x01AFF0, 0x0004}, -{0x01AFF4, 0x0001}, -{0x01AFF5, 0x0004}, -{0x01AFFC, 0x0001}, -{0x01AFFD, 0x0004}, -{0x01AFFF, 0x0001}, -{0x01B000, 0x0004}, -{0x01B123, 0x0001}, -{0x01B132, 0x0004}, -{0x01B133, 0x0001}, -{0x01B150, 0x0004}, -{0x01B153, 0x0001}, -{0x01B155, 0x0004}, -{0x01B156, 0x0001}, -{0x01B164, 0x0004}, -{0x01B168, 0x0001}, -{0x01B170, 0x0004}, -{0x01B2FC, 0x0001}, -{0x01BC00, 0x0004}, -{0x01BC6B, 0x0001}, -{0x01BC70, 0x0004}, -{0x01BC7D, 0x0001}, -{0x01BC80, 0x0004}, -{0x01BC89, 0x0001}, -{0x01BC90, 0x0004}, -{0x01BC9A, 0x0001}, -{0x01BC9C, 0x0040}, -{0x01BC9D, 0x0010}, -{0x01BC9F, 0x0020}, -{0x01BCA0, 0x0080}, -{0x01BCA4, 0x0001}, -{0x01CF00, 0x0010}, -{0x01CF2E, 0x0001}, -{0x01CF30, 0x0010}, -{0x01CF47, 0x0001}, -{0x01CF50, 0x0040}, -{0x01CFC4, 0x0001}, -{0x01D000, 0x0040}, -{0x01D0F6, 0x0001}, -{0x01D100, 0x0040}, -{0x01D127, 0x0001}, -{0x01D129, 0x0040}, -{0x01D165, 0x0010}, -{0x01D16A, 0x0040}, -{0x01D16D, 0x0010}, -{0x01D173, 0x0080}, -{0x01D17B, 0x0010}, -{0x01D183, 0x0040}, -{0x01D185, 0x0010}, -{0x01D18C, 0x0040}, -{0x01D1AA, 0x0010}, -{0x01D1AE, 0x0040}, -{0x01D1EB, 0x0001}, -{0x01D200, 0x0040}, -{0x01D242, 0x0010}, -{0x01D245, 0x0040}, -{0x01D246, 0x0001}, -{0x01D2C0, 0x0002}, -{0x01D2D4, 0x0001}, -{0x01D2E0, 0x0002}, -{0x01D2F4, 0x0001}, -{0x01D300, 0x0040}, -{0x01D357, 0x0001}, -{0x01D360, 0x0002}, -{0x01D379, 0x0001}, -{0x01D400, 0x0004}, -{0x01D455, 0x0001}, -{0x01D456, 0x0004}, -{0x01D49D, 0x0001}, -{0x01D49E, 0x0004}, -{0x01D4A0, 0x0001}, -{0x01D4A2, 0x0004}, -{0x01D4A3, 0x0001}, -{0x01D4A5, 0x0004}, -{0x01D4A7, 0x0001}, -{0x01D4A9, 0x0004}, -{0x01D4AD, 0x0001}, -{0x01D4AE, 0x0004}, -{0x01D4BA, 0x0001}, -{0x01D4BB, 0x0004}, -{0x01D4BC, 0x0001}, -{0x01D4BD, 0x0004}, -{0x01D4C4, 0x0001}, -{0x01D4C5, 0x0004}, -{0x01D506, 0x0001}, -{0x01D507, 0x0004}, -{0x01D50B, 0x0001}, -{0x01D50D, 0x0004}, -{0x01D515, 0x0001}, -{0x01D516, 0x0004}, -{0x01D51D, 0x0001}, -{0x01D51E, 0x0004}, -{0x01D53A, 0x0001}, -{0x01D53B, 0x0004}, -{0x01D53F, 0x0001}, -{0x01D540, 0x0004}, -{0x01D545, 0x0001}, -{0x01D546, 0x0004}, -{0x01D547, 0x0001}, -{0x01D54A, 0x0004}, -{0x01D551, 0x0001}, -{0x01D552, 0x0004}, -{0x01D6A6, 0x0001}, -{0x01D6A8, 0x0004}, -{0x01D6C1, 0x0040}, -{0x01D6C2, 0x0004}, -{0x01D6DB, 0x0040}, -{0x01D6DC, 0x0004}, -{0x01D6FB, 0x0040}, -{0x01D6FC, 0x0004}, -{0x01D715, 0x0040}, -{0x01D716, 0x0004}, -{0x01D735, 0x0040}, -{0x01D736, 0x0004}, -{0x01D74F, 0x0040}, -{0x01D750, 0x0004}, -{0x01D76F, 0x0040}, -{0x01D770, 0x0004}, -{0x01D789, 0x0040}, -{0x01D78A, 0x0004}, -{0x01D7A9, 0x0040}, -{0x01D7AA, 0x0004}, -{0x01D7C3, 0x0040}, -{0x01D7C4, 0x0004}, -{0x01D7CC, 0x0001}, -{0x01D7CE, 0x0002}, -{0x01D800, 0x0040}, -{0x01DA00, 0x0010}, -{0x01DA37, 0x0040}, -{0x01DA3B, 0x0010}, -{0x01DA6D, 0x0040}, -{0x01DA75, 0x0010}, -{0x01DA76, 0x0040}, -{0x01DA84, 0x0010}, -{0x01DA85, 0x0040}, -{0x01DA87, 0x0020}, -{0x01DA8C, 0x0001}, -{0x01DA9B, 0x0010}, -{0x01DAA0, 0x0001}, -{0x01DAA1, 0x0010}, -{0x01DAB0, 0x0001}, -{0x01DF00, 0x0004}, -{0x01DF1F, 0x0001}, -{0x01DF25, 0x0004}, -{0x01DF2B, 0x0001}, -{0x01E000, 0x0010}, -{0x01E007, 0x0001}, -{0x01E008, 0x0010}, -{0x01E019, 0x0001}, -{0x01E01B, 0x0010}, -{0x01E022, 0x0001}, -{0x01E023, 0x0010}, -{0x01E025, 0x0001}, -{0x01E026, 0x0010}, -{0x01E02B, 0x0001}, -{0x01E030, 0x0004}, -{0x01E06E, 0x0001}, -{0x01E08F, 0x0010}, -{0x01E090, 0x0001}, -{0x01E100, 0x0004}, -{0x01E12D, 0x0001}, -{0x01E130, 0x0010}, -{0x01E137, 0x0004}, -{0x01E13E, 0x0001}, -{0x01E140, 0x0002}, -{0x01E14A, 0x0001}, -{0x01E14E, 0x0004}, -{0x01E14F, 0x0040}, -{0x01E150, 0x0001}, -{0x01E290, 0x0004}, -{0x01E2AE, 0x0010}, -{0x01E2AF, 0x0001}, -{0x01E2C0, 0x0004}, -{0x01E2EC, 0x0010}, -{0x01E2F0, 0x0002}, -{0x01E2FA, 0x0001}, -{0x01E2FF, 0x0040}, -{0x01E300, 0x0001}, -{0x01E4D0, 0x0004}, -{0x01E4EC, 0x0010}, -{0x01E4F0, 0x0002}, -{0x01E4FA, 0x0001}, -{0x01E7E0, 0x0004}, -{0x01E7E7, 0x0001}, -{0x01E7E8, 0x0004}, -{0x01E7EC, 0x0001}, -{0x01E7ED, 0x0004}, -{0x01E7EF, 0x0001}, -{0x01E7F0, 0x0004}, -{0x01E7FF, 0x0001}, -{0x01E800, 0x0004}, -{0x01E8C5, 0x0001}, -{0x01E8C7, 0x0002}, -{0x01E8D0, 0x0010}, -{0x01E8D7, 0x0001}, -{0x01E900, 0x0004}, -{0x01E944, 0x0010}, -{0x01E94B, 0x0004}, -{0x01E94C, 0x0001}, -{0x01E950, 0x0002}, -{0x01E95A, 0x0001}, -{0x01E95E, 0x0020}, -{0x01E960, 0x0001}, -{0x01EC71, 0x0002}, -{0x01ECAC, 0x0040}, -{0x01ECAD, 0x0002}, -{0x01ECB0, 0x0040}, -{0x01ECB1, 0x0002}, -{0x01ECB5, 0x0001}, -{0x01ED01, 0x0002}, -{0x01ED2E, 0x0040}, -{0x01ED2F, 0x0002}, -{0x01ED3E, 0x0001}, -{0x01EE00, 0x0004}, -{0x01EE04, 0x0001}, -{0x01EE05, 0x0004}, -{0x01EE20, 0x0001}, -{0x01EE21, 0x0004}, -{0x01EE23, 0x0001}, -{0x01EE24, 0x0004}, -{0x01EE25, 0x0001}, -{0x01EE27, 0x0004}, -{0x01EE28, 0x0001}, -{0x01EE29, 0x0004}, -{0x01EE33, 0x0001}, -{0x01EE34, 0x0004}, -{0x01EE38, 0x0001}, -{0x01EE39, 0x0004}, -{0x01EE3A, 0x0001}, -{0x01EE3B, 0x0004}, -{0x01EE3C, 0x0001}, -{0x01EE42, 0x0004}, -{0x01EE43, 0x0001}, -{0x01EE47, 0x0004}, -{0x01EE48, 0x0001}, -{0x01EE49, 0x0004}, -{0x01EE4A, 0x0001}, -{0x01EE4B, 0x0004}, -{0x01EE4C, 0x0001}, -{0x01EE4D, 0x0004}, -{0x01EE50, 0x0001}, -{0x01EE51, 0x0004}, -{0x01EE53, 0x0001}, -{0x01EE54, 0x0004}, -{0x01EE55, 0x0001}, -{0x01EE57, 0x0004}, -{0x01EE58, 0x0001}, -{0x01EE59, 0x0004}, -{0x01EE5A, 0x0001}, -{0x01EE5B, 0x0004}, -{0x01EE5C, 0x0001}, -{0x01EE5D, 0x0004}, -{0x01EE5E, 0x0001}, -{0x01EE5F, 0x0004}, -{0x01EE60, 0x0001}, -{0x01EE61, 0x0004}, -{0x01EE63, 0x0001}, -{0x01EE64, 0x0004}, -{0x01EE65, 0x0001}, -{0x01EE67, 0x0004}, -{0x01EE6B, 0x0001}, -{0x01EE6C, 0x0004}, -{0x01EE73, 0x0001}, -{0x01EE74, 0x0004}, -{0x01EE78, 0x0001}, -{0x01EE79, 0x0004}, -{0x01EE7D, 0x0001}, -{0x01EE7E, 0x0004}, -{0x01EE7F, 0x0001}, -{0x01EE80, 0x0004}, -{0x01EE8A, 0x0001}, -{0x01EE8B, 0x0004}, -{0x01EE9C, 0x0001}, -{0x01EEA1, 0x0004}, -{0x01EEA4, 0x0001}, -{0x01EEA5, 0x0004}, -{0x01EEAA, 0x0001}, -{0x01EEAB, 0x0004}, -{0x01EEBC, 0x0001}, -{0x01EEF0, 0x0040}, -{0x01EEF2, 0x0001}, -{0x01F000, 0x0040}, -{0x01F02C, 0x0001}, -{0x01F030, 0x0040}, -{0x01F094, 0x0001}, -{0x01F0A0, 0x0040}, -{0x01F0AF, 0x0001}, -{0x01F0B1, 0x0040}, -{0x01F0C0, 0x0001}, -{0x01F0C1, 0x0040}, -{0x01F0D0, 0x0001}, -{0x01F0D1, 0x0040}, -{0x01F0F6, 0x0001}, -{0x01F100, 0x0002}, -{0x01F10D, 0x0040}, -{0x01F1AE, 0x0001}, -{0x01F1E6, 0x0040}, -{0x01F203, 0x0001}, -{0x01F210, 0x0040}, -{0x01F23C, 0x0001}, -{0x01F240, 0x0040}, -{0x01F249, 0x0001}, -{0x01F250, 0x0040}, -{0x01F252, 0x0001}, -{0x01F260, 0x0040}, -{0x01F266, 0x0001}, -{0x01F300, 0x0040}, -{0x01F6D8, 0x0001}, -{0x01F6DC, 0x0040}, -{0x01F6ED, 0x0001}, -{0x01F6F0, 0x0040}, -{0x01F6FD, 0x0001}, -{0x01F700, 0x0040}, -{0x01F777, 0x0001}, -{0x01F77B, 0x0040}, -{0x01F7DA, 0x0001}, -{0x01F7E0, 0x0040}, -{0x01F7EC, 0x0001}, -{0x01F7F0, 0x0040}, -{0x01F7F1, 0x0001}, -{0x01F800, 0x0040}, -{0x01F80C, 0x0001}, -{0x01F810, 0x0040}, -{0x01F848, 0x0001}, -{0x01F850, 0x0040}, -{0x01F85A, 0x0001}, -{0x01F860, 0x0040}, -{0x01F888, 0x0001}, -{0x01F890, 0x0040}, -{0x01F8AE, 0x0001}, -{0x01F8B0, 0x0040}, -{0x01F8B2, 0x0001}, -{0x01F900, 0x0040}, -{0x01FA54, 0x0001}, -{0x01FA60, 0x0040}, -{0x01FA6E, 0x0001}, -{0x01FA70, 0x0040}, -{0x01FA7D, 0x0001}, -{0x01FA80, 0x0040}, -{0x01FA89, 0x0001}, -{0x01FA90, 0x0040}, -{0x01FABE, 0x0001}, -{0x01FABF, 0x0040}, -{0x01FAC6, 0x0001}, -{0x01FACE, 0x0040}, -{0x01FADC, 0x0001}, -{0x01FAE0, 0x0040}, -{0x01FAE9, 0x0001}, -{0x01FAF0, 0x0040}, -{0x01FAF9, 0x0001}, -{0x01FB00, 0x0040}, -{0x01FB93, 0x0001}, -{0x01FB94, 0x0040}, -{0x01FBCB, 0x0001}, -{0x01FBF0, 0x0002}, -{0x01FBFA, 0x0001}, -{0x020000, 0x0004}, -{0x02A6E0, 0x0001}, -{0x02A700, 0x0004}, -{0x02B73A, 0x0001}, -{0x02B740, 0x0004}, -{0x02B81E, 0x0001}, -{0x02B820, 0x0004}, -{0x02CEA2, 0x0001}, -{0x02CEB0, 0x0004}, -{0x02EBE1, 0x0001}, -{0x02EBF0, 0x0004}, -{0x02EE5E, 0x0001}, -{0x02F800, 0x0004}, -{0x02FA1E, 0x0001}, -{0x030000, 0x0004}, -{0x03134B, 0x0001}, -{0x031350, 0x0004}, -{0x0323B0, 0x0001}, -{0x0E0001, 0x0080}, -{0x0E0002, 0x0001}, -{0x0E0020, 0x0080}, -{0x0E0080, 0x0001}, -{0x0E0100, 0x0010}, -{0x0E01F0, 0x0001}, -{0x0F0000, 0x0080}, -{0x0FFFFE, 0x0001}, -{0x100000, 0x0080}, -{0x10FFFE, 0x0001}, -{0x110000, 0x0000}, +const std::vector unicode_rle_codepoints_categs = { // run length encoding, 5 bits categ + 11 bits length +0x03E1, +0x001D, +0x0055, +0x0017, +0x0055, +0x0016, +0x0012, +0x0015, +0x0019, +0x0015, +0x0011, +0x0035, +0x012D, +0x0035, +0x0059, +0x0035, +0x0329, +0x0016, +0x0015, +0x0012, +0x0018, +0x0010, +0x0018, +0x0325, +0x0016, +0x0019, +0x0012, +0x0019, +0x0401, +0x001D, +0x0015, +0x0077, +0x001A, +0x0015, +0x0018, +0x001A, +0x0007, +0x0014, +0x0019, +0x0002, +0x001A, +0x0018, +0x001A, +0x0019, +0x002F, +0x0018, +0x0005, +0x0035, +0x0018, +0x000F, +0x0007, +0x0013, +0x004F, +0x0015, +0x02C9, +0x0019, +0x00C9, +0x02E5, +0x0019, +0x00E5, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0025, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0025, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0029, +0x0005, +0x0009, +0x0005, +0x0009, +0x0045, +0x0029, +0x0005, +0x0009, +0x0005, +0x0029, +0x0005, +0x0049, +0x0025, +0x0069, +0x0005, +0x0029, +0x0005, +0x0049, +0x0045, +0x0029, +0x0005, +0x0029, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0029, +0x0005, +0x0009, +0x0025, +0x0009, +0x0005, +0x0029, +0x0005, +0x0049, +0x0005, +0x0009, +0x0005, +0x0029, +0x0025, +0x0007, +0x0009, +0x0045, +0x0067, +0x0009, +0x0008, +0x0005, +0x0009, +0x0008, +0x0005, +0x0009, +0x0008, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0025, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0025, +0x0009, +0x0008, +0x0005, +0x0009, +0x0005, +0x0049, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x00C5, +0x0029, +0x0005, +0x0029, +0x0025, +0x0009, +0x0005, +0x0069, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0885, +0x0007, +0x0345, +0x0226, +0x0078, +0x0166, +0x01B8, +0x0086, +0x00D8, +0x0006, +0x0018, +0x0006, +0x0218, +0x0DEC, +0x0009, +0x0005, +0x0009, +0x0005, +0x0006, +0x0018, +0x0009, +0x0005, +0x0020, +0x0006, +0x0045, +0x0015, +0x0009, +0x0060, +0x0038, +0x0009, +0x0015, +0x0049, +0x0000, +0x0009, +0x0000, +0x0029, +0x0005, +0x0209, +0x0000, +0x0109, +0x0445, +0x0009, +0x0025, +0x0049, +0x0045, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0085, +0x0009, +0x0005, +0x0019, +0x0009, +0x0005, +0x0029, +0x0025, +0x0649, +0x05E5, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x001A, +0x008C, +0x002B, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0029, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0025, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0000, +0x04A9, +0x0020, +0x0006, +0x00B5, +0x0505, +0x0015, +0x0011, +0x0020, +0x003A, +0x0017, +0x0000, +0x058C, +0x0011, +0x000C, +0x0015, +0x002C, +0x0015, +0x002C, +0x0015, +0x000C, +0x00E0, +0x0347, +0x0060, +0x0067, +0x0035, +0x0140, +0x00A2, +0x0059, +0x0035, +0x0017, +0x0035, +0x003A, +0x014C, +0x0015, +0x0002, +0x0055, +0x03E7, +0x0006, +0x0127, +0x028C, +0x012D, +0x0075, +0x0027, +0x000C, +0x0C47, +0x0015, +0x0007, +0x00CC, +0x0002, +0x001A, +0x00AC, +0x0026, +0x002C, +0x001A, +0x006C, +0x0027, +0x012D, +0x0047, +0x003A, +0x0007, +0x01B5, +0x0000, +0x0002, +0x0007, +0x000C, +0x03A7, +0x034C, +0x0020, +0x0B07, +0x014C, +0x0007, +0x01A0, +0x012D, +0x0407, +0x010C, +0x0026, +0x001A, +0x0055, +0x0006, +0x0020, +0x000C, +0x0037, +0x02A7, +0x006C, +0x0006, +0x010C, +0x0006, +0x004C, +0x0006, +0x008C, +0x0020, +0x01D5, +0x0000, +0x0307, +0x004C, +0x0020, +0x0015, +0x0000, +0x0147, +0x0080, +0x02E7, +0x0018, +0x00A7, +0x0000, +0x0022, +0x00A0, +0x00EC, +0x0507, +0x0006, +0x02EC, +0x0002, +0x03EC, +0x000A, +0x06A7, +0x000C, +0x000A, +0x000C, +0x0007, +0x004A, +0x00EC, +0x006A, +0x000C, +0x002A, +0x0007, +0x00CC, +0x0127, +0x002C, +0x0035, +0x012D, +0x0015, +0x0006, +0x01C7, +0x000C, +0x002A, +0x0000, +0x00E7, +0x0020, +0x0027, +0x0020, +0x02A7, +0x0000, +0x00C7, +0x0000, +0x0007, +0x0040, +0x0067, +0x0020, +0x000C, +0x0007, +0x004A, +0x006C, +0x0020, +0x002A, +0x0020, +0x002A, +0x000C, +0x0007, +0x00E0, +0x000A, +0x0060, +0x0027, +0x0000, +0x0047, +0x002C, +0x0020, +0x012D, +0x0027, +0x0037, +0x00AF, +0x001A, +0x0017, +0x0007, +0x0015, +0x000C, +0x0020, +0x002C, +0x000A, +0x0000, +0x00A7, +0x0060, +0x0027, +0x0020, +0x02A7, +0x0000, +0x00C7, +0x0000, +0x0027, +0x0000, +0x0027, +0x0000, +0x0027, +0x0020, +0x000C, +0x0000, +0x004A, +0x002C, +0x0060, +0x002C, +0x0020, +0x004C, +0x0040, +0x000C, +0x00C0, +0x0067, +0x0000, +0x0007, +0x00C0, +0x012D, +0x002C, +0x0047, +0x000C, +0x0015, +0x0120, +0x002C, +0x000A, +0x0000, +0x0107, +0x0000, +0x0047, +0x0000, +0x02A7, +0x0000, +0x00C7, +0x0000, +0x0027, +0x0000, +0x0087, +0x0020, +0x000C, +0x0007, +0x004A, +0x008C, +0x0000, +0x002C, +0x000A, +0x0000, +0x002A, +0x000C, +0x0020, +0x0007, +0x01C0, +0x0027, +0x002C, +0x0020, +0x012D, +0x0015, +0x0017, +0x00C0, +0x0007, +0x00AC, +0x0000, +0x000C, +0x002A, +0x0000, +0x00E7, +0x0020, +0x0027, +0x0020, +0x02A7, +0x0000, +0x00C7, +0x0000, +0x0027, +0x0000, +0x0087, +0x0020, +0x000C, +0x0007, +0x000A, +0x000C, +0x000A, +0x006C, +0x0020, +0x002A, +0x0020, +0x002A, +0x000C, +0x00C0, +0x002C, +0x000A, +0x0060, +0x0027, +0x0000, +0x0047, +0x002C, +0x0020, +0x012D, +0x001A, +0x0007, +0x00AF, +0x0120, +0x000C, +0x0007, +0x0000, +0x00A7, +0x0040, +0x0047, +0x0000, +0x0067, +0x0040, +0x0027, +0x0000, +0x0007, +0x0000, +0x0027, +0x0040, +0x0027, +0x0040, +0x0047, +0x0040, +0x0167, +0x0060, +0x002A, +0x000C, +0x002A, +0x0040, +0x004A, +0x0000, +0x004A, +0x000C, +0x0020, +0x0007, +0x00A0, +0x000A, +0x01A0, +0x012D, +0x004F, +0x00BA, +0x0017, +0x001A, +0x0080, +0x000C, +0x004A, +0x000C, +0x00E7, +0x0000, +0x0047, +0x0000, +0x02C7, +0x0000, +0x01E7, +0x0020, +0x000C, +0x0007, +0x004C, +0x006A, +0x0000, +0x004C, +0x0000, +0x006C, +0x00C0, +0x002C, +0x0000, +0x0047, +0x0020, +0x0007, +0x0020, +0x0027, +0x002C, +0x0020, +0x012D, +0x00C0, +0x0015, +0x00CF, +0x001A, +0x0007, +0x000C, +0x002A, +0x0015, +0x00E7, +0x0000, +0x0047, +0x0000, +0x02C7, +0x0000, +0x0127, +0x0000, +0x0087, +0x0020, +0x000C, +0x0007, +0x000A, +0x000C, +0x008A, +0x0000, +0x000C, +0x002A, +0x0000, +0x002A, +0x002C, +0x00C0, +0x002A, +0x00A0, +0x0027, +0x0000, +0x0027, +0x002C, +0x0020, +0x012D, +0x0000, +0x0027, +0x000A, +0x0160, +0x002C, +0x002A, +0x0107, +0x0000, +0x0047, +0x0000, +0x0507, +0x002C, +0x0007, +0x004A, +0x006C, +0x0000, +0x004A, +0x0000, +0x004A, +0x000C, +0x0007, +0x001A, +0x0060, +0x0047, +0x000A, +0x00CF, +0x0047, +0x002C, +0x0020, +0x012D, +0x010F, +0x001A, +0x00A7, +0x0000, +0x000C, +0x002A, +0x0000, +0x0227, +0x0040, +0x02E7, +0x0000, +0x0107, +0x0000, +0x0007, +0x0020, +0x00C7, +0x0040, +0x000C, +0x0060, +0x004A, +0x004C, +0x0000, +0x000C, +0x0000, +0x00EA, +0x00A0, +0x012D, +0x0020, +0x002A, +0x0015, +0x0160, +0x05E7, +0x000C, +0x0027, +0x00CC, +0x0060, +0x0017, +0x00A7, +0x0006, +0x00EC, +0x0015, +0x012D, +0x0035, +0x0480, +0x0027, +0x0000, +0x0007, +0x0000, +0x0087, +0x0000, +0x02E7, +0x0000, +0x0007, +0x0000, +0x0127, +0x000C, +0x0027, +0x010C, +0x0007, +0x0020, +0x0087, +0x0000, +0x0006, +0x0000, +0x00CC, +0x0000, +0x012D, +0x0020, +0x0067, +0x03E0, +0x0007, +0x005A, +0x01D5, +0x001A, +0x0015, +0x005A, +0x002C, +0x00BA, +0x012D, +0x012F, +0x001A, +0x000C, +0x001A, +0x000C, +0x001A, +0x000C, +0x0016, +0x0012, +0x0016, +0x0012, +0x002A, +0x00E7, +0x0000, +0x0467, +0x0060, +0x01AC, +0x000A, +0x008C, +0x0015, +0x002C, +0x0087, +0x014C, +0x0000, +0x046C, +0x0000, +0x00FA, +0x000C, +0x00BA, +0x0000, +0x003A, +0x0095, +0x007A, +0x0035, +0x0480, +0x0547, +0x002A, +0x006C, +0x000A, +0x00AC, +0x000A, +0x002C, +0x002A, +0x002C, +0x0007, +0x012D, +0x00B5, +0x00A7, +0x002A, +0x002C, +0x0067, +0x004C, +0x0007, +0x004A, +0x0027, +0x00CA, +0x0047, +0x006C, +0x0187, +0x000C, +0x002A, +0x002C, +0x00AA, +0x000C, +0x0007, +0x000A, +0x012D, +0x004A, +0x000C, +0x003A, +0x04A9, +0x0000, +0x0009, +0x0080, +0x0009, +0x0020, +0x0545, +0x0015, +0x0006, +0x0045, +0x2907, +0x0000, +0x0067, +0x0020, +0x00C7, +0x0000, +0x0007, +0x0000, +0x0067, +0x0020, +0x0507, +0x0000, +0x0067, +0x0020, +0x0407, +0x0000, +0x0067, +0x0020, +0x00C7, +0x0000, +0x0007, +0x0000, +0x0067, +0x0020, +0x01C7, +0x0000, +0x0707, +0x0000, +0x0067, +0x0020, +0x0847, +0x0020, +0x004C, +0x0115, +0x026F, +0x0040, +0x01E7, +0x013A, +0x00A0, +0x0AA9, +0x0020, +0x00A5, +0x0020, +0x0011, +0x4D67, +0x001A, +0x0015, +0x0207, +0x001D, +0x0327, +0x0016, +0x0012, +0x0040, +0x0947, +0x0055, +0x004E, +0x00E7, +0x00C0, +0x0227, +0x004C, +0x000A, +0x0100, +0x0247, +0x002C, +0x000A, +0x0035, +0x0100, +0x0227, +0x002C, +0x0160, +0x0187, +0x0000, +0x0047, +0x0000, +0x002C, +0x0160, +0x0667, +0x002C, +0x000A, +0x00CC, +0x00EA, +0x000C, +0x002A, +0x014C, +0x0055, +0x0006, +0x0055, +0x0017, +0x0007, +0x000C, +0x0020, +0x012D, +0x00A0, +0x012F, +0x00A0, +0x00B5, +0x0011, +0x0075, +0x004C, +0x0002, +0x000C, +0x012D, +0x00A0, +0x0447, +0x0006, +0x0687, +0x00C0, +0x0087, +0x002C, +0x0427, +0x000C, +0x0007, +0x0080, +0x08A7, +0x0120, +0x03C7, +0x0000, +0x004C, +0x006A, +0x002C, +0x004A, +0x0060, +0x002A, +0x000C, +0x00AA, +0x004C, +0x0060, +0x001A, +0x0040, +0x0035, +0x012D, +0x03A7, +0x0020, +0x0087, +0x0140, +0x0567, +0x0060, +0x0327, +0x00A0, +0x012D, +0x000F, +0x0040, +0x043A, +0x02C7, +0x002C, +0x002A, +0x000C, +0x0020, +0x0035, +0x0687, +0x000A, +0x000C, +0x000A, +0x00CC, +0x0000, +0x000C, +0x000A, +0x000C, +0x002A, +0x00EC, +0x00AA, +0x012C, +0x0020, +0x000C, +0x012D, +0x00A0, +0x012D, +0x00A0, +0x00D5, +0x0006, +0x00B5, +0x0020, +0x01AC, +0x000B, +0x01EC, +0x0600, +0x006C, +0x000A, +0x05C7, +0x000C, +0x000A, +0x008C, +0x000A, +0x000C, +0x008A, +0x000C, +0x002A, +0x00E7, +0x0040, +0x012D, +0x00D5, +0x013A, +0x010C, +0x011A, +0x0035, +0x0000, +0x002C, +0x000A, +0x03A7, +0x000A, +0x006C, +0x002A, +0x002C, +0x000A, +0x004C, +0x0027, +0x012D, +0x0567, +0x000C, +0x000A, +0x002C, +0x004A, +0x000C, +0x000A, +0x004C, +0x002A, +0x00E0, +0x0075, +0x0467, +0x00EA, +0x00EC, +0x002A, +0x002C, +0x0040, +0x0095, +0x012D, +0x0040, +0x0047, +0x012D, +0x03A7, +0x00A6, +0x0035, +0x0105, +0x00C0, +0x0549, +0x0020, +0x0049, +0x00F5, +0x00E0, +0x004C, +0x0015, +0x018C, +0x000A, +0x00CC, +0x0067, +0x000C, +0x00A7, +0x000C, +0x0027, +0x000A, +0x002C, +0x0007, +0x0080, +0x0565, +0x07C6, +0x0185, +0x0006, +0x0425, +0x0486, +0x07EC, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0105, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0105, +0x00E9, +0x00A5, +0x0020, +0x00A9, +0x0020, +0x00E5, +0x00E9, +0x00E5, +0x00E9, +0x00A5, +0x0020, +0x00A9, +0x0020, +0x00E5, +0x0000, +0x0009, +0x0000, +0x0009, +0x0000, +0x0009, +0x0000, +0x0009, +0x00E5, +0x00E9, +0x01A5, +0x0020, +0x00E5, +0x00E8, +0x00E5, +0x00E8, +0x00E5, +0x00E8, +0x0085, +0x0000, +0x0025, +0x0069, +0x0008, +0x0018, +0x0005, +0x0058, +0x0045, +0x0000, +0x0025, +0x0069, +0x0008, +0x0058, +0x0065, +0x0020, +0x0025, +0x0069, +0x0000, +0x0058, +0x00E5, +0x0089, +0x0058, +0x0020, +0x0045, +0x0000, +0x0025, +0x0069, +0x0008, +0x0038, +0x0000, +0x015D, +0x0082, +0x00B1, +0x0035, +0x0014, +0x0013, +0x0016, +0x0034, +0x0013, +0x0016, +0x0014, +0x00F5, +0x001B, +0x001C, +0x0082, +0x001D, +0x0115, +0x0014, +0x0013, +0x0075, +0x0030, +0x0055, +0x0019, +0x0016, +0x0012, +0x0155, +0x0019, +0x0015, +0x0010, +0x0135, +0x001D, +0x0082, +0x0000, +0x0122, +0x000F, +0x0006, +0x0020, +0x00AF, +0x0059, +0x0016, +0x0012, +0x0006, +0x012F, +0x0059, +0x0016, +0x0012, +0x0000, +0x0186, +0x0040, +0x0417, +0x01C0, +0x018C, +0x006B, +0x000C, +0x004B, +0x016C, +0x01C0, +0x003A, +0x0009, +0x007A, +0x0009, +0x003A, +0x0005, +0x0049, +0x0025, +0x0049, +0x0005, +0x001A, +0x0009, +0x003A, +0x0019, +0x0089, +0x00BA, +0x0009, +0x001A, +0x0009, +0x001A, +0x0009, +0x001A, +0x0069, +0x001A, +0x0005, +0x0069, +0x0005, +0x0067, +0x0005, +0x003A, +0x0025, +0x0029, +0x0099, +0x0009, +0x0065, +0x001A, +0x0019, +0x003A, +0x0005, +0x001A, +0x01EF, +0x044E, +0x0009, +0x0005, +0x006E, +0x000F, +0x003A, +0x0060, +0x0099, +0x009A, +0x0039, +0x007A, +0x0019, +0x003A, +0x0019, +0x003A, +0x0019, +0x00DA, +0x0019, +0x03DA, +0x0039, +0x003A, +0x0019, +0x001A, +0x0019, +0x03DA, +0x2179, +0x00FA, +0x0016, +0x0012, +0x0016, +0x0012, +0x027A, +0x0039, +0x00DA, +0x0016, +0x0012, +0x0A1A, +0x0019, +0x03BA, +0x0319, +0x04FA, +0x00B9, +0x089A, +0x0300, +0x015A, +0x0280, +0x076F, +0x09BA, +0x02AF, +0x16DA, +0x0019, +0x011A, +0x0019, +0x06BA, +0x00F9, +0x0DDA, +0x0019, +0x1EFA, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x03AF, +0x057A, +0x0099, +0x0016, +0x0012, +0x03D9, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x01F9, +0x1FFA, +0x1059, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x07D9, +0x0016, +0x0012, +0x0016, +0x0012, +0x03F9, +0x0016, +0x0012, +0x2039, +0x05FA, +0x0299, +0x003A, +0x00B9, +0x04DA, +0x0020, +0x03FA, +0x0000, +0x0D1A, +0x05E9, +0x05E5, +0x0009, +0x0005, +0x0049, +0x0025, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0069, +0x0005, +0x0009, +0x0025, +0x0009, +0x00A5, +0x0026, +0x0049, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0025, +0x00BA, +0x0009, +0x0005, +0x0009, +0x0005, +0x004C, +0x0009, +0x0005, +0x0080, +0x0075, +0x000F, +0x0035, +0x04A5, +0x0000, +0x0005, +0x0080, +0x0005, +0x0020, +0x06E7, +0x00C0, +0x0006, +0x0015, +0x01A0, +0x000C, +0x02C7, +0x0100, +0x00C7, +0x0000, +0x00C7, +0x0000, +0x00C7, +0x0000, +0x00C7, +0x0000, +0x00C7, +0x0000, +0x00C7, +0x0000, +0x00C7, +0x0000, +0x00C7, +0x0000, +0x03EC, +0x0035, +0x0014, +0x0013, +0x0014, +0x0013, +0x0055, +0x0014, +0x0013, +0x0015, +0x0014, +0x0013, +0x0115, +0x0011, +0x0035, +0x0011, +0x0015, +0x0014, +0x0013, +0x0035, +0x0014, +0x0013, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0095, +0x0006, +0x0135, +0x0031, +0x0075, +0x0011, +0x0015, +0x0016, +0x0195, +0x003A, +0x0055, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0011, +0x0420, +0x033A, +0x0000, +0x0B1A, +0x0160, +0x1ABA, +0x0320, +0x01FA, +0x001D, +0x0055, +0x001A, +0x0006, +0x0007, +0x000E, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x003A, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0011, +0x0016, +0x0032, +0x001A, +0x010E, +0x006C, +0x002A, +0x0011, +0x0086, +0x003A, +0x004E, +0x0006, +0x0007, +0x0015, +0x003A, +0x0000, +0x0AA7, +0x0020, +0x002C, +0x0038, +0x0026, +0x0007, +0x0011, +0x0B27, +0x0015, +0x0046, +0x0007, +0x0080, +0x0547, +0x0000, +0x0BA7, +0x0000, +0x003A, +0x006F, +0x013A, +0x03E7, +0x047A, +0x0140, +0x001A, +0x01E7, +0x03DA, +0x0000, +0x012F, +0x03BA, +0x00EF, +0x001A, +0x01CF, +0x03FA, +0x012F, +0x04DA, +0x01CF, +0x27FA, +0xFFE7, +0xFFE7, +0xFFE7, +0x37E7, +0x07FA, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0x4287, +0x0006, +0x8EC7, +0x0040, +0x06DA, +0x0100, +0x04E7, +0x00A6, +0x0035, +0x2167, +0x0006, +0x0055, +0x01E7, +0x012D, +0x0027, +0x0260, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0007, +0x000C, +0x004B, +0x0015, +0x012C, +0x0015, +0x0006, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0026, +0x002C, +0x08A7, +0x012E, +0x002C, +0x00B5, +0x00E0, +0x02D8, +0x0106, +0x0038, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0045, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0006, +0x00E5, +0x0009, +0x0005, +0x0009, +0x0005, +0x0029, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0006, +0x0038, +0x0009, +0x0005, +0x0009, +0x0005, +0x0007, +0x0009, +0x0005, +0x0009, +0x0045, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0089, +0x0005, +0x0089, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0069, +0x0005, +0x0009, +0x0005, +0x0080, +0x0009, +0x0005, +0x0000, +0x0005, +0x0000, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x02E0, +0x0046, +0x0009, +0x0005, +0x0007, +0x0026, +0x0005, +0x00C7, +0x000C, +0x0047, +0x000C, +0x0067, +0x000C, +0x02C7, +0x002A, +0x002C, +0x000A, +0x007A, +0x000C, +0x0040, +0x00AF, +0x003A, +0x0017, +0x001A, +0x00A0, +0x0667, +0x0075, +0x00E0, +0x002A, +0x0627, +0x01EA, +0x002C, +0x00E0, +0x0035, +0x012D, +0x00A0, +0x022C, +0x00A7, +0x0055, +0x0007, +0x0015, +0x0027, +0x000C, +0x012D, +0x0367, +0x00EC, +0x0035, +0x02C7, +0x014C, +0x002A, +0x0140, +0x0015, +0x0387, +0x0040, +0x004C, +0x000A, +0x05C7, +0x000C, +0x002A, +0x006C, +0x002A, +0x002C, +0x004A, +0x0195, +0x0000, +0x0006, +0x012D, +0x0060, +0x0035, +0x0087, +0x000C, +0x0006, +0x0107, +0x012D, +0x0087, +0x0000, +0x0507, +0x00AC, +0x002A, +0x002C, +0x002A, +0x002C, +0x0100, +0x0047, +0x000C, +0x00E7, +0x000C, +0x000A, +0x0020, +0x012D, +0x0020, +0x0075, +0x01E7, +0x0006, +0x00A7, +0x005A, +0x0007, +0x000A, +0x000C, +0x000A, +0x0627, +0x000C, +0x0007, +0x004C, +0x0027, +0x002C, +0x0087, +0x002C, +0x0007, +0x000C, +0x0007, +0x02E0, +0x0027, +0x0006, +0x0035, +0x0147, +0x000A, +0x002C, +0x002A, +0x0035, +0x0007, +0x0026, +0x000A, +0x000C, +0x0120, +0x00A7, +0x0020, +0x00A7, +0x0020, +0x00A7, +0x0100, +0x00C7, +0x0000, +0x00C7, +0x0000, +0x0545, +0x0018, +0x0066, +0x0105, +0x0006, +0x0038, +0x0060, +0x09E5, +0x0447, +0x002A, +0x000C, +0x002A, +0x000C, +0x002A, +0x0015, +0x000A, +0x000C, +0x0020, +0x012D, +0x00A0, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0x7467, +0x0160, +0x02C7, +0x0060, +0x0607, +0x0060, +0xFFE4, +0xFFE3, +0xFFE3, +0xFFE3, +0x1FE3, +0x2DA7, +0x0020, +0x0D27, +0x04A0, +0x00C5, +0x0160, +0x0085, +0x0080, +0x0007, +0x000C, +0x0127, +0x0019, +0x0187, +0x0000, +0x0087, +0x0000, +0x0007, +0x0000, +0x0027, +0x0000, +0x0027, +0x0000, +0x0D67, +0x0218, +0x01E0, +0x2D47, +0x0012, +0x0016, +0x01FA, +0x07E7, +0x0020, +0x06A7, +0x00C0, +0x001A, +0x03E0, +0x0167, +0x0017, +0x005A, +0x01EC, +0x00D5, +0x0016, +0x0012, +0x0015, +0x00A0, +0x01EC, +0x0015, +0x0031, +0x0030, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0035, +0x0016, +0x0012, +0x0075, +0x0050, +0x0055, +0x0000, +0x0075, +0x0011, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0055, +0x0019, +0x0011, +0x0059, +0x0000, +0x0015, +0x0017, +0x0035, +0x0060, +0x0087, +0x0000, +0x10C7, +0x0020, +0x0002, +0x0000, +0x0055, +0x0017, +0x0055, +0x0016, +0x0012, +0x0015, +0x0019, +0x0015, +0x0011, +0x0035, +0x012D, +0x0035, +0x0059, +0x0035, +0x0329, +0x0016, +0x0015, +0x0012, +0x0018, +0x0010, +0x0018, +0x0325, +0x0016, +0x0019, +0x0012, +0x0019, +0x0016, +0x0012, +0x0015, +0x0016, +0x0012, +0x0035, +0x0127, +0x0006, +0x0587, +0x0026, +0x03C7, +0x0040, +0x00A7, +0x0020, +0x00A7, +0x0020, +0x00A7, +0x0020, +0x0047, +0x0040, +0x0037, +0x0019, +0x0018, +0x001A, +0x0037, +0x0000, +0x001A, +0x0079, +0x003A, +0x0120, +0x0042, +0x003A, +0x0020, +0x0167, +0x0000, +0x0327, +0x0000, +0x0247, +0x0000, +0x0027, +0x0000, +0x01C7, +0x0020, +0x01A7, +0x0420, +0x0F47, +0x0080, +0x0055, +0x0060, +0x058F, +0x0040, +0x011A, +0x068E, +0x006F, +0x021A, +0x002F, +0x005A, +0x0000, +0x019A, +0x0040, +0x001A, +0x05C0, +0x059A, +0x000C, +0x1020, +0x0387, +0x0040, +0x0607, +0x01C0, +0x000C, +0x034F, +0x0060, +0x03E7, +0x006F, +0x0100, +0x0267, +0x000E, +0x00E7, +0x000E, +0x0080, +0x04A7, +0x008C, +0x0080, +0x03A7, +0x0000, +0x0015, +0x0467, +0x0060, +0x00E7, +0x0015, +0x008E, +0x0520, +0x04E9, +0x04E5, +0x09A7, +0x0020, +0x012D, +0x00A0, +0x0469, +0x0060, +0x0465, +0x0060, +0x04E7, +0x00E0, +0x0667, +0x0140, +0x0015, +0x0149, +0x0000, +0x01C9, +0x0000, +0x00C9, +0x0000, +0x0029, +0x0000, +0x0145, +0x0000, +0x01C5, +0x0000, +0x00C5, +0x0000, +0x0025, +0x0840, +0x26C7, +0x0100, +0x02A7, +0x0120, +0x00E7, +0x02E0, +0x00A6, +0x0000, +0x0526, +0x0000, +0x0106, +0x0880, +0x00A7, +0x0020, +0x0007, +0x0000, +0x0567, +0x0000, +0x0027, +0x0040, +0x0007, +0x0020, +0x02C7, +0x0000, +0x0015, +0x00EF, +0x02C7, +0x003A, +0x00CF, +0x03C7, +0x00E0, +0x010F, +0x05E0, +0x0247, +0x0000, +0x0027, +0x0080, +0x008F, +0x02A7, +0x00AF, +0x0040, +0x0015, +0x0327, +0x0080, +0x0015, +0x07E0, +0x06E7, +0x0060, +0x002F, +0x0027, +0x01EF, +0x0020, +0x05AF, +0x0007, +0x004C, +0x0000, +0x002C, +0x0080, +0x006C, +0x0067, +0x0000, +0x0047, +0x0000, +0x0387, +0x0020, +0x004C, +0x0060, +0x000C, +0x010F, +0x00C0, +0x0115, +0x00C0, +0x0387, +0x002F, +0x0015, +0x0387, +0x004F, +0x03E0, +0x00E7, +0x001A, +0x0367, +0x002C, +0x0060, +0x008F, +0x00D5, +0x0100, +0x06A7, +0x0040, +0x00D5, +0x02A7, +0x0020, +0x00EF, +0x0247, +0x0080, +0x00EF, +0x0227, +0x00C0, +0x0075, +0x0160, +0x00CF, +0x09E0, +0x0907, +0x06C0, +0x0649, +0x0180, +0x0645, +0x00C0, +0x00AF, +0x0467, +0x006C, +0x00E0, +0x012D, +0x24A0, +0x03CF, +0x0000, +0x0527, +0x0000, +0x002C, +0x0011, +0x0020, +0x0027, +0x0940, +0x004C, +0x0387, +0x012F, +0x0007, +0x00E0, +0x02A7, +0x014C, +0x006F, +0x0095, +0x02A0, +0x0227, +0x006C, +0x0075, +0x04A0, +0x0287, +0x00CF, +0x0260, +0x02C7, +0x0100, +0x000A, +0x000C, +0x000A, +0x0687, +0x01CC, +0x00D5, +0x0060, +0x026F, +0x012D, +0x000C, +0x0027, +0x002C, +0x0007, +0x0100, +0x004C, +0x000A, +0x0587, +0x004A, +0x006C, +0x002A, +0x002C, +0x0035, +0x0002, +0x0075, +0x000C, +0x0120, +0x0002, +0x0020, +0x0307, +0x00C0, +0x012D, +0x00A0, +0x004C, +0x0467, +0x008C, +0x000A, +0x00EC, +0x0000, +0x012D, +0x0075, +0x0007, +0x002A, +0x0007, +0x00E0, +0x0447, +0x000C, +0x0035, +0x0007, +0x0100, +0x002C, +0x000A, +0x05E7, +0x004A, +0x010C, +0x002A, +0x0067, +0x0075, +0x006C, +0x0015, +0x000A, +0x000C, +0x012D, +0x0007, +0x0015, +0x0007, +0x0055, +0x0000, +0x026F, +0x0140, +0x0227, +0x0000, +0x0307, +0x004A, +0x004C, +0x002A, +0x000C, +0x000A, +0x002C, +0x00B5, +0x000C, +0x0027, +0x000C, +0x07A0, +0x00C7, +0x0000, +0x0007, +0x0000, +0x0067, +0x0000, +0x01C7, +0x0000, +0x0127, +0x0015, +0x00A0, +0x05C7, +0x000C, +0x004A, +0x00EC, +0x0080, +0x012D, +0x00A0, +0x002C, +0x002A, +0x0000, +0x00E7, +0x0020, +0x0027, +0x0020, +0x02A7, +0x0000, +0x00C7, +0x0000, +0x0027, +0x0000, +0x0087, +0x0000, +0x002C, +0x0007, +0x002A, +0x000C, +0x006A, +0x0020, +0x002A, +0x0020, +0x004A, +0x0020, +0x0007, +0x00A0, +0x000A, +0x0080, +0x0087, +0x002A, +0x0020, +0x00CC, +0x0040, +0x008C, +0x1140, +0x0687, +0x004A, +0x00EC, +0x002A, +0x004C, +0x000A, +0x000C, +0x0067, +0x0095, +0x012D, +0x0035, +0x0000, +0x0015, +0x000C, +0x0047, +0x03A0, +0x05E7, +0x004A, +0x00AC, +0x000A, +0x000C, +0x006A, +0x002C, +0x000A, +0x002C, +0x0027, +0x0015, +0x0007, +0x00E0, +0x012D, +0x14A0, +0x05C7, +0x004A, +0x006C, +0x0020, +0x006A, +0x002C, +0x000A, +0x002C, +0x02D5, +0x0067, +0x002C, +0x0420, +0x05E7, +0x004A, +0x00EC, +0x002A, +0x000C, +0x000A, +0x002C, +0x0055, +0x0007, +0x0140, +0x012D, +0x00A0, +0x0195, +0x0240, +0x0547, +0x000C, +0x000A, +0x000C, +0x002A, +0x00AC, +0x000A, +0x000C, +0x0007, +0x0015, +0x00A0, +0x012D, +0x06A0, +0x0347, +0x0020, +0x004C, +0x002A, +0x006C, +0x000A, +0x008C, +0x0060, +0x012D, +0x002F, +0x0055, +0x001A, +0x00C7, +0x1700, +0x0567, +0x004A, +0x010C, +0x000A, +0x002C, +0x0015, +0x0C60, +0x03E9, +0x03E5, +0x012D, +0x010F, +0x0160, +0x00E7, +0x0020, +0x0007, +0x0020, +0x00E7, +0x0000, +0x0027, +0x0000, +0x02E7, +0x00AA, +0x0000, +0x002A, +0x0020, +0x002C, +0x000A, +0x000C, +0x0007, +0x000A, +0x0007, +0x000A, +0x000C, +0x0055, +0x0100, +0x012D, +0x08A0, +0x00E7, +0x0020, +0x04C7, +0x004A, +0x006C, +0x0020, +0x002C, +0x006A, +0x000C, +0x0007, +0x0015, +0x0007, +0x000A, +0x0340, +0x0007, +0x012C, +0x04E7, +0x00AC, +0x000A, +0x0007, +0x006C, +0x00F5, +0x000C, +0x00E0, +0x0007, +0x00AC, +0x002A, +0x004C, +0x05A7, +0x018C, +0x000A, +0x002C, +0x0055, +0x0007, +0x0095, +0x0180, +0x0907, +0x00C0, +0x0135, +0x1EA0, +0x0107, +0x0000, +0x0487, +0x000A, +0x00CC, +0x0000, +0x00AC, +0x000A, +0x000C, +0x0007, +0x0095, +0x0120, +0x012D, +0x024F, +0x0040, +0x0035, +0x03A7, +0x0020, +0x02AC, +0x0000, +0x000A, +0x00CC, +0x000A, +0x002C, +0x000A, +0x002C, +0x0900, +0x00C7, +0x0000, +0x0027, +0x0000, +0x04A7, +0x00AC, +0x0040, +0x000C, +0x0000, +0x002C, +0x0000, +0x00CC, +0x0007, +0x000C, +0x00E0, +0x012D, +0x00A0, +0x00A7, +0x0000, +0x0027, +0x0000, +0x03E7, +0x008A, +0x0000, +0x002C, +0x0000, +0x002A, +0x000C, +0x000A, +0x000C, +0x0007, +0x00C0, +0x012D, +0x26A0, +0x0247, +0x002C, +0x002A, +0x0035, +0x00C0, +0x002C, +0x0007, +0x000A, +0x0187, +0x0000, +0x0427, +0x002A, +0x008C, +0x0040, +0x002A, +0x000C, +0x000A, +0x000C, +0x0195, +0x012D, +0x0AA0, +0x0007, +0x01C0, +0x028F, +0x00FA, +0x0077, +0x021A, +0x0180, +0x0015, +0x7327, +0x0CA0, +0x0DCE, +0x0000, +0x0095, +0x0140, +0x1867, +0xFFE0, +0x4960, +0x0C07, +0x0035, +0x0180, +0x85E7, +0x01E2, +0x000C, +0x00A7, +0x01CC, +0xFFE0, +0xF520, +0x48C7, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0x3700, +0x4707, +0x00C0, +0x03C7, +0x0000, +0x012D, +0x0060, +0x0035, +0x09C7, +0x0000, +0x012D, +0x00A0, +0x03A7, +0x0020, +0x008C, +0x0015, +0x0120, +0x05E7, +0x00CC, +0x0095, +0x007A, +0x0066, +0x0015, +0x001A, +0x0120, +0x012D, +0x0000, +0x00CF, +0x0000, +0x0287, +0x0080, +0x0247, +0x55E0, +0x03E9, +0x03E5, +0x02CF, +0x0075, +0x0C80, +0x0947, +0x0060, +0x000C, +0x0007, +0x06CA, +0x00C0, +0x006C, +0x0186, +0x07E0, +0x0026, +0x0015, +0x0006, +0x000C, +0x0140, +0x002A, +0x01A0, +0xFFE7, +0xFFE7, +0xFEE7, +0x00E0, +0x9AA7, +0x0520, +0x0107, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0x5CC0, +0x0066, +0x0000, +0x00C6, +0x0000, +0x0026, +0x0000, +0x2447, +0x01C0, +0x0007, +0x0380, +0x0047, +0x0020, +0x0007, +0x01A0, +0x0067, +0x00E0, +0x3167, +0xFFE0, +0x2060, +0x0D47, +0x0080, +0x0187, +0x0040, +0x0107, +0x00C0, +0x0127, +0x0020, +0x001A, +0x002C, +0x0015, +0x0062, +0xFFE0, +0xFFE0, +0x4B60, +0x05AC, +0x0020, +0x02CC, +0x0100, +0x0E7A, +0x0760, +0x1EBA, +0x0120, +0x04DA, +0x0020, +0x077A, +0x002A, +0x004C, +0x005A, +0x00AA, +0x00E2, +0x00EC, +0x003A, +0x00CC, +0x03BA, +0x006C, +0x079A, +0x0280, +0x083A, +0x004C, +0x001A, +0x0F20, +0x026F, +0x0160, +0x026F, +0x0160, +0x0ADA, +0x0100, +0x030F, +0x10C0, +0x0329, +0x0325, +0x0329, +0x00C5, +0x0000, +0x0225, +0x0329, +0x0325, +0x0009, +0x0000, +0x0029, +0x0020, +0x0009, +0x0020, +0x0029, +0x0020, +0x0069, +0x0000, +0x00E9, +0x0065, +0x0000, +0x0005, +0x0000, +0x00C5, +0x0000, +0x0145, +0x0329, +0x0325, +0x0029, +0x0000, +0x0069, +0x0020, +0x00E9, +0x0000, +0x00C9, +0x0000, +0x0325, +0x0029, +0x0000, +0x0069, +0x0000, +0x0089, +0x0000, +0x0009, +0x0040, +0x00C9, +0x0000, +0x0325, +0x0329, +0x0325, +0x0329, +0x0325, +0x0329, +0x0325, +0x0329, +0x0325, +0x0329, +0x0325, +0x0329, +0x0365, +0x0020, +0x0309, +0x0019, +0x0305, +0x0019, +0x00A5, +0x0309, +0x0019, +0x0305, +0x0019, +0x00A5, +0x0309, +0x0019, +0x0305, +0x0019, +0x00A5, +0x0309, +0x0019, +0x0305, +0x0019, +0x00A5, +0x0309, +0x0019, +0x0305, +0x0019, +0x00A5, +0x0009, +0x0005, +0x0020, +0x062D, +0x3FFA, +0x06CC, +0x007A, +0x062C, +0x00FA, +0x000C, +0x01BA, +0x000C, +0x003A, +0x0095, +0x01C0, +0x008C, +0x0000, +0x01CC, +0x89E0, +0x0125, +0x0007, +0x0265, +0x00A0, +0x00A5, +0x1A80, +0x00CC, +0x0000, +0x020C, +0x0020, +0x00CC, +0x0000, +0x002C, +0x0000, +0x008C, +0x0080, +0x07A6, +0x0400, +0x000C, +0x0DE0, +0x0587, +0x0040, +0x00CC, +0x00C6, +0x0020, +0x012D, +0x0060, +0x0007, +0x001A, +0x27E0, +0x03A7, +0x000C, +0x0200, +0x0567, +0x006C, +0x012D, +0x0080, +0x0017, +0x39E0, +0x0347, +0x0006, +0x006C, +0x012D, +0x5CA0, +0x00C7, +0x0000, +0x0067, +0x0000, +0x0027, +0x0000, +0x01C7, +0x0000, +0x1887, +0x0020, +0x010F, +0x00CC, +0x0500, +0x0429, +0x0425, +0x00CC, +0x0006, +0x0060, +0x012D, +0x0060, +0x0035, +0x6200, +0x074F, +0x001A, +0x004F, +0x0017, +0x006F, +0x0960, +0x058F, +0x001A, +0x01CF, +0x1820, +0x0067, +0x0000, +0x0347, +0x0000, +0x0027, +0x0000, +0x0007, +0x0020, +0x0007, +0x0000, +0x0127, +0x0000, +0x0067, +0x0000, +0x0007, +0x0000, +0x0007, +0x00A0, +0x0007, +0x0060, +0x0007, +0x0000, +0x0007, +0x0000, +0x0007, +0x0000, +0x0047, +0x0000, +0x0027, +0x0000, +0x0007, +0x0020, +0x0007, +0x0000, +0x0007, +0x0000, +0x0007, +0x0000, +0x0007, +0x0000, +0x0007, +0x0000, +0x0027, +0x0000, +0x0007, +0x0020, +0x0067, +0x0000, +0x00C7, +0x0000, +0x0067, +0x0000, +0x0067, +0x0000, +0x0007, +0x0000, +0x0127, +0x0000, +0x0207, +0x0080, +0x0047, +0x0000, +0x0087, +0x0000, +0x0207, +0x0660, +0x0039, +0x21A0, +0x057A, +0x0060, +0x0C7A, +0x0160, +0x01DA, +0x0020, +0x01DA, +0x0000, +0x01DA, +0x0000, +0x049A, +0x0120, +0x018F, +0x141A, +0x06E0, +0x039A, +0x0180, +0x057A, +0x0060, +0x011A, +0x00C0, +0x003A, +0x01A0, +0x00BA, +0x1320, +0x1F5A, +0x0098, +0x5AFA, +0x0060, +0x021A, +0x0040, +0x019A, +0x0040, +0x0EDA, +0x0060, +0x0BDA, +0x00A0, +0x017A, +0x0060, +0x001A, +0x01C0, +0x017A, +0x0060, +0x06FA, +0x00E0, +0x013A, +0x00A0, +0x04FA, +0x00E0, +0x03BA, +0x0020, +0x003A, +0x09A0, +0x2A7A, +0x0160, +0x01BA, +0x0020, +0x019A, +0x0040, +0x011A, +0x00C0, +0x05BA, +0x0000, +0x00DA, +0x00E0, +0x01BA, +0x0060, +0x011A, +0x00C0, +0x011A, +0x00C0, +0x125A, +0x0000, +0x06DA, +0x0480, +0x012D, +0x80A0, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xDBE7, +0x03E0, +0xFFE7, +0xFFE7, +0x0727, +0x00A0, +0x1BA7, +0x0020, +0xFFE7, +0xFFE7, +0xD027, +0x01A0, +0xFFE7, +0xFFE7, +0xFFE7, +0xA607, +0x01C0, +0x4DA7, +0xFFE0, +0x3420, +0x43A7, +0xBC20, +0xFFE7, +0xFFE7, +0x6947, +0x0080, +0xFFE7, +0xFFE7, +0x0BE7, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0x8A00, +0x0002, +0x03A0, +0x0BE2, +0x0FE0, +0x1DEC, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xC1E0, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFA3, +0x0020, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFA3, +0x0020, }; const std::unordered_set unicode_set_whitespace = { @@ -7030,3 +9274,4 @@ const std::vector unicode_ranges_nfd = { // start, last, nfd {0x02FA1C, 0x02FA1C, 0x009F3B}, {0x02FA1D, 0x02FA1D, 0x02A600}, }; + diff --git a/src/unicode-data.h b/src/unicode-data.h index e27fe1770710a..cd6a6451a278f 100644 --- a/src/unicode-data.h +++ b/src/unicode-data.h @@ -13,7 +13,7 @@ struct range_nfd { static const uint32_t MAX_CODEPOINTS = 0x110000; -extern const std::vector> unicode_ranges_flags; +extern const std::vector unicode_rle_codepoints_categs; extern const std::unordered_set unicode_set_whitespace; extern const std::unordered_map unicode_map_lowercase; extern const std::unordered_map unicode_map_uppercase; From 2636cb61703d5984dc94d3d757d0c506ec783846 Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Sat, 20 Jul 2024 23:19:42 +0200 Subject: [PATCH 05/29] Decode unicode data categories --- scripts/gen-unicode-data.py | 3 +- src/unicode.cpp | 77 ++++++++++++++++++------------------- src/unicode.h | 26 +++++++++++++ 3 files changed, 65 insertions(+), 41 deletions(-) diff --git a/scripts/gen-unicode-data.py b/scripts/gen-unicode-data.py index 55ac0af12c29f..542a9edbac582 100644 --- a/scripts/gen-unicode-data.py +++ b/scripts/gen-unicode-data.py @@ -49,6 +49,7 @@ def unicode_data_iter(): yield (cpt, cpt_lower, cpt_upper, categ, bidir) +# see codepoint_categ::from_index() in unicode.h UNICODE_CATEGORY_TO_INDEX = { "Cn": 0, # \p{Cn} Undefined "Cc": 1, # \p{Cc} Control @@ -123,7 +124,7 @@ def unicode_data_iter(): table_nfd.sort() -# run length encoding +# run length encoding, see unicode_cpt_category() in unicode.cpp assert (max(UNICODE_CATEGORY_TO_INDEX.values()) < 32) codepoint_categs_runs = [codepoint_categs[0]] # 5 bits categ + 11 bits length for cpt, categ in enumerate(codepoint_categs[1:], 1): diff --git a/src/unicode.cpp b/src/unicode.cpp index e05fb9d1775dd..a78c59f740348 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -113,38 +113,6 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) { // return result; //} -static std::vector unicode_cpt_flags_array() { - std::vector cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED); - - assert (unicode_ranges_flags.front().first == 0); - assert (unicode_ranges_flags.back().first == MAX_CODEPOINTS); - for (size_t i = 1; i < unicode_ranges_flags.size(); ++i) { - const auto range_ini = unicode_ranges_flags[i-1]; // codepoint_ini, flags - const auto range_end = unicode_ranges_flags[i]; // codepoint_end, flags - for (uint32_t cpt = range_ini.first; cpt < range_end.first; ++cpt) { - cpt_flags[cpt] = range_ini.second; - } - } - - for (auto cpt : unicode_set_whitespace) { - cpt_flags[cpt].is_whitespace = true; - } - - for (auto p : unicode_map_lowercase) { - cpt_flags[p.second].is_lowercase = true; - } - - for (auto p : unicode_map_uppercase) { - cpt_flags[p.second].is_uppercase = true; - } - - for (auto &range : unicode_ranges_nfd) { // start, last, nfd - cpt_flags[range.nfd].is_nfd = true; - } - - return cpt_flags; -} - static std::unordered_map unicode_byte_to_utf8_map() { std::unordered_map map; for (int ch = 0x21; ch <= 0x7E; ++ch) { // u'!' to u'~' @@ -606,19 +574,48 @@ std::vector unicode_cpts_from_utf8(const std::string & utf8) { return result; } -codepoint_flags unicode_cpt_flags(const uint32_t cp) { - static const codepoint_flags undef(codepoint_flags::UNDEFINED); - static const auto cpt_flags = unicode_cpt_flags_array(); - return cp < cpt_flags.size() ? cpt_flags[cp] : undef; +codepoint_categ unicode_cpt_category(const uint32_t cp) { + static const std::vector cpt_categs = [] { + std::vector cpt_categs(MAX_CODEPOINTS, codepoint_categ::UNDEF); + uint32_t cpt = 0; + for (uint16_t rle : unicode_rle_codepoints_categs) { + const uint32_t index = rle & 31; + const uint32_t count = rle >> 5; + const auto categ = codepoint_categ::from_index(index); + //printf( "Codepoints 0x%05X to 0x%05X categ %s\n", cpt, cpt + count, categ.c_str()); + for (uint32_t i = 0; i <= count; ++i) { + cpt_categs[cpt++] = categ; + } + } + assert (cpt == MAX_CODEPOINTS); + + for (auto cpt : unicode_set_whitespace) { + cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACE); + } + + for (auto p : unicode_map_lowercase) { + cpt_categs[cpt].set_flag(codepoint_categ::LOWERCASE); + } + + for (auto p : unicode_map_uppercase) { + cpt_categs[cpt].set_flag(codepoint_categ::UPPERCASE); + } + + //for (auto &range : unicode_ranges_nfd) { // start, last, nfd + // cpt_categs[cpt].set_flag(codepoint_categ::NORM_NFD); + //} + + return cpt_categs; + }(); + return cp < cpt_categs.size() ? cpt_categs[cp] : codepoint_categ{}; } -codepoint_flags unicode_cpt_flags(const std::string & utf8) { - static const codepoint_flags undef(codepoint_flags::UNDEFINED); +codepoint_categ unicode_cpt_category(const std::string & utf8) { if (utf8.empty()) { - return undef; // undefined + return codepoint_categ{}; // undefined } size_t offset = 0; - return unicode_cpt_flags(unicode_cpt_from_utf8(utf8, offset)); + return unicode_cpt_category(unicode_cpt_from_utf8(utf8, offset)); } std::string unicode_byte_to_utf8(uint8_t byte) { diff --git a/src/unicode.h b/src/unicode.h index f9f4fcc8cc7a0..e8928f261445d 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -3,6 +3,8 @@ #include #include #include +#include +#include struct codepoint_categ { enum _category : uint16_t { @@ -59,6 +61,18 @@ struct codepoint_categ { inline codepoint_categ(const uint16_t categ=0) : encoded{categ} {} + static codepoint_categ from_index(int index) { + static const std::array table = { + UNDEF, Cc, Cf, Co, Cs, Ll, Lm, Lo, Lt, Lu, Mc, Me, Mn, Nd, Nl, No, Pc, Pd, Pe, Pf, Pi, Po, Ps, Sc, Sk, Sm, So, Zl, Zp, Zs, UNDEF, UNDEF + }; + return (size_t)index < table.size() ? table[index] : table[0]; + } + + inline void set_flag(_flags flags, bool value = true) { + flags = (_flags) (flags & ~SUBMASK); // ignore category bits + encoded = value ? (encoded | flags) : (encoded & ~flags); + } + inline uint8_t get_category() const { return encoded & MASK; } inline uint8_t get_subcategory() const { return encoded & SUBMASK; } @@ -107,6 +121,18 @@ struct codepoint_categ { inline auto is_Zp() const { return (encoded & SUBMASK) == Zp; } inline auto is_Zs() const { return (encoded & SUBMASK) == Zs; } + const char * c_str() const { + static const std::map map = { + {UNDEF, "UNDEF"}, {C, "C"}, {L, "L"}, {M, "M"}, {N, "N"}, {P, "P"}, {S, "S"}, {Z, "Z"}, + {Cc, "Cc"}, {Cf, "Cf"}, {Co, "Co"}, {Cs, "Cs"}, {Ll, "Ll"}, {Lm, "Lm"}, {Lo, "Lo"}, {Lt, "Lt"}, + {Lu, "Lu"}, {Mc, "Mc"}, {Me, "Me"}, {Mn, "Mn"}, {Nd, "Nd"}, {Nl, "Nl"}, {No, "No"}, {Pc, "Pc"}, + {Pd, "Pd"}, {Pe, "Pe"}, {Pf, "Pf"}, {Pi, "Pi"}, {Po, "Po"}, {Ps, "Ps"}, {Sc, "Sc"}, {Sk, "Sk"}, + {Sm, "Sm"}, {So, "So"}, {Zl, "Zl"}, {Zp, "Zp"}, {Zs, "Zs"}, + }; + const auto it = map.find(encoded & SUBMASK); + return it == map.end() ? "INVALID" : it->second; + } + uint16_t encoded; }; From 23cf064e3bcd1ca6502484c47b23cb48f4cb4321 Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Sat, 20 Jul 2024 23:28:05 +0200 Subject: [PATCH 06/29] Replace 'codepoint_flags' with 'codepoint_categ' --- src/llama.cpp | 12 +++---- src/unicode.cpp | 84 +++++++++++++++++++++++++------------------------ src/unicode.h | 8 +++++ 3 files changed, 57 insertions(+), 47 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 7d68ed8111873..e8dcc9ff348da 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15836,22 +15836,22 @@ struct llm_tokenizer_wpm { std::vector words(1, ""); for (const uint32_t cpt : cpts_nfd) { - const auto flags = unicode_cpt_flags(cpt); + const auto categ = unicode_cpt_category(cpt); - if (flags.is_whitespace) { + if (categ.is_whitespace()) { if (words.back().size()) { // finish previous word if any words.emplace_back(); } continue; } - assert (!flags.is_separator); - if (cpt == 0 || cpt == 0xFFFD || flags.is_control) { + assert (!categ.is_S()); + if (cpt == 0 || cpt == 0xFFFD || categ.is_C()) { continue; } const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt)); - if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) { + if (categ.is_P() || (cpt < 0x7F && categ.is_S()) || is_chinese_char(cpt)) { if (words.back().size()) { // finish previous word if any words.emplace_back(); } @@ -15869,7 +15869,7 @@ struct llm_tokenizer_wpm { return words; } - static bool is_chinese_char(uint32_t cpt) { + static bool is_chinese_char(uint32_t cpt) { //TODO: move to unicode-data.cpp? unicode_cpt_category(cpt).is_chinese()? return (cpt >= 0x04E00 && cpt <= 0x09FFF) || (cpt >= 0x03400 && cpt <= 0x04DBF) || diff --git a/src/unicode.cpp b/src/unicode.cpp index a78c59f740348..4c33359743dcc 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -203,8 +203,9 @@ static std::vector unicode_regex_split_custom_gpt2(const std::string & t return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE; }; - auto _get_flags = [&] (const size_t pos) -> codepoint_flags { - return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{}; + static const codepoint_categ SENTINEL = codepoint_categ::MASK + 1; + auto _get_categ = [&] (const size_t pos) -> codepoint_categ { + return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_category(cpts[pos]) : SENTINEL; }; size_t _prev_end = offset_ini; @@ -226,7 +227,7 @@ static std::vector unicode_regex_split_custom_gpt2(const std::string & t for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) { const uint32_t cpt = _get_cpt(pos); - const auto flags = _get_flags(pos); + const auto categ = _get_categ(pos); // regex: 's|'t|'re|'ve|'m|'ll|'d if (cpt == '\'' && pos+1 < offset_end) { @@ -246,37 +247,37 @@ static std::vector unicode_regex_split_custom_gpt2(const std::string & t } } - auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags); + auto categ2 = (cpt == ' ' ? _get_categ(pos+1) : categ); // regex: ?\p{L}+ - if (flags2.is_letter) { + if (categ2.is_L()) { pos += (cpt == ' '); - while (flags2.is_letter) { - flags2 = _get_flags(++pos); + while (categ2.is_L()) { + categ2 = _get_categ(++pos); } _add_token(pos); continue; } // regex: ?\p{N}+ - if (flags2.is_number) { + if (categ2.is_N()) { pos += (cpt == ' '); - while (flags2.is_number) { - flags2 = _get_flags(++pos); + while (categ2.is_N()) { + categ2 = _get_categ(++pos); } _add_token(pos); continue; } // regex: ?[^\s\p{L}\p{N}]+ - if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) { + if (!(categ2.is_whitespace() | categ2.is_L() | categ2.is_N()) && categ2 != SENTINEL) { pos += (cpt == ' '); - while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) { - flags2 = _get_flags(++pos); + while (!(categ2.is_whitespace() | categ2.is_L() | categ2.is_N()) && categ2 != SENTINEL) { + categ2 = _get_categ(++pos); } _add_token(pos); continue; } size_t num_whitespaces = 0; - while (_get_flags(pos+num_whitespaces).is_whitespace) { + while (_get_categ(pos+num_whitespaces).is_whitespace()) { num_whitespaces++; } @@ -321,8 +322,9 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE; }; - auto _get_flags = [&] (const size_t pos) -> codepoint_flags { - return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{}; + static const codepoint_categ SENTINEL = codepoint_categ::MASK + 1; + auto _get_categ = [&] (const size_t pos) -> codepoint_categ { + return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_category(cpts[pos]) : SENTINEL; }; size_t _prev_end = offset_ini; @@ -344,7 +346,7 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) { const uint32_t cpt = _get_cpt(pos); - const auto flags = _get_flags(pos); + const auto categ = _get_categ(pos); // regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive if (cpt == '\'' && pos+1 < offset_end) { @@ -365,10 +367,10 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & } // regex: [^\r\n\p{L}\p{N}]?\p{L}+ - if (!(cpt == '\r' || cpt == '\n' || flags.is_number)) { - if (flags.is_letter || _get_flags(pos+1).is_letter) { // one or more letters + if (!(cpt == '\r' || cpt == '\n' || categ.is_N())) { + if (categ.is_L() || _get_categ(pos+1).is_L()) { // one or more letters pos++; - while (_get_flags(pos).is_letter) { + while (_get_categ(pos).is_L()) { pos++; } _add_token(pos); @@ -377,9 +379,9 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & } // regex: \p{N}{1,3} - if (flags.is_number) { + if (categ.is_N()) { size_t ini = pos; - while (_get_flags(pos).is_number) { + while (_get_categ(pos).is_N()) { if (++pos - ini >= 3 ) { _add_token(pos); ini = pos; @@ -390,11 +392,11 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & } // regex: ?[^\s\p{L}\p{N}]+[\r\n]* - auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags); - if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags.as_uint()) { + auto categ2 = (cpt == ' ' ? _get_categ(pos+1) : categ); + if (!(categ2.is_whitespace() | categ2.is_L() | categ2.is_N()) && categ2 != SENTINEL) { pos += (cpt == ' '); - while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) { - flags2 = _get_flags(++pos); + while (!(categ2.is_whitespace() | categ2.is_L() | categ2.is_N()) && categ2 != SENTINEL) { + categ2 = _get_categ(++pos); } uint32_t cpt2 = _get_cpt(pos); while (cpt2 == '\r' || cpt2 == '\n') { @@ -406,7 +408,7 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & size_t num_whitespaces = 0; size_t last_end_r_or_n = 0; - while (_get_flags(pos+num_whitespaces).is_whitespace) { + while (_get_categ(pos+num_whitespaces).is_whitespace()) { uint32_t cpt2 = _get_cpt(pos+num_whitespaces); if (cpt2 == '\r' || cpt2 == '\n') { last_end_r_or_n = pos + num_whitespaces + 1; @@ -636,21 +638,21 @@ uint32_t unicode_tolower(uint32_t cp) { std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs) { // unicode categories static const std::map k_ucat_enum = { - { "\\p{N}", codepoint_flags::NUMBER }, - { "\\p{L}", codepoint_flags::LETTER }, - { "\\p{P}", codepoint_flags::PUNCTUATION }, + { "\\p{N}", codepoint_categ::N }, + { "\\p{L}", codepoint_categ::L }, + { "\\p{P}", codepoint_categ::P }, }; static const std::map k_ucat_cpt = { - { codepoint_flags::NUMBER, 0xD1 }, - { codepoint_flags::LETTER, 0xD2 }, - { codepoint_flags::PUNCTUATION, 0xD3 }, + { codepoint_categ::N, 0xD1 }, + { codepoint_categ::L, 0xD2 }, + { codepoint_categ::P, 0xD3 }, }; static const std::map k_ucat_map = { - { codepoint_flags::NUMBER, "\x30-\x39" }, // 0-9 - { codepoint_flags::LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z - { codepoint_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\} + { codepoint_categ::N, "\x30-\x39" }, // 0-9 + { codepoint_categ::L, "\x41-\x5A\x61-\x7A" }, // A-Za-z + { codepoint_categ::P, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\} }; // compute collapsed codepoints only if needed by at least one regex @@ -681,14 +683,14 @@ std::vector unicode_regex_split(const std::string & text, const std continue; } - const auto flags = unicode_cpt_flags(cpts[i]); + const auto categ = unicode_cpt_category(cpts[i]); - if (flags.is_whitespace) { + if (categ.is_whitespace()) { //NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does. //text_collapsed[i] = (char) 0x85; // as whitespace fallback text_collapsed[i] = (char) 0x0B; // as whitespace fallback - } else if (k_ucat_cpt.find(flags.category_flag()) != k_ucat_cpt.end()) { - text_collapsed[i] = k_ucat_cpt.at(flags.category_flag()); + } else if (k_ucat_cpt.find(categ.get_category()) != k_ucat_cpt.end()) { + text_collapsed[i] = k_ucat_cpt.at(categ.get_category()); } else { text_collapsed[i] = (char) 0xD0; // fallback } @@ -777,7 +779,7 @@ std::vector unicode_regex_split(const std::string & text, const std // std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback std::wstring wtext(cpts.begin(), cpts.end()); for (size_t i = 0; i < wtext.size(); ++i) { - if (wtext[i] > 0x7F && unicode_cpt_flags(wtext[i]).is_whitespace) { + if (wtext[i] > 0x7F && unicode_cpt_category(wtext[i]).is_whitespace()) { wtext[i] = 0x0B; } } diff --git a/src/unicode.h b/src/unicode.h index e8928f261445d..339ef2893a4e9 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -121,6 +121,14 @@ struct codepoint_categ { inline auto is_Zp() const { return (encoded & SUBMASK) == Zp; } inline auto is_Zs() const { return (encoded & SUBMASK) == Zs; } + inline bool operator == (const codepoint_categ other) const { + return encoded == other.encoded; + } + + inline bool operator != (const codepoint_categ other) const { + return encoded != other.encoded; + } + const char * c_str() const { static const std::map map = { {UNDEF, "UNDEF"}, {C, "C"}, {L, "L"}, {M, "M"}, {N, "N"}, {P, "P"}, {S, "S"}, {Z, "Z"}, From ecebfc0c718c81b7f0d6d6552e81c71fe9bf2053 Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Fri, 26 Jul 2024 00:16:24 +0200 Subject: [PATCH 07/29] Update unicode data: sorted whitespaces --- scripts/gen-unicode-data.py | 2 +- src/unicode-data.cpp | 3 +-- src/unicode-data.h | 3 +-- src/unicode.cpp | 2 +- 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/scripts/gen-unicode-data.py b/scripts/gen-unicode-data.py index 542a9edbac582..d774fcabe9481 100644 --- a/scripts/gen-unicode-data.py +++ b/scripts/gen-unicode-data.py @@ -170,7 +170,7 @@ def out(line=""): out("0x%04X," % rle) out("};\n") -out("const std::unordered_set unicode_set_whitespace = {") +out("const std::vector unicode_vec_whitespace = {") for codepoint in table_whitespace: out("0x%06X," % codepoint) out("};\n") diff --git a/src/unicode-data.cpp b/src/unicode-data.cpp index 4a0c0547c7d03..2591723ce3172 100644 --- a/src/unicode-data.cpp +++ b/src/unicode-data.cpp @@ -5,7 +5,6 @@ #include #include #include -#include const std::vector unicode_rle_codepoints_categs = { // run length encoding, 5 bits categ + 11 bits length 0x03E1, @@ -4527,7 +4526,7 @@ const std::vector unicode_rle_codepoints_categs = { // run length enc 0x0020, }; -const std::unordered_set unicode_set_whitespace = { +const std::vector unicode_vec_whitespace = { 0x000009, 0x00000A, 0x00000B, diff --git a/src/unicode-data.h b/src/unicode-data.h index cd6a6451a278f..682f79c373749 100644 --- a/src/unicode-data.h +++ b/src/unicode-data.h @@ -3,7 +3,6 @@ #include #include #include -#include struct range_nfd { uint32_t first; @@ -14,7 +13,7 @@ struct range_nfd { static const uint32_t MAX_CODEPOINTS = 0x110000; extern const std::vector unicode_rle_codepoints_categs; -extern const std::unordered_set unicode_set_whitespace; +extern const std::vector unicode_vec_whitespace; extern const std::unordered_map unicode_map_lowercase; extern const std::unordered_map unicode_map_uppercase; extern const std::vector unicode_ranges_nfd; diff --git a/src/unicode.cpp b/src/unicode.cpp index 4c33359743dcc..dd413c8092a54 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -591,7 +591,7 @@ codepoint_categ unicode_cpt_category(const uint32_t cp) { } assert (cpt == MAX_CODEPOINTS); - for (auto cpt : unicode_set_whitespace) { + for (auto cpt : unicode_vec_whitespace) { cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACE); } From 8c8e1afaaece651ef50d926d8fcf997dc98c8263 Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Fri, 26 Jul 2024 00:18:16 +0200 Subject: [PATCH 08/29] Fix codepoint_categ return types --- src/unicode.h | 86 +++++++++++++++++++++++++-------------------------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/src/unicode.h b/src/unicode.h index 339ef2893a4e9..4ea8f19472ebc 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -73,53 +73,53 @@ struct codepoint_categ { encoded = value ? (encoded | flags) : (encoded & ~flags); } - inline uint8_t get_category() const { return encoded & MASK; } - inline uint8_t get_subcategory() const { return encoded & SUBMASK; } + inline uint16_t get_category() const { return encoded & MASK; } + inline uint16_t get_subcategory() const { return encoded & SUBMASK; } inline bool is_undefined() const { return !encoded; } inline bool is_defined() const { return encoded; } - inline auto is_whitespace() const { return encoded & WHITESPACE; } - inline auto is_lowercase() const { return encoded & LOWERCASE; } - inline auto is_uppercase() const { return encoded & UPPERCASE; } - - inline auto is_C() const { return encoded & C; } - inline auto is_L() const { return encoded & L; } - inline auto is_M() const { return encoded & M; } - inline auto is_N() const { return encoded & N; } - inline auto is_P() const { return encoded & P; } - inline auto is_S() const { return encoded & S; } - inline auto is_Z() const { return encoded & Z; } - - inline auto is_Cc() const { return (encoded & SUBMASK) == Cc; } - inline auto is_Cf() const { return (encoded & SUBMASK) == Cf; } - inline auto is_Co() const { return (encoded & SUBMASK) == Co; } - inline auto is_Cs() const { return (encoded & SUBMASK) == Cs; } - inline auto is_Ll() const { return (encoded & SUBMASK) == Ll; } - inline auto is_Lm() const { return (encoded & SUBMASK) == Lm; } - inline auto is_Lo() const { return (encoded & SUBMASK) == Lo; } - inline auto is_Lt() const { return (encoded & SUBMASK) == Lt; } - inline auto is_Lu() const { return (encoded & SUBMASK) == Lu; } - inline auto is_Mc() const { return (encoded & SUBMASK) == Mc; } - inline auto is_Me() const { return (encoded & SUBMASK) == Me; } - inline auto is_Mn() const { return (encoded & SUBMASK) == Mn; } - inline auto is_Nd() const { return (encoded & SUBMASK) == Nd; } - inline auto is_Nl() const { return (encoded & SUBMASK) == Nl; } - inline auto is_No() const { return (encoded & SUBMASK) == No; } - inline auto is_Pc() const { return (encoded & SUBMASK) == Pc; } - inline auto is_Pd() const { return (encoded & SUBMASK) == Pd; } - inline auto is_Pe() const { return (encoded & SUBMASK) == Pe; } - inline auto is_Pf() const { return (encoded & SUBMASK) == Pf; } - inline auto is_Pi() const { return (encoded & SUBMASK) == Pi; } - inline auto is_Po() const { return (encoded & SUBMASK) == Po; } - inline auto is_Ps() const { return (encoded & SUBMASK) == Ps; } - inline auto is_Sc() const { return (encoded & SUBMASK) == Sc; } - inline auto is_Sk() const { return (encoded & SUBMASK) == Sk; } - inline auto is_Sm() const { return (encoded & SUBMASK) == Sm; } - inline auto is_So() const { return (encoded & SUBMASK) == So; } - inline auto is_Zl() const { return (encoded & SUBMASK) == Zl; } - inline auto is_Zp() const { return (encoded & SUBMASK) == Zp; } - inline auto is_Zs() const { return (encoded & SUBMASK) == Zs; } + inline uint16_t is_whitespace() const { return encoded & WHITESPACE; } + inline uint16_t is_lowercase() const { return encoded & LOWERCASE; } + inline uint16_t is_uppercase() const { return encoded & UPPERCASE; } + + inline uint16_t is_C() const { return encoded & C; } + inline uint16_t is_L() const { return encoded & L; } + inline uint16_t is_M() const { return encoded & M; } + inline uint16_t is_N() const { return encoded & N; } + inline uint16_t is_P() const { return encoded & P; } + inline uint16_t is_S() const { return encoded & S; } + inline uint16_t is_Z() const { return encoded & Z; } + + inline bool is_Cc() const { return (encoded & SUBMASK) == Cc; } + inline bool is_Cf() const { return (encoded & SUBMASK) == Cf; } + inline bool is_Co() const { return (encoded & SUBMASK) == Co; } + inline bool is_Cs() const { return (encoded & SUBMASK) == Cs; } + inline bool is_Ll() const { return (encoded & SUBMASK) == Ll; } + inline bool is_Lm() const { return (encoded & SUBMASK) == Lm; } + inline bool is_Lo() const { return (encoded & SUBMASK) == Lo; } + inline bool is_Lt() const { return (encoded & SUBMASK) == Lt; } + inline bool is_Lu() const { return (encoded & SUBMASK) == Lu; } + inline bool is_Mc() const { return (encoded & SUBMASK) == Mc; } + inline bool is_Me() const { return (encoded & SUBMASK) == Me; } + inline bool is_Mn() const { return (encoded & SUBMASK) == Mn; } + inline bool is_Nd() const { return (encoded & SUBMASK) == Nd; } + inline bool is_Nl() const { return (encoded & SUBMASK) == Nl; } + inline bool is_No() const { return (encoded & SUBMASK) == No; } + inline bool is_Pc() const { return (encoded & SUBMASK) == Pc; } + inline bool is_Pd() const { return (encoded & SUBMASK) == Pd; } + inline bool is_Pe() const { return (encoded & SUBMASK) == Pe; } + inline bool is_Pf() const { return (encoded & SUBMASK) == Pf; } + inline bool is_Pi() const { return (encoded & SUBMASK) == Pi; } + inline bool is_Po() const { return (encoded & SUBMASK) == Po; } + inline bool is_Ps() const { return (encoded & SUBMASK) == Ps; } + inline bool is_Sc() const { return (encoded & SUBMASK) == Sc; } + inline bool is_Sk() const { return (encoded & SUBMASK) == Sk; } + inline bool is_Sm() const { return (encoded & SUBMASK) == Sm; } + inline bool is_So() const { return (encoded & SUBMASK) == So; } + inline bool is_Zl() const { return (encoded & SUBMASK) == Zl; } + inline bool is_Zp() const { return (encoded & SUBMASK) == Zp; } + inline bool is_Zs() const { return (encoded & SUBMASK) == Zs; } inline bool operator == (const codepoint_categ other) const { return encoded == other.encoded; From 8f7d56ec5b0c4fc5c42265f6071f6fb6977ec856 Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Fri, 26 Jul 2024 00:26:42 +0200 Subject: [PATCH 09/29] Add unicode_data helper functions --- src/unicode.h | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/src/unicode.h b/src/unicode.h index 4ea8f19472ebc..0b8243ccd4ac9 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -1,6 +1,8 @@ #pragma once #include +#include +#include #include #include #include @@ -61,13 +63,6 @@ struct codepoint_categ { inline codepoint_categ(const uint16_t categ=0) : encoded{categ} {} - static codepoint_categ from_index(int index) { - static const std::array table = { - UNDEF, Cc, Cf, Co, Cs, Ll, Lm, Lo, Lt, Lu, Mc, Me, Mn, Nd, Nl, No, Pc, Pd, Pe, Pf, Pi, Po, Ps, Sc, Sk, Sm, So, Zl, Zp, Zs, UNDEF, UNDEF - }; - return (size_t)index < table.size() ? table[index] : table[0]; - } - inline void set_flag(_flags flags, bool value = true) { flags = (_flags) (flags & ~SUBMASK); // ignore category bits encoded = value ? (encoded | flags) : (encoded & ~flags); @@ -141,6 +136,34 @@ struct codepoint_categ { return it == map.end() ? "INVALID" : it->second; } + static codepoint_categ from_index(int index) { + static const std::array table = { + UNDEF, Cc, Cf, Co, Cs, Ll, Lm, Lo, Lt, Lu, Mc, Me, Mn, Nd, Nl, No, Pc, Pd, Pe, Pf, Pi, Po, Ps, Sc, Sk, Sm, So, Zl, Zp, Zs, UNDEF, UNDEF + }; + return (size_t)index < table.size() ? table[index] : table[0]; + } + + static codepoint_categ from_chars(const char categ, const char subcateg = '\0') { + auto _subindex = [] (const char subcateg, const char subcategs[]) -> uint16_t { + if (!subcateg) { + return 0; + } + const char * p = strchr(subcategs, subcateg); + return p ? (p - subcategs + 1) : 0; + }; + switch(categ) { + case 'C': if(subcateg == 'n') return 0; // undefined + return C | (_subindex(subcateg, "cfos" ) << 7); + case 'L': return L | (_subindex(subcateg, "lmotu" ) << 7); + case 'M': return M | (_subindex(subcateg, "cen" ) << 7); + case 'N': return N | (_subindex(subcateg, "dlo" ) << 7); + case 'P': return P | (_subindex(subcateg, "cdefios") << 7); + case 'S': return S | (_subindex(subcateg, "ckmo" ) << 7); + case 'Z': return Z | (_subindex(subcateg, "lps" ) << 7); + default: assert (false); return 0; + } + }; + uint16_t encoded; }; From 1cd7ac090b08765133b5de1bacf0b58b66738d0f Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Fri, 26 Jul 2024 00:43:43 +0200 Subject: [PATCH 10/29] Reimplement 'collapsed' unicode categories: - Add all unicode categories. - Fix \s with non-ASCII problem. --- src/unicode.cpp | 395 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 274 insertions(+), 121 deletions(-) diff --git a/src/unicode.cpp b/src/unicode.cpp index dd413c8092a54..68cadf0c49075 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -636,67 +636,39 @@ uint32_t unicode_tolower(uint32_t cp) { } std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs) { - // unicode categories - static const std::map k_ucat_enum = { - { "\\p{N}", codepoint_categ::N }, - { "\\p{L}", codepoint_categ::L }, - { "\\p{P}", codepoint_categ::P }, - }; + //TODO: update and add more comments + // generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte + // ref: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935 - static const std::map k_ucat_cpt = { - { codepoint_categ::N, 0xD1 }, - { codepoint_categ::L, 0xD2 }, - { codepoint_categ::P, 0xD3 }, + // 0xDB80 to 0xDBFF: Private Use High Surrogate (128 range values) + static const uint32_t COLLAPSE_CPT_RANGE_FIRST = 0xDB80; + static const uint32_t COLLAPSE_CPT_RANGE_LAST = 0xDBFF; + auto category_to_collapsed_cpt = [] (const codepoint_categ categ) { + const uint16_t subindex = categ.get_subcategory() >> 7; // subcategory stored in 3 bits + switch(categ.get_category()) { // category fits in other 3 bits + case codepoint_categ::UNDEF: return COLLAPSE_CPT_RANGE_FIRST + ((0 << 3) | subindex); + case codepoint_categ::C: return COLLAPSE_CPT_RANGE_FIRST + ((1 << 3) | subindex); + case codepoint_categ::L: return COLLAPSE_CPT_RANGE_FIRST + ((2 << 3) | subindex); + case codepoint_categ::M: return COLLAPSE_CPT_RANGE_FIRST + ((3 << 3) | subindex); + case codepoint_categ::N: return COLLAPSE_CPT_RANGE_FIRST + ((4 << 3) | subindex); + case codepoint_categ::P: return COLLAPSE_CPT_RANGE_FIRST + ((5 << 3) | subindex); + case codepoint_categ::S: return COLLAPSE_CPT_RANGE_FIRST + ((6 << 3) | subindex); + case codepoint_categ::Z: return COLLAPSE_CPT_RANGE_FIRST + ((7 << 3) | subindex); + default: assert (false); return COLLAPSE_CPT_RANGE_FIRST; + } }; - - static const std::map k_ucat_map = { - { codepoint_categ::N, "\x30-\x39" }, // 0-9 - { codepoint_categ::L, "\x41-\x5A\x61-\x7A" }, // A-Za-z - { codepoint_categ::P, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\} + auto category_to_collapsed_range = [&] (const codepoint_categ categ) { + // \p{Ll} --> \p{Ll} to \p{Ll} // has subcategory ? yes + // \p{Lu} --> \p{Lu} to \p{Lu} // has subcategory ? yes + // \p{L} --> \p{Ll} to \p{Lu} // has subcategory ? no + assert ((COLLAPSE_CPT_RANGE_FIRST & 0b111) == 0); + const uint32_t collapsed = category_to_collapsed_cpt(categ); + const uint32_t range = (collapsed & 0b111) ? 0 : 0b111; // has subcategory ? + return std::pair(collapsed, collapsed + range); }; - // compute collapsed codepoints only if needed by at least one regex - bool need_collapse = false; - for (auto & regex_expr : regex_exprs) { - // search for unicode categories - for (const auto & ucat : k_ucat_enum) { - if (std::string::npos != regex_expr.find(ucat.first)) { - need_collapse = true; - break; - } - } - } - const auto cpts = unicode_cpts_from_utf8(text); - // generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte - // ref: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935 - std::string text_collapsed; - if (need_collapse) { - // collapse all unicode categories - text_collapsed.resize(cpts.size()); - - for (size_t i = 0; i < cpts.size(); ++i) { - // keep single-byte codepoints as is - if (cpts[i] < 128) { - text_collapsed[i] = cpts[i]; - continue; - } - - const auto categ = unicode_cpt_category(cpts[i]); - - if (categ.is_whitespace()) { - //NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does. - //text_collapsed[i] = (char) 0x85; // as whitespace fallback - text_collapsed[i] = (char) 0x0B; // as whitespace fallback - } else if (k_ucat_cpt.find(categ.get_category()) != k_ucat_cpt.end()) { - text_collapsed[i] = k_ucat_cpt.at(categ.get_category()); - } else { - text_collapsed[i] = (char) 0xD0; // fallback - } - } - } - std::vector bpe_offsets = { cpts.size() }; for (auto & regex_expr : regex_exprs) { @@ -708,91 +680,272 @@ std::vector unicode_regex_split(const std::string & text, const std continue; } - // fallback to general-purpose std::regex / std::wregex - try { - // if a unicode category is used in the regex, we use the collapsed text and replace the unicode category - // with the corresponding collapsed representation - bool use_collapsed = false; - for (auto & ucat : k_ucat_enum) { - if (std::string::npos != regex_expr.find(ucat.first)) { - use_collapsed = true; - break; - } + std::vector> regex_expr_ranges; // start codepoint, last codepoint + std::vector> regex_expr_categs; // offset, codepoint category + std::map map_categ_wregex; // categ --> regex utf32 string + std::wstring wregex_collapsed; + std::wstring wtext_collapsed; + bool inside_square = false; + bool is_cpt_range = false; + + // common ranges: \w \d + regex_expr_ranges.emplace_back('a', 'z'); + regex_expr_ranges.emplace_back('A', 'Z'); + regex_expr_ranges.emplace_back('0', '9'); + regex_expr_ranges.emplace_back('_', '_'); + // common ranges: \s + for (uint32_t cpt : unicode_vec_whitespace) { + const auto categ_prev = unicode_cpt_category(regex_expr_ranges.back().second); + const auto categ_last = unicode_cpt_category(cpt); + if (categ_prev == categ_last && regex_expr_ranges.back().second + 1 == cpt) { + regex_expr_ranges.back().second = cpt; + } else { + regex_expr_ranges.emplace_back(cpt, cpt); } + } - if (use_collapsed) { - // sanity-check that the original regex does not contain any non-ASCII characters - const auto cpts_regex = unicode_cpts_from_utf8(regex_expr); - for (size_t i = 0; i < cpts_regex.size(); ++i) { - if (cpts_regex[i] >= 128) { - throw std::runtime_error("Regex includes both unicode categories and non-ASCII characters - not supported"); + // std::wregex \s does not match non-ASCII whitespaces + static const codepoint_categ categ_whitespace(codepoint_categ::MASK + 1); // UNDEF category, subcategory 1 + std::wstring & wregex_whitespaces = map_categ_wregex[categ_whitespace.get_subcategory()]; + wregex_whitespaces += L"\\s"; + for (uint32_t cpt : unicode_vec_whitespace) { + if (cpt >= 0x80) { // non-ASCII whitespaces + if (wregex_whitespaces.back() + 1 == cpt) { + if (*(wregex_whitespaces.end() - 2) == '-') { + wregex_whitespaces.back() = cpt; + } else { + wregex_whitespaces += '-'; + wregex_whitespaces += cpt; } + } else { + wregex_whitespaces += cpt; } + } + } - // generate a collapsed representation of the regex - std::string regex_expr_collapsed; + const auto cpts_regex = unicode_cpts_from_utf8(regex_expr); - // track if we are inside [], because nested [] are not allowed - bool inside = false; - for (size_t i = 0; i < regex_expr.size(); ++i) { - if (regex_expr[i] == '[' && (i == 0 || regex_expr[i - 1] != '\\')) { - regex_expr_collapsed += '['; - inside = true; - continue; - } + for (size_t i = 0; i < cpts_regex.size(); ++i) { + uint32_t cpt = cpts_regex[i]; - if (inside && regex_expr[i] == ']' && regex_expr[i - 1] != '\\') { - regex_expr_collapsed += ']'; - inside = false; + if (inside_square) { + switch(cpt) { + case '^': + if (cpts_regex[i - 1] != '[') { + break; + } continue; - } - - if (regex_expr[i + 0] == '\\' && i + 4 < regex_expr.size() && - regex_expr[i + 1] == 'p' && - regex_expr[i + 2] == '{' && - regex_expr[i + 4] == '}') { - const std::string pat = regex_expr.substr(i, 5); - if (k_ucat_enum.find(pat) != k_ucat_enum.end()) { - if (!inside) { - regex_expr_collapsed += '['; - } - regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat)); - regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat)); - if (!inside) { - regex_expr_collapsed += ']'; + case ']': + inside_square = false; + continue; + case '-': + is_cpt_range = true; + continue; + } + } else { + switch(cpt) { + case '^': + if (i > 0) { + break; + } + continue; + case '$': + if (i + 1 < cpts_regex.size()) { + break; + } + continue; + case '[': + inside_square = true; + continue; + case '{': + while (cpt && cpt != '}') { + cpt = cpts_regex[++i]; + } + continue; + case '}': + case ']': + assert (false); + case '(': + if (cpts_regex[i + 1] == '?') { // (?: (?i: (?= (?! (?<= (? prev_range = {0, -1}; + std::sort(regex_expr_ranges.begin(), regex_expr_ranges.end()); + for (auto range : regex_expr_ranges) { + range.first = std::max(range.first, prev_range.second + 1); // prevent overlapping //TODO: as error? + if (range.first > range.second) { // skip overlapping and repetitions + continue; + } + codepoint_categ categ = unicode_cpt_category(range.first); + assert (categ == unicode_cpt_category(range.second)); + auto it0 = map_categ_wregex.find(categ.get_category()); + auto it1 = map_categ_wregex.find(categ.get_subcategory()); + for (const auto & it : {it0, it1}) { + if (it != map_categ_wregex.end()) { + it->second += (wchar_t) range.first; + if (range.first < range.second) { + it->second += (wchar_t) '-'; + it->second += (wchar_t) range.second; + } + } + } + prev_range = range; + regex_expr_ranges[regex_expr_ranges_uniques++] = range; + } + regex_expr_ranges.resize(regex_expr_ranges_uniques); + + // replace categories with respective collapsed codepoint and ranges + uint32_t i = 0; + wregex_collapsed.reserve(regex_expr.size()); + for (auto offset_categ : regex_expr_categs) { + while (i < offset_categ.first) { // copy original regex until reaching the category + wregex_collapsed += (wchar_t) cpts_regex[i]; + i++; + } + assert (cpts_regex[i] == '\\'); + const uint32_t cpt_next = cpts_regex[i + 1]; + const bool is_negated = cpt_next < 'a'; // is uppercase + if (cpt_next == 'p' || cpt_next == 'P') { + assert (cpts_regex[i + 2] == '{' && cpts_regex[i + 3]); + i += cpts_regex[i + 4] == '}' ? 5 : 6; + assert (cpts_regex[i - 1] == '}'); } else { - // no unicode category used, we can use std::wregex directly - const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr); - - // std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback - std::wstring wtext(cpts.begin(), cpts.end()); - for (size_t i = 0; i < wtext.size(); ++i) { - if (wtext[i] > 0x7F && unicode_cpt_category(wtext[i]).is_whitespace()) { - wtext[i] = 0x0B; + assert (cpt_next == 's' || cpt_next == 'w' || cpt_next == 'd' || // \s \w \d + cpt_next == 'S' || cpt_next == 'W' || cpt_next == 'D'); // \S \W \D + i += 2; + } + const codepoint_categ categ = offset_categ.second; + auto it = map_categ_wregex.find(categ.get_subcategory()); + assert (it != map_categ_wregex.end()); + if (it != map_categ_wregex.end()) { + if (categ.is_whitespace()) { // inside square brackets //NOTE: reusing flag WHITESPACE + assert (is_negated == false); + wregex_collapsed += it->second; + } else if(it->second.size() == 1 && !is_negated) { + wregex_collapsed += it->second; + } else { + wregex_collapsed += '['; + if (is_negated) { + wregex_collapsed += '^'; } + wregex_collapsed += it->second; + wregex_collapsed += ']'; } + } + } + while (i < (uint32_t)cpts_regex.size()) { + wregex_collapsed += cpts_regex[i]; + i++; + } - //printf("text: %s\n", text.c_str()); - //printf("regex_expr: %s\n", regex_expr.c_str()); - bpe_offsets = unicode_regex_split_stl(wtext, wregex_expr, bpe_offsets); + // collapse text codepoints not included in 'regex_expr_ranges' + wtext_collapsed.reserve(cpts.size()); + for (uint32_t cpt : cpts) { + const codepoint_categ categ = unicode_cpt_category(cpt); + auto it = std::lower_bound(regex_expr_ranges.begin(), regex_expr_ranges.end(), cpt, + [] (const std::pair range, const uint32_t cpt) { + return range.second < cpt; + } + ); + if (it == regex_expr_ranges.end() || cpt < it->first || it->second < cpt) { + cpt = category_to_collapsed_cpt(categ); // not found, collapse to category codepoint } - } catch (std::regex_error & e) { - fprintf(stderr, "Failed to process regex: '%s'\n", regex_expr.c_str()); - fprintf(stderr, "Regex error: %s\n", e.what()); - throw std::runtime_error("Failed to process regex"); + wtext_collapsed += (wchar_t) cpt; } + + bpe_offsets = unicode_regex_split_stl(wtext_collapsed, wregex_collapsed, bpe_offsets); } std::vector bpe_words; From aeac3421329ef2332fc08b2805a629a42f1bde6b Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Sun, 4 Aug 2024 23:22:56 +0200 Subject: [PATCH 11/29] Add more comments --- src/unicode.cpp | 59 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 3 deletions(-) diff --git a/src/unicode.cpp b/src/unicode.cpp index 68cadf0c49075..f5d1496488a12 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -636,13 +636,47 @@ uint32_t unicode_tolower(uint32_t cp) { } std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs) { - //TODO: update and add more comments - // generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte - // ref: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935 + // std::regex does not support unicode categories: \p{N}, \p{L}, \p{Lu}, \p{Ll} ... + // std::regex does not support unicode whitespaces \s: 0x85, 0xA0, 0x001680 ... 0x003000. + // Generate a "collapsed" representation of the regex, where all unicode categories are replaced by codepoints ranges. + // Generate a "collapsed" representation of the text, where all codepoints are forced to fall into generated category ranges. + // Text codepoints not found in generated category ranges are replaced by a "collapsed" codepoint. + // This implementation generalizes the original implementation adding support to unicode subcategories: + // https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935 + + // Definitions: + // - Unicode cagegory: high unicode categories, \p{C}, \p{L}, \p{M}, \p{N}, \p{P}, \p{S}, \p{Z}. + // - Unicode subcagegory: including all unicode categories, \p{Cc}, \p{Cf}, \p{Co}, \p{Cs}, ..., \p{Zs}. + // - Collapsed codepoint: unused codepoint representing a unicode subcategory. + // - Collapsed range: sequence of "collapsed" codepoint, representing one unicode category. + // - Collapsed regex: original regex including "collapsed" codepoints and ranges. + + // (1) Build the "collapsed" regex: + // (1.1) Generate a replacement list of codepoint ranges: + // (1.1.1) For each unicode category. + // (1.1.2) For each unicode subcategory. + // (1.1.3) Expand \s adding unicode whitespaces. + // (1.2) Each list includes its respective "collaped" codepoint/range. + // (1.3) [Optimization] Only build lists of categories present in the regex. + // (1.4) Build the "collapsed" regex replacing categories and subcategories by this "collapsed" lists. + // (2) Build a list of codepoint ranges. + // (2.1) If a codepoint is not found in this list, then it is "collapsable". + // (2.2) [Optimization] Only build lists of ranges present in the regex. + // (3) For each input text: + // (3.1) Search codepoints in the regex codepoint ranges. + // (3.2) If found, it is a valid codepoint (the "collapsed" regex uses it), literal copy. + // (3.3) If not found, replace with its "collapsed" codepoint so the "collapsed" regex can process it. + + //TODO: Refactor optimizations + // Steps (1) and (2) only depends on the regex expression text. + // Step (3) needs 'regex_expr_ranges' for text "collapsing" and 'wregex_collapsed'. + // Optimization: store and reuse 'wregex_collapsed' and 'regex_expr_ranges'. // 0xDB80 to 0xDBFF: Private Use High Surrogate (128 range values) static const uint32_t COLLAPSE_CPT_RANGE_FIRST = 0xDB80; static const uint32_t COLLAPSE_CPT_RANGE_LAST = 0xDBFF; + + // return the collapsed codepoint of an unicode category or subcategory auto category_to_collapsed_cpt = [] (const codepoint_categ categ) { const uint16_t subindex = categ.get_subcategory() >> 7; // subcategory stored in 3 bits switch(categ.get_category()) { // category fits in other 3 bits @@ -657,6 +691,8 @@ std::vector unicode_regex_split(const std::string & text, const std default: assert (false); return COLLAPSE_CPT_RANGE_FIRST; } }; + + // return the collapsed range of an unicode category (range including all subcategories) auto category_to_collapsed_range = [&] (const codepoint_categ categ) { // \p{Ll} --> \p{Ll} to \p{Ll} // has subcategory ? yes // \p{Lu} --> \p{Lu} to \p{Lu} // has subcategory ? yes @@ -688,11 +724,14 @@ std::vector unicode_regex_split(const std::string & text, const std bool inside_square = false; bool is_cpt_range = false; + // (2) Build a list of codepoint ranges // common ranges: \w \d regex_expr_ranges.emplace_back('a', 'z'); regex_expr_ranges.emplace_back('A', 'Z'); regex_expr_ranges.emplace_back('0', '9'); regex_expr_ranges.emplace_back('_', '_'); + + // (2) Build a list of codepoint ranges // common ranges: \s for (uint32_t cpt : unicode_vec_whitespace) { const auto categ_prev = unicode_cpt_category(regex_expr_ranges.back().second); @@ -704,6 +743,7 @@ std::vector unicode_regex_split(const std::string & text, const std } } + // (1.1.3) Expand \s adding unicode whitespaces. // std::wregex \s does not match non-ASCII whitespaces static const codepoint_categ categ_whitespace(codepoint_categ::MASK + 1); // UNDEF category, subcategory 1 std::wstring & wregex_whitespaces = map_categ_wregex[categ_whitespace.get_subcategory()]; @@ -728,6 +768,7 @@ std::vector unicode_regex_split(const std::string & text, const std for (size_t i = 0; i < cpts_regex.size(); ++i) { uint32_t cpt = cpts_regex[i]; + // skip regex metacharacters if (inside_square) { switch(cpt) { case '^': @@ -788,6 +829,7 @@ std::vector unicode_regex_split(const std::string & text, const std } } + // parse unicode categories and subcategories if (cpt == '\\' && cpts_regex[i + 1] == 'p' && cpts_regex[i + 2] == '{') { assert (cpts_regex[i + 3] && cpts_regex[i + 4]); codepoint_categ categ = {}; @@ -797,6 +839,7 @@ std::vector unicode_regex_split(const std::string & text, const std categ = codepoint_categ::from_chars((char)cpts_regex[i + 3], (char)cpts_regex[i + 4]); assert (cpts_regex[i + 5] == '}'); } + // (2) Build a list of codepoint ranges. (2.2) [Optimization] Only build lists of ranges present in the regex. categ.set_flag(codepoint_categ::WHITESPACE, inside_square); //NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets' regex_expr_categs.emplace_back(i, categ); i += cpts_regex[i + 4] == '}' ? 4 : 5; @@ -805,6 +848,7 @@ std::vector unicode_regex_split(const std::string & text, const std if (cpt == '\\') { if (cpts_regex[i + 1] == 's' || cpts_regex[i + 1] == 'S') { // \s \S + // (2) Build a list of codepoint ranges. (2.2) [Optimization] Only build lists of ranges present in the regex. regex_expr_categs.emplace_back(i, categ_whitespace); //NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets' regex_expr_categs.back().second.set_flag(codepoint_categ::WHITESPACE, inside_square); @@ -813,6 +857,7 @@ std::vector unicode_regex_split(const std::string & text, const std } } + // parse more metcharacters and espaped characters if (cpt == '\\') { switch (cpts_regex[i + 1]) { case 's': ++i; continue; // \s whitespaces @@ -835,8 +880,10 @@ std::vector unicode_regex_split(const std::string & text, const std } } + // ensure there is not a collission with any "collapsed" codepoints assert (cpt < COLLAPSE_CPT_RANGE_FIRST || COLLAPSE_CPT_RANGE_LAST < cpt); + // (2) Build a list of codepoint ranges if (is_cpt_range) { is_cpt_range = false; regex_expr_ranges.back().second = cpt; @@ -850,6 +897,7 @@ std::vector unicode_regex_split(const std::string & text, const std const uint16_t subcateg = offset_categ.second.get_subcategory(); auto it = map_categ_wregex.find(subcateg); if (it == map_categ_wregex.end()) { + // (1.2) Each list includes its respective "collaped" codepoint/range. const auto collapsed_range = category_to_collapsed_range(offset_categ.second); map_categ_wregex[subcateg] = (wchar_t) collapsed_range.first; if (collapsed_range.first < collapsed_range.second) { @@ -868,6 +916,7 @@ std::vector unicode_regex_split(const std::string & text, const std if (range.first > range.second) { // skip overlapping and repetitions continue; } + // (1.1) Generate a replacement list of codepoint ranges codepoint_categ categ = unicode_cpt_category(range.first); assert (categ == unicode_cpt_category(range.second)); auto it0 = map_categ_wregex.find(categ.get_category()); @@ -906,6 +955,7 @@ std::vector unicode_regex_split(const std::string & text, const std cpt_next == 'S' || cpt_next == 'W' || cpt_next == 'D'); // \S \W \D i += 2; } + // (1.4) Build the "collapsed" regex replacing categories and subcategories by this "collapsed" lists. const codepoint_categ categ = offset_categ.second; auto it = map_categ_wregex.find(categ.get_subcategory()); assert (it != map_categ_wregex.end()); @@ -934,14 +984,17 @@ std::vector unicode_regex_split(const std::string & text, const std wtext_collapsed.reserve(cpts.size()); for (uint32_t cpt : cpts) { const codepoint_categ categ = unicode_cpt_category(cpt); + // (3.1) Search codepoints in the regex codepoint ranges. auto it = std::lower_bound(regex_expr_ranges.begin(), regex_expr_ranges.end(), cpt, [] (const std::pair range, const uint32_t cpt) { return range.second < cpt; } ); if (it == regex_expr_ranges.end() || cpt < it->first || it->second < cpt) { + // (3.3) If not found, replace with its "collapsed" codepoint so the "collapsed" regex can process it. cpt = category_to_collapsed_cpt(categ); // not found, collapse to category codepoint } + // (3.2) If found, it is a valid codepoint (the "collapsed" regex uses it), literal copy. wtext_collapsed += (wchar_t) cpt; } From 85c59df9ce6c560fbd01b42be91b366f6e79a9e3 Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Mon, 5 Aug 2024 20:52:25 +0200 Subject: [PATCH 12/29] minor: remove trailing whitespaces and extra semicolons --- src/unicode.cpp | 4 ++-- src/unicode.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/unicode.cpp b/src/unicode.cpp index 5a2c9bb8a4e51..7cd479450b8af 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -864,7 +864,7 @@ std::vector unicode_regex_split(const std::string & text, const std } // parse more metcharacters and espaped characters - if (cpt == '\\') { + if (cpt == '\\') { switch (cpts_regex[i + 1]) { case 's': ++i; continue; // \s whitespaces case 'w': ++i; continue; // \w words @@ -933,7 +933,7 @@ std::vector unicode_regex_split(const std::string & text, const std if (range.first < range.second) { it->second += (wchar_t) '-'; it->second += (wchar_t) range.second; - } + } } } prev_range = range; diff --git a/src/unicode.h b/src/unicode.h index 536e80ef16693..75cdb3f4a596f 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -162,7 +162,7 @@ struct codepoint_categ { case 'Z': return Z | (_subindex(subcateg, "lps" ) << 7); default: assert (false); return 0; } - }; + } uint16_t encoded; }; From 735105edf9290566d50c194f76408f424b4dad6f Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Mon, 5 Aug 2024 20:54:30 +0200 Subject: [PATCH 13/29] Use GGML_ASSERT and GGML_ABORT --- src/unicode.cpp | 54 ++++++++++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/src/unicode.cpp b/src/unicode.cpp index 7cd479450b8af..a5a377b39f567 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -2,10 +2,10 @@ #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING #endif +#include "ggml.h" #include "unicode.h" #include "unicode-data.h" -#include #include #include #include @@ -201,7 +201,7 @@ static std::vector unicode_regex_split_custom_gpt2(const std::string & t for (auto offset : offsets) { const size_t offset_ini = start; const size_t offset_end = start + offset; - assert(offset_end <= cpts.size()); + GGML_ASSERT(offset_end <= cpts.size()); start = offset_end; static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF; @@ -216,7 +216,7 @@ static std::vector unicode_regex_split_custom_gpt2(const std::string & t size_t _prev_end = offset_ini; auto _add_token = [&] (const size_t end) -> size_t { - assert(_prev_end <= end && end <= offset_end); + GGML_ASSERT(_prev_end <= end && end <= offset_end); size_t len = end - _prev_end; if (len > 0) { bpe_offsets.push_back(len); @@ -320,7 +320,7 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & for (auto offset : offsets) { const size_t offset_ini = start; const size_t offset_end = start + offset; - assert(offset_end <= cpts.size()); + GGML_ASSERT(offset_end <= cpts.size()); start = offset_end; static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF; @@ -335,7 +335,7 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & size_t _prev_end = offset_ini; auto _add_token = [&] (const size_t end) -> size_t { - assert(_prev_end <= end && end <= offset_end); + GGML_ASSERT(_prev_end <= end && end <= offset_end); size_t len = end - _prev_end; if (len > 0) { bpe_offsets.push_back(len); @@ -595,7 +595,7 @@ codepoint_categ unicode_cpt_category(const uint32_t cp) { cpt_categs[cpt++] = categ; } } - assert (cpt == MAX_CODEPOINTS); + GGML_ASSERT(cpt == MAX_CODEPOINTS); for (auto cpt : unicode_vec_whitespace) { cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACE); @@ -694,7 +694,7 @@ std::vector unicode_regex_split(const std::string & text, const std case codepoint_categ::P: return COLLAPSE_CPT_RANGE_FIRST + ((5 << 3) | subindex); case codepoint_categ::S: return COLLAPSE_CPT_RANGE_FIRST + ((6 << 3) | subindex); case codepoint_categ::Z: return COLLAPSE_CPT_RANGE_FIRST + ((7 << 3) | subindex); - default: assert (false); return COLLAPSE_CPT_RANGE_FIRST; + default: GGML_ASSERT(false); return COLLAPSE_CPT_RANGE_FIRST; } }; @@ -703,7 +703,7 @@ std::vector unicode_regex_split(const std::string & text, const std // \p{Ll} --> \p{Ll} to \p{Ll} // has subcategory ? yes // \p{Lu} --> \p{Lu} to \p{Lu} // has subcategory ? yes // \p{L} --> \p{Ll} to \p{Lu} // has subcategory ? no - assert ((COLLAPSE_CPT_RANGE_FIRST & 0b111) == 0); + GGML_ASSERT((COLLAPSE_CPT_RANGE_FIRST & 0b111) == 0); const uint32_t collapsed = category_to_collapsed_cpt(categ); const uint32_t range = (collapsed & 0b111) ? 0 : 0b111; // has subcategory ? return std::pair(collapsed, collapsed + range); @@ -811,17 +811,17 @@ std::vector unicode_regex_split(const std::string & text, const std continue; case '}': case ']': - assert (false); + GGML_ABORT("invalid regex"); case '(': if (cpts_regex[i + 1] == '?') { // (?: (?i: (?= (?! (?<= (? unicode_regex_split(const std::string & text, const std // parse unicode categories and subcategories if (cpt == '\\' && cpts_regex[i + 1] == 'p' && cpts_regex[i + 2] == '{') { - assert (cpts_regex[i + 3] && cpts_regex[i + 4]); + GGML_ASSERT(cpts_regex[i + 3] && cpts_regex[i + 4]); codepoint_categ categ = {}; if (cpts_regex[i + 4] == '}') { categ = codepoint_categ::from_chars((char)cpts_regex[i + 3]); } else { categ = codepoint_categ::from_chars((char)cpts_regex[i + 3], (char)cpts_regex[i + 4]); - assert (cpts_regex[i + 5] == '}'); + GGML_ASSERT(cpts_regex[i + 5] == '}'); } // (2) Build a list of codepoint ranges. (2.2) [Optimization] Only build lists of ranges present in the regex. categ.set_flag(codepoint_categ::WHITESPACE, inside_square); //NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets' @@ -875,19 +875,19 @@ std::vector unicode_regex_split(const std::string & text, const std case 't': ++i; cpt = '\t'; break; case 'r': ++i; cpt = '\r'; break; case 'n': ++i; cpt = '\n'; break; - case 'x': assert (false); break; //TODO: hex values - case 'u': assert (false); break; //TODO: unicode values - case 'U': assert (false); break; //TODO: unicode values + case 'x': GGML_ABORT("TODO"); break; //TODO: hex values + case 'u': GGML_ABORT("TODO"); break; //TODO: unicode values + case 'U': GGML_ABORT("TODO"); break; //TODO: unicode values default: // escaped character - assert (!is_cpt_range); + GGML_ASSERT(!is_cpt_range); cpt = cpts_regex[++i]; - assert (cpt < 0x80); + GGML_ASSERT(cpt < 0x80); break; } } // ensure there is not a collission with any "collapsed" codepoints - assert (cpt < COLLAPSE_CPT_RANGE_FIRST || COLLAPSE_CPT_RANGE_LAST < cpt); + GGML_ASSERT(cpt < COLLAPSE_CPT_RANGE_FIRST || COLLAPSE_CPT_RANGE_LAST < cpt); // (2) Build a list of codepoint ranges if (is_cpt_range) { @@ -924,7 +924,7 @@ std::vector unicode_regex_split(const std::string & text, const std } // (1.1) Generate a replacement list of codepoint ranges codepoint_categ categ = unicode_cpt_category(range.first); - assert (categ == unicode_cpt_category(range.second)); + GGML_ASSERT(categ == unicode_cpt_category(range.second)); auto it0 = map_categ_wregex.find(categ.get_category()); auto it1 = map_categ_wregex.find(categ.get_subcategory()); for (const auto & it : {it0, it1}) { @@ -949,25 +949,25 @@ std::vector unicode_regex_split(const std::string & text, const std wregex_collapsed += (wchar_t) cpts_regex[i]; i++; } - assert (cpts_regex[i] == '\\'); + GGML_ASSERT(cpts_regex[i] == '\\'); const uint32_t cpt_next = cpts_regex[i + 1]; const bool is_negated = cpt_next < 'a'; // is uppercase if (cpt_next == 'p' || cpt_next == 'P') { - assert (cpts_regex[i + 2] == '{' && cpts_regex[i + 3]); + GGML_ASSERT(cpts_regex[i + 2] == '{' && cpts_regex[i + 3]); i += cpts_regex[i + 4] == '}' ? 5 : 6; - assert (cpts_regex[i - 1] == '}'); + GGML_ASSERT(cpts_regex[i - 1] == '}'); } else { - assert (cpt_next == 's' || cpt_next == 'w' || cpt_next == 'd' || // \s \w \d - cpt_next == 'S' || cpt_next == 'W' || cpt_next == 'D'); // \S \W \D + GGML_ASSERT(cpt_next == 's' || cpt_next == 'w' || cpt_next == 'd' || // \s \w \d + cpt_next == 'S' || cpt_next == 'W' || cpt_next == 'D'); // \S \W \D i += 2; } // (1.4) Build the "collapsed" regex replacing categories and subcategories by this "collapsed" lists. const codepoint_categ categ = offset_categ.second; auto it = map_categ_wregex.find(categ.get_subcategory()); - assert (it != map_categ_wregex.end()); + GGML_ASSERT(it != map_categ_wregex.end()); if (it != map_categ_wregex.end()) { if (categ.is_whitespace()) { // inside square brackets //NOTE: reusing flag WHITESPACE - assert (is_negated == false); + GGML_ASSERT(is_negated == false); wregex_collapsed += it->second; } else if(it->second.size() == 1 && !is_negated) { wregex_collapsed += it->second; From fd6d9b9e6a918a1f08f4718cb61fb299b92dd390 Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Mon, 5 Aug 2024 20:58:15 +0200 Subject: [PATCH 14/29] Update bruteforce test: fix pyright complaints --- tests/test-tokenizer-random.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index c17a1cfbd85a7..f3447d482b989 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -124,8 +124,7 @@ def get_vocab(self, detokenize=False) -> list[str]: text = self.detokenize([id], remove_special=False, unparse_special=True) else: text = self.lib.llama_token_get_text(self.model, id) - text = self.ffi.string(text) - text = str(text, encoding="utf-8", errors="replace") # replace errors with '\uFFFD' + text = str(cast(bytes, self.ffi.string(text)), encoding="utf-8", errors="replace") # replace errors with '\uFFFD' vocab.append(text) return vocab @@ -162,12 +161,13 @@ def __init__(self, dir_tokenizer: str): self.eos_token = self.model.eos_token def get_vocab(self, detokenize=False) -> list[str]: + vocab: list[str] = [] max_token_id = max(self.model.get_vocab().values()) if detokenize: ids = list(range(max_token_id + 1)) vocab = self.model.batch_decode(ids, skip_special_tokens=False) else: - vocab = [None] * (max_token_id + 1) + vocab = [""] * (max_token_id + 1) for text, id in self.model.get_vocab().items(): vocab[id] = text return vocab @@ -455,14 +455,6 @@ def generator_random_vocab_words(tokenizer: TokenizerGroundtruth, iterations=100 def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLlamaCpp, generator: Iterator[str]): - def find_first_mismatch(ids1: list[int] | str, ids2: list[int] | str): - for i, (a, b) in enumerate(zip(ids1, ids2)): - if a != b: - return i - if len(ids1) == len(ids2): - return -1 - return min(len(ids1), len(ids2)) - def check_detokenizer(text: str, text1: str, text2: str) -> bool: if text1 == text2: # equal to TokenizerGroundtruth? return True From 3b36703c8afd32966ab80dd861b1985a929df79f Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Mon, 5 Aug 2024 21:10:45 +0200 Subject: [PATCH 15/29] Update bruteforce test: - Faster failing text range selection. - Show unique failing texts differences. - Add more recent models. --- tests/test-tokenizer-random.py | 82 ++++++++++++++++++++-------------- 1 file changed, 48 insertions(+), 34 deletions(-) diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index f3447d482b989..4a5773fa535a9 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -472,10 +472,11 @@ def check_detokenizer(text: str, text1: str, text2: str) -> bool: t_decode1 = 0 t_decode2 = 0 t_start = time.perf_counter() + total_tests = 0 + failing_texts = set() encode_errors = 0 decode_errors = 0 - total_tests = 0 - MAX_ERRORS = 10 + MAX_ERRORS = 5 logger.info("%s: %s" % (generator.__qualname__, "ini")) for text in generator: @@ -494,13 +495,11 @@ def check_detokenizer(text: str, text1: str, text2: str) -> bool: t_encode2 += t2 - t1 t_decode1 += t3 - t2 t_decode2 += t4 - t3 + total_tests += 1 # compare encode_ok = ids1 == ids2 decode_ok = check_detokenizer(text, text1, text2) - encode_errors += not encode_ok - decode_errors += not decode_ok - total_tests += 1 - if (encode_errors < MAX_ERRORS and not encode_ok) or (decode_errors < MAX_ERRORS and not decode_ok): + if not (encode_ok and decode_ok): def _compare(text: str): ids1 = tokenizer1.encode(text) ids2 = tokenizer2.encode(text) @@ -510,33 +509,42 @@ def _compare(text: str): decode_ok = check_detokenizer(text, text1, text2) ok = encode_ok and decode_ok return ok, ids1, ids2, text1, text2 + # binary search upper and lower failing range a, b = 0, len(text) - for step in [64, 32, 16, 8, 4, 2, 1]: - while a < b: - t = max(a, b - step) - if _compare(text[a : t])[0]: - break - b = t - for step in [64, 32, 16, 8, 4, 2, 1]: - while a < b: - t = min(a + step, b) - if _compare(text[t : b])[0]: - break - a = t + step = b + while step > 1: + step = step // 2 + if not _compare(text[a : b - step])[0]: + b = b - step + step = b + while step > 1: + step = step // 2 + if not _compare(text[a + step : b])[0]: + a = a + step ok, ids1, ids2, text1, text2 = _compare(text[a : b]) assert a <= b and not ok - logger.error(" Text:" + repr(text[a : b])) - logger.error(" " + " ".join(repr(x) + ":" + hex(ord(x)) for x in text[a : b])) - logger.error(" Expected: " + str(ids1)) - logger.error(" Result: " + str(ids2)) - logger.error(" Expected: " + " ".join(repr(x) + ":" + hex(ord(x)) for x in text1)) - logger.error(" Result: " + " ".join(repr(x) + ":" + hex(ord(x)) for x in text2)) - logger.error(f" {encode_errors=}") - logger.error(f" {decode_errors=}") - if encode_errors >= MAX_ERRORS and decode_errors >= MAX_ERRORS: - logger.error(f" EXIT: {encode_errors=} {decode_errors=}") - # raise Exception() - break + # show unique failing texts differences + failing_text = text[a : b] + if failing_text not in failing_texts: + failing_texts.add(failing_text) + if encode_errors < MAX_ERRORS and not encode_ok: + encode_errors += 1 + logger.error(f" {encode_errors=}") + logger.error(" Text:" + repr(failing_text)) + logger.error(" " + " ".join(repr(x) + ":" + hex(ord(x)) for x in failing_text)) + logger.error(" Expected: " + str(ids1)) + logger.error(" Result: " + str(ids2)) + if decode_errors < MAX_ERRORS and not decode_ok: + decode_errors += 1 + logger.error(f" {decode_errors=}") + logger.error(" Text:" + repr(failing_text)) + logger.error(" " + " ".join(repr(x) + ":" + hex(ord(x)) for x in failing_text)) + logger.error(" Expected: " + " ".join(repr(x) + ":" + hex(ord(x)) for x in text1)) + logger.error(" Result: " + " ".join(repr(x) + ":" + hex(ord(x)) for x in text2)) + if encode_errors >= MAX_ERRORS and decode_errors >= MAX_ERRORS: + logger.error(f" EXIT: {encode_errors=} {decode_errors=}") + # raise Exception() + break t_total = time.perf_counter() - t_start logger.info(f"{generator.__qualname__}: end, {t_encode1=:.3f} {t_encode2=:.3f} {t_decode1=:.3f} {t_decode2=:.3f} {t_total=:.3f}") @@ -635,21 +643,19 @@ def main(argv: list[str] | None = None): "phi-3", # SPM "gemma", # SPM "gemma-2", # SPM - "baichuan", # SPM + # "baichuan", # SPM "bert-bge", # WPM "jina-v2-en", # WPM + # "t5", # UGM "llama-bpe", # BPE "phi-2", # BPE "deepseek-llm", # BPE "deepseek-coder", # BPE "falcon", # BPE - "mpt", # BPE "starcoder", # BPE "gpt-2", # BPE "stablelm2", # BPE "refact", # BPE - "qwen2", # BPE - "olmo", # BPE "jina-v2-es", # BPE "jina-v2-de", # BPE "smaug-bpe", # BPE @@ -657,6 +663,14 @@ def main(argv: list[str] | None = None): "jina-v2-code", # BPE "viking", # BPE "jais", # BPE + "codeshell", # BPE + "tekken", # BPE + "smollm", # BPE + "mpt", # BPE NFC + "command-r", # BPE NFC + "qwen2", # BPE NFC + "olmo", # BPE NFC + "gpt-neox", # BPE NFC ] logger.info("=" * 50) From d558c736fd691adea0a1d36b99d4a802240d3679 Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Mon, 5 Aug 2024 21:24:13 +0200 Subject: [PATCH 16/29] Binary constants are a C++14 feature --- src/unicode.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/unicode.cpp b/src/unicode.cpp index a5a377b39f567..19f55145e117b 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -703,9 +703,9 @@ std::vector unicode_regex_split(const std::string & text, const std // \p{Ll} --> \p{Ll} to \p{Ll} // has subcategory ? yes // \p{Lu} --> \p{Lu} to \p{Lu} // has subcategory ? yes // \p{L} --> \p{Ll} to \p{Lu} // has subcategory ? no - GGML_ASSERT((COLLAPSE_CPT_RANGE_FIRST & 0b111) == 0); + GGML_ASSERT((COLLAPSE_CPT_RANGE_FIRST & 0x7) == 0); const uint32_t collapsed = category_to_collapsed_cpt(categ); - const uint32_t range = (collapsed & 0b111) ? 0 : 0b111; // has subcategory ? + const uint32_t range = (collapsed & 0x7) ? 0 : 0x7; // has subcategory ? return std::pair(collapsed, collapsed + range); }; From 674f0faa74dc483ed3a1bac1f1f3ec8d0eb7e792 Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Mon, 5 Aug 2024 21:43:32 +0200 Subject: [PATCH 17/29] Fix copy/paste wrong variable --- src/unicode.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/unicode.cpp b/src/unicode.cpp index 19f55145e117b..ae36d2b43c828 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -602,11 +602,11 @@ codepoint_categ unicode_cpt_category(const uint32_t cp) { } for (auto p : unicode_map_lowercase) { - cpt_categs[cpt].set_flag(codepoint_categ::LOWERCASE); + cpt_categs[p.second].set_flag(codepoint_categ::LOWERCASE); } for (auto p : unicode_map_uppercase) { - cpt_categs[cpt].set_flag(codepoint_categ::UPPERCASE); + cpt_categs[p.second].set_flag(codepoint_categ::UPPERCASE); } //for (auto &range : unicode_ranges_nfd) { // start, last, nfd From 2ca313830e56d293719040d09733bc574f59923e Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Mon, 5 Aug 2024 23:55:17 +0200 Subject: [PATCH 18/29] Fix compiler complaints --- src/unicode.cpp | 18 ++++++++++-------- src/unicode.h | 2 +- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/unicode.cpp b/src/unicode.cpp index ae36d2b43c828..725476600f2ff 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -694,7 +694,7 @@ std::vector unicode_regex_split(const std::string & text, const std case codepoint_categ::P: return COLLAPSE_CPT_RANGE_FIRST + ((5 << 3) | subindex); case codepoint_categ::S: return COLLAPSE_CPT_RANGE_FIRST + ((6 << 3) | subindex); case codepoint_categ::Z: return COLLAPSE_CPT_RANGE_FIRST + ((7 << 3) | subindex); - default: GGML_ASSERT(false); return COLLAPSE_CPT_RANGE_FIRST; + default: GGML_ABORT("invalid category"); } }; @@ -709,6 +709,8 @@ std::vector unicode_regex_split(const std::string & text, const std return std::pair(collapsed, collapsed + range); }; + GGML_ASSERT(sizeof(wchar_t) == sizeof(u_int32_t)); + const auto cpts = unicode_cpts_from_utf8(text); std::vector bpe_offsets = { cpts.size() }; @@ -756,7 +758,7 @@ std::vector unicode_regex_split(const std::string & text, const std wregex_whitespaces += L"\\s"; for (uint32_t cpt : unicode_vec_whitespace) { if (cpt >= 0x80) { // non-ASCII whitespaces - if (wregex_whitespaces.back() + 1 == cpt) { + if (wregex_whitespaces.back() + 1 == (wchar_t) cpt) { if (*(wregex_whitespaces.end() - 2) == '-') { wregex_whitespaces.back() = cpt; } else { @@ -764,7 +766,7 @@ std::vector unicode_regex_split(const std::string & text, const std wregex_whitespaces += cpt; } } else { - wregex_whitespaces += cpt; + wregex_whitespaces += (wchar_t) cpt; } } } @@ -847,7 +849,7 @@ std::vector unicode_regex_split(const std::string & text, const std } // (2) Build a list of codepoint ranges. (2.2) [Optimization] Only build lists of ranges present in the regex. categ.set_flag(codepoint_categ::WHITESPACE, inside_square); //NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets' - regex_expr_categs.emplace_back(i, categ); + regex_expr_categs.emplace_back((uint32_t)i, categ); i += cpts_regex[i + 4] == '}' ? 4 : 5; continue; } @@ -855,7 +857,7 @@ std::vector unicode_regex_split(const std::string & text, const std if (cpt == '\\') { if (cpts_regex[i + 1] == 's' || cpts_regex[i + 1] == 'S') { // \s \S // (2) Build a list of codepoint ranges. (2.2) [Optimization] Only build lists of ranges present in the regex. - regex_expr_categs.emplace_back(i, categ_whitespace); + regex_expr_categs.emplace_back((uint32_t)i, categ_whitespace); //NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets' regex_expr_categs.back().second.set_flag(codepoint_categ::WHITESPACE, inside_square); i += 1; @@ -875,9 +877,9 @@ std::vector unicode_regex_split(const std::string & text, const std case 't': ++i; cpt = '\t'; break; case 'r': ++i; cpt = '\r'; break; case 'n': ++i; cpt = '\n'; break; - case 'x': GGML_ABORT("TODO"); break; //TODO: hex values - case 'u': GGML_ABORT("TODO"); break; //TODO: unicode values - case 'U': GGML_ABORT("TODO"); break; //TODO: unicode values + case 'x': GGML_ABORT("TODO"); //TODO: hex values + case 'u': GGML_ABORT("TODO"); //TODO: unicode values + case 'U': GGML_ABORT("TODO"); //TODO: unicode values default: // escaped character GGML_ASSERT(!is_cpt_range); cpt = cpts_regex[++i]; diff --git a/src/unicode.h b/src/unicode.h index 75cdb3f4a596f..8a3f4078ca79b 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -149,7 +149,7 @@ struct codepoint_categ { return 0; } const char * p = strchr(subcategs, subcateg); - return p ? (p - subcategs + 1) : 0; + return (uint16_t) (p ? (p - subcategs + 1) : 0); }; switch(categ) { case 'C': if(subcateg == 'n') return 0; // undefined From 80f41234e40d4368960c85d7383c7b6a70cb5eac Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Wed, 7 Aug 2024 23:08:04 +0200 Subject: [PATCH 19/29] Update bruteforce test: fix binary search --- tests/test-tokenizer-random.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index 4a5773fa535a9..f7c3b140776c8 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -513,14 +513,16 @@ def _compare(text: str): a, b = 0, len(text) step = b while step > 1: - step = step // 2 - if not _compare(text[a : b - step])[0]: - b = b - step + step = (step + 1) // 2 + t = max(a, b - step) + if not _compare(text[a : t])[0]: + b = t step = b while step > 1: - step = step // 2 - if not _compare(text[a + step : b])[0]: - a = a + step + step = (step + 1) // 2 + t = min(a + step, b) + if not _compare(text[t : b])[0]: + a = t ok, ids1, ids2, text1, text2 = _compare(text[a : b]) assert a <= b and not ok # show unique failing texts differences From 7afe6df6a2290fea257414938a2d39110b9e7a33 Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Wed, 7 Aug 2024 23:14:36 +0200 Subject: [PATCH 20/29] Unicode data whitespaces as ranges --- scripts/gen-unicode-data.py | 23 +++++++++++----------- src/unicode-data.cpp | 38 ++++++++++++------------------------- src/unicode-data.h | 2 +- src/unicode.cpp | 6 ++++-- 4 files changed, 28 insertions(+), 41 deletions(-) diff --git a/scripts/gen-unicode-data.py b/scripts/gen-unicode-data.py index d774fcabe9481..1528a13db4c80 100644 --- a/scripts/gen-unicode-data.py +++ b/scripts/gen-unicode-data.py @@ -85,7 +85,6 @@ def unicode_data_iter(): codepoint_categs = array.array('B', [0]) * MAX_CODEPOINTS # Undefined -table_whitespace = [] table_lowercase = [] table_uppercase = [] table_nfd = [] @@ -111,19 +110,20 @@ def unicode_data_iter(): table_nfd.append((cpt, norm)) -# whitespaces, see "" https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt -table_whitespace.extend(range(0x0009, 0x000D + 1)) -table_whitespace.extend(range(0x2000, 0x200A + 1)) -table_whitespace.extend([0x0020, 0x0085, 0x00A0, 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000]) - - # sort by codepoint -table_whitespace.sort() table_lowercase.sort() table_uppercase.sort() table_nfd.sort() +# whitespaces, see "" https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt +whitespace_ranges: list[tuple[int, int]] = [] # start, last +whitespace_ranges.append((0x0009, 0x000D)) +whitespace_ranges.append((0x2000, 0x200A)) +for whitespace in [0x0020, 0x0085, 0x00A0, 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000]: + whitespace_ranges.append((whitespace, whitespace)) + + # run length encoding, see unicode_cpt_category() in unicode.cpp assert (max(UNICODE_CATEGORY_TO_INDEX.values()) < 32) codepoint_categs_runs = [codepoint_categs[0]] # 5 bits categ + 11 bits length @@ -162,7 +162,6 @@ def out(line=""): #include #include #include -#include """) out("const std::vector unicode_rle_codepoints_categs = { // run length encoding, 5 bits categ + 11 bits length") @@ -170,9 +169,9 @@ def out(line=""): out("0x%04X," % rle) out("};\n") -out("const std::vector unicode_vec_whitespace = {") -for codepoint in table_whitespace: - out("0x%06X," % codepoint) +out("const std::vector> unicode_ranges_whitespace = {") +for (start, last) in whitespace_ranges: + out("{0x%06X, 0x%06X}," % (start, last)) out("};\n") out("const std::unordered_map unicode_map_lowercase = {") diff --git a/src/unicode-data.cpp b/src/unicode-data.cpp index 2591723ce3172..1a2ceb01739b8 100644 --- a/src/unicode-data.cpp +++ b/src/unicode-data.cpp @@ -4526,32 +4526,18 @@ const std::vector unicode_rle_codepoints_categs = { // run length enc 0x0020, }; -const std::vector unicode_vec_whitespace = { -0x000009, -0x00000A, -0x00000B, -0x00000C, -0x00000D, -0x000020, -0x000085, -0x0000A0, -0x001680, -0x002000, -0x002001, -0x002002, -0x002003, -0x002004, -0x002005, -0x002006, -0x002007, -0x002008, -0x002009, -0x00200A, -0x002028, -0x002029, -0x00202F, -0x00205F, -0x003000, +const std::vector> unicode_ranges_whitespace = { +{0x000009, 0x00000D}, +{0x002000, 0x00200A}, +{0x000020, 0x000020}, +{0x000085, 0x000085}, +{0x0000A0, 0x0000A0}, +{0x001680, 0x001680}, +{0x002028, 0x002028}, +{0x002029, 0x002029}, +{0x00202F, 0x00202F}, +{0x00205F, 0x00205F}, +{0x003000, 0x003000}, }; const std::unordered_map unicode_map_lowercase = { diff --git a/src/unicode-data.h b/src/unicode-data.h index 682f79c373749..447826879eaee 100644 --- a/src/unicode-data.h +++ b/src/unicode-data.h @@ -13,7 +13,7 @@ struct range_nfd { static const uint32_t MAX_CODEPOINTS = 0x110000; extern const std::vector unicode_rle_codepoints_categs; -extern const std::vector unicode_vec_whitespace; +extern const std::vector> unicode_ranges_whitespace; extern const std::unordered_map unicode_map_lowercase; extern const std::unordered_map unicode_map_uppercase; extern const std::vector unicode_ranges_nfd; diff --git a/src/unicode.cpp b/src/unicode.cpp index 725476600f2ff..6ebef0ec96e02 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -597,8 +597,10 @@ codepoint_categ unicode_cpt_category(const uint32_t cp) { } GGML_ASSERT(cpt == MAX_CODEPOINTS); - for (auto cpt : unicode_vec_whitespace) { - cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACE); + for (auto p : unicode_ranges_whitespace) { + for (uint32_t cpt = p.first; cpt <= p.second; ++cpt) { + cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACE); + } } for (auto p : unicode_map_lowercase) { From c2406383749aec4c3617c90054cd1d64134fcf15 Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Thu, 8 Aug 2024 01:35:20 +0200 Subject: [PATCH 21/29] Reimplement unicode_regex_split() --- src/unicode.cpp | 425 +++++++++++++++++++----------------------------- 1 file changed, 171 insertions(+), 254 deletions(-) diff --git a/src/unicode.cpp b/src/unicode.cpp index 6ebef0ec96e02..4a5728ed6fd88 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -644,141 +644,128 @@ uint32_t unicode_tolower(uint32_t cp) { } std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs) { - // std::regex does not support unicode categories: \p{N}, \p{L}, \p{Lu}, \p{Ll} ... - // std::regex does not support unicode whitespaces \s: 0x85, 0xA0, 0x001680 ... 0x003000. - // Generate a "collapsed" representation of the regex, where all unicode categories are replaced by codepoints ranges. - // Generate a "collapsed" representation of the text, where all codepoints are forced to fall into generated category ranges. - // Text codepoints not found in generated category ranges are replaced by a "collapsed" codepoint. - // This implementation generalizes the original implementation adding support to unicode subcategories: - // https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935 - - // Definitions: - // - Unicode cagegory: high unicode categories, \p{C}, \p{L}, \p{M}, \p{N}, \p{P}, \p{S}, \p{Z}. - // - Unicode subcagegory: including all unicode categories, \p{Cc}, \p{Cf}, \p{Co}, \p{Cs}, ..., \p{Zs}. - // - Collapsed codepoint: unused codepoint representing a unicode subcategory. - // - Collapsed range: sequence of "collapsed" codepoint, representing one unicode category. - // - Collapsed regex: original regex including "collapsed" codepoints and ranges. - - // (1) Build the "collapsed" regex: - // (1.1) Generate a replacement list of codepoint ranges: - // (1.1.1) For each unicode category. - // (1.1.2) For each unicode subcategory. - // (1.1.3) Expand \s adding unicode whitespaces. - // (1.2) Each list includes its respective "collaped" codepoint/range. - // (1.3) [Optimization] Only build lists of categories present in the regex. - // (1.4) Build the "collapsed" regex replacing categories and subcategories by this "collapsed" lists. - // (2) Build a list of codepoint ranges. - // (2.1) If a codepoint is not found in this list, then it is "collapsable". - // (2.2) [Optimization] Only build lists of ranges present in the regex. - // (3) For each input text: - // (3.1) Search codepoints in the regex codepoint ranges. - // (3.2) If found, it is a valid codepoint (the "collapsed" regex uses it), literal copy. - // (3.3) If not found, replace with its "collapsed" codepoint so the "collapsed" regex can process it. - - //TODO: Refactor optimizations - // Steps (1) and (2) only depends on the regex expression text. - // Step (3) needs 'regex_expr_ranges' for text "collapsing" and 'wregex_collapsed'. - // Optimization: store and reuse 'wregex_collapsed' and 'regex_expr_ranges'. - - // 0xDB80 to 0xDBFF: Private Use High Surrogate (128 range values) - static const uint32_t COLLAPSE_CPT_RANGE_FIRST = 0xDB80; - static const uint32_t COLLAPSE_CPT_RANGE_LAST = 0xDBFF; - - // return the collapsed codepoint of an unicode category or subcategory - auto category_to_collapsed_cpt = [] (const codepoint_categ categ) { - const uint16_t subindex = categ.get_subcategory() >> 7; // subcategory stored in 3 bits - switch(categ.get_category()) { // category fits in other 3 bits - case codepoint_categ::UNDEF: return COLLAPSE_CPT_RANGE_FIRST + ((0 << 3) | subindex); - case codepoint_categ::C: return COLLAPSE_CPT_RANGE_FIRST + ((1 << 3) | subindex); - case codepoint_categ::L: return COLLAPSE_CPT_RANGE_FIRST + ((2 << 3) | subindex); - case codepoint_categ::M: return COLLAPSE_CPT_RANGE_FIRST + ((3 << 3) | subindex); - case codepoint_categ::N: return COLLAPSE_CPT_RANGE_FIRST + ((4 << 3) | subindex); - case codepoint_categ::P: return COLLAPSE_CPT_RANGE_FIRST + ((5 << 3) | subindex); - case codepoint_categ::S: return COLLAPSE_CPT_RANGE_FIRST + ((6 << 3) | subindex); - case codepoint_categ::Z: return COLLAPSE_CPT_RANGE_FIRST + ((7 << 3) | subindex); - default: GGML_ABORT("invalid category"); + // std::wregex does not support unicode categories: \p{N}, \p{L}, \p{Lu}, \p{Ll} ... + // std::wregex does not support unicode whitespaces \s: 0x85, 0xA0, 0x001680 ... 0x003000. + // std::wregex allows full wchar_t 32 bit codepoints, not limited to standard max 0x110000. + // The main idea is to insert unicode category bits into all regex and text codepoints. + // Max unicode codepoint 0x110000 fits in 21 bits. + // Store unicode category and subcategory in 10 bits. + // Set the high bit to zero to keep wchar_t positive (uint32_t codepoints). + // Categorized codepoint: + // 1 bit zero + 7 bits category + 3 bits subcategory index + 21 bits codepoint + // 0b0'XXXXXXX'xxx'ccccccccccccccccccccc + // A "categorized codepoint" re-defines the ordering keeping category hierarchy. + // All high category codepoints \p{X} fall into the range: + // 0b0'XXXXXXX'000'000000000000000000000 + // 0b0'XXXXXXX'111'111111111111111111111 + // All subcategory codepoints \p{Xx} fall into the range: + // 0b0'XXXXXXX'xxx'000000000000000000000 + // 0b0'XXXXXXX'xxx'111111111111111111111 + // Processing steps: + // Build a lists of "categorized codepoints/ranges" for replacing regex \s \w and \d. + // Replace all regex codepoints/ranges with respective "categorized codepoints/ranges". + // Replace all text codepoints with respective "categorized codepoints". + // Caveats: + // Some regex ranges starts and ends with different category/subcategory. + // Split the ranges in sub-ranges to ensure a single category to maintain the new hierarchy. + // This forces iterating all ranges and could produce long sub-range sequences. + + //TODO: Regex processing can be cached. + + // insert unicode category and subcategory before codepoint bits + // 1 bit zero + 7 bits category + 3 bits subcategory index + 21 bits zero + static const auto categorized_prefix = [] (const codepoint_categ categ) -> wchar_t { + static const uint32_t MASK = codepoint_categ::MASK; // category mask + static const uint32_t SUBMASK = codepoint_categ::SUBMASK & ~codepoint_categ::MASK; // subcategory mask + return (wchar_t) (((categ.encoded & MASK) << (21+3)) | ((categ.encoded & SUBMASK) << (21-7))); + }; + + // insert unicode category and subcategory before codepoint bits + // 1 bit zero + 7 bits category + 3 bits subcategory index + 21 bits codepoint + static const auto categorize_codepoint = [] (const uint32_t cpt) -> wchar_t { + GGML_ASSERT(cpt < (1 << 21)); + return categorized_prefix(unicode_cpt_category(cpt)) | (wchar_t)cpt; + }; + + // remove the categorized prefix bits and restore original codepoint bits + static const auto decategorize_codepoint = [] (const wchar_t cpt) -> uint32_t { + return (uint32_t) cpt & ((1 << 21) - 1); + }; + + // returns the respective categorized codepoint range of the category/subcategory + static const auto categorize_range_from_chars = [] (const char categ, const char subcateg) { + const wchar_t range_ini = categorized_prefix(codepoint_categ::from_chars(categ, subcateg)); + const wchar_t range_end = (wchar_t) (range_ini | (subcateg ? (1<<21)-1 : (1<<24)-1)); + return std::pair(range_ini, range_end); + }; + + // helper function to append/concat regex expressions + auto wregex_append_subregex = [] (std::wstring & wregex, const std::wstring & subregex, const bool add_squares, const bool negated) { + if (add_squares) { + wregex += '['; + if (negated) { + wregex += '^'; + } + wregex += subregex; + wregex += ']'; + } else { + GGML_ASSERT(!negated); //TODO: negation inside square brackets: \S \W \D + wregex += subregex; } }; - // return the collapsed range of an unicode category (range including all subcategories) - auto category_to_collapsed_range = [&] (const codepoint_categ categ) { - // \p{Ll} --> \p{Ll} to \p{Ll} // has subcategory ? yes - // \p{Lu} --> \p{Lu} to \p{Lu} // has subcategory ? yes - // \p{L} --> \p{Ll} to \p{Lu} // has subcategory ? no - GGML_ASSERT((COLLAPSE_CPT_RANGE_FIRST & 0x7) == 0); - const uint32_t collapsed = category_to_collapsed_cpt(categ); - const uint32_t range = (collapsed & 0x7) ? 0 : 0x7; // has subcategory ? - return std::pair(collapsed, collapsed + range); + // \d digits replacement + static const std::wstring wregex_digits = { + categorize_codepoint('0'), '-', categorize_codepoint('9'), }; - GGML_ASSERT(sizeof(wchar_t) == sizeof(u_int32_t)); + // \w words replacement + static const std::wstring wregex_words = { + categorize_codepoint('_'), + categorize_codepoint('0'), '-', categorize_codepoint('9'), + categorize_codepoint('A'), '-', categorize_codepoint('Z'), + categorize_codepoint('a'), '-', categorize_codepoint('z'), + }; - const auto cpts = unicode_cpts_from_utf8(text); + // \s whitespaces replacement + static const std::wstring wregex_whitespaces = [] { + std::wstring wregex_whitespaces; + for (const auto & range : unicode_ranges_whitespace) { + wregex_whitespaces += categorize_codepoint(range.first); + if (range.second > range.first) { + wregex_whitespaces += '-'; + wregex_whitespaces += categorize_codepoint(range.second); + } + } + return wregex_whitespaces; + }(); + + GGML_ASSERT(sizeof(wchar_t) == sizeof(uint32_t)); + std::wstring wtext = unicode_wstring_from_utf8(text); - std::vector bpe_offsets = { cpts.size() }; + std::vector offsets = { wtext.size() }; for (auto & regex_expr : regex_exprs) { // first, see if we have an efficient custom regex implementation - auto tmp = unicode_regex_split_custom(text, regex_expr, bpe_offsets); + auto tmp = unicode_regex_split_custom(text, regex_expr, offsets); if (!tmp.empty()) { - bpe_offsets = std::move(tmp); + offsets = std::move(tmp); continue; } - std::vector> regex_expr_ranges; // start codepoint, last codepoint - std::vector> regex_expr_categs; // offset, codepoint category - std::map map_categ_wregex; // categ --> regex utf32 string - std::wstring wregex_collapsed; - std::wstring wtext_collapsed; + std::wstring wregex; bool inside_square = false; bool is_cpt_range = false; - // (2) Build a list of codepoint ranges - // common ranges: \w \d - regex_expr_ranges.emplace_back('a', 'z'); - regex_expr_ranges.emplace_back('A', 'Z'); - regex_expr_ranges.emplace_back('0', '9'); - regex_expr_ranges.emplace_back('_', '_'); - - // (2) Build a list of codepoint ranges - // common ranges: \s - for (uint32_t cpt : unicode_vec_whitespace) { - const auto categ_prev = unicode_cpt_category(regex_expr_ranges.back().second); - const auto categ_last = unicode_cpt_category(cpt); - if (categ_prev == categ_last && regex_expr_ranges.back().second + 1 == cpt) { - regex_expr_ranges.back().second = cpt; - } else { - regex_expr_ranges.emplace_back(cpt, cpt); - } - } - - // (1.1.3) Expand \s adding unicode whitespaces. - // std::wregex \s does not match non-ASCII whitespaces - static const codepoint_categ categ_whitespace(codepoint_categ::MASK + 1); // UNDEF category, subcategory 1 - std::wstring & wregex_whitespaces = map_categ_wregex[categ_whitespace.get_subcategory()]; - wregex_whitespaces += L"\\s"; - for (uint32_t cpt : unicode_vec_whitespace) { - if (cpt >= 0x80) { // non-ASCII whitespaces - if (wregex_whitespaces.back() + 1 == (wchar_t) cpt) { - if (*(wregex_whitespaces.end() - 2) == '-') { - wregex_whitespaces.back() = cpt; - } else { - wregex_whitespaces += '-'; - wregex_whitespaces += cpt; - } - } else { - wregex_whitespaces += (wchar_t) cpt; - } - } - } - const auto cpts_regex = unicode_cpts_from_utf8(regex_expr); + wregex.reserve(2 * cpts_regex.size()); for (size_t i = 0; i < cpts_regex.size(); ++i) { uint32_t cpt = cpts_regex[i]; - // skip regex metacharacters + // parse regex metacharacters + wregex += (wchar_t) cpt; if (inside_square) { switch(cpt) { case '^': @@ -811,6 +798,7 @@ std::vector unicode_regex_split(const std::string & text, const std case '{': while (cpt && cpt != '}') { cpt = cpts_regex[++i]; + wregex += (wchar_t) cpt; } continue; case '}': @@ -819,12 +807,19 @@ std::vector unicode_regex_split(const std::string & text, const std case '(': if (cpts_regex[i + 1] == '?') { // (?: (?i: (?= (?! (?<= (? unicode_regex_split(const std::string & text, const std continue; } } + wregex.pop_back(); - // parse unicode categories and subcategories + // parse unicode categories and subcategories, replace category with the categorized range if (cpt == '\\' && cpts_regex[i + 1] == 'p' && cpts_regex[i + 2] == '{') { GGML_ASSERT(cpts_regex[i + 3] && cpts_regex[i + 4]); - codepoint_categ categ = {}; + std::pair range; if (cpts_regex[i + 4] == '}') { - categ = codepoint_categ::from_chars((char)cpts_regex[i + 3]); + range = categorize_range_from_chars((char)cpts_regex[i + 3], (char)'\0'); + i += 4; } else { - categ = codepoint_categ::from_chars((char)cpts_regex[i + 3], (char)cpts_regex[i + 4]); - GGML_ASSERT(cpts_regex[i + 5] == '}'); + range = categorize_range_from_chars((char)cpts_regex[i + 3], (char)cpts_regex[i + 4]); + i += 5; } - // (2) Build a list of codepoint ranges. (2.2) [Optimization] Only build lists of ranges present in the regex. - categ.set_flag(codepoint_categ::WHITESPACE, inside_square); //NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets' - regex_expr_categs.emplace_back((uint32_t)i, categ); - i += cpts_regex[i + 4] == '}' ? 4 : 5; + GGML_ASSERT(cpts_regex[i] == '}'); + const std::wstring subregex = {range.first, '-', range.second}; + wregex_append_subregex(wregex, subregex, !inside_square, false); continue; } - if (cpt == '\\') { - if (cpts_regex[i + 1] == 's' || cpts_regex[i + 1] == 'S') { // \s \S - // (2) Build a list of codepoint ranges. (2.2) [Optimization] Only build lists of ranges present in the regex. - regex_expr_categs.emplace_back((uint32_t)i, categ_whitespace); - //NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets' - regex_expr_categs.back().second.set_flag(codepoint_categ::WHITESPACE, inside_square); - i += 1; - continue; - } - } - // parse more metcharacters and espaped characters if (cpt == '\\') { switch (cpts_regex[i + 1]) { - case 's': ++i; continue; // \s whitespaces - case 'w': ++i; continue; // \w words - case 'd': ++i; continue; // \d digits - case 'S': ++i; continue; // \S no whitespaces - case 'W': ++i; continue; // \W no words - case 'D': ++i; continue; // \D no digits + case 's': // \s whitespaces + case 'S': // \S no whitespaces + wregex_append_subregex(wregex, wregex_whitespaces, !inside_square, cpts_regex[++i] == 'S'); + continue; + case 'w': // \w words + case 'W': // \W no words + wregex_append_subregex(wregex, wregex_words, !inside_square, cpts_regex[++i] == 'W'); + continue; + case 'd': // \d digits + case 'D': // \D no digits + wregex_append_subregex(wregex, wregex_digits, !inside_square, cpts_regex[++i] == 'D'); + continue; case 't': ++i; cpt = '\t'; break; case 'r': ++i; cpt = '\r'; break; case 'n': ++i; cpt = '\n'; break; @@ -886,139 +877,65 @@ std::vector unicode_regex_split(const std::string & text, const std GGML_ASSERT(!is_cpt_range); cpt = cpts_regex[++i]; GGML_ASSERT(cpt < 0x80); - break; + break; } } - // ensure there is not a collission with any "collapsed" codepoints - GGML_ASSERT(cpt < COLLAPSE_CPT_RANGE_FIRST || COLLAPSE_CPT_RANGE_LAST < cpt); - - // (2) Build a list of codepoint ranges if (is_cpt_range) { - is_cpt_range = false; - regex_expr_ranges.back().second = cpt; - } else { - regex_expr_ranges.emplace_back(cpt, cpt); - } - } - - // assign collapsed codepoint to each category regex \p{...} - for (auto offset_categ : regex_expr_categs) { - const uint16_t subcateg = offset_categ.second.get_subcategory(); - auto it = map_categ_wregex.find(subcateg); - if (it == map_categ_wregex.end()) { - // (1.2) Each list includes its respective "collaped" codepoint/range. - const auto collapsed_range = category_to_collapsed_range(offset_categ.second); - map_categ_wregex[subcateg] = (wchar_t) collapsed_range.first; - if (collapsed_range.first < collapsed_range.second) { - map_categ_wregex[subcateg] += (wchar_t) '-'; - map_categ_wregex[subcateg] += (wchar_t) collapsed_range.second; - } - } - } - - // copy found regex ranges to each category regex - uint32_t regex_expr_ranges_uniques = 0; - std::pair prev_range = {0, -1}; - std::sort(regex_expr_ranges.begin(), regex_expr_ranges.end()); - for (auto range : regex_expr_ranges) { - range.first = std::max(range.first, prev_range.second + 1); // prevent overlapping //TODO: as error? - if (range.first > range.second) { // skip overlapping and repetitions - continue; - } - // (1.1) Generate a replacement list of codepoint ranges - codepoint_categ categ = unicode_cpt_category(range.first); - GGML_ASSERT(categ == unicode_cpt_category(range.second)); - auto it0 = map_categ_wregex.find(categ.get_category()); - auto it1 = map_categ_wregex.find(categ.get_subcategory()); - for (const auto & it : {it0, it1}) { - if (it != map_categ_wregex.end()) { - it->second += (wchar_t) range.first; - if (range.first < range.second) { - it->second += (wchar_t) '-'; - it->second += (wchar_t) range.second; + // Some regex ranges starts and ends with different category/subcategory. + // Split the ranges in sub-ranges to ensure a single category to maintain the new hierarchy. + // Warning: This forces iterating all ranges and could produce long sub-range sequences. + GGML_ASSERT(wregex.size() && wregex.back() == '-'); + wregex.pop_back(); + wchar_t categorized = wregex.back(); + uint32_t range_ini = decategorize_codepoint(categorized); + const uint32_t range_end = cpt; + GGML_ASSERT(range_ini <= range_end); + codepoint_categ range_categ = unicode_cpt_category(range_ini); + for (cpt = range_ini + 1; cpt <= range_end; ++cpt) { + codepoint_categ categ = unicode_cpt_category(cpt); + if (categ == range_categ) { // still same range category ? + ++categorized; + if (cpt == range_ini + 1) { // single step, no need range + wregex += categorized; + } else if (cpt == range_ini + 2) { // need range if +2 step + wregex.back() = '-'; + wregex += categorized; + } else { + wregex.back() = categorized; // keep range growing + } + } else { // new range category + categorized = categorize_codepoint(cpt); + wregex += categorized; + range_categ = categ; + range_ini = cpt; } } - } - prev_range = range; - regex_expr_ranges[regex_expr_ranges_uniques++] = range; - } - regex_expr_ranges.resize(regex_expr_ranges_uniques); - - // replace categories with respective collapsed codepoint and ranges - uint32_t i = 0; - wregex_collapsed.reserve(regex_expr.size()); - for (auto offset_categ : regex_expr_categs) { - while (i < offset_categ.first) { // copy original regex until reaching the category - wregex_collapsed += (wchar_t) cpts_regex[i]; - i++; - } - GGML_ASSERT(cpts_regex[i] == '\\'); - const uint32_t cpt_next = cpts_regex[i + 1]; - const bool is_negated = cpt_next < 'a'; // is uppercase - if (cpt_next == 'p' || cpt_next == 'P') { - GGML_ASSERT(cpts_regex[i + 2] == '{' && cpts_regex[i + 3]); - i += cpts_regex[i + 4] == '}' ? 5 : 6; - GGML_ASSERT(cpts_regex[i - 1] == '}'); + is_cpt_range = false; } else { - GGML_ASSERT(cpt_next == 's' || cpt_next == 'w' || cpt_next == 'd' || // \s \w \d - cpt_next == 'S' || cpt_next == 'W' || cpt_next == 'D'); // \S \W \D - i += 2; + wregex += categorize_codepoint(cpt); } - // (1.4) Build the "collapsed" regex replacing categories and subcategories by this "collapsed" lists. - const codepoint_categ categ = offset_categ.second; - auto it = map_categ_wregex.find(categ.get_subcategory()); - GGML_ASSERT(it != map_categ_wregex.end()); - if (it != map_categ_wregex.end()) { - if (categ.is_whitespace()) { // inside square brackets //NOTE: reusing flag WHITESPACE - GGML_ASSERT(is_negated == false); - wregex_collapsed += it->second; - } else if(it->second.size() == 1 && !is_negated) { - wregex_collapsed += it->second; - } else { - wregex_collapsed += '['; - if (is_negated) { - wregex_collapsed += '^'; - } - wregex_collapsed += it->second; - wregex_collapsed += ']'; - } - } - } - while (i < (uint32_t)cpts_regex.size()) { - wregex_collapsed += cpts_regex[i]; - i++; } - // collapse text codepoints not included in 'regex_expr_ranges' - wtext_collapsed.reserve(cpts.size()); - for (uint32_t cpt : cpts) { - const codepoint_categ categ = unicode_cpt_category(cpt); - // (3.1) Search codepoints in the regex codepoint ranges. - auto it = std::lower_bound(regex_expr_ranges.begin(), regex_expr_ranges.end(), cpt, - [] (const std::pair range, const uint32_t cpt) { - return range.second < cpt; - } - ); - if (it == regex_expr_ranges.end() || cpt < it->first || it->second < cpt) { - // (3.3) If not found, replace with its "collapsed" codepoint so the "collapsed" regex can process it. - cpt = category_to_collapsed_cpt(categ); // not found, collapse to category codepoint + // categorize all wtext codepoints + if (wtext.size() && wtext[0] < MAX_CODEPOINTS) { // if not already categorized + for (size_t i = 0; i < wtext.size(); ++i) { + wtext[i] = categorize_codepoint((uint32_t) wtext[i]); } - // (3.2) If found, it is a valid codepoint (the "collapsed" regex uses it), literal copy. - wtext_collapsed += (wchar_t) cpt; } - bpe_offsets = unicode_regex_split_stl(wtext_collapsed, wregex_collapsed, bpe_offsets); + offsets = unicode_regex_split_stl(wtext, wregex, offsets); } std::vector bpe_words; - bpe_words.reserve(bpe_offsets.size()); // reserve memory for the approximate size + bpe_words.reserve(offsets.size()); // reserve memory for the approximate size size_t start = 0; - for (size_t & offset : bpe_offsets) { + for (size_t & offset : offsets) { bpe_words.emplace_back(); for (size_t i = start; i < start + offset; ++i) { - bpe_words.back() += unicode_cpt_to_utf8(cpts[i]); + const uint32_t cpt = decategorize_codepoint(wtext[i]); + bpe_words.back() += unicode_cpt_to_utf8(cpt); } start += offset; } From 312c4322cc4d607fa4a81724c859672605e03c33 Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Tue, 13 Aug 2024 16:30:30 +0200 Subject: [PATCH 22/29] Remove invalid assert --- src/llama-vocab.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 5eeae05858ebb..6192fd195746f 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -710,7 +710,6 @@ struct llm_tokenizer_wpm { continue; } - assert (!categ.is_S()); if (cpt == 0 || cpt == 0xFFFD || categ.is_C()) { continue; } From b565148cb43b732327a4c515b6b484d55dc53f9b Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Tue, 13 Aug 2024 16:42:33 +0200 Subject: [PATCH 23/29] Update codepoint_categ: - Reorganize category/subcategory bits. - Regex flags for \s \w \d. --- src/unicode.cpp | 22 +++--- src/unicode.h | 176 ++++++++++++++++++++++++------------------------ 2 files changed, 96 insertions(+), 102 deletions(-) diff --git a/src/unicode.cpp b/src/unicode.cpp index 4a5728ed6fd88..20c1287c43199 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -209,7 +209,7 @@ static std::vector unicode_regex_split_custom_gpt2(const std::string & t return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE; }; - static const codepoint_categ SENTINEL = codepoint_categ::MASK + 1; + static const codepoint_categ SENTINEL = codepoint_categ::UNDEF + 1; auto _get_categ = [&] (const size_t pos) -> codepoint_categ { return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_category(cpts[pos]) : SENTINEL; }; @@ -328,7 +328,7 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE; }; - static const codepoint_categ SENTINEL = codepoint_categ::MASK + 1; + static const codepoint_categ SENTINEL = codepoint_categ::UNDEF + 1; auto _get_categ = [&] (const size_t pos) -> codepoint_categ { return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_category(cpts[pos]) : SENTINEL; }; @@ -589,28 +589,24 @@ codepoint_categ unicode_cpt_category(const uint32_t cp) { for (uint16_t rle : unicode_rle_codepoints_categs) { const uint32_t index = rle & 31; const uint32_t count = rle >> 5; - const auto categ = codepoint_categ::from_index(index); - //printf( "Codepoints 0x%05X to 0x%05X categ %s\n", cpt, cpt + count, categ.c_str()); + auto categ = codepoint_categ::from_index(index); + //printf("Codepoints 0x%05X to 0x%05X categ %s\n", cpt, cpt + count, categ.c_str()); + categ.set_flag(codepoint_categ::DIGITS, categ.is_Nd()); // \d --> \p{Nd} + categ.set_flag(codepoint_categ::WORDS, categ.is_L() | categ.is_N()); // \w --> \p{L} \p{N} _ for (uint32_t i = 0; i <= count; ++i) { cpt_categs[cpt++] = categ; } } GGML_ASSERT(cpt == MAX_CODEPOINTS); + cpt_categs['_'].set_flag(codepoint_categ::WORDS); // \w --> \p{L} \p{N} _ + for (auto p : unicode_ranges_whitespace) { for (uint32_t cpt = p.first; cpt <= p.second; ++cpt) { - cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACE); + cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACES); } } - for (auto p : unicode_map_lowercase) { - cpt_categs[p.second].set_flag(codepoint_categ::LOWERCASE); - } - - for (auto p : unicode_map_uppercase) { - cpt_categs[p.second].set_flag(codepoint_categ::UPPERCASE); - } - //for (auto &range : unicode_ranges_nfd) { // start, last, nfd // cpt_categs[cpt].set_flag(codepoint_categ::NORM_NFD); //} diff --git a/src/unicode.h b/src/unicode.h index 8a3f4078ca79b..3aeb74771eb7f 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -9,74 +9,71 @@ #include struct codepoint_categ { + // 0bffffff'ccccccc'sss --> 6 bits flags + 7 bits category + 3 bits subcategory enum _category : uint16_t { - UNDEF = 0, // \p{Cn} Undefined - C = 1 << 0, // \p{C} Control - L = 1 << 1, // \p{L} Letter - M = 1 << 2, // \p{M} Mark - N = 1 << 3, // \p{N} Number - P = 1 << 4, // \p{P} Punctuation - S = 1 << 5, // \p{S} Symbol - Z = 1 << 6, // \p{Z} Separator - MASK = (1 << 7) - 1 // 7 bits - }; - - enum _subcategory : uint16_t { - Cc = C | (1 << 7), // \p{Cc} Control - Cf = C | (2 << 7), // \p{Cf} Format - Co = C | (3 << 7), // \p{Co} Private Use - Cs = C | (4 << 7), // \p{Cs} Surrrogate - Ll = L | (1 << 7), // \p{Ll} Lowercase Letter - Lm = L | (2 << 7), // \p{Lm} Modifier Letter - Lo = L | (3 << 7), // \p{Lo} Other Letter - Lt = L | (4 << 7), // \p{Lt} Titlecase Letter - Lu = L | (5 << 7), // \p{Lu} Uppercase Letter - Mc = M | (1 << 7), // \p{Mc} Spacing Mark - Me = M | (2 << 7), // \p{Me} Enclosing Mark - Mn = M | (3 << 7), // \p{Mn} Nonspacing Mark - Nd = N | (1 << 7), // \p{Nd} Decimal Number - Nl = N | (2 << 7), // \p{Nl} Letter Number - No = N | (3 << 7), // \p{No} Other Number - Pc = P | (1 << 7), // \p{Pc} Connector Punctuation - Pd = P | (2 << 7), // \p{Pd} Dash Punctuation - Pe = P | (3 << 7), // \p{Pe} Close Punctuation - Pf = P | (4 << 7), // \p{Pf} Final Punctuation - Pi = P | (5 << 7), // \p{Pi} Initial Punctuation - Po = P | (6 << 7), // \p{Po} Other Punctuation - Ps = P | (7 << 7), // \p{Ps} Open Punctuation - Sc = S | (1 << 7), // \p{Sc} Currency Symbol - Sk = S | (2 << 7), // \p{Sk} Modifier Symbol - Sm = S | (3 << 7), // \p{Sm} Math Symbol - So = S | (4 << 7), // \p{So} Other Symbol - Zl = Z | (1 << 7), // \p{Zl} Line Separator - Zp = Z | (2 << 7), // \p{Zp} Paragraph Separator - Zs = Z | (3 << 7), // \p{Zs} Space Separator - SUBMASK = (1 << 10) - 1 // 7+3 bits + UNDEF = 0, // \p{Cn} Undefined + C = 1 << (0 + 3), // \p{C} Control + L = 1 << (1 + 3), // \p{L} Letter + M = 1 << (2 + 3), // \p{M} Mark + N = 1 << (3 + 3), // \p{N} Number + P = 1 << (4 + 3), // \p{P} Punctuation + S = 1 << (5 + 3), // \p{S} Symbol + Z = 1 << (6 + 3), // \p{Z} Separator + Cc = C | 1, // \p{Cc} Control + Cf = C | 2, // \p{Cf} Format + Co = C | 3, // \p{Co} Private Use + Cs = C | 4, // \p{Cs} Surrrogate + Ll = L | 1, // \p{Ll} Lowercase Letter + Lm = L | 2, // \p{Lm} Modifier Letter + Lo = L | 3, // \p{Lo} Other Letter + Lt = L | 4, // \p{Lt} Titlecase Letter + Lu = L | 5, // \p{Lu} Uppercase Letter + Mc = M | 1, // \p{Mc} Spacing Mark + Me = M | 2, // \p{Me} Enclosing Mark + Mn = M | 3, // \p{Mn} Nonspacing Mark + Nd = N | 1, // \p{Nd} Decimal Number + Nl = N | 2, // \p{Nl} Letter Number + No = N | 3, // \p{No} Other Number + Pc = P | 1, // \p{Pc} Connector Punctuation + Pd = P | 2, // \p{Pd} Dash Punctuation + Pe = P | 3, // \p{Pe} Close Punctuation + Pf = P | 4, // \p{Pf} Final Punctuation + Pi = P | 5, // \p{Pi} Initial Punctuation + Po = P | 6, // \p{Po} Other Punctuation + Ps = P | 7, // \p{Ps} Open Punctuation + Sc = S | 1, // \p{Sc} Currency Symbol + Sk = S | 2, // \p{Sk} Modifier Symbol + Sm = S | 3, // \p{Sm} Math Symbol + So = S | 4, // \p{So} Other Symbol + Zl = Z | 1, // \p{Zl} Line Separator + Zp = Z | 2, // \p{Zp} Paragraph Separator + Zs = Z | 3, // \p{Zs} Space Separator + SUBMASK = (1 << 3) - 1, // 3 bits 0b000000'0000000'111 + MASK = (1 << 10) - 1, // 7+3 bits 0b000000'1111111'111 }; enum _flags : uint16_t { - WHITESPACE = (1 << 10), // regex: \s - LOWERCASE = (1 << 11), - UPPERCASE = (1 << 12), + WHITESPACES = (1 << 10), // regex: \s + WORDS = (1 << 11), // regex: \w + DIGITS = (1 << 12), // regex: \d //Norm NFD/NFC = ..., }; inline codepoint_categ(const uint16_t categ=0) : encoded{categ} {} inline void set_flag(_flags flags, bool value = true) { - flags = (_flags) (flags & ~SUBMASK); // ignore category bits + flags = (_flags) (flags & ~MASK); // do not modify category bits encoded = value ? (encoded | flags) : (encoded & ~flags); } inline uint16_t get_category() const { return encoded & MASK; } - inline uint16_t get_subcategory() const { return encoded & SUBMASK; } inline bool is_undefined() const { return !encoded; } inline bool is_defined() const { return encoded; } - inline uint16_t is_whitespace() const { return encoded & WHITESPACE; } - inline uint16_t is_lowercase() const { return encoded & LOWERCASE; } - inline uint16_t is_uppercase() const { return encoded & UPPERCASE; } + inline uint16_t is_whitespace() const { return encoded & WHITESPACES; } + inline uint16_t is_word() const { return encoded & WORDS; } + inline uint16_t is_digit() const { return encoded & DIGITS; } inline uint16_t is_C() const { return encoded & C; } inline uint16_t is_L() const { return encoded & L; } @@ -86,35 +83,35 @@ struct codepoint_categ { inline uint16_t is_S() const { return encoded & S; } inline uint16_t is_Z() const { return encoded & Z; } - inline bool is_Cc() const { return (encoded & SUBMASK) == Cc; } - inline bool is_Cf() const { return (encoded & SUBMASK) == Cf; } - inline bool is_Co() const { return (encoded & SUBMASK) == Co; } - inline bool is_Cs() const { return (encoded & SUBMASK) == Cs; } - inline bool is_Ll() const { return (encoded & SUBMASK) == Ll; } - inline bool is_Lm() const { return (encoded & SUBMASK) == Lm; } - inline bool is_Lo() const { return (encoded & SUBMASK) == Lo; } - inline bool is_Lt() const { return (encoded & SUBMASK) == Lt; } - inline bool is_Lu() const { return (encoded & SUBMASK) == Lu; } - inline bool is_Mc() const { return (encoded & SUBMASK) == Mc; } - inline bool is_Me() const { return (encoded & SUBMASK) == Me; } - inline bool is_Mn() const { return (encoded & SUBMASK) == Mn; } - inline bool is_Nd() const { return (encoded & SUBMASK) == Nd; } - inline bool is_Nl() const { return (encoded & SUBMASK) == Nl; } - inline bool is_No() const { return (encoded & SUBMASK) == No; } - inline bool is_Pc() const { return (encoded & SUBMASK) == Pc; } - inline bool is_Pd() const { return (encoded & SUBMASK) == Pd; } - inline bool is_Pe() const { return (encoded & SUBMASK) == Pe; } - inline bool is_Pf() const { return (encoded & SUBMASK) == Pf; } - inline bool is_Pi() const { return (encoded & SUBMASK) == Pi; } - inline bool is_Po() const { return (encoded & SUBMASK) == Po; } - inline bool is_Ps() const { return (encoded & SUBMASK) == Ps; } - inline bool is_Sc() const { return (encoded & SUBMASK) == Sc; } - inline bool is_Sk() const { return (encoded & SUBMASK) == Sk; } - inline bool is_Sm() const { return (encoded & SUBMASK) == Sm; } - inline bool is_So() const { return (encoded & SUBMASK) == So; } - inline bool is_Zl() const { return (encoded & SUBMASK) == Zl; } - inline bool is_Zp() const { return (encoded & SUBMASK) == Zp; } - inline bool is_Zs() const { return (encoded & SUBMASK) == Zs; } + inline bool is_Cc() const { return (encoded & MASK) == Cc; } + inline bool is_Cf() const { return (encoded & MASK) == Cf; } + inline bool is_Co() const { return (encoded & MASK) == Co; } + inline bool is_Cs() const { return (encoded & MASK) == Cs; } + inline bool is_Ll() const { return (encoded & MASK) == Ll; } + inline bool is_Lm() const { return (encoded & MASK) == Lm; } + inline bool is_Lo() const { return (encoded & MASK) == Lo; } + inline bool is_Lt() const { return (encoded & MASK) == Lt; } + inline bool is_Lu() const { return (encoded & MASK) == Lu; } + inline bool is_Mc() const { return (encoded & MASK) == Mc; } + inline bool is_Me() const { return (encoded & MASK) == Me; } + inline bool is_Mn() const { return (encoded & MASK) == Mn; } + inline bool is_Nd() const { return (encoded & MASK) == Nd; } + inline bool is_Nl() const { return (encoded & MASK) == Nl; } + inline bool is_No() const { return (encoded & MASK) == No; } + inline bool is_Pc() const { return (encoded & MASK) == Pc; } + inline bool is_Pd() const { return (encoded & MASK) == Pd; } + inline bool is_Pe() const { return (encoded & MASK) == Pe; } + inline bool is_Pf() const { return (encoded & MASK) == Pf; } + inline bool is_Pi() const { return (encoded & MASK) == Pi; } + inline bool is_Po() const { return (encoded & MASK) == Po; } + inline bool is_Ps() const { return (encoded & MASK) == Ps; } + inline bool is_Sc() const { return (encoded & MASK) == Sc; } + inline bool is_Sk() const { return (encoded & MASK) == Sk; } + inline bool is_Sm() const { return (encoded & MASK) == Sm; } + inline bool is_So() const { return (encoded & MASK) == So; } + inline bool is_Zl() const { return (encoded & MASK) == Zl; } + inline bool is_Zp() const { return (encoded & MASK) == Zp; } + inline bool is_Zs() const { return (encoded & MASK) == Zs; } inline bool operator == (const codepoint_categ other) const { return encoded == other.encoded; @@ -132,7 +129,7 @@ struct codepoint_categ { {Pd, "Pd"}, {Pe, "Pe"}, {Pf, "Pf"}, {Pi, "Pi"}, {Po, "Po"}, {Ps, "Ps"}, {Sc, "Sc"}, {Sk, "Sk"}, {Sm, "Sm"}, {So, "So"}, {Zl, "Zl"}, {Zp, "Zp"}, {Zs, "Zs"}, }; - const auto it = map.find(encoded & SUBMASK); + const auto it = map.find(encoded & MASK); return it == map.end() ? "INVALID" : it->second; } @@ -149,18 +146,19 @@ struct codepoint_categ { return 0; } const char * p = strchr(subcategs, subcateg); - return (uint16_t) (p ? (p - subcategs + 1) : 0); + GGML_ASSERT(p); + return (uint16_t) (p - subcategs + 1); }; switch(categ) { case 'C': if(subcateg == 'n') return 0; // undefined - return C | (_subindex(subcateg, "cfos" ) << 7); - case 'L': return L | (_subindex(subcateg, "lmotu" ) << 7); - case 'M': return M | (_subindex(subcateg, "cen" ) << 7); - case 'N': return N | (_subindex(subcateg, "dlo" ) << 7); - case 'P': return P | (_subindex(subcateg, "cdefios") << 7); - case 'S': return S | (_subindex(subcateg, "ckmo" ) << 7); - case 'Z': return Z | (_subindex(subcateg, "lps" ) << 7); - default: assert (false); return 0; + return C | _subindex(subcateg, "cfos" ); + case 'L': return L | _subindex(subcateg, "lmotu" ); + case 'M': return M | _subindex(subcateg, "cen" ); + case 'N': return N | _subindex(subcateg, "dlo" ); + case 'P': return P | _subindex(subcateg, "cdefios"); + case 'S': return S | _subindex(subcateg, "ckmo" ); + case 'Z': return Z | _subindex(subcateg, "lps" ); + default: GGML_ABORT("invalid category character"); } } From 5a93d2ec504c649ccd7cfb6ff8c23a5bd3105894 Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Tue, 13 Aug 2024 17:38:46 +0200 Subject: [PATCH 24/29] Reimplement unicode_regex_split(): - Using std::basic_regex. - Custom std::ctype specialization for 32bits codepoints. - Custom std::regex_traits specialization for 32bits codepoints. - Implementing custom 'character class expression' for \p{Xx}. - Single pass regex preparation. --- src/unicode.cpp | 577 +++++++++++++++++++++--------------------------- src/unicode.h | 17 ++ 2 files changed, 269 insertions(+), 325 deletions(-) diff --git a/src/unicode.cpp b/src/unicode.cpp index 20c1287c43199..988dd35e4d3c2 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -451,76 +451,271 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & return bpe_offsets; } -// use std::wregex to split the text -static std::vector unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector & offsets) { - std::wregex expr(regex_expr); - std::vector bpe_offsets; // store the offset of each word - bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size - size_t start = 0; - for (auto offset : offsets) { - std::wcregex_iterator it(wtext.data() + start, wtext.data() + start + offset, expr); - std::wcregex_iterator end; +static std::vector unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector & offsets) { + std::vector bpe_offsets; - int64_t start_idx = 0; - while (it != end) { - std::wcmatch match = *it; - if (match.position() > start_idx) { - bpe_offsets.emplace_back(match.position() - start_idx); + if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") { + bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets); + } else if ( + regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" || + regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") { + + bpe_offsets = unicode_regex_split_custom_llama3(text, offsets); + } + + return bpe_offsets; +} + +// Custom std::regex specializations for 32bit unicode codepoints +// std::wregex does not support unicode categories: \p{N}, \p{L}, \p{Lu}, \p{Ll} ... +// std::wregex does not support unicode whitespaces \s: 0x85, 0xA0, 0x001680 ... 0x003000. +// std::wregex supports full 32 bit codepoints, not limited to standard max 0x110000. +namespace std { + using codepoint = uint32_t; // codepoint type for all template specializations + + // Minimal required implementation for std::regex string processing + template<> // custom specialized std::ctype + class ctype { + public: + + using CharT = codepoint; + using char_type = CharT; + + using mask = uint8_t; //NOTE: see std::ctype_base + static const mask digit = 1; // requiered variable names + static const mask xdigit = 2; // user defined values + static const mask alpha = 3; // used to be a bitmask + static const mask upper = 4; // we do not need a bitmask + static const mask lower = 5; // using a sequence instead + + static locale::id id; // required by std::locale::facet + + bool is(mask m, char_type c) const { + switch (m) { + case digit: return ('0' <= c && c <= '9'); + case xdigit: return ('0' <= c && c <= '9') || ('A' <= c && c <= 'F'); + case alpha: return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z'); + case upper: return ('A' <= c && c <= 'Z'); + case lower: return ('a' <= c && c <= 'z'); + default: return false; } - bpe_offsets.emplace_back(match.length()); - start_idx = match.position() + match.length(); - ++it; } - if (start_idx < (int64_t) offset) { - bpe_offsets.emplace_back(offset - start_idx); + char_type toupper(char_type c) const { + return ('a' <= c && c <= 'z') ? c - ('a' - 'A') : c; } - start += offset; + + char_type tolower(char_type c) const { + return ('A' <= c && c <= 'Z') ? c + ('a' - 'A') : c; + } + + char_type widen(char c) const { // char to codepoint + return (char_type) c; + } + + char narrow(char_type c, char dfault) const { // codepoint to char + return (c < 0x80 ? (char)c : dfault); + } + }; + + locale::id ctype::id = {}; + + template<> // specialization to use our custom specialized std::ctype + const std::ctype & use_facet>(const std::locale &) { + static std::ctype ctype_uint32 = {}; + return ctype_uint32; } - return bpe_offsets; + template<> // specialization to use our custom specialized std::ctype + const std::ctype & use_facet>(const std::locale & loc) { + return use_facet>(loc); + } + + // Minimal required implementation for std::regex string processing + template<> // custom specialized std::regex_traits + class regex_traits { + public: + + using CharT = codepoint; + using char_type = codepoint; + using size_type = size_t; + using string_type = std::basic_string; + using locale_type = std::locale; + using char_class_type = uint64_t; + + #if (defined(_WIN32) || defined(_WIN64)) // MSVC class _Regex_traits + using _Uelem = CharT; + static const auto _Ch_upper = std::ctype::upper; + static const auto _Ch_alpha = std::ctype::alpha; + #endif + + static size_type length(const CharT * str) { + return std::char_traits::length(str); + } + + CharT translate(CharT c) const { + return c; + } + + CharT translate_nocase(CharT c) const { + return unicode_tolower(c); + } + + template + string_type transform(It first, It last) const { + GGML_ASSERT(false); //TODO: not needed ? + return {first, last}; //TODO: not tested + } + + template + string_type transform_primary(It first, It last) const { + (void) first; + (void) last; + GGML_ASSERT(*first < MAX_CODEPOINTS); // valid codepoint + return {}; + } + + template + string_type lookup_collatename(It first, It last) const { + (void) last; + GGML_ASSERT(*first & (1 << 31)); + return {*first}; + } + + template + char_class_type lookup_classname(It first, It last, bool icase = false) const { + (void) last; + (void) icase; + const uint32_t encoded = *first; + codepoint_categ categ = {}; + switch(encoded) { + case 's': + case 'S': // negation is internally tracked + categ.set_flag(codepoint_categ::WHITESPACES); + return categ.expand_bits(); + case 'w': + case 'W': // negation is internally tracked + categ.set_flag(codepoint_categ::WORDS); + return categ.expand_bits(); + case 'd': + case 'D': // negation is internally tracked + categ.set_flag(codepoint_categ::DIGITS); + return categ.expand_bits(); + default: { // unicode category \p{Xx} encoded in codepoint + GGML_ASSERT(encoded & (1 << 31)); // make sure its our custom codepoint encoding the category + const bool negated = encoded & (1 << 30); // negation of 'character class expression' are not internally tracked + categ = {(uint16_t) encoded}; + return ((uint64_t) negated << 63) | categ.expand_bits(false); + } + } + } + + bool isctype(CharT c, char_class_type mask) const { + const bool negated = mask & (1llu << 63); + mask &= unicode_cpt_category(c).expand_bits(); + return negated ^ (bool) mask; + } + + int value(CharT c, int radix) const { // char to int value + switch (radix) { + case 8: return ('0' <= c && c <= '7') ? (int)c - '0' : -1; + case 10: return ('0' <= c && c <= '9') ? (int)c - '0' : -1; + case 16: return ('0' <= c && c <= '9') ? (int)c - '0' : (('A' <= c && c <= 'F') ? (int)c - 'A' + 10 : -1); + default: return -1; + } + } + + const locale_type & imbue(const locale_type &) { // set locale //NOTE: ignoring locales + return std::locale::classic(); + } + + const locale_type & getloc() const { // get locale //NOTE: ignoring locales + return std::locale::classic(); + } + }; +} + +static std::vector unicode_regex_prepare(const std::string & regex) { + std::vector regex_cpts; + regex_cpts.reserve(regex.size() * 12 / 10); // estimate +20% + + size_t offset = 0; + int inside_square = 0; + bool any_positive = false; + bool any_negative = false; + + const size_t size = regex.size(); + while (offset < size) { + inside_square += regex[offset] == '['; + inside_square -= regex[offset] == ']'; + GGML_ASSERT(inside_square >= 0); + if (!inside_square) { + any_positive = false; + any_negative = false; + } + + if (regex[offset] == '\\') { + const size_t i = offset + 1; + if (regex[i] == 'p' || regex[i] == 'P') { + // convert \p{Xx} to custom 'character class expression' [:Xy:] + if (regex[i + 1] == '{' && regex[i + 2] && regex[i + 3]) { + codepoint_categ categ = {}; + if (regex[i + 3] == '}') { + categ = codepoint_categ::from_chars(regex[i + 2]); + offset += 5; + } else if (regex[i + 3] != '}' && regex[i + 4] == '}') { + categ = codepoint_categ::from_chars(regex[i + 2], regex[i + 3]); + offset += 6; + } + bool negated = regex[i] == 'P'; + any_positive |= !negated; + any_negative |= negated; + GGML_ASSERT(any_positive != any_negative); //BUG: can not mix 'p' and 'P' inside [] + GGML_ASSERT(sizeof(categ) <= 2); + // encoded category in 32 bits codepoint + uint32_t cpt_categ = (1 << 31) | (negated << 30) | categ.encoded; + if (inside_square) { + regex_cpts.insert(regex_cpts.end(), {'[', ':', cpt_categ, ':', ']'}); + } else { + regex_cpts.insert(regex_cpts.end(), {'[', '[', ':', cpt_categ, ':', ']', ']'}); + } + continue; + } + } + } + + regex_cpts.push_back(unicode_cpt_from_utf8(regex, offset)); + } + + return regex_cpts; } -// use std::regex to split the text -static std::vector unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector & offsets) { - std::regex expr(regex_expr); +// use std::basic_regex to split the text codepoints +static std::vector unicode_regex_split_stl(const std::vector & text_cpts, const std::vector & regex_cpts, const std::vector & offsets) { + using regex_type = std::basic_regex; + using iter_type = std::regex_iterator; + regex_type regex(regex_cpts.begin(), regex_cpts.end()); + const iter_type end; + std::vector bpe_offsets; // store the offset of each word - bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size - size_t start = 0; + bpe_offsets.reserve(offsets.size()); // reserve memory for the approximate size + const uint32_t * text_data = text_cpts.data(); for (auto offset : offsets) { - std::cregex_iterator it(text.data() + start, text.data() + start + offset, expr); - std::cregex_iterator end; - + iter_type it(text_data, text_data + offset, regex); int64_t start_idx = 0; while (it != end) { - std::cmatch match = *it; - if (match.position() > start_idx) { - bpe_offsets.emplace_back(match.position() - start_idx); + if (it->position() > start_idx) { + bpe_offsets.emplace_back(it->position() - start_idx); } - bpe_offsets.emplace_back(match.length()); - start_idx = match.position() + match.length(); + bpe_offsets.emplace_back(it->length()); + start_idx = it->position() + it->length(); ++it; } if (start_idx < (int64_t) offset) { bpe_offsets.emplace_back(offset - start_idx); } - start += offset; - } - - return bpe_offsets; -} - -static std::vector unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector & offsets) { - std::vector bpe_offsets; - - if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") { - bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets); - } else if ( - regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" || - regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") { - - bpe_offsets = unicode_regex_split_custom_llama3(text, offsets); + text_data += offset; } return bpe_offsets; @@ -639,288 +834,21 @@ uint32_t unicode_tolower(uint32_t cp) { return it == unicode_map_lowercase.end() ? cp : it->second; } -std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs) { - // std::wregex does not support unicode categories: \p{N}, \p{L}, \p{Lu}, \p{Ll} ... - // std::wregex does not support unicode whitespaces \s: 0x85, 0xA0, 0x001680 ... 0x003000. - // std::wregex allows full wchar_t 32 bit codepoints, not limited to standard max 0x110000. - // The main idea is to insert unicode category bits into all regex and text codepoints. - // Max unicode codepoint 0x110000 fits in 21 bits. - // Store unicode category and subcategory in 10 bits. - // Set the high bit to zero to keep wchar_t positive (uint32_t codepoints). - // Categorized codepoint: - // 1 bit zero + 7 bits category + 3 bits subcategory index + 21 bits codepoint - // 0b0'XXXXXXX'xxx'ccccccccccccccccccccc - // A "categorized codepoint" re-defines the ordering keeping category hierarchy. - // All high category codepoints \p{X} fall into the range: - // 0b0'XXXXXXX'000'000000000000000000000 - // 0b0'XXXXXXX'111'111111111111111111111 - // All subcategory codepoints \p{Xx} fall into the range: - // 0b0'XXXXXXX'xxx'000000000000000000000 - // 0b0'XXXXXXX'xxx'111111111111111111111 - // Processing steps: - // Build a lists of "categorized codepoints/ranges" for replacing regex \s \w and \d. - // Replace all regex codepoints/ranges with respective "categorized codepoints/ranges". - // Replace all text codepoints with respective "categorized codepoints". - // Caveats: - // Some regex ranges starts and ends with different category/subcategory. - // Split the ranges in sub-ranges to ensure a single category to maintain the new hierarchy. - // This forces iterating all ranges and could produce long sub-range sequences. - - //TODO: Regex processing can be cached. - - // insert unicode category and subcategory before codepoint bits - // 1 bit zero + 7 bits category + 3 bits subcategory index + 21 bits zero - static const auto categorized_prefix = [] (const codepoint_categ categ) -> wchar_t { - static const uint32_t MASK = codepoint_categ::MASK; // category mask - static const uint32_t SUBMASK = codepoint_categ::SUBMASK & ~codepoint_categ::MASK; // subcategory mask - return (wchar_t) (((categ.encoded & MASK) << (21+3)) | ((categ.encoded & SUBMASK) << (21-7))); - }; - - // insert unicode category and subcategory before codepoint bits - // 1 bit zero + 7 bits category + 3 bits subcategory index + 21 bits codepoint - static const auto categorize_codepoint = [] (const uint32_t cpt) -> wchar_t { - GGML_ASSERT(cpt < (1 << 21)); - return categorized_prefix(unicode_cpt_category(cpt)) | (wchar_t)cpt; - }; - - // remove the categorized prefix bits and restore original codepoint bits - static const auto decategorize_codepoint = [] (const wchar_t cpt) -> uint32_t { - return (uint32_t) cpt & ((1 << 21) - 1); - }; - - // returns the respective categorized codepoint range of the category/subcategory - static const auto categorize_range_from_chars = [] (const char categ, const char subcateg) { - const wchar_t range_ini = categorized_prefix(codepoint_categ::from_chars(categ, subcateg)); - const wchar_t range_end = (wchar_t) (range_ini | (subcateg ? (1<<21)-1 : (1<<24)-1)); - return std::pair(range_ini, range_end); - }; - - // helper function to append/concat regex expressions - auto wregex_append_subregex = [] (std::wstring & wregex, const std::wstring & subregex, const bool add_squares, const bool negated) { - if (add_squares) { - wregex += '['; - if (negated) { - wregex += '^'; - } - wregex += subregex; - wregex += ']'; - } else { - GGML_ASSERT(!negated); //TODO: negation inside square brackets: \S \W \D - wregex += subregex; - } - }; - - // \d digits replacement - static const std::wstring wregex_digits = { - categorize_codepoint('0'), '-', categorize_codepoint('9'), - }; - - // \w words replacement - static const std::wstring wregex_words = { - categorize_codepoint('_'), - categorize_codepoint('0'), '-', categorize_codepoint('9'), - categorize_codepoint('A'), '-', categorize_codepoint('Z'), - categorize_codepoint('a'), '-', categorize_codepoint('z'), - }; - - // \s whitespaces replacement - static const std::wstring wregex_whitespaces = [] { - std::wstring wregex_whitespaces; - for (const auto & range : unicode_ranges_whitespace) { - wregex_whitespaces += categorize_codepoint(range.first); - if (range.second > range.first) { - wregex_whitespaces += '-'; - wregex_whitespaces += categorize_codepoint(range.second); - } - } - return wregex_whitespaces; - }(); - - GGML_ASSERT(sizeof(wchar_t) == sizeof(uint32_t)); - std::wstring wtext = unicode_wstring_from_utf8(text); - - std::vector offsets = { wtext.size() }; +std::vector unicode_regex_split(const std::string & text_utf8, const std::vector & regex_exprs) { + const std::vector cpts = unicode_cpts_from_utf8(text_utf8); + std::vector offsets = { cpts.size() }; for (auto & regex_expr : regex_exprs) { // first, see if we have an efficient custom regex implementation - auto tmp = unicode_regex_split_custom(text, regex_expr, offsets); + auto tmp = unicode_regex_split_custom(text_utf8, regex_expr, offsets); if (!tmp.empty()) { offsets = std::move(tmp); continue; } - std::wstring wregex; - bool inside_square = false; - bool is_cpt_range = false; - - const auto cpts_regex = unicode_cpts_from_utf8(regex_expr); - wregex.reserve(2 * cpts_regex.size()); - - for (size_t i = 0; i < cpts_regex.size(); ++i) { - uint32_t cpt = cpts_regex[i]; - - // parse regex metacharacters - wregex += (wchar_t) cpt; - if (inside_square) { - switch(cpt) { - case '^': - if (cpts_regex[i - 1] != '[') { - break; - } - continue; - case ']': - inside_square = false; - continue; - case '-': - is_cpt_range = true; - continue; - } - } else { - switch(cpt) { - case '^': - if (i > 0) { - break; - } - continue; - case '$': - if (i + 1 < cpts_regex.size()) { - break; - } - continue; - case '[': - inside_square = true; - continue; - case '{': - while (cpt && cpt != '}') { - cpt = cpts_regex[++i]; - wregex += (wchar_t) cpt; - } - continue; - case '}': - case ']': - GGML_ABORT("invalid regex"); - case '(': - if (cpts_regex[i + 1] == '?') { // (?: (?i: (?= (?! (?<= (? range; - if (cpts_regex[i + 4] == '}') { - range = categorize_range_from_chars((char)cpts_regex[i + 3], (char)'\0'); - i += 4; - } else { - range = categorize_range_from_chars((char)cpts_regex[i + 3], (char)cpts_regex[i + 4]); - i += 5; - } - GGML_ASSERT(cpts_regex[i] == '}'); - const std::wstring subregex = {range.first, '-', range.second}; - wregex_append_subregex(wregex, subregex, !inside_square, false); - continue; - } - - // parse more metcharacters and espaped characters - if (cpt == '\\') { - switch (cpts_regex[i + 1]) { - case 's': // \s whitespaces - case 'S': // \S no whitespaces - wregex_append_subregex(wregex, wregex_whitespaces, !inside_square, cpts_regex[++i] == 'S'); - continue; - case 'w': // \w words - case 'W': // \W no words - wregex_append_subregex(wregex, wregex_words, !inside_square, cpts_regex[++i] == 'W'); - continue; - case 'd': // \d digits - case 'D': // \D no digits - wregex_append_subregex(wregex, wregex_digits, !inside_square, cpts_regex[++i] == 'D'); - continue; - case 't': ++i; cpt = '\t'; break; - case 'r': ++i; cpt = '\r'; break; - case 'n': ++i; cpt = '\n'; break; - case 'x': GGML_ABORT("TODO"); //TODO: hex values - case 'u': GGML_ABORT("TODO"); //TODO: unicode values - case 'U': GGML_ABORT("TODO"); //TODO: unicode values - default: // escaped character - GGML_ASSERT(!is_cpt_range); - cpt = cpts_regex[++i]; - GGML_ASSERT(cpt < 0x80); - break; - } - } - - if (is_cpt_range) { - // Some regex ranges starts and ends with different category/subcategory. - // Split the ranges in sub-ranges to ensure a single category to maintain the new hierarchy. - // Warning: This forces iterating all ranges and could produce long sub-range sequences. - GGML_ASSERT(wregex.size() && wregex.back() == '-'); - wregex.pop_back(); - wchar_t categorized = wregex.back(); - uint32_t range_ini = decategorize_codepoint(categorized); - const uint32_t range_end = cpt; - GGML_ASSERT(range_ini <= range_end); - codepoint_categ range_categ = unicode_cpt_category(range_ini); - for (cpt = range_ini + 1; cpt <= range_end; ++cpt) { - codepoint_categ categ = unicode_cpt_category(cpt); - if (categ == range_categ) { // still same range category ? - ++categorized; - if (cpt == range_ini + 1) { // single step, no need range - wregex += categorized; - } else if (cpt == range_ini + 2) { // need range if +2 step - wregex.back() = '-'; - wregex += categorized; - } else { - wregex.back() = categorized; // keep range growing - } - } else { // new range category - categorized = categorize_codepoint(cpt); - wregex += categorized; - range_categ = categ; - range_ini = cpt; - } - } - is_cpt_range = false; - } else { - wregex += categorize_codepoint(cpt); - } - } - - // categorize all wtext codepoints - if (wtext.size() && wtext[0] < MAX_CODEPOINTS) { // if not already categorized - for (size_t i = 0; i < wtext.size(); ++i) { - wtext[i] = categorize_codepoint((uint32_t) wtext[i]); - } - } - - offsets = unicode_regex_split_stl(wtext, wregex, offsets); + const auto regex_cpts = unicode_regex_prepare(regex_expr); + offsets = unicode_regex_split_stl(cpts, regex_cpts, offsets); } std::vector bpe_words; @@ -930,8 +858,7 @@ std::vector unicode_regex_split(const std::string & text, const std for (size_t & offset : offsets) { bpe_words.emplace_back(); for (size_t i = start; i < start + offset; ++i) { - const uint32_t cpt = decategorize_codepoint(wtext[i]); - bpe_words.back() += unicode_cpt_to_utf8(cpt); + bpe_words.back() += unicode_cpt_to_utf8(cpts[i]); } start += offset; } diff --git a/src/unicode.h b/src/unicode.h index 3aeb74771eb7f..f2c3e71479975 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -113,6 +113,23 @@ struct codepoint_categ { inline bool is_Zp() const { return (encoded & MASK) == Zp; } inline bool is_Zs() const { return (encoded & MASK) == Zs; } + inline uint64_t expand_bits(const bool add_categ=true) const { // one bit for each category/subcateory and flags + const uint32_t subindex = encoded & SUBMASK; + const uint64_t bits = (encoded & MASK) >> 3; + const uint64_t flags = encoded >> 10; + return (flags << (7 * 8)) | (bits << (7 * subindex)) | (bits * add_categ); + } + + inline bool is_in_range(const codepoint_categ other) const { // this.first <= other <= this.last + if (encoded & SUBMASK) { + return encoded == other.encoded; // no range + } + if (encoded & MASK) { + return encoded == (other.encoded & ~SUBMASK); // from 0bffffff'ccccccc'000 to 0bffffff'ccccccc'111 + } + return encoded == (other.encoded & ~MASK); // from 0bffffff'0000000'000 to 0bffffff'1111111'111 + } + inline bool operator == (const codepoint_categ other) const { return encoded == other.encoded; } From 7ff916eae812b1a116fa50a71d61620f77087971 Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Tue, 13 Aug 2024 17:39:41 +0200 Subject: [PATCH 25/29] Original regex for 'tekken' --- src/llama-vocab.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 6192fd195746f..e4a1cbb29296d 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -440,10 +440,8 @@ struct llm_tokenizer_bpe { }; break; case LLAMA_VOCAB_PRE_TYPE_TEKKEN: - // original regex from tokenizer.json - // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" regex_exprs = { - "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }; break; default: From 50e1b1e36d2981d98e335a2065ebf945ee414998 Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Tue, 13 Aug 2024 19:55:12 +0200 Subject: [PATCH 26/29] Remove unused function --- src/unicode.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/unicode.cpp b/src/unicode.cpp index 988dd35e4d3c2..73b757a795514 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -549,10 +549,6 @@ namespace std { static const auto _Ch_alpha = std::ctype::alpha; #endif - static size_type length(const CharT * str) { - return std::char_traits::length(str); - } - CharT translate(CharT c) const { return c; } From dcac74792b036176927c0715f924bf6135620cef Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Tue, 13 Aug 2024 19:58:36 +0200 Subject: [PATCH 27/29] Using 32bit wchar_t by default, uint32_t on Windows --- src/unicode.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/unicode.cpp b/src/unicode.cpp index 73b757a795514..b7c0fc549653c 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -471,8 +471,16 @@ static std::vector unicode_regex_split_custom(const std::string & text, // std::wregex does not support unicode whitespaces \s: 0x85, 0xA0, 0x001680 ... 0x003000. // std::wregex supports full 32 bit codepoints, not limited to standard max 0x110000. namespace std { - using codepoint = uint32_t; // codepoint type for all template specializations +// codepoint type for all template specializations +#if (WCHAR_MAX > 0xFFFF) + using codepoint = wchar_t; // sizeof(wchar_t) == 4 +#else + using codepoint = uint32_t; // Windows: sizeof(wchar_t) == 2 + #define CUSTOM_CTYPE_CODEPOINT +#endif + +#ifdef CUSTOM_CTYPE_CODEPOINT // Minimal required implementation for std::regex string processing template<> // custom specialized std::ctype class ctype { @@ -530,6 +538,7 @@ namespace std { const std::ctype & use_facet>(const std::locale & loc) { return use_facet>(loc); } +#endif // Minimal required implementation for std::regex string processing template<> // custom specialized std::regex_traits From b67c81d1fab3608099e229b916da4dfd2c81d57e Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Tue, 13 Aug 2024 20:25:45 +0200 Subject: [PATCH 28/29] Fix previous commit --- src/unicode.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/unicode.cpp b/src/unicode.cpp index b7c0fc549653c..2c98676a869b4 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -697,14 +697,17 @@ static std::vector unicode_regex_prepare(const std::string & regex) { // use std::basic_regex to split the text codepoints static std::vector unicode_regex_split_stl(const std::vector & text_cpts, const std::vector & regex_cpts, const std::vector & offsets) { - using regex_type = std::basic_regex; - using iter_type = std::regex_iterator; - regex_type regex(regex_cpts.begin(), regex_cpts.end()); + GGML_ASSERT(sizeof(std::codepoint) == sizeof(uint32_t)); + using regex_type = std::basic_regex; + using iter_type = std::regex_iterator; + + const std::codepoint * text_data = (const std::codepoint *) text_cpts.data(); + const std::codepoint * regex_data = (const std::codepoint *) regex_cpts.data(); + regex_type regex(regex_data, regex_data+regex_cpts.size()); const iter_type end; std::vector bpe_offsets; // store the offset of each word bpe_offsets.reserve(offsets.size()); // reserve memory for the approximate size - const uint32_t * text_data = text_cpts.data(); for (auto offset : offsets) { iter_type it(text_data, text_data + offset, regex); int64_t start_idx = 0; From db78320b4d20185e1a2155d056d9aa32c93940f8 Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Tue, 13 Aug 2024 21:19:18 +0200 Subject: [PATCH 29/29] Fix compiler complaints --- src/unicode.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/unicode.cpp b/src/unicode.cpp index 2c98676a869b4..7bd10f50bcf14 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -576,7 +576,7 @@ namespace std { string_type transform_primary(It first, It last) const { (void) first; (void) last; - GGML_ASSERT(*first < MAX_CODEPOINTS); // valid codepoint + GGML_ASSERT((uint32_t) *first < MAX_CODEPOINTS); // check valid codepoint return {}; }