diff --git a/scripts/gen-unicode-data.py b/scripts/gen-unicode-data.py index 2d9bde01c3ca7..1528a13db4c80 100644 --- a/scripts/gen-unicode-data.py +++ b/scripts/gen-unicode-data.py @@ -49,53 +49,42 @@ def unicode_data_iter(): yield (cpt, cpt_lower, cpt_upper, categ, bidir) -# see definition in unicode.h -CODEPOINT_FLAG_UNDEFINED = 0x0001 # -CODEPOINT_FLAG_NUMBER = 0x0002 # \p{N} -CODEPOINT_FLAG_LETTER = 0x0004 # \p{L} -CODEPOINT_FLAG_SEPARATOR = 0x0008 # \p{Z} -CODEPOINT_FLAG_MARK = 0x0010 # \p{M} -CODEPOINT_FLAG_PUNCTUATION = 0x0020 # \p{P} -CODEPOINT_FLAG_SYMBOL = 0x0040 # \p{S} -CODEPOINT_FLAG_CONTROL = 0x0080 # \p{C} - -UNICODE_CATEGORY_TO_FLAG = { - "Cn": CODEPOINT_FLAG_UNDEFINED, # Undefined - "Cc": CODEPOINT_FLAG_CONTROL, # Control - "Cf": CODEPOINT_FLAG_CONTROL, # Format - "Co": CODEPOINT_FLAG_CONTROL, # Private Use - "Cs": CODEPOINT_FLAG_CONTROL, # Surrrogate - "Ll": CODEPOINT_FLAG_LETTER, # Lowercase Letter - "Lm": CODEPOINT_FLAG_LETTER, # Modifier Letter - "Lo": CODEPOINT_FLAG_LETTER, # Other Letter - "Lt": CODEPOINT_FLAG_LETTER, # Titlecase Letter - "Lu": CODEPOINT_FLAG_LETTER, # Uppercase Letter - "L&": CODEPOINT_FLAG_LETTER, # Cased Letter - "Mc": CODEPOINT_FLAG_MARK, # Spacing Mark - "Me": CODEPOINT_FLAG_MARK, # Enclosing Mark - "Mn": CODEPOINT_FLAG_MARK, # Nonspacing Mark - "Nd": CODEPOINT_FLAG_NUMBER, # Decimal Number - "Nl": CODEPOINT_FLAG_NUMBER, # Letter Number - "No": CODEPOINT_FLAG_NUMBER, # Other Number - "Pc": CODEPOINT_FLAG_PUNCTUATION, # Connector Punctuation - "Pd": CODEPOINT_FLAG_PUNCTUATION, # Dash Punctuation - "Pe": CODEPOINT_FLAG_PUNCTUATION, # Close Punctuation - "Pf": CODEPOINT_FLAG_PUNCTUATION, # Final Punctuation - "Pi": CODEPOINT_FLAG_PUNCTUATION, # Initial Punctuation - "Po": CODEPOINT_FLAG_PUNCTUATION, # Other Punctuation - "Ps": CODEPOINT_FLAG_PUNCTUATION, # Open Punctuation - "Sc": CODEPOINT_FLAG_SYMBOL, # Currency Symbol - "Sk": CODEPOINT_FLAG_SYMBOL, # Modifier Symbol - "Sm": CODEPOINT_FLAG_SYMBOL, # Math Symbol - "So": CODEPOINT_FLAG_SYMBOL, # Other Symbol - "Zl": CODEPOINT_FLAG_SEPARATOR, # Line Separator - "Zp": CODEPOINT_FLAG_SEPARATOR, # Paragraph Separator - "Zs": CODEPOINT_FLAG_SEPARATOR, # Space Separator +# see codepoint_categ::from_index() in unicode.h +UNICODE_CATEGORY_TO_INDEX = { + "Cn": 0, # \p{Cn} Undefined + "Cc": 1, # \p{Cc} Control + "Cf": 2, # \p{Cf} Format + "Co": 3, # \p{Co} Private Use + "Cs": 4, # \p{Cs} Surrrogate + "Ll": 5, # \p{Ll} Lowercase Letter + "Lm": 6, # \p{Lm} Modifier Letter + "Lo": 7, # \p{Lo} Other Letter + "Lt": 8, # \p{Lt} Titlecase Letter + "Lu": 9, # \p{Lu} Uppercase Letter + "Mc": 10, # \p{Mc} Spacing Mark + "Me": 11, # \p{Me} Enclosing Mark + "Mn": 12, # \p{Mn} Nonspacing Mark + "Nd": 13, # \p{Nd} Decimal Number + "Nl": 14, # \p{Nl} Letter Number + "No": 15, # \p{No} Other Number + "Pc": 16, # \p{Pc} Connector Punctuation + "Pd": 17, # \p{Pd} Dash Punctuation + "Pe": 18, # \p{Pe} Close Punctuation + "Pf": 19, # \p{Pf} Final Punctuation + "Pi": 20, # \p{Pi} Initial Punctuation + "Po": 21, # \p{Po} Other Punctuation + "Ps": 22, # \p{Ps} Open Punctuation + "Sc": 23, # \p{Sc} Currency Symbol + "Sk": 24, # \p{Sk} Modifier Symbol + "Sm": 25, # \p{Sm} Math Symbol + "So": 26, # \p{So} Other Symbol + "Zl": 27, # \p{Zl} Line Separator + "Zp": 28, # \p{Zp} Paragraph Separator + "Zs": 29, # \p{Zs} Space Separator } -codepoint_flags = array.array('H', [CODEPOINT_FLAG_UNDEFINED]) * MAX_CODEPOINTS -table_whitespace = [] +codepoint_categs = array.array('B', [0]) * MAX_CODEPOINTS # Undefined table_lowercase = [] table_uppercase = [] table_nfd = [] @@ -105,7 +94,7 @@ def unicode_data_iter(): char = chr(cpt) # codepoint category flags - codepoint_flags[cpt] = UNICODE_CATEGORY_TO_FLAG[categ] + codepoint_categs[cpt] = UNICODE_CATEGORY_TO_INDEX[categ] # lowercase conversion if cpt_lower: @@ -121,25 +110,31 @@ def unicode_data_iter(): table_nfd.append((cpt, norm)) -# whitespaces, see "" https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt -table_whitespace.extend(range(0x0009, 0x000D + 1)) -table_whitespace.extend(range(0x2000, 0x200A + 1)) -table_whitespace.extend([0x0020, 0x0085, 0x00A0, 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000]) - - # sort by codepoint -table_whitespace.sort() table_lowercase.sort() table_uppercase.sort() table_nfd.sort() -# group ranges with same flags -ranges_flags: list[tuple[int, int]] = [(0, codepoint_flags[0])] # start, flags -for codepoint, flags in enumerate(codepoint_flags): - if flags != ranges_flags[-1][1]: - ranges_flags.append((codepoint, flags)) -ranges_flags.append((MAX_CODEPOINTS, 0x0000)) +# whitespaces, see "" https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt +whitespace_ranges: list[tuple[int, int]] = [] # start, last +whitespace_ranges.append((0x0009, 0x000D)) +whitespace_ranges.append((0x2000, 0x200A)) +for whitespace in [0x0020, 0x0085, 0x00A0, 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000]: + whitespace_ranges.append((whitespace, whitespace)) + + +# run length encoding, see unicode_cpt_category() in unicode.cpp +assert (max(UNICODE_CATEGORY_TO_INDEX.values()) < 32) +codepoint_categs_runs = [codepoint_categs[0]] # 5 bits categ + 11 bits length +for cpt, categ in enumerate(codepoint_categs[1:], 1): + prev = codepoint_categs_runs[-1] + if prev <= (0xFFFF - 32) and (prev & 31) == categ: + codepoint_categs_runs[-1] += 32 # increment run length + else: + codepoint_categs_runs.append(categ) # new run value + assert (codepoint_categs_runs[-1] < 0xFFFF) +assert (MAX_CODEPOINTS == sum((rle >> 5) + 1 for rle in codepoint_categs_runs)) # group ranges with same nfd @@ -153,7 +148,7 @@ def unicode_data_iter(): # Generate 'unicode-data.cpp': -# python ./scripts//gen-unicode-data.py > unicode-data.cpp +# python ./scripts//gen-unicode-data.py > ./src/unicode-data.cpp def out(line=""): print(line, end='\n') # noqa @@ -167,17 +162,16 @@ def out(line=""): #include #include #include -#include """) -out("const std::vector> unicode_ranges_flags = { // start, flags // last=next_start-1") -for codepoint, flags in ranges_flags: - out("{0x%06X, 0x%04X}," % (codepoint, flags)) +out("const std::vector unicode_rle_codepoints_categs = { // run length encoding, 5 bits categ + 11 bits length") +for rle in codepoint_categs_runs: + out("0x%04X," % rle) out("};\n") -out("const std::unordered_set unicode_set_whitespace = {") -for codepoint in table_whitespace: - out("0x%06X," % codepoint) +out("const std::vector> unicode_ranges_whitespace = {") +for (start, last) in whitespace_ranges: + out("{0x%06X, 0x%06X}," % (start, last)) out("};\n") out("const std::unordered_map unicode_map_lowercase = {") diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 133094904c2d2..e4a1cbb29296d 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -440,10 +440,8 @@ struct llm_tokenizer_bpe { }; break; case LLAMA_VOCAB_PRE_TYPE_TEKKEN: - // original regex from tokenizer.json - // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" regex_exprs = { - "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }; break; default: @@ -701,22 +699,21 @@ struct llm_tokenizer_wpm { std::vector words(1, ""); for (const uint32_t cpt : cpts_nfd) { - const auto flags = unicode_cpt_flags(cpt); + const auto categ = unicode_cpt_category(cpt); - if (flags.is_whitespace) { + if (categ.is_whitespace()) { if (words.back().size()) { // finish previous word if any words.emplace_back(); } continue; } - assert (!flags.is_separator); - if (cpt == 0 || cpt == 0xFFFD || flags.is_control) { + if (cpt == 0 || cpt == 0xFFFD || categ.is_C()) { continue; } const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt)); - if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) { + if (categ.is_P() || (cpt < 0x7F && categ.is_S()) || is_chinese_char(cpt)) { if (words.back().size()) { // finish previous word if any words.emplace_back(); } @@ -734,7 +731,7 @@ struct llm_tokenizer_wpm { return words; } - static bool is_chinese_char(uint32_t cpt) { + static bool is_chinese_char(uint32_t cpt) { //TODO: move to unicode-data.cpp? unicode_cpt_category(cpt).is_chinese()? return (cpt >= 0x04E00 && cpt <= 0x09FFF) || (cpt >= 0x03400 && cpt <= 0x04DBF) || diff --git a/src/unicode-data.cpp b/src/unicode-data.cpp index 02bdf782380fe..1a2ceb01739b8 100644 --- a/src/unicode-data.cpp +++ b/src/unicode-data.cpp @@ -5,2310 +5,4539 @@ #include #include #include -#include -const std::vector> unicode_ranges_flags = { // start, flags // last=next_start-1 -{0x000000, 0x0080}, -{0x000020, 0x0008}, -{0x000021, 0x0020}, -{0x000024, 0x0040}, -{0x000025, 0x0020}, -{0x00002B, 0x0040}, -{0x00002C, 0x0020}, -{0x000030, 0x0002}, -{0x00003A, 0x0020}, -{0x00003C, 0x0040}, -{0x00003F, 0x0020}, -{0x000041, 0x0004}, -{0x00005B, 0x0020}, -{0x00005E, 0x0040}, -{0x00005F, 0x0020}, -{0x000060, 0x0040}, -{0x000061, 0x0004}, -{0x00007B, 0x0020}, -{0x00007C, 0x0040}, -{0x00007D, 0x0020}, -{0x00007E, 0x0040}, -{0x00007F, 0x0080}, -{0x0000A0, 0x0008}, -{0x0000A1, 0x0020}, -{0x0000A2, 0x0040}, -{0x0000A7, 0x0020}, -{0x0000A8, 0x0040}, -{0x0000AA, 0x0004}, -{0x0000AB, 0x0020}, -{0x0000AC, 0x0040}, -{0x0000AD, 0x0080}, -{0x0000AE, 0x0040}, -{0x0000B2, 0x0002}, -{0x0000B4, 0x0040}, -{0x0000B5, 0x0004}, -{0x0000B6, 0x0020}, -{0x0000B8, 0x0040}, -{0x0000B9, 0x0002}, -{0x0000BA, 0x0004}, -{0x0000BB, 0x0020}, -{0x0000BC, 0x0002}, -{0x0000BF, 0x0020}, -{0x0000C0, 0x0004}, -{0x0000D7, 0x0040}, -{0x0000D8, 0x0004}, -{0x0000F7, 0x0040}, -{0x0000F8, 0x0004}, -{0x0002C2, 0x0040}, -{0x0002C6, 0x0004}, -{0x0002D2, 0x0040}, -{0x0002E0, 0x0004}, -{0x0002E5, 0x0040}, -{0x0002EC, 0x0004}, -{0x0002ED, 0x0040}, -{0x0002EE, 0x0004}, -{0x0002EF, 0x0040}, -{0x000300, 0x0010}, -{0x000370, 0x0004}, -{0x000375, 0x0040}, -{0x000376, 0x0004}, -{0x000378, 0x0001}, -{0x00037A, 0x0004}, -{0x00037E, 0x0020}, -{0x00037F, 0x0004}, -{0x000380, 0x0001}, -{0x000384, 0x0040}, -{0x000386, 0x0004}, -{0x000387, 0x0020}, -{0x000388, 0x0004}, -{0x00038B, 0x0001}, -{0x00038C, 0x0004}, -{0x00038D, 0x0001}, -{0x00038E, 0x0004}, -{0x0003A2, 0x0001}, -{0x0003A3, 0x0004}, -{0x0003F6, 0x0040}, -{0x0003F7, 0x0004}, -{0x000482, 0x0040}, -{0x000483, 0x0010}, -{0x00048A, 0x0004}, -{0x000530, 0x0001}, -{0x000531, 0x0004}, -{0x000557, 0x0001}, -{0x000559, 0x0004}, -{0x00055A, 0x0020}, -{0x000560, 0x0004}, -{0x000589, 0x0020}, -{0x00058B, 0x0001}, -{0x00058D, 0x0040}, -{0x000590, 0x0001}, -{0x000591, 0x0010}, -{0x0005BE, 0x0020}, -{0x0005BF, 0x0010}, -{0x0005C0, 0x0020}, -{0x0005C1, 0x0010}, -{0x0005C3, 0x0020}, -{0x0005C4, 0x0010}, -{0x0005C6, 0x0020}, -{0x0005C7, 0x0010}, -{0x0005C8, 0x0001}, -{0x0005D0, 0x0004}, -{0x0005EB, 0x0001}, -{0x0005EF, 0x0004}, -{0x0005F3, 0x0020}, -{0x0005F5, 0x0001}, -{0x000600, 0x0080}, -{0x000606, 0x0040}, -{0x000609, 0x0020}, -{0x00060B, 0x0040}, -{0x00060C, 0x0020}, -{0x00060E, 0x0040}, -{0x000610, 0x0010}, -{0x00061B, 0x0020}, -{0x00061C, 0x0080}, -{0x00061D, 0x0020}, -{0x000620, 0x0004}, -{0x00064B, 0x0010}, -{0x000660, 0x0002}, -{0x00066A, 0x0020}, -{0x00066E, 0x0004}, -{0x000670, 0x0010}, -{0x000671, 0x0004}, -{0x0006D4, 0x0020}, -{0x0006D5, 0x0004}, -{0x0006D6, 0x0010}, -{0x0006DD, 0x0080}, -{0x0006DE, 0x0040}, -{0x0006DF, 0x0010}, -{0x0006E5, 0x0004}, -{0x0006E7, 0x0010}, -{0x0006E9, 0x0040}, -{0x0006EA, 0x0010}, -{0x0006EE, 0x0004}, -{0x0006F0, 0x0002}, -{0x0006FA, 0x0004}, -{0x0006FD, 0x0040}, -{0x0006FF, 0x0004}, -{0x000700, 0x0020}, -{0x00070E, 0x0001}, -{0x00070F, 0x0080}, -{0x000710, 0x0004}, -{0x000711, 0x0010}, -{0x000712, 0x0004}, -{0x000730, 0x0010}, -{0x00074B, 0x0001}, -{0x00074D, 0x0004}, -{0x0007A6, 0x0010}, -{0x0007B1, 0x0004}, -{0x0007B2, 0x0001}, -{0x0007C0, 0x0002}, -{0x0007CA, 0x0004}, -{0x0007EB, 0x0010}, -{0x0007F4, 0x0004}, -{0x0007F6, 0x0040}, -{0x0007F7, 0x0020}, -{0x0007FA, 0x0004}, -{0x0007FB, 0x0001}, -{0x0007FD, 0x0010}, -{0x0007FE, 0x0040}, -{0x000800, 0x0004}, -{0x000816, 0x0010}, -{0x00081A, 0x0004}, -{0x00081B, 0x0010}, -{0x000824, 0x0004}, -{0x000825, 0x0010}, -{0x000828, 0x0004}, -{0x000829, 0x0010}, -{0x00082E, 0x0001}, -{0x000830, 0x0020}, -{0x00083F, 0x0001}, -{0x000840, 0x0004}, -{0x000859, 0x0010}, -{0x00085C, 0x0001}, -{0x00085E, 0x0020}, -{0x00085F, 0x0001}, -{0x000860, 0x0004}, -{0x00086B, 0x0001}, -{0x000870, 0x0004}, -{0x000888, 0x0040}, -{0x000889, 0x0004}, -{0x00088F, 0x0001}, -{0x000890, 0x0080}, -{0x000892, 0x0001}, -{0x000898, 0x0010}, -{0x0008A0, 0x0004}, -{0x0008CA, 0x0010}, -{0x0008E2, 0x0080}, -{0x0008E3, 0x0010}, -{0x000904, 0x0004}, -{0x00093A, 0x0010}, -{0x00093D, 0x0004}, -{0x00093E, 0x0010}, -{0x000950, 0x0004}, -{0x000951, 0x0010}, -{0x000958, 0x0004}, -{0x000962, 0x0010}, -{0x000964, 0x0020}, -{0x000966, 0x0002}, -{0x000970, 0x0020}, -{0x000971, 0x0004}, -{0x000981, 0x0010}, -{0x000984, 0x0001}, -{0x000985, 0x0004}, -{0x00098D, 0x0001}, -{0x00098F, 0x0004}, -{0x000991, 0x0001}, -{0x000993, 0x0004}, -{0x0009A9, 0x0001}, -{0x0009AA, 0x0004}, -{0x0009B1, 0x0001}, -{0x0009B2, 0x0004}, -{0x0009B3, 0x0001}, -{0x0009B6, 0x0004}, -{0x0009BA, 0x0001}, -{0x0009BC, 0x0010}, -{0x0009BD, 0x0004}, -{0x0009BE, 0x0010}, -{0x0009C5, 0x0001}, -{0x0009C7, 0x0010}, -{0x0009C9, 0x0001}, -{0x0009CB, 0x0010}, -{0x0009CE, 0x0004}, -{0x0009CF, 0x0001}, -{0x0009D7, 0x0010}, -{0x0009D8, 0x0001}, -{0x0009DC, 0x0004}, -{0x0009DE, 0x0001}, -{0x0009DF, 0x0004}, -{0x0009E2, 0x0010}, -{0x0009E4, 0x0001}, -{0x0009E6, 0x0002}, -{0x0009F0, 0x0004}, -{0x0009F2, 0x0040}, -{0x0009F4, 0x0002}, -{0x0009FA, 0x0040}, -{0x0009FC, 0x0004}, -{0x0009FD, 0x0020}, -{0x0009FE, 0x0010}, -{0x0009FF, 0x0001}, -{0x000A01, 0x0010}, -{0x000A04, 0x0001}, -{0x000A05, 0x0004}, -{0x000A0B, 0x0001}, -{0x000A0F, 0x0004}, -{0x000A11, 0x0001}, -{0x000A13, 0x0004}, -{0x000A29, 0x0001}, -{0x000A2A, 0x0004}, -{0x000A31, 0x0001}, -{0x000A32, 0x0004}, -{0x000A34, 0x0001}, -{0x000A35, 0x0004}, -{0x000A37, 0x0001}, -{0x000A38, 0x0004}, -{0x000A3A, 0x0001}, -{0x000A3C, 0x0010}, -{0x000A3D, 0x0001}, -{0x000A3E, 0x0010}, -{0x000A43, 0x0001}, -{0x000A47, 0x0010}, -{0x000A49, 0x0001}, -{0x000A4B, 0x0010}, -{0x000A4E, 0x0001}, -{0x000A51, 0x0010}, -{0x000A52, 0x0001}, -{0x000A59, 0x0004}, -{0x000A5D, 0x0001}, -{0x000A5E, 0x0004}, -{0x000A5F, 0x0001}, -{0x000A66, 0x0002}, -{0x000A70, 0x0010}, -{0x000A72, 0x0004}, -{0x000A75, 0x0010}, -{0x000A76, 0x0020}, -{0x000A77, 0x0001}, -{0x000A81, 0x0010}, -{0x000A84, 0x0001}, -{0x000A85, 0x0004}, -{0x000A8E, 0x0001}, -{0x000A8F, 0x0004}, -{0x000A92, 0x0001}, -{0x000A93, 0x0004}, -{0x000AA9, 0x0001}, -{0x000AAA, 0x0004}, -{0x000AB1, 0x0001}, -{0x000AB2, 0x0004}, -{0x000AB4, 0x0001}, -{0x000AB5, 0x0004}, -{0x000ABA, 0x0001}, -{0x000ABC, 0x0010}, -{0x000ABD, 0x0004}, -{0x000ABE, 0x0010}, -{0x000AC6, 0x0001}, -{0x000AC7, 0x0010}, -{0x000ACA, 0x0001}, -{0x000ACB, 0x0010}, -{0x000ACE, 0x0001}, -{0x000AD0, 0x0004}, -{0x000AD1, 0x0001}, -{0x000AE0, 0x0004}, -{0x000AE2, 0x0010}, -{0x000AE4, 0x0001}, -{0x000AE6, 0x0002}, -{0x000AF0, 0x0020}, -{0x000AF1, 0x0040}, -{0x000AF2, 0x0001}, -{0x000AF9, 0x0004}, -{0x000AFA, 0x0010}, -{0x000B00, 0x0001}, -{0x000B01, 0x0010}, -{0x000B04, 0x0001}, -{0x000B05, 0x0004}, -{0x000B0D, 0x0001}, -{0x000B0F, 0x0004}, -{0x000B11, 0x0001}, -{0x000B13, 0x0004}, -{0x000B29, 0x0001}, -{0x000B2A, 0x0004}, -{0x000B31, 0x0001}, -{0x000B32, 0x0004}, -{0x000B34, 0x0001}, -{0x000B35, 0x0004}, -{0x000B3A, 0x0001}, -{0x000B3C, 0x0010}, -{0x000B3D, 0x0004}, -{0x000B3E, 0x0010}, -{0x000B45, 0x0001}, -{0x000B47, 0x0010}, -{0x000B49, 0x0001}, -{0x000B4B, 0x0010}, -{0x000B4E, 0x0001}, -{0x000B55, 0x0010}, -{0x000B58, 0x0001}, -{0x000B5C, 0x0004}, -{0x000B5E, 0x0001}, -{0x000B5F, 0x0004}, -{0x000B62, 0x0010}, -{0x000B64, 0x0001}, -{0x000B66, 0x0002}, -{0x000B70, 0x0040}, -{0x000B71, 0x0004}, -{0x000B72, 0x0002}, -{0x000B78, 0x0001}, -{0x000B82, 0x0010}, -{0x000B83, 0x0004}, -{0x000B84, 0x0001}, -{0x000B85, 0x0004}, -{0x000B8B, 0x0001}, -{0x000B8E, 0x0004}, -{0x000B91, 0x0001}, -{0x000B92, 0x0004}, -{0x000B96, 0x0001}, -{0x000B99, 0x0004}, -{0x000B9B, 0x0001}, -{0x000B9C, 0x0004}, -{0x000B9D, 0x0001}, -{0x000B9E, 0x0004}, -{0x000BA0, 0x0001}, -{0x000BA3, 0x0004}, -{0x000BA5, 0x0001}, -{0x000BA8, 0x0004}, -{0x000BAB, 0x0001}, -{0x000BAE, 0x0004}, -{0x000BBA, 0x0001}, -{0x000BBE, 0x0010}, -{0x000BC3, 0x0001}, -{0x000BC6, 0x0010}, -{0x000BC9, 0x0001}, -{0x000BCA, 0x0010}, -{0x000BCE, 0x0001}, -{0x000BD0, 0x0004}, -{0x000BD1, 0x0001}, -{0x000BD7, 0x0010}, -{0x000BD8, 0x0001}, -{0x000BE6, 0x0002}, -{0x000BF3, 0x0040}, -{0x000BFB, 0x0001}, -{0x000C00, 0x0010}, -{0x000C05, 0x0004}, -{0x000C0D, 0x0001}, -{0x000C0E, 0x0004}, -{0x000C11, 0x0001}, -{0x000C12, 0x0004}, -{0x000C29, 0x0001}, -{0x000C2A, 0x0004}, -{0x000C3A, 0x0001}, -{0x000C3C, 0x0010}, -{0x000C3D, 0x0004}, -{0x000C3E, 0x0010}, -{0x000C45, 0x0001}, -{0x000C46, 0x0010}, -{0x000C49, 0x0001}, -{0x000C4A, 0x0010}, -{0x000C4E, 0x0001}, -{0x000C55, 0x0010}, -{0x000C57, 0x0001}, -{0x000C58, 0x0004}, -{0x000C5B, 0x0001}, -{0x000C5D, 0x0004}, -{0x000C5E, 0x0001}, -{0x000C60, 0x0004}, -{0x000C62, 0x0010}, -{0x000C64, 0x0001}, -{0x000C66, 0x0002}, -{0x000C70, 0x0001}, -{0x000C77, 0x0020}, -{0x000C78, 0x0002}, -{0x000C7F, 0x0040}, -{0x000C80, 0x0004}, -{0x000C81, 0x0010}, -{0x000C84, 0x0020}, -{0x000C85, 0x0004}, -{0x000C8D, 0x0001}, -{0x000C8E, 0x0004}, -{0x000C91, 0x0001}, -{0x000C92, 0x0004}, -{0x000CA9, 0x0001}, -{0x000CAA, 0x0004}, -{0x000CB4, 0x0001}, -{0x000CB5, 0x0004}, -{0x000CBA, 0x0001}, -{0x000CBC, 0x0010}, -{0x000CBD, 0x0004}, -{0x000CBE, 0x0010}, -{0x000CC5, 0x0001}, -{0x000CC6, 0x0010}, -{0x000CC9, 0x0001}, -{0x000CCA, 0x0010}, -{0x000CCE, 0x0001}, -{0x000CD5, 0x0010}, -{0x000CD7, 0x0001}, -{0x000CDD, 0x0004}, -{0x000CDF, 0x0001}, -{0x000CE0, 0x0004}, -{0x000CE2, 0x0010}, -{0x000CE4, 0x0001}, -{0x000CE6, 0x0002}, -{0x000CF0, 0x0001}, -{0x000CF1, 0x0004}, -{0x000CF3, 0x0010}, -{0x000CF4, 0x0001}, -{0x000D00, 0x0010}, -{0x000D04, 0x0004}, -{0x000D0D, 0x0001}, -{0x000D0E, 0x0004}, -{0x000D11, 0x0001}, -{0x000D12, 0x0004}, -{0x000D3B, 0x0010}, -{0x000D3D, 0x0004}, -{0x000D3E, 0x0010}, -{0x000D45, 0x0001}, -{0x000D46, 0x0010}, -{0x000D49, 0x0001}, -{0x000D4A, 0x0010}, -{0x000D4E, 0x0004}, -{0x000D4F, 0x0040}, -{0x000D50, 0x0001}, -{0x000D54, 0x0004}, -{0x000D57, 0x0010}, -{0x000D58, 0x0002}, -{0x000D5F, 0x0004}, -{0x000D62, 0x0010}, -{0x000D64, 0x0001}, -{0x000D66, 0x0002}, -{0x000D79, 0x0040}, -{0x000D7A, 0x0004}, -{0x000D80, 0x0001}, -{0x000D81, 0x0010}, -{0x000D84, 0x0001}, -{0x000D85, 0x0004}, -{0x000D97, 0x0001}, -{0x000D9A, 0x0004}, -{0x000DB2, 0x0001}, -{0x000DB3, 0x0004}, -{0x000DBC, 0x0001}, -{0x000DBD, 0x0004}, -{0x000DBE, 0x0001}, -{0x000DC0, 0x0004}, -{0x000DC7, 0x0001}, -{0x000DCA, 0x0010}, -{0x000DCB, 0x0001}, -{0x000DCF, 0x0010}, -{0x000DD5, 0x0001}, -{0x000DD6, 0x0010}, -{0x000DD7, 0x0001}, -{0x000DD8, 0x0010}, -{0x000DE0, 0x0001}, -{0x000DE6, 0x0002}, -{0x000DF0, 0x0001}, -{0x000DF2, 0x0010}, -{0x000DF4, 0x0020}, -{0x000DF5, 0x0001}, -{0x000E01, 0x0004}, -{0x000E31, 0x0010}, -{0x000E32, 0x0004}, -{0x000E34, 0x0010}, -{0x000E3B, 0x0001}, -{0x000E3F, 0x0040}, -{0x000E40, 0x0004}, -{0x000E47, 0x0010}, -{0x000E4F, 0x0020}, -{0x000E50, 0x0002}, -{0x000E5A, 0x0020}, -{0x000E5C, 0x0001}, -{0x000E81, 0x0004}, -{0x000E83, 0x0001}, -{0x000E84, 0x0004}, -{0x000E85, 0x0001}, -{0x000E86, 0x0004}, -{0x000E8B, 0x0001}, -{0x000E8C, 0x0004}, -{0x000EA4, 0x0001}, -{0x000EA5, 0x0004}, -{0x000EA6, 0x0001}, -{0x000EA7, 0x0004}, -{0x000EB1, 0x0010}, -{0x000EB2, 0x0004}, -{0x000EB4, 0x0010}, -{0x000EBD, 0x0004}, -{0x000EBE, 0x0001}, -{0x000EC0, 0x0004}, -{0x000EC5, 0x0001}, -{0x000EC6, 0x0004}, -{0x000EC7, 0x0001}, -{0x000EC8, 0x0010}, -{0x000ECF, 0x0001}, -{0x000ED0, 0x0002}, -{0x000EDA, 0x0001}, -{0x000EDC, 0x0004}, -{0x000EE0, 0x0001}, -{0x000F00, 0x0004}, -{0x000F01, 0x0040}, -{0x000F04, 0x0020}, -{0x000F13, 0x0040}, -{0x000F14, 0x0020}, -{0x000F15, 0x0040}, -{0x000F18, 0x0010}, -{0x000F1A, 0x0040}, -{0x000F20, 0x0002}, -{0x000F34, 0x0040}, -{0x000F35, 0x0010}, -{0x000F36, 0x0040}, -{0x000F37, 0x0010}, -{0x000F38, 0x0040}, -{0x000F39, 0x0010}, -{0x000F3A, 0x0020}, -{0x000F3E, 0x0010}, -{0x000F40, 0x0004}, -{0x000F48, 0x0001}, -{0x000F49, 0x0004}, -{0x000F6D, 0x0001}, -{0x000F71, 0x0010}, -{0x000F85, 0x0020}, -{0x000F86, 0x0010}, -{0x000F88, 0x0004}, -{0x000F8D, 0x0010}, -{0x000F98, 0x0001}, -{0x000F99, 0x0010}, -{0x000FBD, 0x0001}, -{0x000FBE, 0x0040}, -{0x000FC6, 0x0010}, -{0x000FC7, 0x0040}, -{0x000FCD, 0x0001}, -{0x000FCE, 0x0040}, -{0x000FD0, 0x0020}, -{0x000FD5, 0x0040}, -{0x000FD9, 0x0020}, -{0x000FDB, 0x0001}, -{0x001000, 0x0004}, -{0x00102B, 0x0010}, -{0x00103F, 0x0004}, -{0x001040, 0x0002}, -{0x00104A, 0x0020}, -{0x001050, 0x0004}, -{0x001056, 0x0010}, -{0x00105A, 0x0004}, -{0x00105E, 0x0010}, -{0x001061, 0x0004}, -{0x001062, 0x0010}, -{0x001065, 0x0004}, -{0x001067, 0x0010}, -{0x00106E, 0x0004}, -{0x001071, 0x0010}, -{0x001075, 0x0004}, -{0x001082, 0x0010}, -{0x00108E, 0x0004}, -{0x00108F, 0x0010}, -{0x001090, 0x0002}, -{0x00109A, 0x0010}, -{0x00109E, 0x0040}, -{0x0010A0, 0x0004}, -{0x0010C6, 0x0001}, -{0x0010C7, 0x0004}, -{0x0010C8, 0x0001}, -{0x0010CD, 0x0004}, -{0x0010CE, 0x0001}, -{0x0010D0, 0x0004}, -{0x0010FB, 0x0020}, -{0x0010FC, 0x0004}, -{0x001249, 0x0001}, -{0x00124A, 0x0004}, -{0x00124E, 0x0001}, -{0x001250, 0x0004}, -{0x001257, 0x0001}, -{0x001258, 0x0004}, -{0x001259, 0x0001}, -{0x00125A, 0x0004}, -{0x00125E, 0x0001}, -{0x001260, 0x0004}, -{0x001289, 0x0001}, -{0x00128A, 0x0004}, -{0x00128E, 0x0001}, -{0x001290, 0x0004}, -{0x0012B1, 0x0001}, -{0x0012B2, 0x0004}, -{0x0012B6, 0x0001}, -{0x0012B8, 0x0004}, -{0x0012BF, 0x0001}, -{0x0012C0, 0x0004}, -{0x0012C1, 0x0001}, -{0x0012C2, 0x0004}, -{0x0012C6, 0x0001}, -{0x0012C8, 0x0004}, -{0x0012D7, 0x0001}, -{0x0012D8, 0x0004}, -{0x001311, 0x0001}, -{0x001312, 0x0004}, -{0x001316, 0x0001}, -{0x001318, 0x0004}, -{0x00135B, 0x0001}, -{0x00135D, 0x0010}, -{0x001360, 0x0020}, -{0x001369, 0x0002}, -{0x00137D, 0x0001}, -{0x001380, 0x0004}, -{0x001390, 0x0040}, -{0x00139A, 0x0001}, -{0x0013A0, 0x0004}, -{0x0013F6, 0x0001}, -{0x0013F8, 0x0004}, -{0x0013FE, 0x0001}, -{0x001400, 0x0020}, -{0x001401, 0x0004}, -{0x00166D, 0x0040}, -{0x00166E, 0x0020}, -{0x00166F, 0x0004}, -{0x001680, 0x0008}, -{0x001681, 0x0004}, -{0x00169B, 0x0020}, -{0x00169D, 0x0001}, -{0x0016A0, 0x0004}, -{0x0016EB, 0x0020}, -{0x0016EE, 0x0002}, -{0x0016F1, 0x0004}, -{0x0016F9, 0x0001}, -{0x001700, 0x0004}, -{0x001712, 0x0010}, -{0x001716, 0x0001}, -{0x00171F, 0x0004}, -{0x001732, 0x0010}, -{0x001735, 0x0020}, -{0x001737, 0x0001}, -{0x001740, 0x0004}, -{0x001752, 0x0010}, -{0x001754, 0x0001}, -{0x001760, 0x0004}, -{0x00176D, 0x0001}, -{0x00176E, 0x0004}, -{0x001771, 0x0001}, -{0x001772, 0x0010}, -{0x001774, 0x0001}, -{0x001780, 0x0004}, -{0x0017B4, 0x0010}, -{0x0017D4, 0x0020}, -{0x0017D7, 0x0004}, -{0x0017D8, 0x0020}, -{0x0017DB, 0x0040}, -{0x0017DC, 0x0004}, -{0x0017DD, 0x0010}, -{0x0017DE, 0x0001}, -{0x0017E0, 0x0002}, -{0x0017EA, 0x0001}, -{0x0017F0, 0x0002}, -{0x0017FA, 0x0001}, -{0x001800, 0x0020}, -{0x00180B, 0x0010}, -{0x00180E, 0x0080}, -{0x00180F, 0x0010}, -{0x001810, 0x0002}, -{0x00181A, 0x0001}, -{0x001820, 0x0004}, -{0x001879, 0x0001}, -{0x001880, 0x0004}, -{0x001885, 0x0010}, -{0x001887, 0x0004}, -{0x0018A9, 0x0010}, -{0x0018AA, 0x0004}, -{0x0018AB, 0x0001}, -{0x0018B0, 0x0004}, -{0x0018F6, 0x0001}, -{0x001900, 0x0004}, -{0x00191F, 0x0001}, -{0x001920, 0x0010}, -{0x00192C, 0x0001}, -{0x001930, 0x0010}, -{0x00193C, 0x0001}, -{0x001940, 0x0040}, -{0x001941, 0x0001}, -{0x001944, 0x0020}, -{0x001946, 0x0002}, -{0x001950, 0x0004}, -{0x00196E, 0x0001}, -{0x001970, 0x0004}, -{0x001975, 0x0001}, -{0x001980, 0x0004}, -{0x0019AC, 0x0001}, -{0x0019B0, 0x0004}, -{0x0019CA, 0x0001}, -{0x0019D0, 0x0002}, -{0x0019DB, 0x0001}, -{0x0019DE, 0x0040}, -{0x001A00, 0x0004}, -{0x001A17, 0x0010}, -{0x001A1C, 0x0001}, -{0x001A1E, 0x0020}, -{0x001A20, 0x0004}, -{0x001A55, 0x0010}, -{0x001A5F, 0x0001}, -{0x001A60, 0x0010}, -{0x001A7D, 0x0001}, -{0x001A7F, 0x0010}, -{0x001A80, 0x0002}, -{0x001A8A, 0x0001}, -{0x001A90, 0x0002}, -{0x001A9A, 0x0001}, -{0x001AA0, 0x0020}, -{0x001AA7, 0x0004}, -{0x001AA8, 0x0020}, -{0x001AAE, 0x0001}, -{0x001AB0, 0x0010}, -{0x001ACF, 0x0001}, -{0x001B00, 0x0010}, -{0x001B05, 0x0004}, -{0x001B34, 0x0010}, -{0x001B45, 0x0004}, -{0x001B4D, 0x0001}, -{0x001B50, 0x0002}, -{0x001B5A, 0x0020}, -{0x001B61, 0x0040}, -{0x001B6B, 0x0010}, -{0x001B74, 0x0040}, -{0x001B7D, 0x0020}, -{0x001B7F, 0x0001}, -{0x001B80, 0x0010}, -{0x001B83, 0x0004}, -{0x001BA1, 0x0010}, -{0x001BAE, 0x0004}, -{0x001BB0, 0x0002}, -{0x001BBA, 0x0004}, -{0x001BE6, 0x0010}, -{0x001BF4, 0x0001}, -{0x001BFC, 0x0020}, -{0x001C00, 0x0004}, -{0x001C24, 0x0010}, -{0x001C38, 0x0001}, -{0x001C3B, 0x0020}, -{0x001C40, 0x0002}, -{0x001C4A, 0x0001}, -{0x001C4D, 0x0004}, -{0x001C50, 0x0002}, -{0x001C5A, 0x0004}, -{0x001C7E, 0x0020}, -{0x001C80, 0x0004}, -{0x001C89, 0x0001}, -{0x001C90, 0x0004}, -{0x001CBB, 0x0001}, -{0x001CBD, 0x0004}, -{0x001CC0, 0x0020}, -{0x001CC8, 0x0001}, -{0x001CD0, 0x0010}, -{0x001CD3, 0x0020}, -{0x001CD4, 0x0010}, -{0x001CE9, 0x0004}, -{0x001CED, 0x0010}, -{0x001CEE, 0x0004}, -{0x001CF4, 0x0010}, -{0x001CF5, 0x0004}, -{0x001CF7, 0x0010}, -{0x001CFA, 0x0004}, -{0x001CFB, 0x0001}, -{0x001D00, 0x0004}, -{0x001DC0, 0x0010}, -{0x001E00, 0x0004}, -{0x001F16, 0x0001}, -{0x001F18, 0x0004}, -{0x001F1E, 0x0001}, -{0x001F20, 0x0004}, -{0x001F46, 0x0001}, -{0x001F48, 0x0004}, -{0x001F4E, 0x0001}, -{0x001F50, 0x0004}, -{0x001F58, 0x0001}, -{0x001F59, 0x0004}, -{0x001F5A, 0x0001}, -{0x001F5B, 0x0004}, -{0x001F5C, 0x0001}, -{0x001F5D, 0x0004}, -{0x001F5E, 0x0001}, -{0x001F5F, 0x0004}, -{0x001F7E, 0x0001}, -{0x001F80, 0x0004}, -{0x001FB5, 0x0001}, -{0x001FB6, 0x0004}, -{0x001FBD, 0x0040}, -{0x001FBE, 0x0004}, -{0x001FBF, 0x0040}, -{0x001FC2, 0x0004}, -{0x001FC5, 0x0001}, -{0x001FC6, 0x0004}, -{0x001FCD, 0x0040}, -{0x001FD0, 0x0004}, -{0x001FD4, 0x0001}, -{0x001FD6, 0x0004}, -{0x001FDC, 0x0001}, -{0x001FDD, 0x0040}, -{0x001FE0, 0x0004}, -{0x001FED, 0x0040}, -{0x001FF0, 0x0001}, -{0x001FF2, 0x0004}, -{0x001FF5, 0x0001}, -{0x001FF6, 0x0004}, -{0x001FFD, 0x0040}, -{0x001FFF, 0x0001}, -{0x002000, 0x0008}, -{0x00200B, 0x0080}, -{0x002010, 0x0020}, -{0x002028, 0x0008}, -{0x00202A, 0x0080}, -{0x00202F, 0x0008}, -{0x002030, 0x0020}, -{0x002044, 0x0040}, -{0x002045, 0x0020}, -{0x002052, 0x0040}, -{0x002053, 0x0020}, -{0x00205F, 0x0008}, -{0x002060, 0x0080}, -{0x002065, 0x0001}, -{0x002066, 0x0080}, -{0x002070, 0x0002}, -{0x002071, 0x0004}, -{0x002072, 0x0001}, -{0x002074, 0x0002}, -{0x00207A, 0x0040}, -{0x00207D, 0x0020}, -{0x00207F, 0x0004}, -{0x002080, 0x0002}, -{0x00208A, 0x0040}, -{0x00208D, 0x0020}, -{0x00208F, 0x0001}, -{0x002090, 0x0004}, -{0x00209D, 0x0001}, -{0x0020A0, 0x0040}, -{0x0020C1, 0x0001}, -{0x0020D0, 0x0010}, -{0x0020F1, 0x0001}, -{0x002100, 0x0040}, -{0x002102, 0x0004}, -{0x002103, 0x0040}, -{0x002107, 0x0004}, -{0x002108, 0x0040}, -{0x00210A, 0x0004}, -{0x002114, 0x0040}, -{0x002115, 0x0004}, -{0x002116, 0x0040}, -{0x002119, 0x0004}, -{0x00211E, 0x0040}, -{0x002124, 0x0004}, -{0x002125, 0x0040}, -{0x002126, 0x0004}, -{0x002127, 0x0040}, -{0x002128, 0x0004}, -{0x002129, 0x0040}, -{0x00212A, 0x0004}, -{0x00212E, 0x0040}, -{0x00212F, 0x0004}, -{0x00213A, 0x0040}, -{0x00213C, 0x0004}, -{0x002140, 0x0040}, -{0x002145, 0x0004}, -{0x00214A, 0x0040}, -{0x00214E, 0x0004}, -{0x00214F, 0x0040}, -{0x002150, 0x0002}, -{0x002183, 0x0004}, -{0x002185, 0x0002}, -{0x00218A, 0x0040}, -{0x00218C, 0x0001}, -{0x002190, 0x0040}, -{0x002308, 0x0020}, -{0x00230C, 0x0040}, -{0x002329, 0x0020}, -{0x00232B, 0x0040}, -{0x002427, 0x0001}, -{0x002440, 0x0040}, -{0x00244B, 0x0001}, -{0x002460, 0x0002}, -{0x00249C, 0x0040}, -{0x0024EA, 0x0002}, -{0x002500, 0x0040}, -{0x002768, 0x0020}, -{0x002776, 0x0002}, -{0x002794, 0x0040}, -{0x0027C5, 0x0020}, -{0x0027C7, 0x0040}, -{0x0027E6, 0x0020}, -{0x0027F0, 0x0040}, -{0x002983, 0x0020}, -{0x002999, 0x0040}, -{0x0029D8, 0x0020}, -{0x0029DC, 0x0040}, -{0x0029FC, 0x0020}, -{0x0029FE, 0x0040}, -{0x002B74, 0x0001}, -{0x002B76, 0x0040}, -{0x002B96, 0x0001}, -{0x002B97, 0x0040}, -{0x002C00, 0x0004}, -{0x002CE5, 0x0040}, -{0x002CEB, 0x0004}, -{0x002CEF, 0x0010}, -{0x002CF2, 0x0004}, -{0x002CF4, 0x0001}, -{0x002CF9, 0x0020}, -{0x002CFD, 0x0002}, -{0x002CFE, 0x0020}, -{0x002D00, 0x0004}, -{0x002D26, 0x0001}, -{0x002D27, 0x0004}, -{0x002D28, 0x0001}, -{0x002D2D, 0x0004}, -{0x002D2E, 0x0001}, -{0x002D30, 0x0004}, -{0x002D68, 0x0001}, -{0x002D6F, 0x0004}, -{0x002D70, 0x0020}, -{0x002D71, 0x0001}, -{0x002D7F, 0x0010}, -{0x002D80, 0x0004}, -{0x002D97, 0x0001}, -{0x002DA0, 0x0004}, -{0x002DA7, 0x0001}, -{0x002DA8, 0x0004}, -{0x002DAF, 0x0001}, -{0x002DB0, 0x0004}, -{0x002DB7, 0x0001}, -{0x002DB8, 0x0004}, -{0x002DBF, 0x0001}, -{0x002DC0, 0x0004}, -{0x002DC7, 0x0001}, -{0x002DC8, 0x0004}, -{0x002DCF, 0x0001}, -{0x002DD0, 0x0004}, -{0x002DD7, 0x0001}, -{0x002DD8, 0x0004}, -{0x002DDF, 0x0001}, -{0x002DE0, 0x0010}, -{0x002E00, 0x0020}, -{0x002E2F, 0x0004}, -{0x002E30, 0x0020}, -{0x002E50, 0x0040}, -{0x002E52, 0x0020}, -{0x002E5E, 0x0001}, -{0x002E80, 0x0040}, -{0x002E9A, 0x0001}, -{0x002E9B, 0x0040}, -{0x002EF4, 0x0001}, -{0x002F00, 0x0040}, -{0x002FD6, 0x0001}, -{0x002FF0, 0x0040}, -{0x003000, 0x0008}, -{0x003001, 0x0020}, -{0x003004, 0x0040}, -{0x003005, 0x0004}, -{0x003007, 0x0002}, -{0x003008, 0x0020}, -{0x003012, 0x0040}, -{0x003014, 0x0020}, -{0x003020, 0x0040}, -{0x003021, 0x0002}, -{0x00302A, 0x0010}, -{0x003030, 0x0020}, -{0x003031, 0x0004}, -{0x003036, 0x0040}, -{0x003038, 0x0002}, -{0x00303B, 0x0004}, -{0x00303D, 0x0020}, -{0x00303E, 0x0040}, -{0x003040, 0x0001}, -{0x003041, 0x0004}, -{0x003097, 0x0001}, -{0x003099, 0x0010}, -{0x00309B, 0x0040}, -{0x00309D, 0x0004}, -{0x0030A0, 0x0020}, -{0x0030A1, 0x0004}, -{0x0030FB, 0x0020}, -{0x0030FC, 0x0004}, -{0x003100, 0x0001}, -{0x003105, 0x0004}, -{0x003130, 0x0001}, -{0x003131, 0x0004}, -{0x00318F, 0x0001}, -{0x003190, 0x0040}, -{0x003192, 0x0002}, -{0x003196, 0x0040}, -{0x0031A0, 0x0004}, -{0x0031C0, 0x0040}, -{0x0031E4, 0x0001}, -{0x0031EF, 0x0040}, -{0x0031F0, 0x0004}, -{0x003200, 0x0040}, -{0x00321F, 0x0001}, -{0x003220, 0x0002}, -{0x00322A, 0x0040}, -{0x003248, 0x0002}, -{0x003250, 0x0040}, -{0x003251, 0x0002}, -{0x003260, 0x0040}, -{0x003280, 0x0002}, -{0x00328A, 0x0040}, -{0x0032B1, 0x0002}, -{0x0032C0, 0x0040}, -{0x003400, 0x0004}, -{0x004DC0, 0x0040}, -{0x004E00, 0x0004}, -{0x00A48D, 0x0001}, -{0x00A490, 0x0040}, -{0x00A4C7, 0x0001}, -{0x00A4D0, 0x0004}, -{0x00A4FE, 0x0020}, -{0x00A500, 0x0004}, -{0x00A60D, 0x0020}, -{0x00A610, 0x0004}, -{0x00A620, 0x0002}, -{0x00A62A, 0x0004}, -{0x00A62C, 0x0001}, -{0x00A640, 0x0004}, -{0x00A66F, 0x0010}, -{0x00A673, 0x0020}, -{0x00A674, 0x0010}, -{0x00A67E, 0x0020}, -{0x00A67F, 0x0004}, -{0x00A69E, 0x0010}, -{0x00A6A0, 0x0004}, -{0x00A6E6, 0x0002}, -{0x00A6F0, 0x0010}, -{0x00A6F2, 0x0020}, -{0x00A6F8, 0x0001}, -{0x00A700, 0x0040}, -{0x00A717, 0x0004}, -{0x00A720, 0x0040}, -{0x00A722, 0x0004}, -{0x00A789, 0x0040}, -{0x00A78B, 0x0004}, -{0x00A7CB, 0x0001}, -{0x00A7D0, 0x0004}, -{0x00A7D2, 0x0001}, -{0x00A7D3, 0x0004}, -{0x00A7D4, 0x0001}, -{0x00A7D5, 0x0004}, -{0x00A7DA, 0x0001}, -{0x00A7F2, 0x0004}, -{0x00A802, 0x0010}, -{0x00A803, 0x0004}, -{0x00A806, 0x0010}, -{0x00A807, 0x0004}, -{0x00A80B, 0x0010}, -{0x00A80C, 0x0004}, -{0x00A823, 0x0010}, -{0x00A828, 0x0040}, -{0x00A82C, 0x0010}, -{0x00A82D, 0x0001}, -{0x00A830, 0x0002}, -{0x00A836, 0x0040}, -{0x00A83A, 0x0001}, -{0x00A840, 0x0004}, -{0x00A874, 0x0020}, -{0x00A878, 0x0001}, -{0x00A880, 0x0010}, -{0x00A882, 0x0004}, -{0x00A8B4, 0x0010}, -{0x00A8C6, 0x0001}, -{0x00A8CE, 0x0020}, -{0x00A8D0, 0x0002}, -{0x00A8DA, 0x0001}, -{0x00A8E0, 0x0010}, -{0x00A8F2, 0x0004}, -{0x00A8F8, 0x0020}, -{0x00A8FB, 0x0004}, -{0x00A8FC, 0x0020}, -{0x00A8FD, 0x0004}, -{0x00A8FF, 0x0010}, -{0x00A900, 0x0002}, -{0x00A90A, 0x0004}, -{0x00A926, 0x0010}, -{0x00A92E, 0x0020}, -{0x00A930, 0x0004}, -{0x00A947, 0x0010}, -{0x00A954, 0x0001}, -{0x00A95F, 0x0020}, -{0x00A960, 0x0004}, -{0x00A97D, 0x0001}, -{0x00A980, 0x0010}, -{0x00A984, 0x0004}, -{0x00A9B3, 0x0010}, -{0x00A9C1, 0x0020}, -{0x00A9CE, 0x0001}, -{0x00A9CF, 0x0004}, -{0x00A9D0, 0x0002}, -{0x00A9DA, 0x0001}, -{0x00A9DE, 0x0020}, -{0x00A9E0, 0x0004}, -{0x00A9E5, 0x0010}, -{0x00A9E6, 0x0004}, -{0x00A9F0, 0x0002}, -{0x00A9FA, 0x0004}, -{0x00A9FF, 0x0001}, -{0x00AA00, 0x0004}, -{0x00AA29, 0x0010}, -{0x00AA37, 0x0001}, -{0x00AA40, 0x0004}, -{0x00AA43, 0x0010}, -{0x00AA44, 0x0004}, -{0x00AA4C, 0x0010}, -{0x00AA4E, 0x0001}, -{0x00AA50, 0x0002}, -{0x00AA5A, 0x0001}, -{0x00AA5C, 0x0020}, -{0x00AA60, 0x0004}, -{0x00AA77, 0x0040}, -{0x00AA7A, 0x0004}, -{0x00AA7B, 0x0010}, -{0x00AA7E, 0x0004}, -{0x00AAB0, 0x0010}, -{0x00AAB1, 0x0004}, -{0x00AAB2, 0x0010}, -{0x00AAB5, 0x0004}, -{0x00AAB7, 0x0010}, -{0x00AAB9, 0x0004}, -{0x00AABE, 0x0010}, -{0x00AAC0, 0x0004}, -{0x00AAC1, 0x0010}, -{0x00AAC2, 0x0004}, -{0x00AAC3, 0x0001}, -{0x00AADB, 0x0004}, -{0x00AADE, 0x0020}, -{0x00AAE0, 0x0004}, -{0x00AAEB, 0x0010}, -{0x00AAF0, 0x0020}, -{0x00AAF2, 0x0004}, -{0x00AAF5, 0x0010}, -{0x00AAF7, 0x0001}, -{0x00AB01, 0x0004}, -{0x00AB07, 0x0001}, -{0x00AB09, 0x0004}, -{0x00AB0F, 0x0001}, -{0x00AB11, 0x0004}, -{0x00AB17, 0x0001}, -{0x00AB20, 0x0004}, -{0x00AB27, 0x0001}, -{0x00AB28, 0x0004}, -{0x00AB2F, 0x0001}, -{0x00AB30, 0x0004}, -{0x00AB5B, 0x0040}, -{0x00AB5C, 0x0004}, -{0x00AB6A, 0x0040}, -{0x00AB6C, 0x0001}, -{0x00AB70, 0x0004}, -{0x00ABE3, 0x0010}, -{0x00ABEB, 0x0020}, -{0x00ABEC, 0x0010}, -{0x00ABEE, 0x0001}, -{0x00ABF0, 0x0002}, -{0x00ABFA, 0x0001}, -{0x00AC00, 0x0004}, -{0x00D7A4, 0x0001}, -{0x00D7B0, 0x0004}, -{0x00D7C7, 0x0001}, -{0x00D7CB, 0x0004}, -{0x00D7FC, 0x0001}, -{0x00D800, 0x0080}, -{0x00F900, 0x0004}, -{0x00FA6E, 0x0001}, -{0x00FA70, 0x0004}, -{0x00FADA, 0x0001}, -{0x00FB00, 0x0004}, -{0x00FB07, 0x0001}, -{0x00FB13, 0x0004}, -{0x00FB18, 0x0001}, -{0x00FB1D, 0x0004}, -{0x00FB1E, 0x0010}, -{0x00FB1F, 0x0004}, -{0x00FB29, 0x0040}, -{0x00FB2A, 0x0004}, -{0x00FB37, 0x0001}, -{0x00FB38, 0x0004}, -{0x00FB3D, 0x0001}, -{0x00FB3E, 0x0004}, -{0x00FB3F, 0x0001}, -{0x00FB40, 0x0004}, -{0x00FB42, 0x0001}, -{0x00FB43, 0x0004}, -{0x00FB45, 0x0001}, -{0x00FB46, 0x0004}, -{0x00FBB2, 0x0040}, -{0x00FBC3, 0x0001}, -{0x00FBD3, 0x0004}, -{0x00FD3E, 0x0020}, -{0x00FD40, 0x0040}, -{0x00FD50, 0x0004}, -{0x00FD90, 0x0001}, -{0x00FD92, 0x0004}, -{0x00FDC8, 0x0001}, -{0x00FDCF, 0x0040}, -{0x00FDD0, 0x0001}, -{0x00FDF0, 0x0004}, -{0x00FDFC, 0x0040}, -{0x00FE00, 0x0010}, -{0x00FE10, 0x0020}, -{0x00FE1A, 0x0001}, -{0x00FE20, 0x0010}, -{0x00FE30, 0x0020}, -{0x00FE53, 0x0001}, -{0x00FE54, 0x0020}, -{0x00FE62, 0x0040}, -{0x00FE63, 0x0020}, -{0x00FE64, 0x0040}, -{0x00FE67, 0x0001}, -{0x00FE68, 0x0020}, -{0x00FE69, 0x0040}, -{0x00FE6A, 0x0020}, -{0x00FE6C, 0x0001}, -{0x00FE70, 0x0004}, -{0x00FE75, 0x0001}, -{0x00FE76, 0x0004}, -{0x00FEFD, 0x0001}, -{0x00FEFF, 0x0080}, -{0x00FF00, 0x0001}, -{0x00FF01, 0x0020}, -{0x00FF04, 0x0040}, -{0x00FF05, 0x0020}, -{0x00FF0B, 0x0040}, -{0x00FF0C, 0x0020}, -{0x00FF10, 0x0002}, -{0x00FF1A, 0x0020}, -{0x00FF1C, 0x0040}, -{0x00FF1F, 0x0020}, -{0x00FF21, 0x0004}, -{0x00FF3B, 0x0020}, -{0x00FF3E, 0x0040}, -{0x00FF3F, 0x0020}, -{0x00FF40, 0x0040}, -{0x00FF41, 0x0004}, -{0x00FF5B, 0x0020}, -{0x00FF5C, 0x0040}, -{0x00FF5D, 0x0020}, -{0x00FF5E, 0x0040}, -{0x00FF5F, 0x0020}, -{0x00FF66, 0x0004}, -{0x00FFBF, 0x0001}, -{0x00FFC2, 0x0004}, -{0x00FFC8, 0x0001}, -{0x00FFCA, 0x0004}, -{0x00FFD0, 0x0001}, -{0x00FFD2, 0x0004}, -{0x00FFD8, 0x0001}, -{0x00FFDA, 0x0004}, -{0x00FFDD, 0x0001}, -{0x00FFE0, 0x0040}, -{0x00FFE7, 0x0001}, -{0x00FFE8, 0x0040}, -{0x00FFEF, 0x0001}, -{0x00FFF9, 0x0080}, -{0x00FFFC, 0x0040}, -{0x00FFFE, 0x0001}, -{0x010000, 0x0004}, -{0x01000C, 0x0001}, -{0x01000D, 0x0004}, -{0x010027, 0x0001}, -{0x010028, 0x0004}, -{0x01003B, 0x0001}, -{0x01003C, 0x0004}, -{0x01003E, 0x0001}, -{0x01003F, 0x0004}, -{0x01004E, 0x0001}, -{0x010050, 0x0004}, -{0x01005E, 0x0001}, -{0x010080, 0x0004}, -{0x0100FB, 0x0001}, -{0x010100, 0x0020}, -{0x010103, 0x0001}, -{0x010107, 0x0002}, -{0x010134, 0x0001}, -{0x010137, 0x0040}, -{0x010140, 0x0002}, -{0x010179, 0x0040}, -{0x01018A, 0x0002}, -{0x01018C, 0x0040}, -{0x01018F, 0x0001}, -{0x010190, 0x0040}, -{0x01019D, 0x0001}, -{0x0101A0, 0x0040}, -{0x0101A1, 0x0001}, -{0x0101D0, 0x0040}, -{0x0101FD, 0x0010}, -{0x0101FE, 0x0001}, -{0x010280, 0x0004}, -{0x01029D, 0x0001}, -{0x0102A0, 0x0004}, -{0x0102D1, 0x0001}, -{0x0102E0, 0x0010}, -{0x0102E1, 0x0002}, -{0x0102FC, 0x0001}, -{0x010300, 0x0004}, -{0x010320, 0x0002}, -{0x010324, 0x0001}, -{0x01032D, 0x0004}, -{0x010341, 0x0002}, -{0x010342, 0x0004}, -{0x01034A, 0x0002}, -{0x01034B, 0x0001}, -{0x010350, 0x0004}, -{0x010376, 0x0010}, -{0x01037B, 0x0001}, -{0x010380, 0x0004}, -{0x01039E, 0x0001}, -{0x01039F, 0x0020}, -{0x0103A0, 0x0004}, -{0x0103C4, 0x0001}, -{0x0103C8, 0x0004}, -{0x0103D0, 0x0020}, -{0x0103D1, 0x0002}, -{0x0103D6, 0x0001}, -{0x010400, 0x0004}, -{0x01049E, 0x0001}, -{0x0104A0, 0x0002}, -{0x0104AA, 0x0001}, -{0x0104B0, 0x0004}, -{0x0104D4, 0x0001}, -{0x0104D8, 0x0004}, -{0x0104FC, 0x0001}, -{0x010500, 0x0004}, -{0x010528, 0x0001}, -{0x010530, 0x0004}, -{0x010564, 0x0001}, -{0x01056F, 0x0020}, -{0x010570, 0x0004}, -{0x01057B, 0x0001}, -{0x01057C, 0x0004}, -{0x01058B, 0x0001}, -{0x01058C, 0x0004}, -{0x010593, 0x0001}, -{0x010594, 0x0004}, -{0x010596, 0x0001}, -{0x010597, 0x0004}, -{0x0105A2, 0x0001}, -{0x0105A3, 0x0004}, -{0x0105B2, 0x0001}, -{0x0105B3, 0x0004}, -{0x0105BA, 0x0001}, -{0x0105BB, 0x0004}, -{0x0105BD, 0x0001}, -{0x010600, 0x0004}, -{0x010737, 0x0001}, -{0x010740, 0x0004}, -{0x010756, 0x0001}, -{0x010760, 0x0004}, -{0x010768, 0x0001}, -{0x010780, 0x0004}, -{0x010786, 0x0001}, -{0x010787, 0x0004}, -{0x0107B1, 0x0001}, -{0x0107B2, 0x0004}, -{0x0107BB, 0x0001}, -{0x010800, 0x0004}, -{0x010806, 0x0001}, -{0x010808, 0x0004}, -{0x010809, 0x0001}, -{0x01080A, 0x0004}, -{0x010836, 0x0001}, -{0x010837, 0x0004}, -{0x010839, 0x0001}, -{0x01083C, 0x0004}, -{0x01083D, 0x0001}, -{0x01083F, 0x0004}, -{0x010856, 0x0001}, -{0x010857, 0x0020}, -{0x010858, 0x0002}, -{0x010860, 0x0004}, -{0x010877, 0x0040}, -{0x010879, 0x0002}, -{0x010880, 0x0004}, -{0x01089F, 0x0001}, -{0x0108A7, 0x0002}, -{0x0108B0, 0x0001}, -{0x0108E0, 0x0004}, -{0x0108F3, 0x0001}, -{0x0108F4, 0x0004}, -{0x0108F6, 0x0001}, -{0x0108FB, 0x0002}, -{0x010900, 0x0004}, -{0x010916, 0x0002}, -{0x01091C, 0x0001}, -{0x01091F, 0x0020}, -{0x010920, 0x0004}, -{0x01093A, 0x0001}, -{0x01093F, 0x0020}, -{0x010940, 0x0001}, -{0x010980, 0x0004}, -{0x0109B8, 0x0001}, -{0x0109BC, 0x0002}, -{0x0109BE, 0x0004}, -{0x0109C0, 0x0002}, -{0x0109D0, 0x0001}, -{0x0109D2, 0x0002}, -{0x010A00, 0x0004}, -{0x010A01, 0x0010}, -{0x010A04, 0x0001}, -{0x010A05, 0x0010}, -{0x010A07, 0x0001}, -{0x010A0C, 0x0010}, -{0x010A10, 0x0004}, -{0x010A14, 0x0001}, -{0x010A15, 0x0004}, -{0x010A18, 0x0001}, -{0x010A19, 0x0004}, -{0x010A36, 0x0001}, -{0x010A38, 0x0010}, -{0x010A3B, 0x0001}, -{0x010A3F, 0x0010}, -{0x010A40, 0x0002}, -{0x010A49, 0x0001}, -{0x010A50, 0x0020}, -{0x010A59, 0x0001}, -{0x010A60, 0x0004}, -{0x010A7D, 0x0002}, -{0x010A7F, 0x0020}, -{0x010A80, 0x0004}, -{0x010A9D, 0x0002}, -{0x010AA0, 0x0001}, -{0x010AC0, 0x0004}, -{0x010AC8, 0x0040}, -{0x010AC9, 0x0004}, -{0x010AE5, 0x0010}, -{0x010AE7, 0x0001}, -{0x010AEB, 0x0002}, -{0x010AF0, 0x0020}, -{0x010AF7, 0x0001}, -{0x010B00, 0x0004}, -{0x010B36, 0x0001}, -{0x010B39, 0x0020}, -{0x010B40, 0x0004}, -{0x010B56, 0x0001}, -{0x010B58, 0x0002}, -{0x010B60, 0x0004}, -{0x010B73, 0x0001}, -{0x010B78, 0x0002}, -{0x010B80, 0x0004}, -{0x010B92, 0x0001}, -{0x010B99, 0x0020}, -{0x010B9D, 0x0001}, -{0x010BA9, 0x0002}, -{0x010BB0, 0x0001}, -{0x010C00, 0x0004}, -{0x010C49, 0x0001}, -{0x010C80, 0x0004}, -{0x010CB3, 0x0001}, -{0x010CC0, 0x0004}, -{0x010CF3, 0x0001}, -{0x010CFA, 0x0002}, -{0x010D00, 0x0004}, -{0x010D24, 0x0010}, -{0x010D28, 0x0001}, -{0x010D30, 0x0002}, -{0x010D3A, 0x0001}, -{0x010E60, 0x0002}, -{0x010E7F, 0x0001}, -{0x010E80, 0x0004}, -{0x010EAA, 0x0001}, -{0x010EAB, 0x0010}, -{0x010EAD, 0x0020}, -{0x010EAE, 0x0001}, -{0x010EB0, 0x0004}, -{0x010EB2, 0x0001}, -{0x010EFD, 0x0010}, -{0x010F00, 0x0004}, -{0x010F1D, 0x0002}, -{0x010F27, 0x0004}, -{0x010F28, 0x0001}, -{0x010F30, 0x0004}, -{0x010F46, 0x0010}, -{0x010F51, 0x0002}, -{0x010F55, 0x0020}, -{0x010F5A, 0x0001}, -{0x010F70, 0x0004}, -{0x010F82, 0x0010}, -{0x010F86, 0x0020}, -{0x010F8A, 0x0001}, -{0x010FB0, 0x0004}, -{0x010FC5, 0x0002}, -{0x010FCC, 0x0001}, -{0x010FE0, 0x0004}, -{0x010FF7, 0x0001}, -{0x011000, 0x0010}, -{0x011003, 0x0004}, -{0x011038, 0x0010}, -{0x011047, 0x0020}, -{0x01104E, 0x0001}, -{0x011052, 0x0002}, -{0x011070, 0x0010}, -{0x011071, 0x0004}, -{0x011073, 0x0010}, -{0x011075, 0x0004}, -{0x011076, 0x0001}, -{0x01107F, 0x0010}, -{0x011083, 0x0004}, -{0x0110B0, 0x0010}, -{0x0110BB, 0x0020}, -{0x0110BD, 0x0080}, -{0x0110BE, 0x0020}, -{0x0110C2, 0x0010}, -{0x0110C3, 0x0001}, -{0x0110CD, 0x0080}, -{0x0110CE, 0x0001}, -{0x0110D0, 0x0004}, -{0x0110E9, 0x0001}, -{0x0110F0, 0x0002}, -{0x0110FA, 0x0001}, -{0x011100, 0x0010}, -{0x011103, 0x0004}, -{0x011127, 0x0010}, -{0x011135, 0x0001}, -{0x011136, 0x0002}, -{0x011140, 0x0020}, -{0x011144, 0x0004}, -{0x011145, 0x0010}, -{0x011147, 0x0004}, -{0x011148, 0x0001}, -{0x011150, 0x0004}, -{0x011173, 0x0010}, -{0x011174, 0x0020}, -{0x011176, 0x0004}, -{0x011177, 0x0001}, -{0x011180, 0x0010}, -{0x011183, 0x0004}, -{0x0111B3, 0x0010}, -{0x0111C1, 0x0004}, -{0x0111C5, 0x0020}, -{0x0111C9, 0x0010}, -{0x0111CD, 0x0020}, -{0x0111CE, 0x0010}, -{0x0111D0, 0x0002}, -{0x0111DA, 0x0004}, -{0x0111DB, 0x0020}, -{0x0111DC, 0x0004}, -{0x0111DD, 0x0020}, -{0x0111E0, 0x0001}, -{0x0111E1, 0x0002}, -{0x0111F5, 0x0001}, -{0x011200, 0x0004}, -{0x011212, 0x0001}, -{0x011213, 0x0004}, -{0x01122C, 0x0010}, -{0x011238, 0x0020}, -{0x01123E, 0x0010}, -{0x01123F, 0x0004}, -{0x011241, 0x0010}, -{0x011242, 0x0001}, -{0x011280, 0x0004}, -{0x011287, 0x0001}, -{0x011288, 0x0004}, -{0x011289, 0x0001}, -{0x01128A, 0x0004}, -{0x01128E, 0x0001}, -{0x01128F, 0x0004}, -{0x01129E, 0x0001}, -{0x01129F, 0x0004}, -{0x0112A9, 0x0020}, -{0x0112AA, 0x0001}, -{0x0112B0, 0x0004}, -{0x0112DF, 0x0010}, -{0x0112EB, 0x0001}, -{0x0112F0, 0x0002}, -{0x0112FA, 0x0001}, -{0x011300, 0x0010}, -{0x011304, 0x0001}, -{0x011305, 0x0004}, -{0x01130D, 0x0001}, -{0x01130F, 0x0004}, -{0x011311, 0x0001}, -{0x011313, 0x0004}, -{0x011329, 0x0001}, -{0x01132A, 0x0004}, -{0x011331, 0x0001}, -{0x011332, 0x0004}, -{0x011334, 0x0001}, -{0x011335, 0x0004}, -{0x01133A, 0x0001}, -{0x01133B, 0x0010}, -{0x01133D, 0x0004}, -{0x01133E, 0x0010}, -{0x011345, 0x0001}, -{0x011347, 0x0010}, -{0x011349, 0x0001}, -{0x01134B, 0x0010}, -{0x01134E, 0x0001}, -{0x011350, 0x0004}, -{0x011351, 0x0001}, -{0x011357, 0x0010}, -{0x011358, 0x0001}, -{0x01135D, 0x0004}, -{0x011362, 0x0010}, -{0x011364, 0x0001}, -{0x011366, 0x0010}, -{0x01136D, 0x0001}, -{0x011370, 0x0010}, -{0x011375, 0x0001}, -{0x011400, 0x0004}, -{0x011435, 0x0010}, -{0x011447, 0x0004}, -{0x01144B, 0x0020}, -{0x011450, 0x0002}, -{0x01145A, 0x0020}, -{0x01145C, 0x0001}, -{0x01145D, 0x0020}, -{0x01145E, 0x0010}, -{0x01145F, 0x0004}, -{0x011462, 0x0001}, -{0x011480, 0x0004}, -{0x0114B0, 0x0010}, -{0x0114C4, 0x0004}, -{0x0114C6, 0x0020}, -{0x0114C7, 0x0004}, -{0x0114C8, 0x0001}, -{0x0114D0, 0x0002}, -{0x0114DA, 0x0001}, -{0x011580, 0x0004}, -{0x0115AF, 0x0010}, -{0x0115B6, 0x0001}, -{0x0115B8, 0x0010}, -{0x0115C1, 0x0020}, -{0x0115D8, 0x0004}, -{0x0115DC, 0x0010}, -{0x0115DE, 0x0001}, -{0x011600, 0x0004}, -{0x011630, 0x0010}, -{0x011641, 0x0020}, -{0x011644, 0x0004}, -{0x011645, 0x0001}, -{0x011650, 0x0002}, -{0x01165A, 0x0001}, -{0x011660, 0x0020}, -{0x01166D, 0x0001}, -{0x011680, 0x0004}, -{0x0116AB, 0x0010}, -{0x0116B8, 0x0004}, -{0x0116B9, 0x0020}, -{0x0116BA, 0x0001}, -{0x0116C0, 0x0002}, -{0x0116CA, 0x0001}, -{0x011700, 0x0004}, -{0x01171B, 0x0001}, -{0x01171D, 0x0010}, -{0x01172C, 0x0001}, -{0x011730, 0x0002}, -{0x01173C, 0x0020}, -{0x01173F, 0x0040}, -{0x011740, 0x0004}, -{0x011747, 0x0001}, -{0x011800, 0x0004}, -{0x01182C, 0x0010}, -{0x01183B, 0x0020}, -{0x01183C, 0x0001}, -{0x0118A0, 0x0004}, -{0x0118E0, 0x0002}, -{0x0118F3, 0x0001}, -{0x0118FF, 0x0004}, -{0x011907, 0x0001}, -{0x011909, 0x0004}, -{0x01190A, 0x0001}, -{0x01190C, 0x0004}, -{0x011914, 0x0001}, -{0x011915, 0x0004}, -{0x011917, 0x0001}, -{0x011918, 0x0004}, -{0x011930, 0x0010}, -{0x011936, 0x0001}, -{0x011937, 0x0010}, -{0x011939, 0x0001}, -{0x01193B, 0x0010}, -{0x01193F, 0x0004}, -{0x011940, 0x0010}, -{0x011941, 0x0004}, -{0x011942, 0x0010}, -{0x011944, 0x0020}, -{0x011947, 0x0001}, -{0x011950, 0x0002}, -{0x01195A, 0x0001}, -{0x0119A0, 0x0004}, -{0x0119A8, 0x0001}, -{0x0119AA, 0x0004}, -{0x0119D1, 0x0010}, -{0x0119D8, 0x0001}, -{0x0119DA, 0x0010}, -{0x0119E1, 0x0004}, -{0x0119E2, 0x0020}, -{0x0119E3, 0x0004}, -{0x0119E4, 0x0010}, -{0x0119E5, 0x0001}, -{0x011A00, 0x0004}, -{0x011A01, 0x0010}, -{0x011A0B, 0x0004}, -{0x011A33, 0x0010}, -{0x011A3A, 0x0004}, -{0x011A3B, 0x0010}, -{0x011A3F, 0x0020}, -{0x011A47, 0x0010}, -{0x011A48, 0x0001}, -{0x011A50, 0x0004}, -{0x011A51, 0x0010}, -{0x011A5C, 0x0004}, -{0x011A8A, 0x0010}, -{0x011A9A, 0x0020}, -{0x011A9D, 0x0004}, -{0x011A9E, 0x0020}, -{0x011AA3, 0x0001}, -{0x011AB0, 0x0004}, -{0x011AF9, 0x0001}, -{0x011B00, 0x0020}, -{0x011B0A, 0x0001}, -{0x011C00, 0x0004}, -{0x011C09, 0x0001}, -{0x011C0A, 0x0004}, -{0x011C2F, 0x0010}, -{0x011C37, 0x0001}, -{0x011C38, 0x0010}, -{0x011C40, 0x0004}, -{0x011C41, 0x0020}, -{0x011C46, 0x0001}, -{0x011C50, 0x0002}, -{0x011C6D, 0x0001}, -{0x011C70, 0x0020}, -{0x011C72, 0x0004}, -{0x011C90, 0x0001}, -{0x011C92, 0x0010}, -{0x011CA8, 0x0001}, -{0x011CA9, 0x0010}, -{0x011CB7, 0x0001}, -{0x011D00, 0x0004}, -{0x011D07, 0x0001}, -{0x011D08, 0x0004}, -{0x011D0A, 0x0001}, -{0x011D0B, 0x0004}, -{0x011D31, 0x0010}, -{0x011D37, 0x0001}, -{0x011D3A, 0x0010}, -{0x011D3B, 0x0001}, -{0x011D3C, 0x0010}, -{0x011D3E, 0x0001}, -{0x011D3F, 0x0010}, -{0x011D46, 0x0004}, -{0x011D47, 0x0010}, -{0x011D48, 0x0001}, -{0x011D50, 0x0002}, -{0x011D5A, 0x0001}, -{0x011D60, 0x0004}, -{0x011D66, 0x0001}, -{0x011D67, 0x0004}, -{0x011D69, 0x0001}, -{0x011D6A, 0x0004}, -{0x011D8A, 0x0010}, -{0x011D8F, 0x0001}, -{0x011D90, 0x0010}, -{0x011D92, 0x0001}, -{0x011D93, 0x0010}, -{0x011D98, 0x0004}, -{0x011D99, 0x0001}, -{0x011DA0, 0x0002}, -{0x011DAA, 0x0001}, -{0x011EE0, 0x0004}, -{0x011EF3, 0x0010}, -{0x011EF7, 0x0020}, -{0x011EF9, 0x0001}, -{0x011F00, 0x0010}, -{0x011F02, 0x0004}, -{0x011F03, 0x0010}, -{0x011F04, 0x0004}, -{0x011F11, 0x0001}, -{0x011F12, 0x0004}, -{0x011F34, 0x0010}, -{0x011F3B, 0x0001}, -{0x011F3E, 0x0010}, -{0x011F43, 0x0020}, -{0x011F50, 0x0002}, -{0x011F5A, 0x0001}, -{0x011FB0, 0x0004}, -{0x011FB1, 0x0001}, -{0x011FC0, 0x0002}, -{0x011FD5, 0x0040}, -{0x011FF2, 0x0001}, -{0x011FFF, 0x0020}, -{0x012000, 0x0004}, -{0x01239A, 0x0001}, -{0x012400, 0x0002}, -{0x01246F, 0x0001}, -{0x012470, 0x0020}, -{0x012475, 0x0001}, -{0x012480, 0x0004}, -{0x012544, 0x0001}, -{0x012F90, 0x0004}, -{0x012FF1, 0x0020}, -{0x012FF3, 0x0001}, -{0x013000, 0x0004}, -{0x013430, 0x0080}, -{0x013440, 0x0010}, -{0x013441, 0x0004}, -{0x013447, 0x0010}, -{0x013456, 0x0001}, -{0x014400, 0x0004}, -{0x014647, 0x0001}, -{0x016800, 0x0004}, -{0x016A39, 0x0001}, -{0x016A40, 0x0004}, -{0x016A5F, 0x0001}, -{0x016A60, 0x0002}, -{0x016A6A, 0x0001}, -{0x016A6E, 0x0020}, -{0x016A70, 0x0004}, -{0x016ABF, 0x0001}, -{0x016AC0, 0x0002}, -{0x016ACA, 0x0001}, -{0x016AD0, 0x0004}, -{0x016AEE, 0x0001}, -{0x016AF0, 0x0010}, -{0x016AF5, 0x0020}, -{0x016AF6, 0x0001}, -{0x016B00, 0x0004}, -{0x016B30, 0x0010}, -{0x016B37, 0x0020}, -{0x016B3C, 0x0040}, -{0x016B40, 0x0004}, -{0x016B44, 0x0020}, -{0x016B45, 0x0040}, -{0x016B46, 0x0001}, -{0x016B50, 0x0002}, -{0x016B5A, 0x0001}, -{0x016B5B, 0x0002}, -{0x016B62, 0x0001}, -{0x016B63, 0x0004}, -{0x016B78, 0x0001}, -{0x016B7D, 0x0004}, -{0x016B90, 0x0001}, -{0x016E40, 0x0004}, -{0x016E80, 0x0002}, -{0x016E97, 0x0020}, -{0x016E9B, 0x0001}, -{0x016F00, 0x0004}, -{0x016F4B, 0x0001}, -{0x016F4F, 0x0010}, -{0x016F50, 0x0004}, -{0x016F51, 0x0010}, -{0x016F88, 0x0001}, -{0x016F8F, 0x0010}, -{0x016F93, 0x0004}, -{0x016FA0, 0x0001}, -{0x016FE0, 0x0004}, -{0x016FE2, 0x0020}, -{0x016FE3, 0x0004}, -{0x016FE4, 0x0010}, -{0x016FE5, 0x0001}, -{0x016FF0, 0x0010}, -{0x016FF2, 0x0001}, -{0x017000, 0x0004}, -{0x0187F8, 0x0001}, -{0x018800, 0x0004}, -{0x018CD6, 0x0001}, -{0x018D00, 0x0004}, -{0x018D09, 0x0001}, -{0x01AFF0, 0x0004}, -{0x01AFF4, 0x0001}, -{0x01AFF5, 0x0004}, -{0x01AFFC, 0x0001}, -{0x01AFFD, 0x0004}, -{0x01AFFF, 0x0001}, -{0x01B000, 0x0004}, -{0x01B123, 0x0001}, -{0x01B132, 0x0004}, -{0x01B133, 0x0001}, -{0x01B150, 0x0004}, -{0x01B153, 0x0001}, -{0x01B155, 0x0004}, -{0x01B156, 0x0001}, -{0x01B164, 0x0004}, -{0x01B168, 0x0001}, -{0x01B170, 0x0004}, -{0x01B2FC, 0x0001}, -{0x01BC00, 0x0004}, -{0x01BC6B, 0x0001}, -{0x01BC70, 0x0004}, -{0x01BC7D, 0x0001}, -{0x01BC80, 0x0004}, -{0x01BC89, 0x0001}, -{0x01BC90, 0x0004}, -{0x01BC9A, 0x0001}, -{0x01BC9C, 0x0040}, -{0x01BC9D, 0x0010}, -{0x01BC9F, 0x0020}, -{0x01BCA0, 0x0080}, -{0x01BCA4, 0x0001}, -{0x01CF00, 0x0010}, -{0x01CF2E, 0x0001}, -{0x01CF30, 0x0010}, -{0x01CF47, 0x0001}, -{0x01CF50, 0x0040}, -{0x01CFC4, 0x0001}, -{0x01D000, 0x0040}, -{0x01D0F6, 0x0001}, -{0x01D100, 0x0040}, -{0x01D127, 0x0001}, -{0x01D129, 0x0040}, -{0x01D165, 0x0010}, -{0x01D16A, 0x0040}, -{0x01D16D, 0x0010}, -{0x01D173, 0x0080}, -{0x01D17B, 0x0010}, -{0x01D183, 0x0040}, -{0x01D185, 0x0010}, -{0x01D18C, 0x0040}, -{0x01D1AA, 0x0010}, -{0x01D1AE, 0x0040}, -{0x01D1EB, 0x0001}, -{0x01D200, 0x0040}, -{0x01D242, 0x0010}, -{0x01D245, 0x0040}, -{0x01D246, 0x0001}, -{0x01D2C0, 0x0002}, -{0x01D2D4, 0x0001}, -{0x01D2E0, 0x0002}, -{0x01D2F4, 0x0001}, -{0x01D300, 0x0040}, -{0x01D357, 0x0001}, -{0x01D360, 0x0002}, -{0x01D379, 0x0001}, -{0x01D400, 0x0004}, -{0x01D455, 0x0001}, -{0x01D456, 0x0004}, -{0x01D49D, 0x0001}, -{0x01D49E, 0x0004}, -{0x01D4A0, 0x0001}, -{0x01D4A2, 0x0004}, -{0x01D4A3, 0x0001}, -{0x01D4A5, 0x0004}, -{0x01D4A7, 0x0001}, -{0x01D4A9, 0x0004}, -{0x01D4AD, 0x0001}, -{0x01D4AE, 0x0004}, -{0x01D4BA, 0x0001}, -{0x01D4BB, 0x0004}, -{0x01D4BC, 0x0001}, -{0x01D4BD, 0x0004}, -{0x01D4C4, 0x0001}, -{0x01D4C5, 0x0004}, -{0x01D506, 0x0001}, -{0x01D507, 0x0004}, -{0x01D50B, 0x0001}, -{0x01D50D, 0x0004}, -{0x01D515, 0x0001}, -{0x01D516, 0x0004}, -{0x01D51D, 0x0001}, -{0x01D51E, 0x0004}, -{0x01D53A, 0x0001}, -{0x01D53B, 0x0004}, -{0x01D53F, 0x0001}, -{0x01D540, 0x0004}, -{0x01D545, 0x0001}, -{0x01D546, 0x0004}, -{0x01D547, 0x0001}, -{0x01D54A, 0x0004}, -{0x01D551, 0x0001}, -{0x01D552, 0x0004}, -{0x01D6A6, 0x0001}, -{0x01D6A8, 0x0004}, -{0x01D6C1, 0x0040}, -{0x01D6C2, 0x0004}, -{0x01D6DB, 0x0040}, -{0x01D6DC, 0x0004}, -{0x01D6FB, 0x0040}, -{0x01D6FC, 0x0004}, -{0x01D715, 0x0040}, -{0x01D716, 0x0004}, -{0x01D735, 0x0040}, -{0x01D736, 0x0004}, -{0x01D74F, 0x0040}, -{0x01D750, 0x0004}, -{0x01D76F, 0x0040}, -{0x01D770, 0x0004}, -{0x01D789, 0x0040}, -{0x01D78A, 0x0004}, -{0x01D7A9, 0x0040}, -{0x01D7AA, 0x0004}, -{0x01D7C3, 0x0040}, -{0x01D7C4, 0x0004}, -{0x01D7CC, 0x0001}, -{0x01D7CE, 0x0002}, -{0x01D800, 0x0040}, -{0x01DA00, 0x0010}, -{0x01DA37, 0x0040}, -{0x01DA3B, 0x0010}, -{0x01DA6D, 0x0040}, -{0x01DA75, 0x0010}, -{0x01DA76, 0x0040}, -{0x01DA84, 0x0010}, -{0x01DA85, 0x0040}, -{0x01DA87, 0x0020}, -{0x01DA8C, 0x0001}, -{0x01DA9B, 0x0010}, -{0x01DAA0, 0x0001}, -{0x01DAA1, 0x0010}, -{0x01DAB0, 0x0001}, -{0x01DF00, 0x0004}, -{0x01DF1F, 0x0001}, -{0x01DF25, 0x0004}, -{0x01DF2B, 0x0001}, -{0x01E000, 0x0010}, -{0x01E007, 0x0001}, -{0x01E008, 0x0010}, -{0x01E019, 0x0001}, -{0x01E01B, 0x0010}, -{0x01E022, 0x0001}, -{0x01E023, 0x0010}, -{0x01E025, 0x0001}, -{0x01E026, 0x0010}, -{0x01E02B, 0x0001}, -{0x01E030, 0x0004}, -{0x01E06E, 0x0001}, -{0x01E08F, 0x0010}, -{0x01E090, 0x0001}, -{0x01E100, 0x0004}, -{0x01E12D, 0x0001}, -{0x01E130, 0x0010}, -{0x01E137, 0x0004}, -{0x01E13E, 0x0001}, -{0x01E140, 0x0002}, -{0x01E14A, 0x0001}, -{0x01E14E, 0x0004}, -{0x01E14F, 0x0040}, -{0x01E150, 0x0001}, -{0x01E290, 0x0004}, -{0x01E2AE, 0x0010}, -{0x01E2AF, 0x0001}, -{0x01E2C0, 0x0004}, -{0x01E2EC, 0x0010}, -{0x01E2F0, 0x0002}, -{0x01E2FA, 0x0001}, -{0x01E2FF, 0x0040}, -{0x01E300, 0x0001}, -{0x01E4D0, 0x0004}, -{0x01E4EC, 0x0010}, -{0x01E4F0, 0x0002}, -{0x01E4FA, 0x0001}, -{0x01E7E0, 0x0004}, -{0x01E7E7, 0x0001}, -{0x01E7E8, 0x0004}, -{0x01E7EC, 0x0001}, -{0x01E7ED, 0x0004}, -{0x01E7EF, 0x0001}, -{0x01E7F0, 0x0004}, -{0x01E7FF, 0x0001}, -{0x01E800, 0x0004}, -{0x01E8C5, 0x0001}, -{0x01E8C7, 0x0002}, -{0x01E8D0, 0x0010}, -{0x01E8D7, 0x0001}, -{0x01E900, 0x0004}, -{0x01E944, 0x0010}, -{0x01E94B, 0x0004}, -{0x01E94C, 0x0001}, -{0x01E950, 0x0002}, -{0x01E95A, 0x0001}, -{0x01E95E, 0x0020}, -{0x01E960, 0x0001}, -{0x01EC71, 0x0002}, -{0x01ECAC, 0x0040}, -{0x01ECAD, 0x0002}, -{0x01ECB0, 0x0040}, -{0x01ECB1, 0x0002}, -{0x01ECB5, 0x0001}, -{0x01ED01, 0x0002}, -{0x01ED2E, 0x0040}, -{0x01ED2F, 0x0002}, -{0x01ED3E, 0x0001}, -{0x01EE00, 0x0004}, -{0x01EE04, 0x0001}, -{0x01EE05, 0x0004}, -{0x01EE20, 0x0001}, -{0x01EE21, 0x0004}, -{0x01EE23, 0x0001}, -{0x01EE24, 0x0004}, -{0x01EE25, 0x0001}, -{0x01EE27, 0x0004}, -{0x01EE28, 0x0001}, -{0x01EE29, 0x0004}, -{0x01EE33, 0x0001}, -{0x01EE34, 0x0004}, -{0x01EE38, 0x0001}, -{0x01EE39, 0x0004}, -{0x01EE3A, 0x0001}, -{0x01EE3B, 0x0004}, -{0x01EE3C, 0x0001}, -{0x01EE42, 0x0004}, -{0x01EE43, 0x0001}, -{0x01EE47, 0x0004}, -{0x01EE48, 0x0001}, -{0x01EE49, 0x0004}, -{0x01EE4A, 0x0001}, -{0x01EE4B, 0x0004}, -{0x01EE4C, 0x0001}, -{0x01EE4D, 0x0004}, -{0x01EE50, 0x0001}, -{0x01EE51, 0x0004}, -{0x01EE53, 0x0001}, -{0x01EE54, 0x0004}, -{0x01EE55, 0x0001}, -{0x01EE57, 0x0004}, -{0x01EE58, 0x0001}, -{0x01EE59, 0x0004}, -{0x01EE5A, 0x0001}, -{0x01EE5B, 0x0004}, -{0x01EE5C, 0x0001}, -{0x01EE5D, 0x0004}, -{0x01EE5E, 0x0001}, -{0x01EE5F, 0x0004}, -{0x01EE60, 0x0001}, -{0x01EE61, 0x0004}, -{0x01EE63, 0x0001}, -{0x01EE64, 0x0004}, -{0x01EE65, 0x0001}, -{0x01EE67, 0x0004}, -{0x01EE6B, 0x0001}, -{0x01EE6C, 0x0004}, -{0x01EE73, 0x0001}, -{0x01EE74, 0x0004}, -{0x01EE78, 0x0001}, -{0x01EE79, 0x0004}, -{0x01EE7D, 0x0001}, -{0x01EE7E, 0x0004}, -{0x01EE7F, 0x0001}, -{0x01EE80, 0x0004}, -{0x01EE8A, 0x0001}, -{0x01EE8B, 0x0004}, -{0x01EE9C, 0x0001}, -{0x01EEA1, 0x0004}, -{0x01EEA4, 0x0001}, -{0x01EEA5, 0x0004}, -{0x01EEAA, 0x0001}, -{0x01EEAB, 0x0004}, -{0x01EEBC, 0x0001}, -{0x01EEF0, 0x0040}, -{0x01EEF2, 0x0001}, -{0x01F000, 0x0040}, -{0x01F02C, 0x0001}, -{0x01F030, 0x0040}, -{0x01F094, 0x0001}, -{0x01F0A0, 0x0040}, -{0x01F0AF, 0x0001}, -{0x01F0B1, 0x0040}, -{0x01F0C0, 0x0001}, -{0x01F0C1, 0x0040}, -{0x01F0D0, 0x0001}, -{0x01F0D1, 0x0040}, -{0x01F0F6, 0x0001}, -{0x01F100, 0x0002}, -{0x01F10D, 0x0040}, -{0x01F1AE, 0x0001}, -{0x01F1E6, 0x0040}, -{0x01F203, 0x0001}, -{0x01F210, 0x0040}, -{0x01F23C, 0x0001}, -{0x01F240, 0x0040}, -{0x01F249, 0x0001}, -{0x01F250, 0x0040}, -{0x01F252, 0x0001}, -{0x01F260, 0x0040}, -{0x01F266, 0x0001}, -{0x01F300, 0x0040}, -{0x01F6D8, 0x0001}, -{0x01F6DC, 0x0040}, -{0x01F6ED, 0x0001}, -{0x01F6F0, 0x0040}, -{0x01F6FD, 0x0001}, -{0x01F700, 0x0040}, -{0x01F777, 0x0001}, -{0x01F77B, 0x0040}, -{0x01F7DA, 0x0001}, -{0x01F7E0, 0x0040}, -{0x01F7EC, 0x0001}, -{0x01F7F0, 0x0040}, -{0x01F7F1, 0x0001}, -{0x01F800, 0x0040}, -{0x01F80C, 0x0001}, -{0x01F810, 0x0040}, -{0x01F848, 0x0001}, -{0x01F850, 0x0040}, -{0x01F85A, 0x0001}, -{0x01F860, 0x0040}, -{0x01F888, 0x0001}, -{0x01F890, 0x0040}, -{0x01F8AE, 0x0001}, -{0x01F8B0, 0x0040}, -{0x01F8B2, 0x0001}, -{0x01F900, 0x0040}, -{0x01FA54, 0x0001}, -{0x01FA60, 0x0040}, -{0x01FA6E, 0x0001}, -{0x01FA70, 0x0040}, -{0x01FA7D, 0x0001}, -{0x01FA80, 0x0040}, -{0x01FA89, 0x0001}, -{0x01FA90, 0x0040}, -{0x01FABE, 0x0001}, -{0x01FABF, 0x0040}, -{0x01FAC6, 0x0001}, -{0x01FACE, 0x0040}, -{0x01FADC, 0x0001}, -{0x01FAE0, 0x0040}, -{0x01FAE9, 0x0001}, -{0x01FAF0, 0x0040}, -{0x01FAF9, 0x0001}, -{0x01FB00, 0x0040}, -{0x01FB93, 0x0001}, -{0x01FB94, 0x0040}, -{0x01FBCB, 0x0001}, -{0x01FBF0, 0x0002}, -{0x01FBFA, 0x0001}, -{0x020000, 0x0004}, -{0x02A6E0, 0x0001}, -{0x02A700, 0x0004}, -{0x02B73A, 0x0001}, -{0x02B740, 0x0004}, -{0x02B81E, 0x0001}, -{0x02B820, 0x0004}, -{0x02CEA2, 0x0001}, -{0x02CEB0, 0x0004}, -{0x02EBE1, 0x0001}, -{0x02EBF0, 0x0004}, -{0x02EE5E, 0x0001}, -{0x02F800, 0x0004}, -{0x02FA1E, 0x0001}, -{0x030000, 0x0004}, -{0x03134B, 0x0001}, -{0x031350, 0x0004}, -{0x0323B0, 0x0001}, -{0x0E0001, 0x0080}, -{0x0E0002, 0x0001}, -{0x0E0020, 0x0080}, -{0x0E0080, 0x0001}, -{0x0E0100, 0x0010}, -{0x0E01F0, 0x0001}, -{0x0F0000, 0x0080}, -{0x0FFFFE, 0x0001}, -{0x100000, 0x0080}, -{0x10FFFE, 0x0001}, -{0x110000, 0x0000}, +const std::vector unicode_rle_codepoints_categs = { // run length encoding, 5 bits categ + 11 bits length +0x03E1, +0x001D, +0x0055, +0x0017, +0x0055, +0x0016, +0x0012, +0x0015, +0x0019, +0x0015, +0x0011, +0x0035, +0x012D, +0x0035, +0x0059, +0x0035, +0x0329, +0x0016, +0x0015, +0x0012, +0x0018, +0x0010, +0x0018, +0x0325, +0x0016, +0x0019, +0x0012, +0x0019, +0x0401, +0x001D, +0x0015, +0x0077, +0x001A, +0x0015, +0x0018, +0x001A, +0x0007, +0x0014, +0x0019, +0x0002, +0x001A, +0x0018, +0x001A, +0x0019, +0x002F, +0x0018, +0x0005, +0x0035, +0x0018, +0x000F, +0x0007, +0x0013, +0x004F, +0x0015, +0x02C9, +0x0019, +0x00C9, +0x02E5, +0x0019, +0x00E5, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0025, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0025, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0029, +0x0005, +0x0009, +0x0005, +0x0009, +0x0045, +0x0029, +0x0005, +0x0009, +0x0005, +0x0029, +0x0005, +0x0049, +0x0025, +0x0069, +0x0005, +0x0029, +0x0005, +0x0049, +0x0045, +0x0029, +0x0005, +0x0029, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0029, +0x0005, +0x0009, +0x0025, +0x0009, +0x0005, +0x0029, +0x0005, +0x0049, +0x0005, +0x0009, +0x0005, +0x0029, +0x0025, +0x0007, +0x0009, +0x0045, +0x0067, +0x0009, +0x0008, +0x0005, +0x0009, +0x0008, +0x0005, +0x0009, +0x0008, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0025, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0025, +0x0009, +0x0008, +0x0005, +0x0009, +0x0005, +0x0049, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x00C5, +0x0029, +0x0005, +0x0029, +0x0025, +0x0009, +0x0005, +0x0069, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0885, +0x0007, +0x0345, +0x0226, +0x0078, +0x0166, +0x01B8, +0x0086, +0x00D8, +0x0006, +0x0018, +0x0006, +0x0218, +0x0DEC, +0x0009, +0x0005, +0x0009, +0x0005, +0x0006, +0x0018, +0x0009, +0x0005, +0x0020, +0x0006, +0x0045, +0x0015, +0x0009, +0x0060, +0x0038, +0x0009, +0x0015, +0x0049, +0x0000, +0x0009, +0x0000, +0x0029, +0x0005, +0x0209, +0x0000, +0x0109, +0x0445, +0x0009, +0x0025, +0x0049, +0x0045, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0085, +0x0009, +0x0005, +0x0019, +0x0009, +0x0005, +0x0029, +0x0025, +0x0649, +0x05E5, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x001A, +0x008C, +0x002B, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0029, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0025, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0000, +0x04A9, +0x0020, +0x0006, +0x00B5, +0x0505, +0x0015, +0x0011, +0x0020, +0x003A, +0x0017, +0x0000, +0x058C, +0x0011, +0x000C, +0x0015, +0x002C, +0x0015, +0x002C, +0x0015, +0x000C, +0x00E0, +0x0347, +0x0060, +0x0067, +0x0035, +0x0140, +0x00A2, +0x0059, +0x0035, +0x0017, +0x0035, +0x003A, +0x014C, +0x0015, +0x0002, +0x0055, +0x03E7, +0x0006, +0x0127, +0x028C, +0x012D, +0x0075, +0x0027, +0x000C, +0x0C47, +0x0015, +0x0007, +0x00CC, +0x0002, +0x001A, +0x00AC, +0x0026, +0x002C, +0x001A, +0x006C, +0x0027, +0x012D, +0x0047, +0x003A, +0x0007, +0x01B5, +0x0000, +0x0002, +0x0007, +0x000C, +0x03A7, +0x034C, +0x0020, +0x0B07, +0x014C, +0x0007, +0x01A0, +0x012D, +0x0407, +0x010C, +0x0026, +0x001A, +0x0055, +0x0006, +0x0020, +0x000C, +0x0037, +0x02A7, +0x006C, +0x0006, +0x010C, +0x0006, +0x004C, +0x0006, +0x008C, +0x0020, +0x01D5, +0x0000, +0x0307, +0x004C, +0x0020, +0x0015, +0x0000, +0x0147, +0x0080, +0x02E7, +0x0018, +0x00A7, +0x0000, +0x0022, +0x00A0, +0x00EC, +0x0507, +0x0006, +0x02EC, +0x0002, +0x03EC, +0x000A, +0x06A7, +0x000C, +0x000A, +0x000C, +0x0007, +0x004A, +0x00EC, +0x006A, +0x000C, +0x002A, +0x0007, +0x00CC, +0x0127, +0x002C, +0x0035, +0x012D, +0x0015, +0x0006, +0x01C7, +0x000C, +0x002A, +0x0000, +0x00E7, +0x0020, +0x0027, +0x0020, +0x02A7, +0x0000, +0x00C7, +0x0000, +0x0007, +0x0040, +0x0067, +0x0020, +0x000C, +0x0007, +0x004A, +0x006C, +0x0020, +0x002A, +0x0020, +0x002A, +0x000C, +0x0007, +0x00E0, +0x000A, +0x0060, +0x0027, +0x0000, +0x0047, +0x002C, +0x0020, +0x012D, +0x0027, +0x0037, +0x00AF, +0x001A, +0x0017, +0x0007, +0x0015, +0x000C, +0x0020, +0x002C, +0x000A, +0x0000, +0x00A7, +0x0060, +0x0027, +0x0020, +0x02A7, +0x0000, +0x00C7, +0x0000, +0x0027, +0x0000, +0x0027, +0x0000, +0x0027, +0x0020, +0x000C, +0x0000, +0x004A, +0x002C, +0x0060, +0x002C, +0x0020, +0x004C, +0x0040, +0x000C, +0x00C0, +0x0067, +0x0000, +0x0007, +0x00C0, +0x012D, +0x002C, +0x0047, +0x000C, +0x0015, +0x0120, +0x002C, +0x000A, +0x0000, +0x0107, +0x0000, +0x0047, +0x0000, +0x02A7, +0x0000, +0x00C7, +0x0000, +0x0027, +0x0000, +0x0087, +0x0020, +0x000C, +0x0007, +0x004A, +0x008C, +0x0000, +0x002C, +0x000A, +0x0000, +0x002A, +0x000C, +0x0020, +0x0007, +0x01C0, +0x0027, +0x002C, +0x0020, +0x012D, +0x0015, +0x0017, +0x00C0, +0x0007, +0x00AC, +0x0000, +0x000C, +0x002A, +0x0000, +0x00E7, +0x0020, +0x0027, +0x0020, +0x02A7, +0x0000, +0x00C7, +0x0000, +0x0027, +0x0000, +0x0087, +0x0020, +0x000C, +0x0007, +0x000A, +0x000C, +0x000A, +0x006C, +0x0020, +0x002A, +0x0020, +0x002A, +0x000C, +0x00C0, +0x002C, +0x000A, +0x0060, +0x0027, +0x0000, +0x0047, +0x002C, +0x0020, +0x012D, +0x001A, +0x0007, +0x00AF, +0x0120, +0x000C, +0x0007, +0x0000, +0x00A7, +0x0040, +0x0047, +0x0000, +0x0067, +0x0040, +0x0027, +0x0000, +0x0007, +0x0000, +0x0027, +0x0040, +0x0027, +0x0040, +0x0047, +0x0040, +0x0167, +0x0060, +0x002A, +0x000C, +0x002A, +0x0040, +0x004A, +0x0000, +0x004A, +0x000C, +0x0020, +0x0007, +0x00A0, +0x000A, +0x01A0, +0x012D, +0x004F, +0x00BA, +0x0017, +0x001A, +0x0080, +0x000C, +0x004A, +0x000C, +0x00E7, +0x0000, +0x0047, +0x0000, +0x02C7, +0x0000, +0x01E7, +0x0020, +0x000C, +0x0007, +0x004C, +0x006A, +0x0000, +0x004C, +0x0000, +0x006C, +0x00C0, +0x002C, +0x0000, +0x0047, +0x0020, +0x0007, +0x0020, +0x0027, +0x002C, +0x0020, +0x012D, +0x00C0, +0x0015, +0x00CF, +0x001A, +0x0007, +0x000C, +0x002A, +0x0015, +0x00E7, +0x0000, +0x0047, +0x0000, +0x02C7, +0x0000, +0x0127, +0x0000, +0x0087, +0x0020, +0x000C, +0x0007, +0x000A, +0x000C, +0x008A, +0x0000, +0x000C, +0x002A, +0x0000, +0x002A, +0x002C, +0x00C0, +0x002A, +0x00A0, +0x0027, +0x0000, +0x0027, +0x002C, +0x0020, +0x012D, +0x0000, +0x0027, +0x000A, +0x0160, +0x002C, +0x002A, +0x0107, +0x0000, +0x0047, +0x0000, +0x0507, +0x002C, +0x0007, +0x004A, +0x006C, +0x0000, +0x004A, +0x0000, +0x004A, +0x000C, +0x0007, +0x001A, +0x0060, +0x0047, +0x000A, +0x00CF, +0x0047, +0x002C, +0x0020, +0x012D, +0x010F, +0x001A, +0x00A7, +0x0000, +0x000C, +0x002A, +0x0000, +0x0227, +0x0040, +0x02E7, +0x0000, +0x0107, +0x0000, +0x0007, +0x0020, +0x00C7, +0x0040, +0x000C, +0x0060, +0x004A, +0x004C, +0x0000, +0x000C, +0x0000, +0x00EA, +0x00A0, +0x012D, +0x0020, +0x002A, +0x0015, +0x0160, +0x05E7, +0x000C, +0x0027, +0x00CC, +0x0060, +0x0017, +0x00A7, +0x0006, +0x00EC, +0x0015, +0x012D, +0x0035, +0x0480, +0x0027, +0x0000, +0x0007, +0x0000, +0x0087, +0x0000, +0x02E7, +0x0000, +0x0007, +0x0000, +0x0127, +0x000C, +0x0027, +0x010C, +0x0007, +0x0020, +0x0087, +0x0000, +0x0006, +0x0000, +0x00CC, +0x0000, +0x012D, +0x0020, +0x0067, +0x03E0, +0x0007, +0x005A, +0x01D5, +0x001A, +0x0015, +0x005A, +0x002C, +0x00BA, +0x012D, +0x012F, +0x001A, +0x000C, +0x001A, +0x000C, +0x001A, +0x000C, +0x0016, +0x0012, +0x0016, +0x0012, +0x002A, +0x00E7, +0x0000, +0x0467, +0x0060, +0x01AC, +0x000A, +0x008C, +0x0015, +0x002C, +0x0087, +0x014C, +0x0000, +0x046C, +0x0000, +0x00FA, +0x000C, +0x00BA, +0x0000, +0x003A, +0x0095, +0x007A, +0x0035, +0x0480, +0x0547, +0x002A, +0x006C, +0x000A, +0x00AC, +0x000A, +0x002C, +0x002A, +0x002C, +0x0007, +0x012D, +0x00B5, +0x00A7, +0x002A, +0x002C, +0x0067, +0x004C, +0x0007, +0x004A, +0x0027, +0x00CA, +0x0047, +0x006C, +0x0187, +0x000C, +0x002A, +0x002C, +0x00AA, +0x000C, +0x0007, +0x000A, +0x012D, +0x004A, +0x000C, +0x003A, +0x04A9, +0x0000, +0x0009, +0x0080, +0x0009, +0x0020, +0x0545, +0x0015, +0x0006, +0x0045, +0x2907, +0x0000, +0x0067, +0x0020, +0x00C7, +0x0000, +0x0007, +0x0000, +0x0067, +0x0020, +0x0507, +0x0000, +0x0067, +0x0020, +0x0407, +0x0000, +0x0067, +0x0020, +0x00C7, +0x0000, +0x0007, +0x0000, +0x0067, +0x0020, +0x01C7, +0x0000, +0x0707, +0x0000, +0x0067, +0x0020, +0x0847, +0x0020, +0x004C, +0x0115, +0x026F, +0x0040, +0x01E7, +0x013A, +0x00A0, +0x0AA9, +0x0020, +0x00A5, +0x0020, +0x0011, +0x4D67, +0x001A, +0x0015, +0x0207, +0x001D, +0x0327, +0x0016, +0x0012, +0x0040, +0x0947, +0x0055, +0x004E, +0x00E7, +0x00C0, +0x0227, +0x004C, +0x000A, +0x0100, +0x0247, +0x002C, +0x000A, +0x0035, +0x0100, +0x0227, +0x002C, +0x0160, +0x0187, +0x0000, +0x0047, +0x0000, +0x002C, +0x0160, +0x0667, +0x002C, +0x000A, +0x00CC, +0x00EA, +0x000C, +0x002A, +0x014C, +0x0055, +0x0006, +0x0055, +0x0017, +0x0007, +0x000C, +0x0020, +0x012D, +0x00A0, +0x012F, +0x00A0, +0x00B5, +0x0011, +0x0075, +0x004C, +0x0002, +0x000C, +0x012D, +0x00A0, +0x0447, +0x0006, +0x0687, +0x00C0, +0x0087, +0x002C, +0x0427, +0x000C, +0x0007, +0x0080, +0x08A7, +0x0120, +0x03C7, +0x0000, +0x004C, +0x006A, +0x002C, +0x004A, +0x0060, +0x002A, +0x000C, +0x00AA, +0x004C, +0x0060, +0x001A, +0x0040, +0x0035, +0x012D, +0x03A7, +0x0020, +0x0087, +0x0140, +0x0567, +0x0060, +0x0327, +0x00A0, +0x012D, +0x000F, +0x0040, +0x043A, +0x02C7, +0x002C, +0x002A, +0x000C, +0x0020, +0x0035, +0x0687, +0x000A, +0x000C, +0x000A, +0x00CC, +0x0000, +0x000C, +0x000A, +0x000C, +0x002A, +0x00EC, +0x00AA, +0x012C, +0x0020, +0x000C, +0x012D, +0x00A0, +0x012D, +0x00A0, +0x00D5, +0x0006, +0x00B5, +0x0020, +0x01AC, +0x000B, +0x01EC, +0x0600, +0x006C, +0x000A, +0x05C7, +0x000C, +0x000A, +0x008C, +0x000A, +0x000C, +0x008A, +0x000C, +0x002A, +0x00E7, +0x0040, +0x012D, +0x00D5, +0x013A, +0x010C, +0x011A, +0x0035, +0x0000, +0x002C, +0x000A, +0x03A7, +0x000A, +0x006C, +0x002A, +0x002C, +0x000A, +0x004C, +0x0027, +0x012D, +0x0567, +0x000C, +0x000A, +0x002C, +0x004A, +0x000C, +0x000A, +0x004C, +0x002A, +0x00E0, +0x0075, +0x0467, +0x00EA, +0x00EC, +0x002A, +0x002C, +0x0040, +0x0095, +0x012D, +0x0040, +0x0047, +0x012D, +0x03A7, +0x00A6, +0x0035, +0x0105, +0x00C0, +0x0549, +0x0020, +0x0049, +0x00F5, +0x00E0, +0x004C, +0x0015, +0x018C, +0x000A, +0x00CC, +0x0067, +0x000C, +0x00A7, +0x000C, +0x0027, +0x000A, +0x002C, +0x0007, +0x0080, +0x0565, +0x07C6, +0x0185, +0x0006, +0x0425, +0x0486, +0x07EC, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0105, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0105, +0x00E9, +0x00A5, +0x0020, +0x00A9, +0x0020, +0x00E5, +0x00E9, +0x00E5, +0x00E9, +0x00A5, +0x0020, +0x00A9, +0x0020, +0x00E5, +0x0000, +0x0009, +0x0000, +0x0009, +0x0000, +0x0009, +0x0000, +0x0009, +0x00E5, +0x00E9, +0x01A5, +0x0020, +0x00E5, +0x00E8, +0x00E5, +0x00E8, +0x00E5, +0x00E8, +0x0085, +0x0000, +0x0025, +0x0069, +0x0008, +0x0018, +0x0005, +0x0058, +0x0045, +0x0000, +0x0025, +0x0069, +0x0008, +0x0058, +0x0065, +0x0020, +0x0025, +0x0069, +0x0000, +0x0058, +0x00E5, +0x0089, +0x0058, +0x0020, +0x0045, +0x0000, +0x0025, +0x0069, +0x0008, +0x0038, +0x0000, +0x015D, +0x0082, +0x00B1, +0x0035, +0x0014, +0x0013, +0x0016, +0x0034, +0x0013, +0x0016, +0x0014, +0x00F5, +0x001B, +0x001C, +0x0082, +0x001D, +0x0115, +0x0014, +0x0013, +0x0075, +0x0030, +0x0055, +0x0019, +0x0016, +0x0012, +0x0155, +0x0019, +0x0015, +0x0010, +0x0135, +0x001D, +0x0082, +0x0000, +0x0122, +0x000F, +0x0006, +0x0020, +0x00AF, +0x0059, +0x0016, +0x0012, +0x0006, +0x012F, +0x0059, +0x0016, +0x0012, +0x0000, +0x0186, +0x0040, +0x0417, +0x01C0, +0x018C, +0x006B, +0x000C, +0x004B, +0x016C, +0x01C0, +0x003A, +0x0009, +0x007A, +0x0009, +0x003A, +0x0005, +0x0049, +0x0025, +0x0049, +0x0005, +0x001A, +0x0009, +0x003A, +0x0019, +0x0089, +0x00BA, +0x0009, +0x001A, +0x0009, +0x001A, +0x0009, +0x001A, +0x0069, +0x001A, +0x0005, +0x0069, +0x0005, +0x0067, +0x0005, +0x003A, +0x0025, +0x0029, +0x0099, +0x0009, +0x0065, +0x001A, +0x0019, +0x003A, +0x0005, +0x001A, +0x01EF, +0x044E, +0x0009, +0x0005, +0x006E, +0x000F, +0x003A, +0x0060, +0x0099, +0x009A, +0x0039, +0x007A, +0x0019, +0x003A, +0x0019, +0x003A, +0x0019, +0x00DA, +0x0019, +0x03DA, +0x0039, +0x003A, +0x0019, +0x001A, +0x0019, +0x03DA, +0x2179, +0x00FA, +0x0016, +0x0012, +0x0016, +0x0012, +0x027A, +0x0039, +0x00DA, +0x0016, +0x0012, +0x0A1A, +0x0019, +0x03BA, +0x0319, +0x04FA, +0x00B9, +0x089A, +0x0300, +0x015A, +0x0280, +0x076F, +0x09BA, +0x02AF, +0x16DA, +0x0019, +0x011A, +0x0019, +0x06BA, +0x00F9, +0x0DDA, +0x0019, +0x1EFA, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x03AF, +0x057A, +0x0099, +0x0016, +0x0012, +0x03D9, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x01F9, +0x1FFA, +0x1059, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x07D9, +0x0016, +0x0012, +0x0016, +0x0012, +0x03F9, +0x0016, +0x0012, +0x2039, +0x05FA, +0x0299, +0x003A, +0x00B9, +0x04DA, +0x0020, +0x03FA, +0x0000, +0x0D1A, +0x05E9, +0x05E5, +0x0009, +0x0005, +0x0049, +0x0025, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0069, +0x0005, +0x0009, +0x0025, +0x0009, +0x00A5, +0x0026, +0x0049, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0025, +0x00BA, +0x0009, +0x0005, +0x0009, +0x0005, +0x004C, +0x0009, +0x0005, +0x0080, +0x0075, +0x000F, +0x0035, +0x04A5, +0x0000, +0x0005, +0x0080, +0x0005, +0x0020, +0x06E7, +0x00C0, +0x0006, +0x0015, +0x01A0, +0x000C, +0x02C7, +0x0100, +0x00C7, +0x0000, +0x00C7, +0x0000, +0x00C7, +0x0000, +0x00C7, +0x0000, +0x00C7, +0x0000, +0x00C7, +0x0000, +0x00C7, +0x0000, +0x00C7, +0x0000, +0x03EC, +0x0035, +0x0014, +0x0013, +0x0014, +0x0013, +0x0055, +0x0014, +0x0013, +0x0015, +0x0014, +0x0013, +0x0115, +0x0011, +0x0035, +0x0011, +0x0015, +0x0014, +0x0013, +0x0035, +0x0014, +0x0013, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0095, +0x0006, +0x0135, +0x0031, +0x0075, +0x0011, +0x0015, +0x0016, +0x0195, +0x003A, +0x0055, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0011, +0x0420, +0x033A, +0x0000, +0x0B1A, +0x0160, +0x1ABA, +0x0320, +0x01FA, +0x001D, +0x0055, +0x001A, +0x0006, +0x0007, +0x000E, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x003A, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0011, +0x0016, +0x0032, +0x001A, +0x010E, +0x006C, +0x002A, +0x0011, +0x0086, +0x003A, +0x004E, +0x0006, +0x0007, +0x0015, +0x003A, +0x0000, +0x0AA7, +0x0020, +0x002C, +0x0038, +0x0026, +0x0007, +0x0011, +0x0B27, +0x0015, +0x0046, +0x0007, +0x0080, +0x0547, +0x0000, +0x0BA7, +0x0000, +0x003A, +0x006F, +0x013A, +0x03E7, +0x047A, +0x0140, +0x001A, +0x01E7, +0x03DA, +0x0000, +0x012F, +0x03BA, +0x00EF, +0x001A, +0x01CF, +0x03FA, +0x012F, +0x04DA, +0x01CF, +0x27FA, +0xFFE7, +0xFFE7, +0xFFE7, +0x37E7, +0x07FA, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0x4287, +0x0006, +0x8EC7, +0x0040, +0x06DA, +0x0100, +0x04E7, +0x00A6, +0x0035, +0x2167, +0x0006, +0x0055, +0x01E7, +0x012D, +0x0027, +0x0260, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0007, +0x000C, +0x004B, +0x0015, +0x012C, +0x0015, +0x0006, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0026, +0x002C, +0x08A7, +0x012E, +0x002C, +0x00B5, +0x00E0, +0x02D8, +0x0106, +0x0038, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0045, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0006, +0x00E5, +0x0009, +0x0005, +0x0009, +0x0005, +0x0029, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0006, +0x0038, +0x0009, +0x0005, +0x0009, +0x0005, +0x0007, +0x0009, +0x0005, +0x0009, +0x0045, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0089, +0x0005, +0x0089, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x0069, +0x0005, +0x0009, +0x0005, +0x0080, +0x0009, +0x0005, +0x0000, +0x0005, +0x0000, +0x0005, +0x0009, +0x0005, +0x0009, +0x0005, +0x02E0, +0x0046, +0x0009, +0x0005, +0x0007, +0x0026, +0x0005, +0x00C7, +0x000C, +0x0047, +0x000C, +0x0067, +0x000C, +0x02C7, +0x002A, +0x002C, +0x000A, +0x007A, +0x000C, +0x0040, +0x00AF, +0x003A, +0x0017, +0x001A, +0x00A0, +0x0667, +0x0075, +0x00E0, +0x002A, +0x0627, +0x01EA, +0x002C, +0x00E0, +0x0035, +0x012D, +0x00A0, +0x022C, +0x00A7, +0x0055, +0x0007, +0x0015, +0x0027, +0x000C, +0x012D, +0x0367, +0x00EC, +0x0035, +0x02C7, +0x014C, +0x002A, +0x0140, +0x0015, +0x0387, +0x0040, +0x004C, +0x000A, +0x05C7, +0x000C, +0x002A, +0x006C, +0x002A, +0x002C, +0x004A, +0x0195, +0x0000, +0x0006, +0x012D, +0x0060, +0x0035, +0x0087, +0x000C, +0x0006, +0x0107, +0x012D, +0x0087, +0x0000, +0x0507, +0x00AC, +0x002A, +0x002C, +0x002A, +0x002C, +0x0100, +0x0047, +0x000C, +0x00E7, +0x000C, +0x000A, +0x0020, +0x012D, +0x0020, +0x0075, +0x01E7, +0x0006, +0x00A7, +0x005A, +0x0007, +0x000A, +0x000C, +0x000A, +0x0627, +0x000C, +0x0007, +0x004C, +0x0027, +0x002C, +0x0087, +0x002C, +0x0007, +0x000C, +0x0007, +0x02E0, +0x0027, +0x0006, +0x0035, +0x0147, +0x000A, +0x002C, +0x002A, +0x0035, +0x0007, +0x0026, +0x000A, +0x000C, +0x0120, +0x00A7, +0x0020, +0x00A7, +0x0020, +0x00A7, +0x0100, +0x00C7, +0x0000, +0x00C7, +0x0000, +0x0545, +0x0018, +0x0066, +0x0105, +0x0006, +0x0038, +0x0060, +0x09E5, +0x0447, +0x002A, +0x000C, +0x002A, +0x000C, +0x002A, +0x0015, +0x000A, +0x000C, +0x0020, +0x012D, +0x00A0, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0x7467, +0x0160, +0x02C7, +0x0060, +0x0607, +0x0060, +0xFFE4, +0xFFE3, +0xFFE3, +0xFFE3, +0x1FE3, +0x2DA7, +0x0020, +0x0D27, +0x04A0, +0x00C5, +0x0160, +0x0085, +0x0080, +0x0007, +0x000C, +0x0127, +0x0019, +0x0187, +0x0000, +0x0087, +0x0000, +0x0007, +0x0000, +0x0027, +0x0000, +0x0027, +0x0000, +0x0D67, +0x0218, +0x01E0, +0x2D47, +0x0012, +0x0016, +0x01FA, +0x07E7, +0x0020, +0x06A7, +0x00C0, +0x001A, +0x03E0, +0x0167, +0x0017, +0x005A, +0x01EC, +0x00D5, +0x0016, +0x0012, +0x0015, +0x00A0, +0x01EC, +0x0015, +0x0031, +0x0030, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0035, +0x0016, +0x0012, +0x0075, +0x0050, +0x0055, +0x0000, +0x0075, +0x0011, +0x0016, +0x0012, +0x0016, +0x0012, +0x0016, +0x0012, +0x0055, +0x0019, +0x0011, +0x0059, +0x0000, +0x0015, +0x0017, +0x0035, +0x0060, +0x0087, +0x0000, +0x10C7, +0x0020, +0x0002, +0x0000, +0x0055, +0x0017, +0x0055, +0x0016, +0x0012, +0x0015, +0x0019, +0x0015, +0x0011, +0x0035, +0x012D, +0x0035, +0x0059, +0x0035, +0x0329, +0x0016, +0x0015, +0x0012, +0x0018, +0x0010, +0x0018, +0x0325, +0x0016, +0x0019, +0x0012, +0x0019, +0x0016, +0x0012, +0x0015, +0x0016, +0x0012, +0x0035, +0x0127, +0x0006, +0x0587, +0x0026, +0x03C7, +0x0040, +0x00A7, +0x0020, +0x00A7, +0x0020, +0x00A7, +0x0020, +0x0047, +0x0040, +0x0037, +0x0019, +0x0018, +0x001A, +0x0037, +0x0000, +0x001A, +0x0079, +0x003A, +0x0120, +0x0042, +0x003A, +0x0020, +0x0167, +0x0000, +0x0327, +0x0000, +0x0247, +0x0000, +0x0027, +0x0000, +0x01C7, +0x0020, +0x01A7, +0x0420, +0x0F47, +0x0080, +0x0055, +0x0060, +0x058F, +0x0040, +0x011A, +0x068E, +0x006F, +0x021A, +0x002F, +0x005A, +0x0000, +0x019A, +0x0040, +0x001A, +0x05C0, +0x059A, +0x000C, +0x1020, +0x0387, +0x0040, +0x0607, +0x01C0, +0x000C, +0x034F, +0x0060, +0x03E7, +0x006F, +0x0100, +0x0267, +0x000E, +0x00E7, +0x000E, +0x0080, +0x04A7, +0x008C, +0x0080, +0x03A7, +0x0000, +0x0015, +0x0467, +0x0060, +0x00E7, +0x0015, +0x008E, +0x0520, +0x04E9, +0x04E5, +0x09A7, +0x0020, +0x012D, +0x00A0, +0x0469, +0x0060, +0x0465, +0x0060, +0x04E7, +0x00E0, +0x0667, +0x0140, +0x0015, +0x0149, +0x0000, +0x01C9, +0x0000, +0x00C9, +0x0000, +0x0029, +0x0000, +0x0145, +0x0000, +0x01C5, +0x0000, +0x00C5, +0x0000, +0x0025, +0x0840, +0x26C7, +0x0100, +0x02A7, +0x0120, +0x00E7, +0x02E0, +0x00A6, +0x0000, +0x0526, +0x0000, +0x0106, +0x0880, +0x00A7, +0x0020, +0x0007, +0x0000, +0x0567, +0x0000, +0x0027, +0x0040, +0x0007, +0x0020, +0x02C7, +0x0000, +0x0015, +0x00EF, +0x02C7, +0x003A, +0x00CF, +0x03C7, +0x00E0, +0x010F, +0x05E0, +0x0247, +0x0000, +0x0027, +0x0080, +0x008F, +0x02A7, +0x00AF, +0x0040, +0x0015, +0x0327, +0x0080, +0x0015, +0x07E0, +0x06E7, +0x0060, +0x002F, +0x0027, +0x01EF, +0x0020, +0x05AF, +0x0007, +0x004C, +0x0000, +0x002C, +0x0080, +0x006C, +0x0067, +0x0000, +0x0047, +0x0000, +0x0387, +0x0020, +0x004C, +0x0060, +0x000C, +0x010F, +0x00C0, +0x0115, +0x00C0, +0x0387, +0x002F, +0x0015, +0x0387, +0x004F, +0x03E0, +0x00E7, +0x001A, +0x0367, +0x002C, +0x0060, +0x008F, +0x00D5, +0x0100, +0x06A7, +0x0040, +0x00D5, +0x02A7, +0x0020, +0x00EF, +0x0247, +0x0080, +0x00EF, +0x0227, +0x00C0, +0x0075, +0x0160, +0x00CF, +0x09E0, +0x0907, +0x06C0, +0x0649, +0x0180, +0x0645, +0x00C0, +0x00AF, +0x0467, +0x006C, +0x00E0, +0x012D, +0x24A0, +0x03CF, +0x0000, +0x0527, +0x0000, +0x002C, +0x0011, +0x0020, +0x0027, +0x0940, +0x004C, +0x0387, +0x012F, +0x0007, +0x00E0, +0x02A7, +0x014C, +0x006F, +0x0095, +0x02A0, +0x0227, +0x006C, +0x0075, +0x04A0, +0x0287, +0x00CF, +0x0260, +0x02C7, +0x0100, +0x000A, +0x000C, +0x000A, +0x0687, +0x01CC, +0x00D5, +0x0060, +0x026F, +0x012D, +0x000C, +0x0027, +0x002C, +0x0007, +0x0100, +0x004C, +0x000A, +0x0587, +0x004A, +0x006C, +0x002A, +0x002C, +0x0035, +0x0002, +0x0075, +0x000C, +0x0120, +0x0002, +0x0020, +0x0307, +0x00C0, +0x012D, +0x00A0, +0x004C, +0x0467, +0x008C, +0x000A, +0x00EC, +0x0000, +0x012D, +0x0075, +0x0007, +0x002A, +0x0007, +0x00E0, +0x0447, +0x000C, +0x0035, +0x0007, +0x0100, +0x002C, +0x000A, +0x05E7, +0x004A, +0x010C, +0x002A, +0x0067, +0x0075, +0x006C, +0x0015, +0x000A, +0x000C, +0x012D, +0x0007, +0x0015, +0x0007, +0x0055, +0x0000, +0x026F, +0x0140, +0x0227, +0x0000, +0x0307, +0x004A, +0x004C, +0x002A, +0x000C, +0x000A, +0x002C, +0x00B5, +0x000C, +0x0027, +0x000C, +0x07A0, +0x00C7, +0x0000, +0x0007, +0x0000, +0x0067, +0x0000, +0x01C7, +0x0000, +0x0127, +0x0015, +0x00A0, +0x05C7, +0x000C, +0x004A, +0x00EC, +0x0080, +0x012D, +0x00A0, +0x002C, +0x002A, +0x0000, +0x00E7, +0x0020, +0x0027, +0x0020, +0x02A7, +0x0000, +0x00C7, +0x0000, +0x0027, +0x0000, +0x0087, +0x0000, +0x002C, +0x0007, +0x002A, +0x000C, +0x006A, +0x0020, +0x002A, +0x0020, +0x004A, +0x0020, +0x0007, +0x00A0, +0x000A, +0x0080, +0x0087, +0x002A, +0x0020, +0x00CC, +0x0040, +0x008C, +0x1140, +0x0687, +0x004A, +0x00EC, +0x002A, +0x004C, +0x000A, +0x000C, +0x0067, +0x0095, +0x012D, +0x0035, +0x0000, +0x0015, +0x000C, +0x0047, +0x03A0, +0x05E7, +0x004A, +0x00AC, +0x000A, +0x000C, +0x006A, +0x002C, +0x000A, +0x002C, +0x0027, +0x0015, +0x0007, +0x00E0, +0x012D, +0x14A0, +0x05C7, +0x004A, +0x006C, +0x0020, +0x006A, +0x002C, +0x000A, +0x002C, +0x02D5, +0x0067, +0x002C, +0x0420, +0x05E7, +0x004A, +0x00EC, +0x002A, +0x000C, +0x000A, +0x002C, +0x0055, +0x0007, +0x0140, +0x012D, +0x00A0, +0x0195, +0x0240, +0x0547, +0x000C, +0x000A, +0x000C, +0x002A, +0x00AC, +0x000A, +0x000C, +0x0007, +0x0015, +0x00A0, +0x012D, +0x06A0, +0x0347, +0x0020, +0x004C, +0x002A, +0x006C, +0x000A, +0x008C, +0x0060, +0x012D, +0x002F, +0x0055, +0x001A, +0x00C7, +0x1700, +0x0567, +0x004A, +0x010C, +0x000A, +0x002C, +0x0015, +0x0C60, +0x03E9, +0x03E5, +0x012D, +0x010F, +0x0160, +0x00E7, +0x0020, +0x0007, +0x0020, +0x00E7, +0x0000, +0x0027, +0x0000, +0x02E7, +0x00AA, +0x0000, +0x002A, +0x0020, +0x002C, +0x000A, +0x000C, +0x0007, +0x000A, +0x0007, +0x000A, +0x000C, +0x0055, +0x0100, +0x012D, +0x08A0, +0x00E7, +0x0020, +0x04C7, +0x004A, +0x006C, +0x0020, +0x002C, +0x006A, +0x000C, +0x0007, +0x0015, +0x0007, +0x000A, +0x0340, +0x0007, +0x012C, +0x04E7, +0x00AC, +0x000A, +0x0007, +0x006C, +0x00F5, +0x000C, +0x00E0, +0x0007, +0x00AC, +0x002A, +0x004C, +0x05A7, +0x018C, +0x000A, +0x002C, +0x0055, +0x0007, +0x0095, +0x0180, +0x0907, +0x00C0, +0x0135, +0x1EA0, +0x0107, +0x0000, +0x0487, +0x000A, +0x00CC, +0x0000, +0x00AC, +0x000A, +0x000C, +0x0007, +0x0095, +0x0120, +0x012D, +0x024F, +0x0040, +0x0035, +0x03A7, +0x0020, +0x02AC, +0x0000, +0x000A, +0x00CC, +0x000A, +0x002C, +0x000A, +0x002C, +0x0900, +0x00C7, +0x0000, +0x0027, +0x0000, +0x04A7, +0x00AC, +0x0040, +0x000C, +0x0000, +0x002C, +0x0000, +0x00CC, +0x0007, +0x000C, +0x00E0, +0x012D, +0x00A0, +0x00A7, +0x0000, +0x0027, +0x0000, +0x03E7, +0x008A, +0x0000, +0x002C, +0x0000, +0x002A, +0x000C, +0x000A, +0x000C, +0x0007, +0x00C0, +0x012D, +0x26A0, +0x0247, +0x002C, +0x002A, +0x0035, +0x00C0, +0x002C, +0x0007, +0x000A, +0x0187, +0x0000, +0x0427, +0x002A, +0x008C, +0x0040, +0x002A, +0x000C, +0x000A, +0x000C, +0x0195, +0x012D, +0x0AA0, +0x0007, +0x01C0, +0x028F, +0x00FA, +0x0077, +0x021A, +0x0180, +0x0015, +0x7327, +0x0CA0, +0x0DCE, +0x0000, +0x0095, +0x0140, +0x1867, +0xFFE0, +0x4960, +0x0C07, +0x0035, +0x0180, +0x85E7, +0x01E2, +0x000C, +0x00A7, +0x01CC, +0xFFE0, +0xF520, +0x48C7, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0x3700, +0x4707, +0x00C0, +0x03C7, +0x0000, +0x012D, +0x0060, +0x0035, +0x09C7, +0x0000, +0x012D, +0x00A0, +0x03A7, +0x0020, +0x008C, +0x0015, +0x0120, +0x05E7, +0x00CC, +0x0095, +0x007A, +0x0066, +0x0015, +0x001A, +0x0120, +0x012D, +0x0000, +0x00CF, +0x0000, +0x0287, +0x0080, +0x0247, +0x55E0, +0x03E9, +0x03E5, +0x02CF, +0x0075, +0x0C80, +0x0947, +0x0060, +0x000C, +0x0007, +0x06CA, +0x00C0, +0x006C, +0x0186, +0x07E0, +0x0026, +0x0015, +0x0006, +0x000C, +0x0140, +0x002A, +0x01A0, +0xFFE7, +0xFFE7, +0xFEE7, +0x00E0, +0x9AA7, +0x0520, +0x0107, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0x5CC0, +0x0066, +0x0000, +0x00C6, +0x0000, +0x0026, +0x0000, +0x2447, +0x01C0, +0x0007, +0x0380, +0x0047, +0x0020, +0x0007, +0x01A0, +0x0067, +0x00E0, +0x3167, +0xFFE0, +0x2060, +0x0D47, +0x0080, +0x0187, +0x0040, +0x0107, +0x00C0, +0x0127, +0x0020, +0x001A, +0x002C, +0x0015, +0x0062, +0xFFE0, +0xFFE0, +0x4B60, +0x05AC, +0x0020, +0x02CC, +0x0100, +0x0E7A, +0x0760, +0x1EBA, +0x0120, +0x04DA, +0x0020, +0x077A, +0x002A, +0x004C, +0x005A, +0x00AA, +0x00E2, +0x00EC, +0x003A, +0x00CC, +0x03BA, +0x006C, +0x079A, +0x0280, +0x083A, +0x004C, +0x001A, +0x0F20, +0x026F, +0x0160, +0x026F, +0x0160, +0x0ADA, +0x0100, +0x030F, +0x10C0, +0x0329, +0x0325, +0x0329, +0x00C5, +0x0000, +0x0225, +0x0329, +0x0325, +0x0009, +0x0000, +0x0029, +0x0020, +0x0009, +0x0020, +0x0029, +0x0020, +0x0069, +0x0000, +0x00E9, +0x0065, +0x0000, +0x0005, +0x0000, +0x00C5, +0x0000, +0x0145, +0x0329, +0x0325, +0x0029, +0x0000, +0x0069, +0x0020, +0x00E9, +0x0000, +0x00C9, +0x0000, +0x0325, +0x0029, +0x0000, +0x0069, +0x0000, +0x0089, +0x0000, +0x0009, +0x0040, +0x00C9, +0x0000, +0x0325, +0x0329, +0x0325, +0x0329, +0x0325, +0x0329, +0x0325, +0x0329, +0x0325, +0x0329, +0x0325, +0x0329, +0x0365, +0x0020, +0x0309, +0x0019, +0x0305, +0x0019, +0x00A5, +0x0309, +0x0019, +0x0305, +0x0019, +0x00A5, +0x0309, +0x0019, +0x0305, +0x0019, +0x00A5, +0x0309, +0x0019, +0x0305, +0x0019, +0x00A5, +0x0309, +0x0019, +0x0305, +0x0019, +0x00A5, +0x0009, +0x0005, +0x0020, +0x062D, +0x3FFA, +0x06CC, +0x007A, +0x062C, +0x00FA, +0x000C, +0x01BA, +0x000C, +0x003A, +0x0095, +0x01C0, +0x008C, +0x0000, +0x01CC, +0x89E0, +0x0125, +0x0007, +0x0265, +0x00A0, +0x00A5, +0x1A80, +0x00CC, +0x0000, +0x020C, +0x0020, +0x00CC, +0x0000, +0x002C, +0x0000, +0x008C, +0x0080, +0x07A6, +0x0400, +0x000C, +0x0DE0, +0x0587, +0x0040, +0x00CC, +0x00C6, +0x0020, +0x012D, +0x0060, +0x0007, +0x001A, +0x27E0, +0x03A7, +0x000C, +0x0200, +0x0567, +0x006C, +0x012D, +0x0080, +0x0017, +0x39E0, +0x0347, +0x0006, +0x006C, +0x012D, +0x5CA0, +0x00C7, +0x0000, +0x0067, +0x0000, +0x0027, +0x0000, +0x01C7, +0x0000, +0x1887, +0x0020, +0x010F, +0x00CC, +0x0500, +0x0429, +0x0425, +0x00CC, +0x0006, +0x0060, +0x012D, +0x0060, +0x0035, +0x6200, +0x074F, +0x001A, +0x004F, +0x0017, +0x006F, +0x0960, +0x058F, +0x001A, +0x01CF, +0x1820, +0x0067, +0x0000, +0x0347, +0x0000, +0x0027, +0x0000, +0x0007, +0x0020, +0x0007, +0x0000, +0x0127, +0x0000, +0x0067, +0x0000, +0x0007, +0x0000, +0x0007, +0x00A0, +0x0007, +0x0060, +0x0007, +0x0000, +0x0007, +0x0000, +0x0007, +0x0000, +0x0047, +0x0000, +0x0027, +0x0000, +0x0007, +0x0020, +0x0007, +0x0000, +0x0007, +0x0000, +0x0007, +0x0000, +0x0007, +0x0000, +0x0007, +0x0000, +0x0027, +0x0000, +0x0007, +0x0020, +0x0067, +0x0000, +0x00C7, +0x0000, +0x0067, +0x0000, +0x0067, +0x0000, +0x0007, +0x0000, +0x0127, +0x0000, +0x0207, +0x0080, +0x0047, +0x0000, +0x0087, +0x0000, +0x0207, +0x0660, +0x0039, +0x21A0, +0x057A, +0x0060, +0x0C7A, +0x0160, +0x01DA, +0x0020, +0x01DA, +0x0000, +0x01DA, +0x0000, +0x049A, +0x0120, +0x018F, +0x141A, +0x06E0, +0x039A, +0x0180, +0x057A, +0x0060, +0x011A, +0x00C0, +0x003A, +0x01A0, +0x00BA, +0x1320, +0x1F5A, +0x0098, +0x5AFA, +0x0060, +0x021A, +0x0040, +0x019A, +0x0040, +0x0EDA, +0x0060, +0x0BDA, +0x00A0, +0x017A, +0x0060, +0x001A, +0x01C0, +0x017A, +0x0060, +0x06FA, +0x00E0, +0x013A, +0x00A0, +0x04FA, +0x00E0, +0x03BA, +0x0020, +0x003A, +0x09A0, +0x2A7A, +0x0160, +0x01BA, +0x0020, +0x019A, +0x0040, +0x011A, +0x00C0, +0x05BA, +0x0000, +0x00DA, +0x00E0, +0x01BA, +0x0060, +0x011A, +0x00C0, +0x011A, +0x00C0, +0x125A, +0x0000, +0x06DA, +0x0480, +0x012D, +0x80A0, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xFFE7, +0xDBE7, +0x03E0, +0xFFE7, +0xFFE7, +0x0727, +0x00A0, +0x1BA7, +0x0020, +0xFFE7, +0xFFE7, +0xD027, +0x01A0, +0xFFE7, +0xFFE7, +0xFFE7, +0xA607, +0x01C0, +0x4DA7, +0xFFE0, +0x3420, +0x43A7, +0xBC20, +0xFFE7, +0xFFE7, +0x6947, +0x0080, +0xFFE7, +0xFFE7, +0x0BE7, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0x8A00, +0x0002, +0x03A0, +0x0BE2, +0x0FE0, +0x1DEC, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xFFE0, +0xC1E0, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFA3, +0x0020, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFE3, +0xFFA3, +0x0020, }; -const std::unordered_set unicode_set_whitespace = { -0x000009, -0x00000A, -0x00000B, -0x00000C, -0x00000D, -0x000020, -0x000085, -0x0000A0, -0x001680, -0x002000, -0x002001, -0x002002, -0x002003, -0x002004, -0x002005, -0x002006, -0x002007, -0x002008, -0x002009, -0x00200A, -0x002028, -0x002029, -0x00202F, -0x00205F, -0x003000, +const std::vector> unicode_ranges_whitespace = { +{0x000009, 0x00000D}, +{0x002000, 0x00200A}, +{0x000020, 0x000020}, +{0x000085, 0x000085}, +{0x0000A0, 0x0000A0}, +{0x001680, 0x001680}, +{0x002028, 0x002028}, +{0x002029, 0x002029}, +{0x00202F, 0x00202F}, +{0x00205F, 0x00205F}, +{0x003000, 0x003000}, }; const std::unordered_map unicode_map_lowercase = { @@ -7030,3 +9259,4 @@ const std::vector unicode_ranges_nfd = { // start, last, nfd {0x02FA1C, 0x02FA1C, 0x009F3B}, {0x02FA1D, 0x02FA1D, 0x02A600}, }; + diff --git a/src/unicode-data.h b/src/unicode-data.h index e27fe1770710a..447826879eaee 100644 --- a/src/unicode-data.h +++ b/src/unicode-data.h @@ -3,7 +3,6 @@ #include #include #include -#include struct range_nfd { uint32_t first; @@ -13,8 +12,8 @@ struct range_nfd { static const uint32_t MAX_CODEPOINTS = 0x110000; -extern const std::vector> unicode_ranges_flags; -extern const std::unordered_set unicode_set_whitespace; +extern const std::vector unicode_rle_codepoints_categs; +extern const std::vector> unicode_ranges_whitespace; extern const std::unordered_map unicode_map_lowercase; extern const std::unordered_map unicode_map_uppercase; extern const std::vector unicode_ranges_nfd; diff --git a/src/unicode.cpp b/src/unicode.cpp index 46650bff06d15..7bd10f50bcf14 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -2,10 +2,10 @@ #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING #endif +#include "ggml.h" #include "unicode.h" #include "unicode-data.h" -#include #include #include #include @@ -119,38 +119,6 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) { // return result; //} -static std::vector unicode_cpt_flags_array() { - std::vector cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED); - - assert (unicode_ranges_flags.front().first == 0); - assert (unicode_ranges_flags.back().first == MAX_CODEPOINTS); - for (size_t i = 1; i < unicode_ranges_flags.size(); ++i) { - const auto range_ini = unicode_ranges_flags[i-1]; // codepoint_ini, flags - const auto range_end = unicode_ranges_flags[i]; // codepoint_end, flags - for (uint32_t cpt = range_ini.first; cpt < range_end.first; ++cpt) { - cpt_flags[cpt] = range_ini.second; - } - } - - for (auto cpt : unicode_set_whitespace) { - cpt_flags[cpt].is_whitespace = true; - } - - for (auto p : unicode_map_lowercase) { - cpt_flags[p.second].is_lowercase = true; - } - - for (auto p : unicode_map_uppercase) { - cpt_flags[p.second].is_uppercase = true; - } - - for (auto &range : unicode_ranges_nfd) { // start, last, nfd - cpt_flags[range.nfd].is_nfd = true; - } - - return cpt_flags; -} - static std::unordered_map unicode_byte_to_utf8_map() { std::unordered_map map; for (int ch = 0x21; ch <= 0x7E; ++ch) { // u'!' to u'~' @@ -233,7 +201,7 @@ static std::vector unicode_regex_split_custom_gpt2(const std::string & t for (auto offset : offsets) { const size_t offset_ini = start; const size_t offset_end = start + offset; - assert(offset_end <= cpts.size()); + GGML_ASSERT(offset_end <= cpts.size()); start = offset_end; static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF; @@ -241,13 +209,14 @@ static std::vector unicode_regex_split_custom_gpt2(const std::string & t return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE; }; - auto _get_flags = [&] (const size_t pos) -> codepoint_flags { - return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{}; + static const codepoint_categ SENTINEL = codepoint_categ::UNDEF + 1; + auto _get_categ = [&] (const size_t pos) -> codepoint_categ { + return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_category(cpts[pos]) : SENTINEL; }; size_t _prev_end = offset_ini; auto _add_token = [&] (const size_t end) -> size_t { - assert(_prev_end <= end && end <= offset_end); + GGML_ASSERT(_prev_end <= end && end <= offset_end); size_t len = end - _prev_end; if (len > 0) { bpe_offsets.push_back(len); @@ -264,7 +233,7 @@ static std::vector unicode_regex_split_custom_gpt2(const std::string & t for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) { const uint32_t cpt = _get_cpt(pos); - const auto flags = _get_flags(pos); + const auto categ = _get_categ(pos); // regex: 's|'t|'re|'ve|'m|'ll|'d if (cpt == '\'' && pos+1 < offset_end) { @@ -284,37 +253,37 @@ static std::vector unicode_regex_split_custom_gpt2(const std::string & t } } - auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags); + auto categ2 = (cpt == ' ' ? _get_categ(pos+1) : categ); // regex: ?\p{L}+ - if (flags2.is_letter) { + if (categ2.is_L()) { pos += (cpt == ' '); - while (flags2.is_letter) { - flags2 = _get_flags(++pos); + while (categ2.is_L()) { + categ2 = _get_categ(++pos); } _add_token(pos); continue; } // regex: ?\p{N}+ - if (flags2.is_number) { + if (categ2.is_N()) { pos += (cpt == ' '); - while (flags2.is_number) { - flags2 = _get_flags(++pos); + while (categ2.is_N()) { + categ2 = _get_categ(++pos); } _add_token(pos); continue; } // regex: ?[^\s\p{L}\p{N}]+ - if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) { + if (!(categ2.is_whitespace() | categ2.is_L() | categ2.is_N()) && categ2 != SENTINEL) { pos += (cpt == ' '); - while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) { - flags2 = _get_flags(++pos); + while (!(categ2.is_whitespace() | categ2.is_L() | categ2.is_N()) && categ2 != SENTINEL) { + categ2 = _get_categ(++pos); } _add_token(pos); continue; } size_t num_whitespaces = 0; - while (_get_flags(pos+num_whitespaces).is_whitespace) { + while (_get_categ(pos+num_whitespaces).is_whitespace()) { num_whitespaces++; } @@ -351,7 +320,7 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & for (auto offset : offsets) { const size_t offset_ini = start; const size_t offset_end = start + offset; - assert(offset_end <= cpts.size()); + GGML_ASSERT(offset_end <= cpts.size()); start = offset_end; static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF; @@ -359,13 +328,14 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE; }; - auto _get_flags = [&] (const size_t pos) -> codepoint_flags { - return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{}; + static const codepoint_categ SENTINEL = codepoint_categ::UNDEF + 1; + auto _get_categ = [&] (const size_t pos) -> codepoint_categ { + return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_category(cpts[pos]) : SENTINEL; }; size_t _prev_end = offset_ini; auto _add_token = [&] (const size_t end) -> size_t { - assert(_prev_end <= end && end <= offset_end); + GGML_ASSERT(_prev_end <= end && end <= offset_end); size_t len = end - _prev_end; if (len > 0) { bpe_offsets.push_back(len); @@ -382,7 +352,7 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) { const uint32_t cpt = _get_cpt(pos); - const auto flags = _get_flags(pos); + const auto categ = _get_categ(pos); // regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive if (cpt == '\'' && pos+1 < offset_end) { @@ -403,10 +373,10 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & } // regex: [^\r\n\p{L}\p{N}]?\p{L}+ - if (!(cpt == '\r' || cpt == '\n' || flags.is_number)) { - if (flags.is_letter || _get_flags(pos+1).is_letter) { // one or more letters + if (!(cpt == '\r' || cpt == '\n' || categ.is_N())) { + if (categ.is_L() || _get_categ(pos+1).is_L()) { // one or more letters pos++; - while (_get_flags(pos).is_letter) { + while (_get_categ(pos).is_L()) { pos++; } _add_token(pos); @@ -415,9 +385,9 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & } // regex: \p{N}{1,3} - if (flags.is_number) { + if (categ.is_N()) { size_t ini = pos; - while (_get_flags(pos).is_number) { + while (_get_categ(pos).is_N()) { if (++pos - ini >= 3 ) { _add_token(pos); ini = pos; @@ -428,11 +398,11 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & } // regex: ?[^\s\p{L}\p{N}]+[\r\n]* - auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags); - if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags.as_uint()) { + auto categ2 = (cpt == ' ' ? _get_categ(pos+1) : categ); + if (!(categ2.is_whitespace() | categ2.is_L() | categ2.is_N()) && categ2 != SENTINEL) { pos += (cpt == ' '); - while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) { - flags2 = _get_flags(++pos); + while (!(categ2.is_whitespace() | categ2.is_L() | categ2.is_N()) && categ2 != SENTINEL) { + categ2 = _get_categ(++pos); } uint32_t cpt2 = _get_cpt(pos); while (cpt2 == '\r' || cpt2 == '\n') { @@ -444,7 +414,7 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & size_t num_whitespaces = 0; size_t last_end_r_or_n = 0; - while (_get_flags(pos+num_whitespaces).is_whitespace) { + while (_get_categ(pos+num_whitespaces).is_whitespace()) { uint32_t cpt2 = _get_cpt(pos+num_whitespaces); if (cpt2 == '\r' || cpt2 == '\n') { last_end_r_or_n = pos + num_whitespaces + 1; @@ -481,76 +451,279 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & return bpe_offsets; } -// use std::wregex to split the text -static std::vector unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector & offsets) { - std::wregex expr(regex_expr); - std::vector bpe_offsets; // store the offset of each word - bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size - size_t start = 0; - for (auto offset : offsets) { - std::wcregex_iterator it(wtext.data() + start, wtext.data() + start + offset, expr); - std::wcregex_iterator end; +static std::vector unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector & offsets) { + std::vector bpe_offsets; - int64_t start_idx = 0; - while (it != end) { - std::wcmatch match = *it; - if (match.position() > start_idx) { - bpe_offsets.emplace_back(match.position() - start_idx); + if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") { + bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets); + } else if ( + regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" || + regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") { + + bpe_offsets = unicode_regex_split_custom_llama3(text, offsets); + } + + return bpe_offsets; +} + +// Custom std::regex specializations for 32bit unicode codepoints +// std::wregex does not support unicode categories: \p{N}, \p{L}, \p{Lu}, \p{Ll} ... +// std::wregex does not support unicode whitespaces \s: 0x85, 0xA0, 0x001680 ... 0x003000. +// std::wregex supports full 32 bit codepoints, not limited to standard max 0x110000. +namespace std { + +// codepoint type for all template specializations +#if (WCHAR_MAX > 0xFFFF) + using codepoint = wchar_t; // sizeof(wchar_t) == 4 +#else + using codepoint = uint32_t; // Windows: sizeof(wchar_t) == 2 + #define CUSTOM_CTYPE_CODEPOINT +#endif + +#ifdef CUSTOM_CTYPE_CODEPOINT + // Minimal required implementation for std::regex string processing + template<> // custom specialized std::ctype + class ctype { + public: + + using CharT = codepoint; + using char_type = CharT; + + using mask = uint8_t; //NOTE: see std::ctype_base + static const mask digit = 1; // requiered variable names + static const mask xdigit = 2; // user defined values + static const mask alpha = 3; // used to be a bitmask + static const mask upper = 4; // we do not need a bitmask + static const mask lower = 5; // using a sequence instead + + static locale::id id; // required by std::locale::facet + + bool is(mask m, char_type c) const { + switch (m) { + case digit: return ('0' <= c && c <= '9'); + case xdigit: return ('0' <= c && c <= '9') || ('A' <= c && c <= 'F'); + case alpha: return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z'); + case upper: return ('A' <= c && c <= 'Z'); + case lower: return ('a' <= c && c <= 'z'); + default: return false; } - bpe_offsets.emplace_back(match.length()); - start_idx = match.position() + match.length(); - ++it; } - if (start_idx < (int64_t) offset) { - bpe_offsets.emplace_back(offset - start_idx); + char_type toupper(char_type c) const { + return ('a' <= c && c <= 'z') ? c - ('a' - 'A') : c; } - start += offset; + + char_type tolower(char_type c) const { + return ('A' <= c && c <= 'Z') ? c + ('a' - 'A') : c; + } + + char_type widen(char c) const { // char to codepoint + return (char_type) c; + } + + char narrow(char_type c, char dfault) const { // codepoint to char + return (c < 0x80 ? (char)c : dfault); + } + }; + + locale::id ctype::id = {}; + + template<> // specialization to use our custom specialized std::ctype + const std::ctype & use_facet>(const std::locale &) { + static std::ctype ctype_uint32 = {}; + return ctype_uint32; } - return bpe_offsets; + template<> // specialization to use our custom specialized std::ctype + const std::ctype & use_facet>(const std::locale & loc) { + return use_facet>(loc); + } +#endif + + // Minimal required implementation for std::regex string processing + template<> // custom specialized std::regex_traits + class regex_traits { + public: + + using CharT = codepoint; + using char_type = codepoint; + using size_type = size_t; + using string_type = std::basic_string; + using locale_type = std::locale; + using char_class_type = uint64_t; + + #if (defined(_WIN32) || defined(_WIN64)) // MSVC class _Regex_traits + using _Uelem = CharT; + static const auto _Ch_upper = std::ctype::upper; + static const auto _Ch_alpha = std::ctype::alpha; + #endif + + CharT translate(CharT c) const { + return c; + } + + CharT translate_nocase(CharT c) const { + return unicode_tolower(c); + } + + template + string_type transform(It first, It last) const { + GGML_ASSERT(false); //TODO: not needed ? + return {first, last}; //TODO: not tested + } + + template + string_type transform_primary(It first, It last) const { + (void) first; + (void) last; + GGML_ASSERT((uint32_t) *first < MAX_CODEPOINTS); // check valid codepoint + return {}; + } + + template + string_type lookup_collatename(It first, It last) const { + (void) last; + GGML_ASSERT(*first & (1 << 31)); + return {*first}; + } + + template + char_class_type lookup_classname(It first, It last, bool icase = false) const { + (void) last; + (void) icase; + const uint32_t encoded = *first; + codepoint_categ categ = {}; + switch(encoded) { + case 's': + case 'S': // negation is internally tracked + categ.set_flag(codepoint_categ::WHITESPACES); + return categ.expand_bits(); + case 'w': + case 'W': // negation is internally tracked + categ.set_flag(codepoint_categ::WORDS); + return categ.expand_bits(); + case 'd': + case 'D': // negation is internally tracked + categ.set_flag(codepoint_categ::DIGITS); + return categ.expand_bits(); + default: { // unicode category \p{Xx} encoded in codepoint + GGML_ASSERT(encoded & (1 << 31)); // make sure its our custom codepoint encoding the category + const bool negated = encoded & (1 << 30); // negation of 'character class expression' are not internally tracked + categ = {(uint16_t) encoded}; + return ((uint64_t) negated << 63) | categ.expand_bits(false); + } + } + } + + bool isctype(CharT c, char_class_type mask) const { + const bool negated = mask & (1llu << 63); + mask &= unicode_cpt_category(c).expand_bits(); + return negated ^ (bool) mask; + } + + int value(CharT c, int radix) const { // char to int value + switch (radix) { + case 8: return ('0' <= c && c <= '7') ? (int)c - '0' : -1; + case 10: return ('0' <= c && c <= '9') ? (int)c - '0' : -1; + case 16: return ('0' <= c && c <= '9') ? (int)c - '0' : (('A' <= c && c <= 'F') ? (int)c - 'A' + 10 : -1); + default: return -1; + } + } + + const locale_type & imbue(const locale_type &) { // set locale //NOTE: ignoring locales + return std::locale::classic(); + } + + const locale_type & getloc() const { // get locale //NOTE: ignoring locales + return std::locale::classic(); + } + }; } -// use std::regex to split the text -static std::vector unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector & offsets) { - std::regex expr(regex_expr); +static std::vector unicode_regex_prepare(const std::string & regex) { + std::vector regex_cpts; + regex_cpts.reserve(regex.size() * 12 / 10); // estimate +20% + + size_t offset = 0; + int inside_square = 0; + bool any_positive = false; + bool any_negative = false; + + const size_t size = regex.size(); + while (offset < size) { + inside_square += regex[offset] == '['; + inside_square -= regex[offset] == ']'; + GGML_ASSERT(inside_square >= 0); + if (!inside_square) { + any_positive = false; + any_negative = false; + } + + if (regex[offset] == '\\') { + const size_t i = offset + 1; + if (regex[i] == 'p' || regex[i] == 'P') { + // convert \p{Xx} to custom 'character class expression' [:Xy:] + if (regex[i + 1] == '{' && regex[i + 2] && regex[i + 3]) { + codepoint_categ categ = {}; + if (regex[i + 3] == '}') { + categ = codepoint_categ::from_chars(regex[i + 2]); + offset += 5; + } else if (regex[i + 3] != '}' && regex[i + 4] == '}') { + categ = codepoint_categ::from_chars(regex[i + 2], regex[i + 3]); + offset += 6; + } + bool negated = regex[i] == 'P'; + any_positive |= !negated; + any_negative |= negated; + GGML_ASSERT(any_positive != any_negative); //BUG: can not mix 'p' and 'P' inside [] + GGML_ASSERT(sizeof(categ) <= 2); + // encoded category in 32 bits codepoint + uint32_t cpt_categ = (1 << 31) | (negated << 30) | categ.encoded; + if (inside_square) { + regex_cpts.insert(regex_cpts.end(), {'[', ':', cpt_categ, ':', ']'}); + } else { + regex_cpts.insert(regex_cpts.end(), {'[', '[', ':', cpt_categ, ':', ']', ']'}); + } + continue; + } + } + } + + regex_cpts.push_back(unicode_cpt_from_utf8(regex, offset)); + } + + return regex_cpts; +} + +// use std::basic_regex to split the text codepoints +static std::vector unicode_regex_split_stl(const std::vector & text_cpts, const std::vector & regex_cpts, const std::vector & offsets) { + GGML_ASSERT(sizeof(std::codepoint) == sizeof(uint32_t)); + using regex_type = std::basic_regex; + using iter_type = std::regex_iterator; + + const std::codepoint * text_data = (const std::codepoint *) text_cpts.data(); + const std::codepoint * regex_data = (const std::codepoint *) regex_cpts.data(); + regex_type regex(regex_data, regex_data+regex_cpts.size()); + const iter_type end; + std::vector bpe_offsets; // store the offset of each word - bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size - size_t start = 0; + bpe_offsets.reserve(offsets.size()); // reserve memory for the approximate size for (auto offset : offsets) { - std::cregex_iterator it(text.data() + start, text.data() + start + offset, expr); - std::cregex_iterator end; - + iter_type it(text_data, text_data + offset, regex); int64_t start_idx = 0; while (it != end) { - std::cmatch match = *it; - if (match.position() > start_idx) { - bpe_offsets.emplace_back(match.position() - start_idx); + if (it->position() > start_idx) { + bpe_offsets.emplace_back(it->position() - start_idx); } - bpe_offsets.emplace_back(match.length()); - start_idx = match.position() + match.length(); + bpe_offsets.emplace_back(it->length()); + start_idx = it->position() + it->length(); ++it; } if (start_idx < (int64_t) offset) { bpe_offsets.emplace_back(offset - start_idx); } - start += offset; - } - - return bpe_offsets; -} - -static std::vector unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector & offsets) { - std::vector bpe_offsets; - - if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") { - bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets); - } else if ( - regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" || - regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") { - - bpe_offsets = unicode_regex_split_custom_llama3(text, offsets); + text_data += offset; } return bpe_offsets; @@ -612,19 +785,46 @@ std::vector unicode_cpts_from_utf8(const std::string & utf8) { return result; } -codepoint_flags unicode_cpt_flags(const uint32_t cp) { - static const codepoint_flags undef(codepoint_flags::UNDEFINED); - static const auto cpt_flags = unicode_cpt_flags_array(); - return cp < cpt_flags.size() ? cpt_flags[cp] : undef; +codepoint_categ unicode_cpt_category(const uint32_t cp) { + static const std::vector cpt_categs = [] { + std::vector cpt_categs(MAX_CODEPOINTS, codepoint_categ::UNDEF); + uint32_t cpt = 0; + for (uint16_t rle : unicode_rle_codepoints_categs) { + const uint32_t index = rle & 31; + const uint32_t count = rle >> 5; + auto categ = codepoint_categ::from_index(index); + //printf("Codepoints 0x%05X to 0x%05X categ %s\n", cpt, cpt + count, categ.c_str()); + categ.set_flag(codepoint_categ::DIGITS, categ.is_Nd()); // \d --> \p{Nd} + categ.set_flag(codepoint_categ::WORDS, categ.is_L() | categ.is_N()); // \w --> \p{L} \p{N} _ + for (uint32_t i = 0; i <= count; ++i) { + cpt_categs[cpt++] = categ; + } + } + GGML_ASSERT(cpt == MAX_CODEPOINTS); + + cpt_categs['_'].set_flag(codepoint_categ::WORDS); // \w --> \p{L} \p{N} _ + + for (auto p : unicode_ranges_whitespace) { + for (uint32_t cpt = p.first; cpt <= p.second; ++cpt) { + cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACES); + } + } + + //for (auto &range : unicode_ranges_nfd) { // start, last, nfd + // cpt_categs[cpt].set_flag(codepoint_categ::NORM_NFD); + //} + + return cpt_categs; + }(); + return cp < cpt_categs.size() ? cpt_categs[cp] : codepoint_categ{}; } -codepoint_flags unicode_cpt_flags(const std::string & utf8) { - static const codepoint_flags undef(codepoint_flags::UNDEFINED); +codepoint_categ unicode_cpt_category(const std::string & utf8) { if (utf8.empty()) { - return undef; // undefined + return codepoint_categ{}; // undefined } size_t offset = 0; - return unicode_cpt_flags(unicode_cpt_from_utf8(utf8, offset)); + return unicode_cpt_category(unicode_cpt_from_utf8(utf8, offset)); } std::string unicode_byte_to_utf8(uint8_t byte) { @@ -642,171 +842,28 @@ uint32_t unicode_tolower(uint32_t cp) { return it == unicode_map_lowercase.end() ? cp : it->second; } -std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs) { - // unicode categories - static const std::map k_ucat_enum = { - { "\\p{N}", codepoint_flags::NUMBER }, - { "\\p{L}", codepoint_flags::LETTER }, - { "\\p{P}", codepoint_flags::PUNCTUATION }, - }; - - static const std::map k_ucat_cpt = { - { codepoint_flags::NUMBER, 0xD1 }, - { codepoint_flags::LETTER, 0xD2 }, - { codepoint_flags::PUNCTUATION, 0xD3 }, - }; - - static const std::map k_ucat_map = { - { codepoint_flags::NUMBER, "\x30-\x39" }, // 0-9 - { codepoint_flags::LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z - { codepoint_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\} - }; - - // compute collapsed codepoints only if needed by at least one regex - bool need_collapse = false; - for (auto & regex_expr : regex_exprs) { - // search for unicode categories - for (const auto & ucat : k_ucat_enum) { - if (std::string::npos != regex_expr.find(ucat.first)) { - need_collapse = true; - break; - } - } - } - - const auto cpts = unicode_cpts_from_utf8(text); - - // generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte - // ref: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935 - std::string text_collapsed; - if (need_collapse) { - // collapse all unicode categories - text_collapsed.resize(cpts.size()); - - for (size_t i = 0; i < cpts.size(); ++i) { - // keep single-byte codepoints as is - if (cpts[i] < 128) { - text_collapsed[i] = cpts[i]; - continue; - } - - const auto flags = unicode_cpt_flags(cpts[i]); - - if (flags.is_whitespace) { - //NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does. - //text_collapsed[i] = (char) 0x85; // as whitespace fallback - text_collapsed[i] = (char) 0x0B; // as whitespace fallback - } else if (k_ucat_cpt.find(flags.category_flag()) != k_ucat_cpt.end()) { - text_collapsed[i] = k_ucat_cpt.at(flags.category_flag()); - } else { - text_collapsed[i] = (char) 0xD0; // fallback - } - } - } - - std::vector bpe_offsets = { cpts.size() }; +std::vector unicode_regex_split(const std::string & text_utf8, const std::vector & regex_exprs) { + const std::vector cpts = unicode_cpts_from_utf8(text_utf8); + std::vector offsets = { cpts.size() }; for (auto & regex_expr : regex_exprs) { // first, see if we have an efficient custom regex implementation - auto tmp = unicode_regex_split_custom(text, regex_expr, bpe_offsets); + auto tmp = unicode_regex_split_custom(text_utf8, regex_expr, offsets); if (!tmp.empty()) { - bpe_offsets = std::move(tmp); + offsets = std::move(tmp); continue; } - // fallback to general-purpose std::regex / std::wregex - try { - // if a unicode category is used in the regex, we use the collapsed text and replace the unicode category - // with the corresponding collapsed representation - bool use_collapsed = false; - for (auto & ucat : k_ucat_enum) { - if (std::string::npos != regex_expr.find(ucat.first)) { - use_collapsed = true; - break; - } - } - - if (use_collapsed) { - // sanity-check that the original regex does not contain any non-ASCII characters - const auto cpts_regex = unicode_cpts_from_utf8(regex_expr); - for (size_t i = 0; i < cpts_regex.size(); ++i) { - if (cpts_regex[i] >= 128) { - throw std::runtime_error("Regex includes both unicode categories and non-ASCII characters - not supported"); - } - } - - // generate a collapsed representation of the regex - std::string regex_expr_collapsed; - - // track if we are inside [], because nested [] are not allowed - bool inside = false; - for (size_t i = 0; i < regex_expr.size(); ++i) { - if (regex_expr[i] == '[' && (i == 0 || regex_expr[i - 1] != '\\')) { - regex_expr_collapsed += '['; - inside = true; - continue; - } - - if (inside && regex_expr[i] == ']' && regex_expr[i - 1] != '\\') { - regex_expr_collapsed += ']'; - inside = false; - continue; - } - - if (regex_expr[i + 0] == '\\' && i + 4 < regex_expr.size() && - regex_expr[i + 1] == 'p' && - regex_expr[i + 2] == '{' && - regex_expr[i + 4] == '}') { - const std::string pat = regex_expr.substr(i, 5); - if (k_ucat_enum.find(pat) != k_ucat_enum.end()) { - if (!inside) { - regex_expr_collapsed += '['; - } - regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat)); - regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat)); - if (!inside) { - regex_expr_collapsed += ']'; - } - i += 4; - continue; - } - } - - regex_expr_collapsed += regex_expr[i]; - } - - //printf("text_collapsed: %s\n", text_collapsed.c_str()); - //printf("regex_expr_collapsed: %s\n", regex_expr_collapsed.c_str()); - bpe_offsets = unicode_regex_split_stl(text_collapsed, regex_expr_collapsed, bpe_offsets); - } else { - // no unicode category used, we can use std::wregex directly - const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr); - - // std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback - std::wstring wtext(cpts.begin(), cpts.end()); - for (size_t i = 0; i < wtext.size(); ++i) { - if (wtext[i] > 0x7F && unicode_cpt_flags(wtext[i]).is_whitespace) { - wtext[i] = 0x0B; - } - } - - //printf("text: %s\n", text.c_str()); - //printf("regex_expr: %s\n", regex_expr.c_str()); - bpe_offsets = unicode_regex_split_stl(wtext, wregex_expr, bpe_offsets); - } - } catch (std::regex_error & e) { - fprintf(stderr, "Failed to process regex: '%s'\n", regex_expr.c_str()); - fprintf(stderr, "Regex error: %s\n", e.what()); - throw std::runtime_error("Failed to process regex"); - } + const auto regex_cpts = unicode_regex_prepare(regex_expr); + offsets = unicode_regex_split_stl(cpts, regex_cpts, offsets); } std::vector bpe_words; - bpe_words.reserve(bpe_offsets.size()); // reserve memory for the approximate size + bpe_words.reserve(offsets.size()); // reserve memory for the approximate size size_t start = 0; - for (size_t & offset : bpe_offsets) { + for (size_t & offset : offsets) { bpe_words.emplace_back(); for (size_t i = start; i < start + offset; ++i) { bpe_words.back() += unicode_cpt_to_utf8(cpts[i]); diff --git a/src/unicode.h b/src/unicode.h index 008532a242ab8..f2c3e71479975 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -1,51 +1,185 @@ #pragma once #include +#include +#include #include #include +#include +#include -// TODO: prefix all symbols with "llama_" - -struct codepoint_flags { - enum { - UNDEFINED = 0x0001, - NUMBER = 0x0002, // regex: \p{N} - LETTER = 0x0004, // regex: \p{L} - SEPARATOR = 0x0008, // regex: \p{Z} - ACCENT_MARK = 0x0010, // regex: \p{M} - PUNCTUATION = 0x0020, // regex: \p{P} - SYMBOL = 0x0040, // regex: \p{S} - CONTROL = 0x0080, // regex: \p{C} - MASK_CATEGORIES = 0x00FF, +struct codepoint_categ { + // 0bffffff'ccccccc'sss --> 6 bits flags + 7 bits category + 3 bits subcategory + enum _category : uint16_t { + UNDEF = 0, // \p{Cn} Undefined + C = 1 << (0 + 3), // \p{C} Control + L = 1 << (1 + 3), // \p{L} Letter + M = 1 << (2 + 3), // \p{M} Mark + N = 1 << (3 + 3), // \p{N} Number + P = 1 << (4 + 3), // \p{P} Punctuation + S = 1 << (5 + 3), // \p{S} Symbol + Z = 1 << (6 + 3), // \p{Z} Separator + Cc = C | 1, // \p{Cc} Control + Cf = C | 2, // \p{Cf} Format + Co = C | 3, // \p{Co} Private Use + Cs = C | 4, // \p{Cs} Surrrogate + Ll = L | 1, // \p{Ll} Lowercase Letter + Lm = L | 2, // \p{Lm} Modifier Letter + Lo = L | 3, // \p{Lo} Other Letter + Lt = L | 4, // \p{Lt} Titlecase Letter + Lu = L | 5, // \p{Lu} Uppercase Letter + Mc = M | 1, // \p{Mc} Spacing Mark + Me = M | 2, // \p{Me} Enclosing Mark + Mn = M | 3, // \p{Mn} Nonspacing Mark + Nd = N | 1, // \p{Nd} Decimal Number + Nl = N | 2, // \p{Nl} Letter Number + No = N | 3, // \p{No} Other Number + Pc = P | 1, // \p{Pc} Connector Punctuation + Pd = P | 2, // \p{Pd} Dash Punctuation + Pe = P | 3, // \p{Pe} Close Punctuation + Pf = P | 4, // \p{Pf} Final Punctuation + Pi = P | 5, // \p{Pi} Initial Punctuation + Po = P | 6, // \p{Po} Other Punctuation + Ps = P | 7, // \p{Ps} Open Punctuation + Sc = S | 1, // \p{Sc} Currency Symbol + Sk = S | 2, // \p{Sk} Modifier Symbol + Sm = S | 3, // \p{Sm} Math Symbol + So = S | 4, // \p{So} Other Symbol + Zl = Z | 1, // \p{Zl} Line Separator + Zp = Z | 2, // \p{Zp} Paragraph Separator + Zs = Z | 3, // \p{Zs} Space Separator + SUBMASK = (1 << 3) - 1, // 3 bits 0b000000'0000000'111 + MASK = (1 << 10) - 1, // 7+3 bits 0b000000'1111111'111 }; - // codepoint type - uint16_t is_undefined : 1; - uint16_t is_number : 1; // regex: \p{N} - uint16_t is_letter : 1; // regex: \p{L} - uint16_t is_separator : 1; // regex: \p{Z} - uint16_t is_accent_mark : 1; // regex: \p{M} - uint16_t is_punctuation : 1; // regex: \p{P} - uint16_t is_symbol : 1; // regex: \p{S} - uint16_t is_control : 1; // regex: \p{C} - // helper flags - uint16_t is_whitespace : 1; // regex: \s - uint16_t is_lowercase : 1; - uint16_t is_uppercase : 1; - uint16_t is_nfd : 1; - - // decode from uint16 - inline codepoint_flags(const uint16_t flags=0) { - *reinterpret_cast(this) = flags; + enum _flags : uint16_t { + WHITESPACES = (1 << 10), // regex: \s + WORDS = (1 << 11), // regex: \w + DIGITS = (1 << 12), // regex: \d + //Norm NFD/NFC = ..., + }; + + inline codepoint_categ(const uint16_t categ=0) : encoded{categ} {} + + inline void set_flag(_flags flags, bool value = true) { + flags = (_flags) (flags & ~MASK); // do not modify category bits + encoded = value ? (encoded | flags) : (encoded & ~flags); + } + + inline uint16_t get_category() const { return encoded & MASK; } + + inline bool is_undefined() const { return !encoded; } + inline bool is_defined() const { return encoded; } + + inline uint16_t is_whitespace() const { return encoded & WHITESPACES; } + inline uint16_t is_word() const { return encoded & WORDS; } + inline uint16_t is_digit() const { return encoded & DIGITS; } + + inline uint16_t is_C() const { return encoded & C; } + inline uint16_t is_L() const { return encoded & L; } + inline uint16_t is_M() const { return encoded & M; } + inline uint16_t is_N() const { return encoded & N; } + inline uint16_t is_P() const { return encoded & P; } + inline uint16_t is_S() const { return encoded & S; } + inline uint16_t is_Z() const { return encoded & Z; } + + inline bool is_Cc() const { return (encoded & MASK) == Cc; } + inline bool is_Cf() const { return (encoded & MASK) == Cf; } + inline bool is_Co() const { return (encoded & MASK) == Co; } + inline bool is_Cs() const { return (encoded & MASK) == Cs; } + inline bool is_Ll() const { return (encoded & MASK) == Ll; } + inline bool is_Lm() const { return (encoded & MASK) == Lm; } + inline bool is_Lo() const { return (encoded & MASK) == Lo; } + inline bool is_Lt() const { return (encoded & MASK) == Lt; } + inline bool is_Lu() const { return (encoded & MASK) == Lu; } + inline bool is_Mc() const { return (encoded & MASK) == Mc; } + inline bool is_Me() const { return (encoded & MASK) == Me; } + inline bool is_Mn() const { return (encoded & MASK) == Mn; } + inline bool is_Nd() const { return (encoded & MASK) == Nd; } + inline bool is_Nl() const { return (encoded & MASK) == Nl; } + inline bool is_No() const { return (encoded & MASK) == No; } + inline bool is_Pc() const { return (encoded & MASK) == Pc; } + inline bool is_Pd() const { return (encoded & MASK) == Pd; } + inline bool is_Pe() const { return (encoded & MASK) == Pe; } + inline bool is_Pf() const { return (encoded & MASK) == Pf; } + inline bool is_Pi() const { return (encoded & MASK) == Pi; } + inline bool is_Po() const { return (encoded & MASK) == Po; } + inline bool is_Ps() const { return (encoded & MASK) == Ps; } + inline bool is_Sc() const { return (encoded & MASK) == Sc; } + inline bool is_Sk() const { return (encoded & MASK) == Sk; } + inline bool is_Sm() const { return (encoded & MASK) == Sm; } + inline bool is_So() const { return (encoded & MASK) == So; } + inline bool is_Zl() const { return (encoded & MASK) == Zl; } + inline bool is_Zp() const { return (encoded & MASK) == Zp; } + inline bool is_Zs() const { return (encoded & MASK) == Zs; } + + inline uint64_t expand_bits(const bool add_categ=true) const { // one bit for each category/subcateory and flags + const uint32_t subindex = encoded & SUBMASK; + const uint64_t bits = (encoded & MASK) >> 3; + const uint64_t flags = encoded >> 10; + return (flags << (7 * 8)) | (bits << (7 * subindex)) | (bits * add_categ); } - inline uint16_t as_uint() const { - return *reinterpret_cast(this); + inline bool is_in_range(const codepoint_categ other) const { // this.first <= other <= this.last + if (encoded & SUBMASK) { + return encoded == other.encoded; // no range + } + if (encoded & MASK) { + return encoded == (other.encoded & ~SUBMASK); // from 0bffffff'ccccccc'000 to 0bffffff'ccccccc'111 + } + return encoded == (other.encoded & ~MASK); // from 0bffffff'0000000'000 to 0bffffff'1111111'111 } - inline uint16_t category_flag() const { - return this->as_uint() & MASK_CATEGORIES; + inline bool operator == (const codepoint_categ other) const { + return encoded == other.encoded; } + + inline bool operator != (const codepoint_categ other) const { + return encoded != other.encoded; + } + + const char * c_str() const { + static const std::map map = { + {UNDEF, "UNDEF"}, {C, "C"}, {L, "L"}, {M, "M"}, {N, "N"}, {P, "P"}, {S, "S"}, {Z, "Z"}, + {Cc, "Cc"}, {Cf, "Cf"}, {Co, "Co"}, {Cs, "Cs"}, {Ll, "Ll"}, {Lm, "Lm"}, {Lo, "Lo"}, {Lt, "Lt"}, + {Lu, "Lu"}, {Mc, "Mc"}, {Me, "Me"}, {Mn, "Mn"}, {Nd, "Nd"}, {Nl, "Nl"}, {No, "No"}, {Pc, "Pc"}, + {Pd, "Pd"}, {Pe, "Pe"}, {Pf, "Pf"}, {Pi, "Pi"}, {Po, "Po"}, {Ps, "Ps"}, {Sc, "Sc"}, {Sk, "Sk"}, + {Sm, "Sm"}, {So, "So"}, {Zl, "Zl"}, {Zp, "Zp"}, {Zs, "Zs"}, + }; + const auto it = map.find(encoded & MASK); + return it == map.end() ? "INVALID" : it->second; + } + + static codepoint_categ from_index(int index) { + static const std::array table = { + UNDEF, Cc, Cf, Co, Cs, Ll, Lm, Lo, Lt, Lu, Mc, Me, Mn, Nd, Nl, No, Pc, Pd, Pe, Pf, Pi, Po, Ps, Sc, Sk, Sm, So, Zl, Zp, Zs, UNDEF, UNDEF + }; + return (size_t)index < table.size() ? table[index] : table[0]; + } + + static codepoint_categ from_chars(const char categ, const char subcateg = '\0') { + auto _subindex = [] (const char subcateg, const char subcategs[]) -> uint16_t { + if (!subcateg) { + return 0; + } + const char * p = strchr(subcategs, subcateg); + GGML_ASSERT(p); + return (uint16_t) (p - subcategs + 1); + }; + switch(categ) { + case 'C': if(subcateg == 'n') return 0; // undefined + return C | _subindex(subcateg, "cfos" ); + case 'L': return L | _subindex(subcateg, "lmotu" ); + case 'M': return M | _subindex(subcateg, "cen" ); + case 'N': return N | _subindex(subcateg, "dlo" ); + case 'P': return P | _subindex(subcateg, "cdefios"); + case 'S': return S | _subindex(subcateg, "ckmo" ); + case 'Z': return Z | _subindex(subcateg, "lps" ); + default: GGML_ABORT("invalid category character"); + } + } + + uint16_t encoded; }; size_t unicode_len_utf8(char src); @@ -56,8 +190,8 @@ std::vector unicode_cpts_from_utf8(const std::string & utf8); std::vector unicode_cpts_normalize_nfd(const std::vector & cpts); -codepoint_flags unicode_cpt_flags(const uint32_t cp); -codepoint_flags unicode_cpt_flags(const std::string & utf8); +codepoint_categ unicode_cpt_category(const uint32_t cp); +codepoint_categ unicode_cpt_category(const std::string & utf8); std::string unicode_byte_to_utf8(uint8_t byte); uint8_t unicode_utf8_to_byte(const std::string & utf8); diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index 9ebe6c89185a3..f7c3b140776c8 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -116,9 +116,24 @@ def detokenize(self, ids: list[int], remove_special: bool = False, unparse_speci num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special) return str(cast(Buffer, self.ffi.buffer(self.text_buff, num)), encoding="utf-8", errors="replace") # replace errors with '\uFFFD' + def get_vocab(self, detokenize=False) -> list[str]: + vocab: list[str] = [] + num_tokens = self.lib.llama_n_vocab(self.model) + for id in range(num_tokens): + if detokenize: + text = self.detokenize([id], remove_special=False, unparse_special=True) + else: + text = self.lib.llama_token_get_text(self.model, id) + text = str(cast(bytes, self.ffi.string(text)), encoding="utf-8", errors="replace") # replace errors with '\uFFFD' + vocab.append(text) + return vocab + class Tokenizer: + def get_vocab(self, detokenize=False) -> list[str]: + raise NotImplementedError + def encode(self, text: str) -> list[int]: raise NotImplementedError @@ -129,7 +144,7 @@ def decode(self, ids: list[int]) -> str: class TokenizerGroundtruth (Tokenizer): def __init__(self, dir_tokenizer: str): - self.model: PreTrainedTokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) + self.model: PreTrainedTokenizer = AutoTokenizer.from_pretrained(dir_tokenizer, trust_remote_code=False) # guess BOS and EOS ids = self.encode("a") assert 1 <= len(ids) <= 3 @@ -138,15 +153,25 @@ def __init__(self, dir_tokenizer: str): self.add_bos_token = getattr(self.model, "add_bos_token", add_bos_token) self.add_eos_token = getattr(self.model, "add_eos_token", add_eos_token) # build vocab - tokens = list(self.model.get_vocab().values()) - self.vocab = self.model.batch_decode(tokens, skip_special_tokens=True) - self.vocab = list(sorted(self.vocab)) + self.vocab = self.get_vocab(detokenize=True) # tokens and lists - self.special_tokens = list(self.model.all_special_tokens) - self.added_tokens = self.model.batch_decode(self.model.added_tokens_encoder.values(), skip_special_tokens=False) + self.special_tokens = [self.vocab[i] for i in sorted(self.model.all_special_ids)] + self.added_tokens = [self.vocab[i] for i in sorted(self.model.added_tokens_encoder.values())] self.bos_token = self.model.bos_token self.eos_token = self.model.eos_token + def get_vocab(self, detokenize=False) -> list[str]: + vocab: list[str] = [] + max_token_id = max(self.model.get_vocab().values()) + if detokenize: + ids = list(range(max_token_id + 1)) + vocab = self.model.batch_decode(ids, skip_special_tokens=False) + else: + vocab = [""] * (max_token_id + 1) + for text, id in self.model.get_vocab().items(): + vocab[id] = text + return vocab + def encode(self, text: str) -> list[int]: return self.model.encode(text, add_special_tokens=True) @@ -163,6 +188,9 @@ def __init__(self, vocab_file: str): self.libllama = LibLlama() self.model = LibLlamaModel(self.libllama, vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096)) + def get_vocab(self, detokenize=False) -> list[str]: + return self.model.get_vocab(detokenize) + def encode(self, text: str) -> list[int]: return self.model.tokenize(text, add_special=True, parse_special=True) @@ -253,6 +281,23 @@ def generator_vocab_words(tokenizer: TokenizerGroundtruth) -> Iterator[str]: yield from tokenizer.vocab +def generator_byte_tokens() -> Iterator[str]: + """Brute force check common byte encoding""" + for a, b in ["<>", "[]", "()", ("\\", "")]: + yield from [f"{a}{i}{b}" for i in range(256)] + yield from [f"{a}{i:x}{b}" for i in range(256)] + yield from [f"{a}{i:X}{b}" for i in range(256)] + yield from [f"{a}x{i:x}{b}" for i in range(256)] + yield from [f"{a}x{i:X}{b}" for i in range(256)] + yield from [f"{a}x{i:02x}{b}" for i in range(256)] + yield from [f"{a}x{i:02X}{b}" for i in range(256)] + yield from [f"{a}0x{i:x}{b}" for i in range(256)] + yield from [f"{a}0x{i:X}{b}" for i in range(256)] + yield from [f"{a}0x{i:02x}{b}" for i in range(256)] + yield from [f"{a}0x{i:02X}{b}" for i in range(256)] + yield from [f"{a}{chr(i)}{b}" for i in range(256)] + + def generator_ascii_lr_strip() -> Iterator[str]: WHITESPACES = ["", " ", " "] CHARACTERS = list(chr(i) for i in range(1, 0x80)) + [""] @@ -275,10 +320,11 @@ def generator_apostrophe() -> Iterator[str]: yield char1 + lstrip + "'" + rstrip + char2 yield char1 + char2 + lstrip + "'" + rstrip + "z" yield "a" + lstrip + "'" + rstrip + char1 + char2 + yield "a" + lstrip + "'" + char1 + char2 + rstrip + "z" def generator_added_lr_strip(tokenizer: TokenizerGroundtruth) -> Iterator[str]: - WHITESPACES = ["", " ", " ", "\n", "\r\n", "\n\n", "\t", "\t\t"] + WHITESPACES = ["", " ", " ", "\n", "\r\n", "\n\n", "\t", "\t\t", " "] all_tokens = list(sorted(set(tokenizer.special_tokens + tokenizer.added_tokens))) for token in all_tokens: for lstrip in WHITESPACES: @@ -409,14 +455,6 @@ def generator_random_vocab_words(tokenizer: TokenizerGroundtruth, iterations=100 def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLlamaCpp, generator: Iterator[str]): - def find_first_mismatch(ids1: list[int] | str, ids2: list[int] | str): - for i, (a, b) in enumerate(zip(ids1, ids2)): - if a != b: - return i - if len(ids1) == len(ids2): - return -1 - return min(len(ids1), len(ids2)) - def check_detokenizer(text: str, text1: str, text2: str) -> bool: if text1 == text2: # equal to TokenizerGroundtruth? return True @@ -434,9 +472,11 @@ def check_detokenizer(text: str, text1: str, text2: str) -> bool: t_decode1 = 0 t_decode2 = 0 t_start = time.perf_counter() + total_tests = 0 + failing_texts = set() encode_errors = 0 decode_errors = 0 - MAX_ERRORS = 10 + MAX_ERRORS = 5 logger.info("%s: %s" % (generator.__qualname__, "ini")) for text in generator: @@ -455,31 +495,100 @@ def check_detokenizer(text: str, text1: str, text2: str) -> bool: t_encode2 += t2 - t1 t_decode1 += t3 - t2 t_decode2 += t4 - t3 - if encode_errors < MAX_ERRORS and ids1 != ids2: - i = find_first_mismatch(ids1, ids2) - ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1] - ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1] - logger.error(" Expected: " + str(ids1)) - logger.error(" Result: " + str(ids2)) - encode_errors += 1 - logger.error(f" {encode_errors=}") - if decode_errors < MAX_ERRORS and not check_detokenizer(text, text1, text2): - i = find_first_mismatch(text1, text2) - text1 = list(text1[max(0, i - 2) : i + 5 + 1]) - text2 = list(text2[max(0, i - 2) : i + 5 + 1]) - logger.error(" Expected: " + " ".join(hex(ord(x)) for x in text1)) - logger.error(" Result: " + " ".join(hex(ord(x)) for x in text2)) - decode_errors += 1 - logger.error(f" {decode_errors=}") - if encode_errors >= MAX_ERRORS and decode_errors >= MAX_ERRORS: - logger.error(f" EXIT: {encode_errors=} {decode_errors=}") - # raise Exception() - break + total_tests += 1 + # compare + encode_ok = ids1 == ids2 + decode_ok = check_detokenizer(text, text1, text2) + if not (encode_ok and decode_ok): + def _compare(text: str): + ids1 = tokenizer1.encode(text) + ids2 = tokenizer2.encode(text) + text1 = tokenizer1.decode(ids1) + text2 = tokenizer2.decode(ids1) + encode_ok = ids1 == ids2 + decode_ok = check_detokenizer(text, text1, text2) + ok = encode_ok and decode_ok + return ok, ids1, ids2, text1, text2 + # binary search upper and lower failing range + a, b = 0, len(text) + step = b + while step > 1: + step = (step + 1) // 2 + t = max(a, b - step) + if not _compare(text[a : t])[0]: + b = t + step = b + while step > 1: + step = (step + 1) // 2 + t = min(a + step, b) + if not _compare(text[t : b])[0]: + a = t + ok, ids1, ids2, text1, text2 = _compare(text[a : b]) + assert a <= b and not ok + # show unique failing texts differences + failing_text = text[a : b] + if failing_text not in failing_texts: + failing_texts.add(failing_text) + if encode_errors < MAX_ERRORS and not encode_ok: + encode_errors += 1 + logger.error(f" {encode_errors=}") + logger.error(" Text:" + repr(failing_text)) + logger.error(" " + " ".join(repr(x) + ":" + hex(ord(x)) for x in failing_text)) + logger.error(" Expected: " + str(ids1)) + logger.error(" Result: " + str(ids2)) + if decode_errors < MAX_ERRORS and not decode_ok: + decode_errors += 1 + logger.error(f" {decode_errors=}") + logger.error(" Text:" + repr(failing_text)) + logger.error(" " + " ".join(repr(x) + ":" + hex(ord(x)) for x in failing_text)) + logger.error(" Expected: " + " ".join(repr(x) + ":" + hex(ord(x)) for x in text1)) + logger.error(" Result: " + " ".join(repr(x) + ":" + hex(ord(x)) for x in text2)) + if encode_errors >= MAX_ERRORS and decode_errors >= MAX_ERRORS: + logger.error(f" EXIT: {encode_errors=} {decode_errors=}") + # raise Exception() + break t_total = time.perf_counter() - t_start logger.info(f"{generator.__qualname__}: end, {t_encode1=:.3f} {t_encode2=:.3f} {t_decode1=:.3f} {t_decode2=:.3f} {t_total=:.3f}") +def compare_vocabs(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLlamaCpp): + + MAX_PRINT_ERRORS = 10 + + logger.info("compare_vocabs: ini") + + t_start = time.perf_counter() + + for detokenize in (False, True): + vocab1 = tokenizer1.get_vocab(detokenize) + vocab2 = tokenizer2.get_vocab(detokenize) + if vocab1 != vocab2: + num_errors = 0 + for i in range(max(len(vocab1), len(vocab2))): + text1 = vocab1[i] if i < len(vocab1) else None + text2 = vocab2[i] if i < len(vocab2) else None + if text1 != text2: + # is "[UNUSED_TOKEN_" and "[PAD" valid for all models ? #TODO: use toktypes + if text1 is not None: + text1 = text1.replace("[UNUSED_TOKEN_", "[PAD") + if text2 is not None: + text2 = text2.replace("[UNUSED_TOKEN_", "[PAD") + if text1 is None and (text2 or "").startswith('[PAD'): + text2 = None + if text2 is None and (text1 or "").startswith('[PAD'): + text1 = None + if text1 != text2: + num_errors += 1 + if num_errors < MAX_PRINT_ERRORS: + logger.error(f" {detokenize=} id={i} expected={repr(text1)} result={repr(text2)}") + if num_errors: + logger.error(f" {num_errors=}") + + t_total = time.perf_counter() - t_start + logger.info(f"compare_vocabs: end, {t_total=:.3f}") + + def main(argv: list[str] | None = None): parser = argparse.ArgumentParser() parser.add_argument("vocab_file", type=str, help="path to vocab 'gguf' file") @@ -493,18 +602,21 @@ def main(argv: list[str] | None = None): tokenizer1 = TokenizerGroundtruth(args.dir_tokenizer) tokenizer2 = TokenizerLlamaCpp(args.vocab_file) - # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text()) - # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text_edge_cases()) + compare_vocabs(tokenizer1, tokenizer2) + + compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text()) + compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text_edge_cases()) + compare_tokenizers(tokenizer1, tokenizer2, generator_byte_tokens()) compare_tokenizers(tokenizer1, tokenizer2, generator_ascii_lr_strip()) compare_tokenizers(tokenizer1, tokenizer2, generator_apostrophe()) compare_tokenizers(tokenizer1, tokenizer2, generator_unicodes()) compare_tokenizers(tokenizer1, tokenizer2, generator_vocab_words(tokenizer1)) compare_tokenizers(tokenizer1, tokenizer2, generator_added_lr_strip(tokenizer1)) - # compare_tokenizers(tokenizer1, tokenizer2, generator_random_added_tokens(tokenizer1, 10_000)) - # compare_tokenizers(tokenizer1, tokenizer2, generator_random_chars(10_000)) - # compare_tokenizers(tokenizer1, tokenizer2, generator_random_unicodes(10_000)) - # compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_chars(tokenizer1, 10_000)) - # compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_words(tokenizer1, 5_000)) + compare_tokenizers(tokenizer1, tokenizer2, generator_random_added_tokens(tokenizer1, 10_000)) + compare_tokenizers(tokenizer1, tokenizer2, generator_random_chars(10_000)) + compare_tokenizers(tokenizer1, tokenizer2, generator_random_unicodes(10_000)) + compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_chars(tokenizer1, 10_000)) + compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_words(tokenizer1, 5_000)) tokenizer2.model.free() @@ -533,21 +645,19 @@ def main(argv: list[str] | None = None): "phi-3", # SPM "gemma", # SPM "gemma-2", # SPM - "baichuan", # SPM + # "baichuan", # SPM "bert-bge", # WPM "jina-v2-en", # WPM + # "t5", # UGM "llama-bpe", # BPE "phi-2", # BPE "deepseek-llm", # BPE "deepseek-coder", # BPE "falcon", # BPE - "mpt", # BPE "starcoder", # BPE "gpt-2", # BPE "stablelm2", # BPE "refact", # BPE - "qwen2", # BPE - "olmo", # BPE "jina-v2-es", # BPE "jina-v2-de", # BPE "smaug-bpe", # BPE @@ -555,6 +665,14 @@ def main(argv: list[str] | None = None): "jina-v2-code", # BPE "viking", # BPE "jais", # BPE + "codeshell", # BPE + "tekken", # BPE + "smollm", # BPE + "mpt", # BPE NFC + "command-r", # BPE NFC + "qwen2", # BPE NFC + "olmo", # BPE NFC + "gpt-neox", # BPE NFC ] logger.info("=" * 50)