diff --git a/devel/1113.md b/devel/1113.md index 9923df1317..46b47cfd09 100644 --- a/devel/1113.md +++ b/devel/1113.md @@ -8,6 +8,11 @@ - `src/Data/String/converter.hpp` - `src/Data/String/converter.cpp` - `tests/Data/String/converter_test.cpp` +- `lolly/lolly/data/herk.hpp` +- `lolly/lolly/data/herk.cpp` +- `lolly/tests/lolly/data/herk_test.cpp` +- `src/Scheme/L2/glue_lolly.lua` +- `src/Scheme/L3/glue_moebius.lua` ## 如何测试 @@ -15,6 +20,8 @@ ```bash xmake b converter_test xmake r converter_test +xmake b lolly_tests +xmake test lolly_tests/herk_test ``` ### 预期输出 @@ -31,6 +38,8 @@ Totals: 6 passed, 0 failed, 0 skipped, 0 blacklisted, 9ms ********* Finished testing of TestConverter ********* ``` +lolly 的 `herk_test` 应同样全部通过。 + ## 性能评估 使用临时基准测试(10000 次迭代,混合 ASCII 与 CJK/特殊字符输入)对比修改前后: @@ -46,9 +55,11 @@ Totals: 6 passed, 0 failed, 0 skipped, 0 blacklisted, 9ms 提交前执行以下最少步骤: ```bash -gf fmt --changed-since=main +gf fmt xmake b converter_test xmake r converter_test +xmake b lolly_tests +xmake test lolly_tests/herk_test ``` ## What @@ -73,5 +84,12 @@ Herk 编码是 TMU 序列化的核心编码,其转换正确性直接影响文 2. 重写 `utf8_to_herk` 与 `herk_to_utf8`,仅使用静态表完成转换。 3. 从 `converter_rep::load()` 中删除 `Hex-Cork` 相关分支。 4. 保留 `tests/Data/String/converter_test.cpp` 中已有的完整单元测试作为回归验证。 +5. 将 `utf8_to_herk` 与 `herk_to_utf8` 的实现迁移到 `lolly/lolly/data/herk.cpp`, + 在 `lolly::data` 命名空间中提供相同签名的函数;`src/Data/String/converter.cpp` + 保留为对 `lolly::data` 的薄封装,保持现有 C++ 调用点不变。 +6. 将对应的 Scheme glue(`utf8->herk`、`herk->utf8`)从 `glue_moebius.lua` + 迁移到 `glue_lolly.lua`,使其绑定到 `lolly::data::utf8_to_herk` 与 + `lolly::data::herk_to_utf8`。 +7. 在 `lolly/tests/lolly/data/herk_test.cpp` 中添加针对 lolly 实现的单元测试。 整理时间:2026/06/25 diff --git a/lolly/lolly/data/herk.cpp b/lolly/lolly/data/herk.cpp new file mode 100644 index 0000000000..d943fb1490 --- /dev/null +++ b/lolly/lolly/data/herk.cpp @@ -0,0 +1,168 @@ + +/****************************************************************************** + * MODULE : herk.cpp + * DESCRIPTION: Herk encoding conversions + * COPYRIGHT : (C) 2026 Darcy Shen + ******************************************************************************* + * This software falls under the GNU general public license version 3 or later. + * It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE + * in the root directory or . + ******************************************************************************/ + +#include "herk.hpp" +#include "numeral.hpp" +#include "unicode.hpp" + +namespace lolly { +namespace data { + +static const int herk_to_utf8_code[256]= { + 0x0060, 0x00B4, 0x02C6, 0x02DC, 0x00A8, 0x02DD, 0x02DA, 0x02C7, 0x02D8, + 0x00AF, 0x02D9, 0x00B8, 0x02DB, 0x201A, 0x2039, 0x203A, 0x201C, 0x201D, + 0x201E, 0x00AB, 0x00BB, 0x2013, 0x2014, 0x200B, 0x2080, 0x0131, 0x0237, + 0xFB00, 0xFB01, 0xFB02, 0xFB03, 0xFB04, 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, + 0x002D, 0x002E, 0x002F, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, + 0x0036, 0x0037, 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, + 0x003F, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, + 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, + 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, 0x2018, 0x0061, 0x0062, + 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006A, 0x006B, + 0x006C, 0x006D, 0x006E, 0x006F, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, + 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, + 0x007E, 0x00AD, 0x0102, 0x0104, 0x0106, 0x010C, 0x010E, 0x011A, 0x0118, + 0x011E, 0x0139, 0x013D, 0x0141, 0x0143, 0x0147, 0x014A, 0x0150, 0x0154, + 0x0158, 0x015A, 0x0160, 0x015E, 0x0164, 0x0162, 0x0170, 0x016E, 0x0178, + 0x0179, 0x017D, 0x017B, 0x0132, 0x0130, 0x0111, 0x00A7, 0x0103, 0x0105, + 0x0107, 0x010D, 0x010F, 0x011B, 0x0119, 0x011F, 0x013A, 0x013E, 0x0142, + 0x0144, 0x0148, 0x014B, 0x0151, 0x0155, 0x0159, 0x015B, 0x0161, 0x015F, + 0x0165, 0x0163, 0x0171, 0x016F, 0x00FF, 0x017A, 0x017E, 0x017C, 0x0133, + 0x00A1, 0x00BF, 0x00A3, 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, + 0x00C6, 0x00C7, 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, + 0x00CF, 0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x0152, + 0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x1E9E, 0x00E0, + 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, 0x00E8, 0x00E9, + 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, 0x00F0, 0x00F1, 0x00F2, + 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x0153, 0x00F8, 0x00F9, 0x00FA, 0x00FB, + 0x00FC, 0x00FD, 0x00FE, 0x00DF, +}; + +static const int utf8_to_herk_byte[256]= { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 0, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 189, -1, 191, -1, + -1, -1, 159, 4, -1, -1, 19, -1, 127, -1, 9, -1, -1, -1, -1, + 1, -1, -1, -1, 11, -1, -1, 20, -1, -1, -1, 190, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, -1, 216, 217, 218, 219, 220, 221, 222, 255, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, -1, 248, 249, 250, 251, 252, 253, 254, + 184, +}; + +static const int utf8_to_herk_high[][2]= { + {0x0102, 0x80}, {0x0103, 0xA0}, {0x0104, 0x81}, {0x0105, 0xA1}, + {0x0106, 0x82}, {0x0107, 0xA2}, {0x010C, 0x83}, {0x010D, 0xA3}, + {0x010E, 0x84}, {0x010F, 0xA4}, {0x0111, 0x9E}, {0x0118, 0x86}, + {0x0119, 0xA6}, {0x011A, 0x85}, {0x011B, 0xA5}, {0x011E, 0x87}, + {0x011F, 0xA7}, {0x0130, 0x9D}, {0x0131, 0x19}, {0x0132, 0x9C}, + {0x0133, 0xBC}, {0x0139, 0x88}, {0x013A, 0xA8}, {0x013D, 0x89}, + {0x013E, 0xA9}, {0x0141, 0x8A}, {0x0142, 0xAA}, {0x0143, 0x8B}, + {0x0144, 0xAB}, {0x0147, 0x8C}, {0x0148, 0xAC}, {0x014A, 0x8D}, + {0x014B, 0xAD}, {0x0150, 0x8E}, {0x0151, 0xAE}, {0x0152, 0xD7}, + {0x0153, 0xF7}, {0x0154, 0x8F}, {0x0155, 0xAF}, {0x0158, 0x90}, + {0x0159, 0xB0}, {0x015A, 0x91}, {0x015B, 0xB1}, {0x015E, 0x93}, + {0x015F, 0xB3}, {0x0160, 0x92}, {0x0161, 0xB2}, {0x0162, 0x95}, + {0x0163, 0xB5}, {0x0164, 0x94}, {0x0165, 0xB4}, {0x016E, 0x97}, + {0x016F, 0xB7}, {0x0170, 0x96}, {0x0171, 0xB6}, {0x0178, 0x98}, + {0x0179, 0x99}, {0x017A, 0xB9}, {0x017B, 0x9B}, {0x017C, 0xBB}, + {0x017D, 0x9A}, {0x017E, 0xBA}, {0x0237, 0x1A}, {0x02C6, 0x02}, + {0x02C7, 0x07}, {0x02D8, 0x08}, {0x02D9, 0x0A}, {0x02DA, 0x06}, + {0x02DB, 0x0C}, {0x02DC, 0x03}, {0x02DD, 0x05}, {0x1E9E, 0xDF}, + {0x200B, 0x17}, {0x2013, 0x15}, {0x2014, 0x16}, {0x2018, 0x60}, + {0x201A, 0x0D}, {0x201C, 0x10}, {0x201D, 0x11}, {0x201E, 0x12}, + {0x2039, 0x0E}, {0x203A, 0x0F}, {0x2080, 0x18}, {0xFB00, 0x1B}, + {0xFB01, 0x1C}, {0xFB02, 0x1D}, {0xFB03, 0x1E}, {0xFB04, 0x1F}, +}; + +static const int utf8_to_herk_high_count= 88; + +static int +lookup_utf8_to_herk_high (uint32_t code) { + int lo= 0, hi= utf8_to_herk_high_count - 1; + while (lo <= hi) { + int mid= (lo + hi) >> 1; + if (utf8_to_herk_high[mid][0] == (int) code) + return utf8_to_herk_high[mid][1]; + if (utf8_to_herk_high[mid][0] < (int) code) lo= mid + 1; + else hi= mid - 1; + } + return -1; +} + +static inline void +append_utf8_code (string& r, int code) { + if (code < 0x80) { + r << (char) code; + } + else if (code < 0x800) { + r << (char) (0xC0 | (code >> 6)); + r << (char) (0x80 | (code & 0x3F)); + } + else { + r << (char) (0xE0 | (code >> 12)); + r << (char) (0x80 | ((code >> 6) & 0x3F)); + r << (char) (0x80 | (code & 0x3F)); + } +} + +string +utf8_to_herk (string_u8 input) { + int i, n= N (input); + string output; + for (i= 0; i < n;) { + uint32_t code = decode_from_utf8 (input, i); + int mapped= (code <= 0xFF) ? utf8_to_herk_byte[code] : -1; + if (mapped == -1) mapped= lookup_utf8_to_herk_high (code); + if (mapped >= 0) { + output << string ((char) mapped, 1); + } + else { + if (code < 16) output << "<#0" * to_Hex (code) * ">"; + else if (code < 32 || code >= 128) output << "<#" * to_Hex (code) * ">"; + else output << string ((char) code, 1); + } + } + return output; +} + +string_u8 +herk_to_utf8 (string input) { + int start= 0, i, n= N (input); + string r; + for (i= 0; i < n; i++) + if (input[i] == '<' && i + 1 < n && input[i + 1] == '#') { + for (int j= start; j < i; j++) + append_utf8_code (r, herk_to_utf8_code[(unsigned char) input[j]]); + start= i= i + 2; + while (i < n && input[i] != '>') + i++; + append_utf8_code (r, from_hex (input (start, i))); + start= i + 1; + } + for (int j= start; j < n; j++) + append_utf8_code (r, herk_to_utf8_code[(unsigned char) input[j]]); + return r; +} + +} // namespace data +} // namespace lolly diff --git a/lolly/lolly/data/herk.hpp b/lolly/lolly/data/herk.hpp new file mode 100644 index 0000000000..ed32280ce6 --- /dev/null +++ b/lolly/lolly/data/herk.hpp @@ -0,0 +1,34 @@ + +/****************************************************************************** + * MODULE : herk.hpp + * DESCRIPTION: Herk encoding conversions + * COPYRIGHT : (C) 2026 Darcy Shen + ******************************************************************************* + * This software falls under the GNU general public license version 3 or later. + * It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE + * in the root directory or . + ******************************************************************************/ + +#pragma once + +#include "string.hpp" + +namespace lolly { +namespace data { + +/** + * @brief Convert a Herk-encoded string to UTF-8. + * @param input The Herk-encoded string. + * @return The UTF-8 string. + */ +string_u8 herk_to_utf8 (string input); + +/** + * @brief Convert a UTF-8 string to Herk encoding. + * @param input The UTF-8 string. + * @return The Herk-encoded string. + */ +string utf8_to_herk (string_u8 input); + +} // namespace data +} // namespace lolly diff --git a/lolly/tests/lolly/data/herk_test.cpp b/lolly/tests/lolly/data/herk_test.cpp new file mode 100644 index 0000000000..af89ab3af7 --- /dev/null +++ b/lolly/tests/lolly/data/herk_test.cpp @@ -0,0 +1,764 @@ +/****************************************************************************** + * MODULE : herk_test.cpp + * DESCRIPTION: tests on Herk encoding conversions + * COPYRIGHT : (C) 2026 Darcy Shen + ******************************************************************************* + * This software falls under the GNU general public license version 3 or later. + * It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE + * in the root directory or . + ******************************************************************************/ + +#include "a_lolly_test.hpp" +#include "lolly/data/herk.hpp" + +using lolly::data::herk_to_utf8; +using lolly::data::utf8_to_herk; + +// Helper: build a single-byte Herk string, even for the NUL byte. +static string +herk_byte (int byte) { + return string ((char) byte, 1); +} + +TEST_CASE ("herk_to_utf8_0x") { + string_eq (herk_to_utf8 (herk_byte (0x00)), "`"); // U+0060 + string_eq (utf8_to_herk ("`"), herk_byte (0x00)); // U+0060 + string_eq (herk_to_utf8 (herk_byte (0x01)), "´"); // U+00B4 + string_eq (herk_to_utf8 (herk_byte (0x02)), "ˆ"); // U+02C6 + string_eq (herk_to_utf8 (herk_byte (0x03)), "˜"); // U+02DC + string_eq (herk_to_utf8 (herk_byte (0x04)), "¨"); // U+00A8 + string_eq (herk_to_utf8 (herk_byte (0x05)), "˝"); // U+02DD + string_eq (herk_to_utf8 (herk_byte (0x06)), "˚"); // U+02DA + string_eq (herk_to_utf8 (herk_byte (0x07)), "ˇ"); // U+02C7 + string_eq (herk_to_utf8 (herk_byte (0x08)), "˘"); // U+02D8 + string_eq (herk_to_utf8 (herk_byte (0x09)), "¯"); // U+00AF + string_eq (herk_to_utf8 (herk_byte (0x0A)), "˙"); // U+02D9 + string_eq (herk_to_utf8 (herk_byte (0x0B)), "¸"); // U+00B8 + string_eq (herk_to_utf8 (herk_byte (0x0C)), "˛"); // U+02DB + string_eq (herk_to_utf8 (herk_byte (0x0D)), "‚"); // U+201A + string_eq (herk_to_utf8 (herk_byte (0x0E)), "‹"); // U+2039 + string_eq (herk_to_utf8 (herk_byte (0x0F)), "›"); // U+203A +} + +TEST_CASE ("herk_to_utf8_1x") { + string_eq (herk_to_utf8 (herk_byte (0x10)), "“"); // U+201C + string_eq (herk_to_utf8 (herk_byte (0x11)), "”"); // U+201D + string_eq (herk_to_utf8 (herk_byte (0x12)), "„"); // U+201E + string_eq (herk_to_utf8 (herk_byte (0x13)), "«"); // U+00AB + string_eq (herk_to_utf8 (herk_byte (0x14)), "»"); // U+00BB + string_eq (herk_to_utf8 (herk_byte (0x15)), "–"); // U+2013 + string_eq (herk_to_utf8 (herk_byte (0x16)), "—"); // U+2014 + string_eq (herk_to_utf8 (herk_byte (0x17)), "\xE2\x80\x8B"); // U+200B + string_eq (herk_to_utf8 (herk_byte (0x18)), "₀"); // U+2080 + string_eq (herk_to_utf8 (herk_byte (0x19)), "ı"); // U+0131 + string_eq (herk_to_utf8 (herk_byte (0x1A)), "ȷ"); // U+0237 + string_eq (herk_to_utf8 (herk_byte (0x1B)), "ff"); // U+FB00 + string_eq (herk_to_utf8 (herk_byte (0x1C)), "fi"); // U+FB01 + string_eq (herk_to_utf8 (herk_byte (0x1D)), "fl"); // U+FB02 + string_eq (herk_to_utf8 (herk_byte (0x1E)), "ffi"); // U+FB03 + string_eq (herk_to_utf8 (herk_byte (0x1F)), "ffl"); // U+FB04 +} + +TEST_CASE ("herk_to_utf8_2x") { + string_eq (herk_to_utf8 (herk_byte (0x20)), " "); // U+0020 + string_eq (herk_to_utf8 (herk_byte (0x21)), "!"); // U+0021 + string_eq (herk_to_utf8 (herk_byte (0x22)), "\""); // U+0022 + string_eq (herk_to_utf8 (herk_byte (0x23)), "#"); // U+0023 + string_eq (herk_to_utf8 (herk_byte (0x24)), "$"); // U+0024 + string_eq (herk_to_utf8 (herk_byte (0x25)), "%"); // U+0025 + string_eq (herk_to_utf8 (herk_byte (0x26)), "&"); // U+0026 + string_eq (herk_to_utf8 (herk_byte (0x27)), "'"); // U+0027 + string_eq (herk_to_utf8 (herk_byte (0x28)), "("); // U+0028 + string_eq (herk_to_utf8 (herk_byte (0x29)), ")"); // U+0029 + string_eq (herk_to_utf8 (herk_byte (0x2A)), "*"); // U+002A + string_eq (herk_to_utf8 (herk_byte (0x2B)), "+"); // U+002B + string_eq (herk_to_utf8 (herk_byte (0x2C)), ","); // U+002C + string_eq (herk_to_utf8 (herk_byte (0x2D)), "-"); // U+002D + string_eq (herk_to_utf8 (herk_byte (0x2E)), "."); // U+002E + string_eq (herk_to_utf8 (herk_byte (0x2F)), "/"); // U+002F +} + +TEST_CASE ("herk_to_utf8_3x") { + string_eq (herk_to_utf8 (herk_byte (0x30)), "0"); // U+0030 + string_eq (herk_to_utf8 (herk_byte (0x31)), "1"); // U+0031 + string_eq (herk_to_utf8 (herk_byte (0x32)), "2"); // U+0032 + string_eq (herk_to_utf8 (herk_byte (0x33)), "3"); // U+0033 + string_eq (herk_to_utf8 (herk_byte (0x34)), "4"); // U+0034 + string_eq (herk_to_utf8 (herk_byte (0x35)), "5"); // U+0035 + string_eq (herk_to_utf8 (herk_byte (0x36)), "6"); // U+0036 + string_eq (herk_to_utf8 (herk_byte (0x37)), "7"); // U+0037 + string_eq (herk_to_utf8 (herk_byte (0x38)), "8"); // U+0038 + string_eq (herk_to_utf8 (herk_byte (0x39)), "9"); // U+0039 + string_eq (herk_to_utf8 (herk_byte (0x3A)), ":"); // U+003A + string_eq (herk_to_utf8 (herk_byte (0x3B)), ";"); // U+003B + string_eq (herk_to_utf8 (herk_byte (0x3C)), "<"); // U+003C + string_eq (herk_to_utf8 (herk_byte (0x3D)), "="); // U+003D + string_eq (herk_to_utf8 (herk_byte (0x3E)), ">"); // U+003E + string_eq (herk_to_utf8 (herk_byte (0x3F)), "?"); // U+003F +} + +TEST_CASE ("herk_to_utf8_4x") { + string_eq (herk_to_utf8 (herk_byte (0x40)), "@"); // U+0040 + string_eq (herk_to_utf8 (herk_byte (0x41)), "A"); // U+0041 + string_eq (herk_to_utf8 (herk_byte (0x42)), "B"); // U+0042 + string_eq (herk_to_utf8 (herk_byte (0x43)), "C"); // U+0043 + string_eq (herk_to_utf8 (herk_byte (0x44)), "D"); // U+0044 + string_eq (herk_to_utf8 (herk_byte (0x45)), "E"); // U+0045 + string_eq (herk_to_utf8 (herk_byte (0x46)), "F"); // U+0046 + string_eq (herk_to_utf8 (herk_byte (0x47)), "G"); // U+0047 + string_eq (herk_to_utf8 (herk_byte (0x48)), "H"); // U+0048 + string_eq (herk_to_utf8 (herk_byte (0x49)), "I"); // U+0049 + string_eq (herk_to_utf8 (herk_byte (0x4A)), "J"); // U+004A + string_eq (herk_to_utf8 (herk_byte (0x4B)), "K"); // U+004B + string_eq (herk_to_utf8 (herk_byte (0x4C)), "L"); // U+004C + string_eq (herk_to_utf8 (herk_byte (0x4D)), "M"); // U+004D + string_eq (herk_to_utf8 (herk_byte (0x4E)), "N"); // U+004E + string_eq (herk_to_utf8 (herk_byte (0x4F)), "O"); // U+004F +} + +TEST_CASE ("herk_to_utf8_5x") { + string_eq (herk_to_utf8 (herk_byte (0x50)), "P"); // U+0050 + string_eq (herk_to_utf8 (herk_byte (0x51)), "Q"); // U+0051 + string_eq (herk_to_utf8 (herk_byte (0x52)), "R"); // U+0052 + string_eq (herk_to_utf8 (herk_byte (0x53)), "S"); // U+0053 + string_eq (herk_to_utf8 (herk_byte (0x54)), "T"); // U+0054 + string_eq (herk_to_utf8 (herk_byte (0x55)), "U"); // U+0055 + string_eq (herk_to_utf8 (herk_byte (0x56)), "V"); // U+0056 + string_eq (herk_to_utf8 (herk_byte (0x57)), "W"); // U+0057 + string_eq (herk_to_utf8 (herk_byte (0x58)), "X"); // U+0058 + string_eq (herk_to_utf8 (herk_byte (0x59)), "Y"); // U+0059 + string_eq (herk_to_utf8 (herk_byte (0x5A)), "Z"); // U+005A + string_eq (herk_to_utf8 (herk_byte (0x5B)), "["); // U+005B + string_eq (herk_to_utf8 (herk_byte (0x5C)), "\\"); // U+005C + string_eq (herk_to_utf8 (herk_byte (0x5D)), "]"); // U+005D + string_eq (herk_to_utf8 (herk_byte (0x5E)), "^"); // U+005E + string_eq (herk_to_utf8 (herk_byte (0x5F)), "_"); // U+005F +} + +TEST_CASE ("herk_to_utf8_6x") { + string_eq (herk_to_utf8 (herk_byte (0x60)), "‘"); // U+2018 + string_eq (herk_to_utf8 (herk_byte (0x61)), "a"); // U+0061 + string_eq (herk_to_utf8 (herk_byte (0x62)), "b"); // U+0062 + string_eq (herk_to_utf8 (herk_byte (0x63)), "c"); // U+0063 + string_eq (herk_to_utf8 (herk_byte (0x64)), "d"); // U+0064 + string_eq (herk_to_utf8 (herk_byte (0x65)), "e"); // U+0065 + string_eq (herk_to_utf8 (herk_byte (0x66)), "f"); // U+0066 + string_eq (herk_to_utf8 (herk_byte (0x67)), "g"); // U+0067 + string_eq (herk_to_utf8 (herk_byte (0x68)), "h"); // U+0068 + string_eq (herk_to_utf8 (herk_byte (0x69)), "i"); // U+0069 + string_eq (herk_to_utf8 (herk_byte (0x6A)), "j"); // U+006A + string_eq (herk_to_utf8 (herk_byte (0x6B)), "k"); // U+006B + string_eq (herk_to_utf8 (herk_byte (0x6C)), "l"); // U+006C + string_eq (herk_to_utf8 (herk_byte (0x6D)), "m"); // U+006D + string_eq (herk_to_utf8 (herk_byte (0x6E)), "n"); // U+006E + string_eq (herk_to_utf8 (herk_byte (0x6F)), "o"); // U+006F +} + +TEST_CASE ("herk_to_utf8_7x") { + string_eq (herk_to_utf8 (herk_byte (0x70)), "p"); // U+0070 + string_eq (herk_to_utf8 (herk_byte (0x71)), "q"); // U+0071 + string_eq (herk_to_utf8 (herk_byte (0x72)), "r"); // U+0072 + string_eq (herk_to_utf8 (herk_byte (0x73)), "s"); // U+0073 + string_eq (herk_to_utf8 (herk_byte (0x74)), "t"); // U+0074 + string_eq (herk_to_utf8 (herk_byte (0x75)), "u"); // U+0075 + string_eq (herk_to_utf8 (herk_byte (0x76)), "v"); // U+0076 + string_eq (herk_to_utf8 (herk_byte (0x77)), "w"); // U+0077 + string_eq (herk_to_utf8 (herk_byte (0x78)), "x"); // U+0078 + string_eq (herk_to_utf8 (herk_byte (0x79)), "y"); // U+0079 + string_eq (herk_to_utf8 (herk_byte (0x7A)), "z"); // U+007A + string_eq (herk_to_utf8 (herk_byte (0x7B)), "{"); // U+007B + string_eq (herk_to_utf8 (herk_byte (0x7C)), "|"); // U+007C + string_eq (herk_to_utf8 (herk_byte (0x7D)), "}"); // U+007D + string_eq (herk_to_utf8 (herk_byte (0x7E)), "~"); // U+007E + string_eq (herk_to_utf8 (herk_byte (0x7F)), "\xC2\xAD"); // U+00AD +} + +TEST_CASE ("herk_to_utf8_8x") { + string_eq (herk_to_utf8 (herk_byte (0x80)), "Ă"); // U+0102 + string_eq (herk_to_utf8 (herk_byte (0x81)), "Ą"); // U+0104 + string_eq (herk_to_utf8 (herk_byte (0x82)), "Ć"); // U+0106 + string_eq (herk_to_utf8 (herk_byte (0x83)), "Č"); // U+010C + string_eq (herk_to_utf8 (herk_byte (0x84)), "Ď"); // U+010E + string_eq (herk_to_utf8 (herk_byte (0x85)), "Ě"); // U+011A + string_eq (herk_to_utf8 (herk_byte (0x86)), "Ę"); // U+0118 + string_eq (herk_to_utf8 (herk_byte (0x87)), "Ğ"); // U+011E + string_eq (herk_to_utf8 (herk_byte (0x88)), "Ĺ"); // U+0139 + string_eq (herk_to_utf8 (herk_byte (0x89)), "Ľ"); // U+013D + string_eq (herk_to_utf8 (herk_byte (0x8A)), "Ł"); // U+0141 + string_eq (herk_to_utf8 (herk_byte (0x8B)), "Ń"); // U+0143 + string_eq (herk_to_utf8 (herk_byte (0x8C)), "Ň"); // U+0147 + string_eq (herk_to_utf8 (herk_byte (0x8D)), "Ŋ"); // U+014A + string_eq (herk_to_utf8 (herk_byte (0x8E)), "Ő"); // U+0150 + string_eq (herk_to_utf8 (herk_byte (0x8F)), "Ŕ"); // U+0154 +} + +TEST_CASE ("herk_to_utf8_9x") { + string_eq (herk_to_utf8 (herk_byte (0x90)), "Ř"); // U+0158 + string_eq (herk_to_utf8 (herk_byte (0x91)), "Ś"); // U+015A + string_eq (herk_to_utf8 (herk_byte (0x92)), "Š"); // U+0160 + string_eq (herk_to_utf8 (herk_byte (0x93)), "Ş"); // U+015E + string_eq (herk_to_utf8 (herk_byte (0x94)), "Ť"); // U+0164 + string_eq (herk_to_utf8 (herk_byte (0x95)), "Ţ"); // U+0162 + string_eq (herk_to_utf8 (herk_byte (0x96)), "Ű"); // U+0170 + string_eq (herk_to_utf8 (herk_byte (0x97)), "Ů"); // U+016E + string_eq (herk_to_utf8 (herk_byte (0x98)), "Ÿ"); // U+0178 + string_eq (herk_to_utf8 (herk_byte (0x99)), "Ź"); // U+0179 + string_eq (herk_to_utf8 (herk_byte (0x9A)), "Ž"); // U+017D + string_eq (herk_to_utf8 (herk_byte (0x9B)), "Ż"); // U+017B + string_eq (herk_to_utf8 (herk_byte (0x9C)), "IJ"); // U+0132 + string_eq (herk_to_utf8 (herk_byte (0x9D)), "İ"); // U+0130 + string_eq (herk_to_utf8 (herk_byte (0x9E)), "đ"); // U+0111 + string_eq (herk_to_utf8 (herk_byte (0x9F)), "§"); // U+00A7 +} + +TEST_CASE ("herk_to_utf8_Ax") { + string_eq (herk_to_utf8 (herk_byte (0xA0)), "ă"); // U+0103 + string_eq (herk_to_utf8 (herk_byte (0xA1)), "ą"); // U+0105 + string_eq (herk_to_utf8 (herk_byte (0xA2)), "ć"); // U+0107 + string_eq (herk_to_utf8 (herk_byte (0xA3)), "č"); // U+010D + string_eq (herk_to_utf8 (herk_byte (0xA4)), "ď"); // U+010F + string_eq (herk_to_utf8 (herk_byte (0xA5)), "ě"); // U+011B + string_eq (herk_to_utf8 (herk_byte (0xA6)), "ę"); // U+0119 + string_eq (herk_to_utf8 (herk_byte (0xA7)), "ğ"); // U+011F + string_eq (herk_to_utf8 (herk_byte (0xA8)), "ĺ"); // U+013A + string_eq (herk_to_utf8 (herk_byte (0xA9)), "ľ"); // U+013E + string_eq (herk_to_utf8 (herk_byte (0xAA)), "ł"); // U+0142 + string_eq (herk_to_utf8 (herk_byte (0xAB)), "ń"); // U+0144 + string_eq (herk_to_utf8 (herk_byte (0xAC)), "ň"); // U+0148 + string_eq (herk_to_utf8 (herk_byte (0xAD)), "ŋ"); // U+014B + string_eq (herk_to_utf8 (herk_byte (0xAE)), "ő"); // U+0151 + string_eq (herk_to_utf8 (herk_byte (0xAF)), "ŕ"); // U+0155 +} + +TEST_CASE ("herk_to_utf8_Bx") { + string_eq (herk_to_utf8 (herk_byte (0xB0)), "ř"); // U+0159 + string_eq (herk_to_utf8 (herk_byte (0xB1)), "ś"); // U+015B + string_eq (herk_to_utf8 (herk_byte (0xB2)), "š"); // U+0161 + string_eq (herk_to_utf8 (herk_byte (0xB3)), "ş"); // U+015F + string_eq (herk_to_utf8 (herk_byte (0xB4)), "ť"); // U+0165 + string_eq (herk_to_utf8 (herk_byte (0xB5)), "ţ"); // U+0163 + string_eq (herk_to_utf8 (herk_byte (0xB6)), "ű"); // U+0171 + string_eq (herk_to_utf8 (herk_byte (0xB7)), "ů"); // U+016F + string_eq (herk_to_utf8 (herk_byte (0xB8)), "ÿ"); // U+00FF + string_eq (herk_to_utf8 (herk_byte (0xB9)), "ź"); // U+017A + string_eq (herk_to_utf8 (herk_byte (0xBA)), "ž"); // U+017E + string_eq (herk_to_utf8 (herk_byte (0xBB)), "ż"); // U+017C + string_eq (herk_to_utf8 (herk_byte (0xBC)), "ij"); // U+0133 + string_eq (herk_to_utf8 (herk_byte (0xBD)), "¡"); // U+00A1 + string_eq (herk_to_utf8 (herk_byte (0xBE)), "¿"); // U+00BF + string_eq (herk_to_utf8 (herk_byte (0xBF)), "£"); // U+00A3 +} + +TEST_CASE ("herk_to_utf8_Cx") { + string_eq (herk_to_utf8 (herk_byte (0xC0)), "À"); // U+00C0 + string_eq (herk_to_utf8 (herk_byte (0xC1)), "Á"); // U+00C1 + string_eq (herk_to_utf8 (herk_byte (0xC2)), "Â"); // U+00C2 + string_eq (herk_to_utf8 (herk_byte (0xC3)), "Ã"); // U+00C3 + string_eq (herk_to_utf8 (herk_byte (0xC4)), "Ä"); // U+00C4 + string_eq (herk_to_utf8 (herk_byte (0xC5)), "Å"); // U+00C5 + string_eq (herk_to_utf8 (herk_byte (0xC6)), "Æ"); // U+00C6 + string_eq (herk_to_utf8 (herk_byte (0xC7)), "Ç"); // U+00C7 + string_eq (herk_to_utf8 (herk_byte (0xC8)), "È"); // U+00C8 + string_eq (herk_to_utf8 (herk_byte (0xC9)), "É"); // U+00C9 + string_eq (herk_to_utf8 (herk_byte (0xCA)), "Ê"); // U+00CA + string_eq (herk_to_utf8 (herk_byte (0xCB)), "Ë"); // U+00CB + string_eq (herk_to_utf8 (herk_byte (0xCC)), "Ì"); // U+00CC + string_eq (herk_to_utf8 (herk_byte (0xCD)), "Í"); // U+00CD + string_eq (herk_to_utf8 (herk_byte (0xCE)), "Î"); // U+00CE + string_eq (herk_to_utf8 (herk_byte (0xCF)), "Ï"); // U+00CF +} + +TEST_CASE ("herk_to_utf8_Dx") { + string_eq (herk_to_utf8 (herk_byte (0xD0)), "Ð"); // U+00D0 + string_eq (herk_to_utf8 (herk_byte (0xD1)), "Ñ"); // U+00D1 + string_eq (herk_to_utf8 (herk_byte (0xD2)), "Ò"); // U+00D2 + string_eq (herk_to_utf8 (herk_byte (0xD3)), "Ó"); // U+00D3 + string_eq (herk_to_utf8 (herk_byte (0xD4)), "Ô"); // U+00D4 + string_eq (herk_to_utf8 (herk_byte (0xD5)), "Õ"); // U+00D5 + string_eq (herk_to_utf8 (herk_byte (0xD6)), "Ö"); // U+00D6 + string_eq (herk_to_utf8 (herk_byte (0xD7)), "Œ"); // U+0152 + string_eq (herk_to_utf8 (herk_byte (0xD8)), "Ø"); // U+00D8 + string_eq (herk_to_utf8 (herk_byte (0xD9)), "Ù"); // U+00D9 + string_eq (herk_to_utf8 (herk_byte (0xDA)), "Ú"); // U+00DA + string_eq (herk_to_utf8 (herk_byte (0xDB)), "Û"); // U+00DB + string_eq (herk_to_utf8 (herk_byte (0xDC)), "Ü"); // U+00DC + string_eq (herk_to_utf8 (herk_byte (0xDD)), "Ý"); // U+00DD + string_eq (herk_to_utf8 (herk_byte (0xDE)), "Þ"); // U+00DE + string_eq (herk_to_utf8 (herk_byte (0xDF)), "ẞ"); // U+1E9E +} + +TEST_CASE ("herk_to_utf8_Ex") { + string_eq (herk_to_utf8 (herk_byte (0xE0)), "à"); // U+00E0 + string_eq (herk_to_utf8 (herk_byte (0xE1)), "á"); // U+00E1 + string_eq (herk_to_utf8 (herk_byte (0xE2)), "â"); // U+00E2 + string_eq (herk_to_utf8 (herk_byte (0xE3)), "ã"); // U+00E3 + string_eq (herk_to_utf8 (herk_byte (0xE4)), "ä"); // U+00E4 + string_eq (herk_to_utf8 (herk_byte (0xE5)), "å"); // U+00E5 + string_eq (herk_to_utf8 (herk_byte (0xE6)), "æ"); // U+00E6 + string_eq (herk_to_utf8 (herk_byte (0xE7)), "ç"); // U+00E7 + string_eq (herk_to_utf8 (herk_byte (0xE8)), "è"); // U+00E8 + string_eq (herk_to_utf8 (herk_byte (0xE9)), "é"); // U+00E9 + string_eq (herk_to_utf8 (herk_byte (0xEA)), "ê"); // U+00EA + string_eq (herk_to_utf8 (herk_byte (0xEB)), "ë"); // U+00EB + string_eq (herk_to_utf8 (herk_byte (0xEC)), "ì"); // U+00EC + string_eq (herk_to_utf8 (herk_byte (0xED)), "í"); // U+00ED + string_eq (herk_to_utf8 (herk_byte (0xEE)), "î"); // U+00EE + string_eq (herk_to_utf8 (herk_byte (0xEF)), "ï"); // U+00EF +} + +TEST_CASE ("herk_to_utf8_Fx") { + string_eq (herk_to_utf8 (herk_byte (0xF0)), "ð"); // U+00F0 + string_eq (herk_to_utf8 (herk_byte (0xF1)), "ñ"); // U+00F1 + string_eq (herk_to_utf8 (herk_byte (0xF2)), "ò"); // U+00F2 + string_eq (herk_to_utf8 (herk_byte (0xF3)), "ó"); // U+00F3 + string_eq (herk_to_utf8 (herk_byte (0xF4)), "ô"); // U+00F4 + string_eq (herk_to_utf8 (herk_byte (0xF5)), "õ"); // U+00F5 + string_eq (herk_to_utf8 (herk_byte (0xF6)), "ö"); // U+00F6 + string_eq (herk_to_utf8 (herk_byte (0xF7)), "œ"); // U+0153 + string_eq (herk_to_utf8 (herk_byte (0xF8)), "ø"); // U+00F8 + string_eq (herk_to_utf8 (herk_byte (0xF9)), "ù"); // U+00F9 + string_eq (herk_to_utf8 (herk_byte (0xFA)), "ú"); // U+00FA + string_eq (herk_to_utf8 (herk_byte (0xFB)), "û"); // U+00FB + string_eq (herk_to_utf8 (herk_byte (0xFC)), "ü"); // U+00FC + string_eq (herk_to_utf8 (herk_byte (0xFD)), "ý"); // U+00FD + string_eq (herk_to_utf8 (herk_byte (0xFE)), "þ"); // U+00FE + string_eq (herk_to_utf8 (herk_byte (0xFF)), "ß"); // U+00DF +} + +TEST_CASE ("utf8_to_herk_0x") { + string_eq (utf8_to_herk (herk_byte (0x00)), "<#00>"); // U+0000 + string_eq (utf8_to_herk (""), "<#01>"); // U+0001 + string_eq (utf8_to_herk (""), "<#02>"); // U+0002 + string_eq (utf8_to_herk (""), "<#03>"); // U+0003 + string_eq (utf8_to_herk (""), "<#04>"); // U+0004 + string_eq (utf8_to_herk (""), "<#05>"); // U+0005 + string_eq (utf8_to_herk (""), "<#06>"); // U+0006 + string_eq (utf8_to_herk (""), "<#07>"); // U+0007 + string_eq (utf8_to_herk (""), "<#08>"); // U+0008 + string_eq (utf8_to_herk (" "), "<#09>"); // U+0009 + string_eq (utf8_to_herk ("\x0A"), "<#0A>"); // U+000A + string_eq (utf8_to_herk (" "), "<#0B>"); // U+000B + string_eq (utf8_to_herk (" "), "<#0C>"); // U+000C + string_eq (utf8_to_herk ("\x0D"), "<#0D>"); // U+000D + string_eq (utf8_to_herk (""), "<#0E>"); // U+000E + string_eq (utf8_to_herk (""), "<#0F>"); // U+000F +} + +TEST_CASE ("utf8_to_herk_1x") { + string_eq (utf8_to_herk (""), "<#10>"); // U+0010 + string_eq (utf8_to_herk (""), "<#11>"); // U+0011 + string_eq (utf8_to_herk (""), "<#12>"); // U+0012 + string_eq (utf8_to_herk (""), "<#13>"); // U+0013 + string_eq (utf8_to_herk (""), "<#14>"); // U+0014 + string_eq (utf8_to_herk (""), "<#15>"); // U+0015 + string_eq (utf8_to_herk (""), "<#16>"); // U+0016 + string_eq (utf8_to_herk (""), "<#17>"); // U+0017 + string_eq (utf8_to_herk (""), "<#18>"); // U+0018 + string_eq (utf8_to_herk (""), "<#19>"); // U+0019 + string_eq (utf8_to_herk (""), "<#1A>"); // U+001A + string_eq (utf8_to_herk (""), "<#1B>"); // U+001B + string_eq (utf8_to_herk (""), "<#1C>"); // U+001C + string_eq (utf8_to_herk (""), "<#1D>"); // U+001D + string_eq (utf8_to_herk (""), "<#1E>"); // U+001E + string_eq (utf8_to_herk (""), "<#1F>"); // U+001F +} + +TEST_CASE ("utf8_to_herk_2x") { + string_eq (utf8_to_herk (" "), herk_byte (0x20)); // U+0020 + string_eq (utf8_to_herk ("!"), herk_byte (0x21)); // U+0021 + string_eq (utf8_to_herk ("\""), herk_byte (0x22)); // U+0022 + string_eq (utf8_to_herk ("#"), herk_byte (0x23)); // U+0023 + string_eq (utf8_to_herk ("$"), herk_byte (0x24)); // U+0024 + string_eq (utf8_to_herk ("%"), herk_byte (0x25)); // U+0025 + string_eq (utf8_to_herk ("&"), herk_byte (0x26)); // U+0026 + string_eq (utf8_to_herk ("'"), herk_byte (0x27)); // U+0027 + string_eq (utf8_to_herk ("("), herk_byte (0x28)); // U+0028 + string_eq (utf8_to_herk (")"), herk_byte (0x29)); // U+0029 + string_eq (utf8_to_herk ("*"), herk_byte (0x2A)); // U+002A + string_eq (utf8_to_herk ("+"), herk_byte (0x2B)); // U+002B + string_eq (utf8_to_herk (","), herk_byte (0x2C)); // U+002C + string_eq (utf8_to_herk ("-"), herk_byte (0x2D)); // U+002D + string_eq (utf8_to_herk ("."), herk_byte (0x2E)); // U+002E + string_eq (utf8_to_herk ("/"), herk_byte (0x2F)); // U+002F +} + +TEST_CASE ("utf8_to_herk_3x") { + string_eq (utf8_to_herk ("0"), herk_byte (0x30)); // U+0030 + string_eq (utf8_to_herk ("1"), herk_byte (0x31)); // U+0031 + string_eq (utf8_to_herk ("2"), herk_byte (0x32)); // U+0032 + string_eq (utf8_to_herk ("3"), herk_byte (0x33)); // U+0033 + string_eq (utf8_to_herk ("4"), herk_byte (0x34)); // U+0034 + string_eq (utf8_to_herk ("5"), herk_byte (0x35)); // U+0035 + string_eq (utf8_to_herk ("6"), herk_byte (0x36)); // U+0036 + string_eq (utf8_to_herk ("7"), herk_byte (0x37)); // U+0037 + string_eq (utf8_to_herk ("8"), herk_byte (0x38)); // U+0038 + string_eq (utf8_to_herk ("9"), herk_byte (0x39)); // U+0039 + string_eq (utf8_to_herk (":"), herk_byte (0x3A)); // U+003A + string_eq (utf8_to_herk (";"), herk_byte (0x3B)); // U+003B + string_eq (utf8_to_herk ("<"), herk_byte (0x3C)); // U+003C + string_eq (utf8_to_herk ("="), herk_byte (0x3D)); // U+003D + string_eq (utf8_to_herk (">"), herk_byte (0x3E)); // U+003E + string_eq (utf8_to_herk ("?"), herk_byte (0x3F)); // U+003F +} + +TEST_CASE ("utf8_to_herk_4x") { + string_eq (utf8_to_herk ("@"), herk_byte (0x40)); // U+0040 + string_eq (utf8_to_herk ("A"), herk_byte (0x41)); // U+0041 + string_eq (utf8_to_herk ("B"), herk_byte (0x42)); // U+0042 + string_eq (utf8_to_herk ("C"), herk_byte (0x43)); // U+0043 + string_eq (utf8_to_herk ("D"), herk_byte (0x44)); // U+0044 + string_eq (utf8_to_herk ("E"), herk_byte (0x45)); // U+0045 + string_eq (utf8_to_herk ("F"), herk_byte (0x46)); // U+0046 + string_eq (utf8_to_herk ("G"), herk_byte (0x47)); // U+0047 + string_eq (utf8_to_herk ("H"), herk_byte (0x48)); // U+0048 + string_eq (utf8_to_herk ("I"), herk_byte (0x49)); // U+0049 + string_eq (utf8_to_herk ("J"), herk_byte (0x4A)); // U+004A + string_eq (utf8_to_herk ("K"), herk_byte (0x4B)); // U+004B + string_eq (utf8_to_herk ("L"), herk_byte (0x4C)); // U+004C + string_eq (utf8_to_herk ("M"), herk_byte (0x4D)); // U+004D + string_eq (utf8_to_herk ("N"), herk_byte (0x4E)); // U+004E + string_eq (utf8_to_herk ("O"), herk_byte (0x4F)); // U+004F +} + +TEST_CASE ("utf8_to_herk_5x") { + string_eq (utf8_to_herk ("P"), herk_byte (0x50)); // U+0050 + string_eq (utf8_to_herk ("Q"), herk_byte (0x51)); // U+0051 + string_eq (utf8_to_herk ("R"), herk_byte (0x52)); // U+0052 + string_eq (utf8_to_herk ("S"), herk_byte (0x53)); // U+0053 + string_eq (utf8_to_herk ("T"), herk_byte (0x54)); // U+0054 + string_eq (utf8_to_herk ("U"), herk_byte (0x55)); // U+0055 + string_eq (utf8_to_herk ("V"), herk_byte (0x56)); // U+0056 + string_eq (utf8_to_herk ("W"), herk_byte (0x57)); // U+0057 + string_eq (utf8_to_herk ("X"), herk_byte (0x58)); // U+0058 + string_eq (utf8_to_herk ("Y"), herk_byte (0x59)); // U+0059 + string_eq (utf8_to_herk ("Z"), herk_byte (0x5A)); // U+005A + string_eq (utf8_to_herk ("["), herk_byte (0x5B)); // U+005B + string_eq (utf8_to_herk ("\\"), herk_byte (0x5C)); // U+005C + string_eq (utf8_to_herk ("]"), herk_byte (0x5D)); // U+005D + string_eq (utf8_to_herk ("^"), herk_byte (0x5E)); // U+005E + string_eq (utf8_to_herk ("_"), herk_byte (0x5F)); // U+005F +} + +TEST_CASE ("utf8_to_herk_6x") { + string_eq (utf8_to_herk ("`"), herk_byte (0x00)); // U+0060 + string_eq (utf8_to_herk ("a"), herk_byte (0x61)); // U+0061 + string_eq (utf8_to_herk ("b"), herk_byte (0x62)); // U+0062 + string_eq (utf8_to_herk ("c"), herk_byte (0x63)); // U+0063 + string_eq (utf8_to_herk ("d"), herk_byte (0x64)); // U+0064 + string_eq (utf8_to_herk ("e"), herk_byte (0x65)); // U+0065 + string_eq (utf8_to_herk ("f"), herk_byte (0x66)); // U+0066 + string_eq (utf8_to_herk ("g"), herk_byte (0x67)); // U+0067 + string_eq (utf8_to_herk ("h"), herk_byte (0x68)); // U+0068 + string_eq (utf8_to_herk ("i"), herk_byte (0x69)); // U+0069 + string_eq (utf8_to_herk ("j"), herk_byte (0x6A)); // U+006A + string_eq (utf8_to_herk ("k"), herk_byte (0x6B)); // U+006B + string_eq (utf8_to_herk ("l"), herk_byte (0x6C)); // U+006C + string_eq (utf8_to_herk ("m"), herk_byte (0x6D)); // U+006D + string_eq (utf8_to_herk ("n"), herk_byte (0x6E)); // U+006E + string_eq (utf8_to_herk ("o"), herk_byte (0x6F)); // U+006F +} + +TEST_CASE ("utf8_to_herk_7x") { + string_eq (utf8_to_herk ("p"), herk_byte (0x70)); // U+0070 + string_eq (utf8_to_herk ("q"), herk_byte (0x71)); // U+0071 + string_eq (utf8_to_herk ("r"), herk_byte (0x72)); // U+0072 + string_eq (utf8_to_herk ("s"), herk_byte (0x73)); // U+0073 + string_eq (utf8_to_herk ("t"), herk_byte (0x74)); // U+0074 + string_eq (utf8_to_herk ("u"), herk_byte (0x75)); // U+0075 + string_eq (utf8_to_herk ("v"), herk_byte (0x76)); // U+0076 + string_eq (utf8_to_herk ("w"), herk_byte (0x77)); // U+0077 + string_eq (utf8_to_herk ("x"), herk_byte (0x78)); // U+0078 + string_eq (utf8_to_herk ("y"), herk_byte (0x79)); // U+0079 + string_eq (utf8_to_herk ("z"), herk_byte (0x7A)); // U+007A + string_eq (utf8_to_herk ("{"), herk_byte (0x7B)); // U+007B + string_eq (utf8_to_herk ("|"), herk_byte (0x7C)); // U+007C + string_eq (utf8_to_herk ("}"), herk_byte (0x7D)); // U+007D + string_eq (utf8_to_herk ("~"), herk_byte (0x7E)); // U+007E + string_eq (utf8_to_herk ("\x7F"), "\x7F"); // U+007F unmapped +} + +TEST_CASE ("utf8_to_herk_8x") { + string_eq (utf8_to_herk ("\xC2\x80"), "<#80>"); // U+0080 unmapped + string_eq (utf8_to_herk ("\xC2\x81"), "<#81>"); // U+0081 unmapped + string_eq (utf8_to_herk ("\xC2\x82"), "<#82>"); // U+0082 unmapped + string_eq (utf8_to_herk ("\xC2\x83"), "<#83>"); // U+0083 unmapped + string_eq (utf8_to_herk ("\xC2\x84"), "<#84>"); // U+0084 unmapped + string_eq (utf8_to_herk ("\xC2\x85"), "<#85>"); // U+0085 unmapped + string_eq (utf8_to_herk ("\xC2\x86"), "<#86>"); // U+0086 unmapped + string_eq (utf8_to_herk ("\xC2\x87"), "<#87>"); // U+0087 unmapped + string_eq (utf8_to_herk ("\xC2\x88"), "<#88>"); // U+0088 unmapped + string_eq (utf8_to_herk ("\xC2\x89"), "<#89>"); // U+0089 unmapped + string_eq (utf8_to_herk ("\xC2\x8A"), "<#8A>"); // U+008A unmapped + string_eq (utf8_to_herk ("\xC2\x8B"), "<#8B>"); // U+008B unmapped + string_eq (utf8_to_herk ("\xC2\x8C"), "<#8C>"); // U+008C unmapped + string_eq (utf8_to_herk ("\xC2\x8D"), "<#8D>"); // U+008D unmapped + string_eq (utf8_to_herk ("\xC2\x8E"), "<#8E>"); // U+008E unmapped + string_eq (utf8_to_herk ("\xC2\x8F"), "<#8F>"); // U+008F unmapped +} + +TEST_CASE ("utf8_to_herk_9x") { + string_eq (utf8_to_herk ("\xC2\x90"), "<#90>"); // U+0090 unmapped + string_eq (utf8_to_herk ("\xC2\x91"), "<#91>"); // U+0091 unmapped + string_eq (utf8_to_herk ("\xC2\x92"), "<#92>"); // U+0092 unmapped + string_eq (utf8_to_herk ("\xC2\x93"), "<#93>"); // U+0093 unmapped + string_eq (utf8_to_herk ("\xC2\x94"), "<#94>"); // U+0094 unmapped + string_eq (utf8_to_herk ("\xC2\x95"), "<#95>"); // U+0095 unmapped + string_eq (utf8_to_herk ("\xC2\x96"), "<#96>"); // U+0096 unmapped + string_eq (utf8_to_herk ("\xC2\x97"), "<#97>"); // U+0097 unmapped + string_eq (utf8_to_herk ("\xC2\x98"), "<#98>"); // U+0098 unmapped + string_eq (utf8_to_herk ("\xC2\x99"), "<#99>"); // U+0099 unmapped + string_eq (utf8_to_herk ("\xC2\x9A"), "<#9A>"); // U+009A unmapped + string_eq (utf8_to_herk ("\xC2\x9B"), "<#9B>"); // U+009B unmapped + string_eq (utf8_to_herk ("\xC2\x9C"), "<#9C>"); // U+009C unmapped + string_eq (utf8_to_herk ("\xC2\x9D"), "<#9D>"); // U+009D unmapped + string_eq (utf8_to_herk ("\xC2\x9E"), "<#9E>"); // U+009E unmapped + string_eq (utf8_to_herk ("\xC2\x9F"), "<#9F>"); // U+009F unmapped +} + +TEST_CASE ("utf8_to_herk_Ax") { + string_eq (utf8_to_herk ("\xC2\xA0"), "<#A0>"); // U+00A0 unmapped + string_eq (utf8_to_herk ("\xC2\xA1"), herk_byte (0xBD)); // U+00A1 + string_eq (utf8_to_herk ("\xC2\xA2"), "<#A2>"); // U+00A2 unmapped + string_eq (utf8_to_herk ("\xC2\xA3"), herk_byte (0xBF)); // U+00A3 + string_eq (utf8_to_herk ("\xC2\xA4"), "<#A4>"); // U+00A4 unmapped + string_eq (utf8_to_herk ("\xC2\xA5"), "<#A5>"); // U+00A5 unmapped + string_eq (utf8_to_herk ("\xC2\xA6"), "<#A6>"); // U+00A6 unmapped + string_eq (utf8_to_herk ("\xC2\xA7"), herk_byte (0x9F)); // U+00A7 + string_eq (utf8_to_herk ("\xC2\xA8"), herk_byte (0x04)); // U+00A8 + string_eq (utf8_to_herk ("\xC2\xA9"), "<#A9>"); // U+00A9 unmapped + string_eq (utf8_to_herk ("\xC2\xAA"), "<#AA>"); // U+00AA unmapped + string_eq (utf8_to_herk ("\xC2\xAB"), herk_byte (0x13)); // U+00AB + string_eq (utf8_to_herk ("\xC2\xAC"), "<#AC>"); // U+00AC unmapped + string_eq (utf8_to_herk ("\xC2\xAD"), herk_byte (0x7F)); // U+00AD + string_eq (utf8_to_herk ("\xC2\xAE"), "<#AE>"); // U+00AE unmapped + string_eq (utf8_to_herk ("\xC2\xAF"), herk_byte (0x09)); // U+00AF +} + +TEST_CASE ("utf8_to_herk_Bx") { + string_eq (utf8_to_herk ("\xC2\xB0"), "<#B0>"); // U+00B0 unmapped + string_eq (utf8_to_herk ("\xC2\xB1"), "<#B1>"); // U+00B1 unmapped + string_eq (utf8_to_herk ("\xC2\xB2"), "<#B2>"); // U+00B2 unmapped + string_eq (utf8_to_herk ("\xC2\xB3"), "<#B3>"); // U+00B3 unmapped + string_eq (utf8_to_herk ("\xC2\xB4"), herk_byte (0x01)); // U+00B4 + string_eq (utf8_to_herk ("\xC2\xB5"), "<#B5>"); // U+00B5 unmapped + string_eq (utf8_to_herk ("\xC2\xB6"), "<#B6>"); // U+00B6 unmapped + string_eq (utf8_to_herk ("\xC2\xB7"), "<#B7>"); // U+00B7 unmapped + string_eq (utf8_to_herk ("\xC2\xB8"), herk_byte (0x0B)); // U+00B8 + string_eq (utf8_to_herk ("\xC2\xB9"), "<#B9>"); // U+00B9 unmapped + string_eq (utf8_to_herk ("\xC2\xBA"), "<#BA>"); // U+00BA unmapped + string_eq (utf8_to_herk ("\xC2\xBB"), herk_byte (0x14)); // U+00BB + string_eq (utf8_to_herk ("\xC2\xBC"), "<#BC>"); // U+00BC unmapped + string_eq (utf8_to_herk ("\xC2\xBD"), "<#BD>"); // U+00BD unmapped + string_eq (utf8_to_herk ("\xC2\xBE"), "<#BE>"); // U+00BE unmapped + string_eq (utf8_to_herk ("\xC2\xBF"), herk_byte (0xBE)); // U+00BF +} + +TEST_CASE ("utf8_to_herk_Cx") { + string_eq (utf8_to_herk ("\xC3\x80"), herk_byte (0xC0)); // U+00C0 + string_eq (utf8_to_herk ("\xC3\x81"), herk_byte (0xC1)); // U+00C1 + string_eq (utf8_to_herk ("\xC3\x82"), herk_byte (0xC2)); // U+00C2 + string_eq (utf8_to_herk ("\xC3\x83"), herk_byte (0xC3)); // U+00C3 + string_eq (utf8_to_herk ("\xC3\x84"), herk_byte (0xC4)); // U+00C4 + string_eq (utf8_to_herk ("\xC3\x85"), herk_byte (0xC5)); // U+00C5 + string_eq (utf8_to_herk ("\xC3\x86"), herk_byte (0xC6)); // U+00C6 + string_eq (utf8_to_herk ("\xC3\x87"), herk_byte (0xC7)); // U+00C7 + string_eq (utf8_to_herk ("\xC3\x88"), herk_byte (0xC8)); // U+00C8 + string_eq (utf8_to_herk ("\xC3\x89"), herk_byte (0xC9)); // U+00C9 + string_eq (utf8_to_herk ("\xC3\x8A"), herk_byte (0xCA)); // U+00CA + string_eq (utf8_to_herk ("\xC3\x8B"), herk_byte (0xCB)); // U+00CB + string_eq (utf8_to_herk ("\xC3\x8C"), herk_byte (0xCC)); // U+00CC + string_eq (utf8_to_herk ("\xC3\x8D"), herk_byte (0xCD)); // U+00CD + string_eq (utf8_to_herk ("\xC3\x8E"), herk_byte (0xCE)); // U+00CE + string_eq (utf8_to_herk ("\xC3\x8F"), herk_byte (0xCF)); // U+00CF +} + +TEST_CASE ("utf8_to_herk_Dx") { + string_eq (utf8_to_herk ("\xC3\x90"), herk_byte (0xD0)); // U+00D0 + string_eq (utf8_to_herk ("\xC3\x91"), herk_byte (0xD1)); // U+00D1 + string_eq (utf8_to_herk ("\xC3\x92"), herk_byte (0xD2)); // U+00D2 + string_eq (utf8_to_herk ("\xC3\x93"), herk_byte (0xD3)); // U+00D3 + string_eq (utf8_to_herk ("\xC3\x94"), herk_byte (0xD4)); // U+00D4 + string_eq (utf8_to_herk ("\xC3\x95"), herk_byte (0xD5)); // U+00D5 + string_eq (utf8_to_herk ("\xC3\x96"), herk_byte (0xD6)); // U+00D6 + string_eq (utf8_to_herk ("\xC3\x97"), "<#D7>"); // U+00D7 unmapped + string_eq (utf8_to_herk ("\xC3\x98"), herk_byte (0xD8)); // U+00D8 + string_eq (utf8_to_herk ("\xC3\x99"), herk_byte (0xD9)); // U+00D9 + string_eq (utf8_to_herk ("\xC3\x9A"), herk_byte (0xDA)); // U+00DA + string_eq (utf8_to_herk ("\xC3\x9B"), herk_byte (0xDB)); // U+00DB + string_eq (utf8_to_herk ("\xC3\x9C"), herk_byte (0xDC)); // U+00DC + string_eq (utf8_to_herk ("\xC3\x9D"), herk_byte (0xDD)); // U+00DD + string_eq (utf8_to_herk ("\xC3\x9E"), herk_byte (0xDE)); // U+00DE + string_eq (utf8_to_herk ("\xC3\x9F"), herk_byte (0xFF)); // U+00DF +} + +TEST_CASE ("utf8_to_herk_Ex") { + string_eq (utf8_to_herk ("\xC3\xA0"), herk_byte (0xE0)); // U+00E0 + string_eq (utf8_to_herk ("\xC3\xA1"), herk_byte (0xE1)); // U+00E1 + string_eq (utf8_to_herk ("\xC3\xA2"), herk_byte (0xE2)); // U+00E2 + string_eq (utf8_to_herk ("\xC3\xA3"), herk_byte (0xE3)); // U+00E3 + string_eq (utf8_to_herk ("\xC3\xA4"), herk_byte (0xE4)); // U+00E4 + string_eq (utf8_to_herk ("\xC3\xA5"), herk_byte (0xE5)); // U+00E5 + string_eq (utf8_to_herk ("\xC3\xA6"), herk_byte (0xE6)); // U+00E6 + string_eq (utf8_to_herk ("\xC3\xA7"), herk_byte (0xE7)); // U+00E7 + string_eq (utf8_to_herk ("\xC3\xA8"), herk_byte (0xE8)); // U+00E8 + string_eq (utf8_to_herk ("\xC3\xA9"), herk_byte (0xE9)); // U+00E9 + string_eq (utf8_to_herk ("\xC3\xAA"), herk_byte (0xEA)); // U+00EA + string_eq (utf8_to_herk ("\xC3\xAB"), herk_byte (0xEB)); // U+00EB + string_eq (utf8_to_herk ("\xC3\xAC"), herk_byte (0xEC)); // U+00EC + string_eq (utf8_to_herk ("\xC3\xAD"), herk_byte (0xED)); // U+00ED + string_eq (utf8_to_herk ("\xC3\xAE"), herk_byte (0xEE)); // U+00EE + string_eq (utf8_to_herk ("\xC3\xAF"), herk_byte (0xEF)); // U+00EF +} + +TEST_CASE ("utf8_to_herk_Fx") { + string_eq (utf8_to_herk ("\xC3\xB0"), herk_byte (0xF0)); // U+00F0 + string_eq (utf8_to_herk ("\xC3\xB1"), herk_byte (0xF1)); // U+00F1 + string_eq (utf8_to_herk ("\xC3\xB2"), herk_byte (0xF2)); // U+00F2 + string_eq (utf8_to_herk ("\xC3\xB3"), herk_byte (0xF3)); // U+00F3 + string_eq (utf8_to_herk ("\xC3\xB4"), herk_byte (0xF4)); // U+00F4 + string_eq (utf8_to_herk ("\xC3\xB5"), herk_byte (0xF5)); // U+00F5 + string_eq (utf8_to_herk ("\xC3\xB6"), herk_byte (0xF6)); // U+00F6 + string_eq (utf8_to_herk ("\xC3\xB7"), "<#F7>"); // U+00F7 unmapped + string_eq (utf8_to_herk ("\xC3\xB8"), herk_byte (0xF8)); // U+00F8 + string_eq (utf8_to_herk ("\xC3\xB9"), herk_byte (0xF9)); // U+00F9 + string_eq (utf8_to_herk ("\xC3\xBA"), herk_byte (0xFA)); // U+00FA + string_eq (utf8_to_herk ("\xC3\xBB"), herk_byte (0xFB)); // U+00FB + string_eq (utf8_to_herk ("\xC3\xBC"), herk_byte (0xFC)); // U+00FC + string_eq (utf8_to_herk ("\xC3\xBD"), herk_byte (0xFD)); // U+00FD + string_eq (utf8_to_herk ("\xC3\xBE"), herk_byte (0xFE)); // U+00FE + string_eq (utf8_to_herk ("\xC3\xBF"), herk_byte (0xB8)); // U+00FF +} + +TEST_CASE ("utf8_to_herk_high_mapped") { + string_eq (utf8_to_herk ("\xC4\x82"), herk_byte (0x80)); // U+0102 + string_eq (utf8_to_herk ("\xC4\x83"), herk_byte (0xA0)); // U+0103 + string_eq (utf8_to_herk ("\xC4\x84"), herk_byte (0x81)); // U+0104 + string_eq (utf8_to_herk ("\xC4\x85"), herk_byte (0xA1)); // U+0105 + string_eq (utf8_to_herk ("\xC4\x86"), herk_byte (0x82)); // U+0106 + string_eq (utf8_to_herk ("\xC4\x87"), herk_byte (0xA2)); // U+0107 + string_eq (utf8_to_herk ("\xC4\x8C"), herk_byte (0x83)); // U+010C + string_eq (utf8_to_herk ("\xC4\x8D"), herk_byte (0xA3)); // U+010D + string_eq (utf8_to_herk ("\xC4\x8E"), herk_byte (0x84)); // U+010E + string_eq (utf8_to_herk ("\xC4\x8F"), herk_byte (0xA4)); // U+010F + string_eq (utf8_to_herk ("\xC4\x91"), herk_byte (0x9E)); // U+0111 + string_eq (utf8_to_herk ("\xC4\x98"), herk_byte (0x86)); // U+0118 + string_eq (utf8_to_herk ("\xC4\x99"), herk_byte (0xA6)); // U+0119 + string_eq (utf8_to_herk ("\xC4\x9A"), herk_byte (0x85)); // U+011A + string_eq (utf8_to_herk ("\xC4\x9B"), herk_byte (0xA5)); // U+011B + string_eq (utf8_to_herk ("\xC4\x9E"), herk_byte (0x87)); // U+011E + string_eq (utf8_to_herk ("\xC4\x9F"), herk_byte (0xA7)); // U+011F + string_eq (utf8_to_herk ("\xC4\xB0"), herk_byte (0x9D)); // U+0130 + string_eq (utf8_to_herk ("\xC4\xB1"), herk_byte (0x19)); // U+0131 + string_eq (utf8_to_herk ("\xC4\xB2"), herk_byte (0x9C)); // U+0132 + string_eq (utf8_to_herk ("\xC4\xB3"), herk_byte (0xBC)); // U+0133 + string_eq (utf8_to_herk ("\xC4\xB9"), herk_byte (0x88)); // U+0139 + string_eq (utf8_to_herk ("\xC4\xBA"), herk_byte (0xA8)); // U+013A + string_eq (utf8_to_herk ("\xC4\xBD"), herk_byte (0x89)); // U+013D + string_eq (utf8_to_herk ("\xC4\xBE"), herk_byte (0xA9)); // U+013E + string_eq (utf8_to_herk ("\xC5\x81"), herk_byte (0x8A)); // U+0141 + string_eq (utf8_to_herk ("\xC5\x82"), herk_byte (0xAA)); // U+0142 + string_eq (utf8_to_herk ("\xC5\x83"), herk_byte (0x8B)); // U+0143 + string_eq (utf8_to_herk ("\xC5\x84"), herk_byte (0xAB)); // U+0144 + string_eq (utf8_to_herk ("\xC5\x87"), herk_byte (0x8C)); // U+0147 + string_eq (utf8_to_herk ("\xC5\x88"), herk_byte (0xAC)); // U+0148 + string_eq (utf8_to_herk ("\xC5\x8A"), herk_byte (0x8D)); // U+014A + string_eq (utf8_to_herk ("\xC5\x8B"), herk_byte (0xAD)); // U+014B + string_eq (utf8_to_herk ("\xC5\x90"), herk_byte (0x8E)); // U+0150 + string_eq (utf8_to_herk ("\xC5\x91"), herk_byte (0xAE)); // U+0151 + string_eq (utf8_to_herk ("\xC5\x92"), herk_byte (0xD7)); // U+0152 + string_eq (utf8_to_herk ("\xC5\x93"), herk_byte (0xF7)); // U+0153 + string_eq (utf8_to_herk ("\xC5\x94"), herk_byte (0x8F)); // U+0154 + string_eq (utf8_to_herk ("\xC5\x95"), herk_byte (0xAF)); // U+0155 + string_eq (utf8_to_herk ("\xC5\x98"), herk_byte (0x90)); // U+0158 + string_eq (utf8_to_herk ("\xC5\x99"), herk_byte (0xB0)); // U+0159 + string_eq (utf8_to_herk ("\xC5\x9A"), herk_byte (0x91)); // U+015A + string_eq (utf8_to_herk ("\xC5\x9B"), herk_byte (0xB1)); // U+015B + string_eq (utf8_to_herk ("\xC5\x9E"), herk_byte (0x93)); // U+015E + string_eq (utf8_to_herk ("\xC5\x9F"), herk_byte (0xB3)); // U+015F + string_eq (utf8_to_herk ("\xC5\xA0"), herk_byte (0x92)); // U+0160 + string_eq (utf8_to_herk ("\xC5\xA1"), herk_byte (0xB2)); // U+0161 + string_eq (utf8_to_herk ("\xC5\xA2"), herk_byte (0x95)); // U+0162 + string_eq (utf8_to_herk ("\xC5\xA3"), herk_byte (0xB5)); // U+0163 + string_eq (utf8_to_herk ("\xC5\xA4"), herk_byte (0x94)); // U+0164 + string_eq (utf8_to_herk ("\xC5\xA5"), herk_byte (0xB4)); // U+0165 + string_eq (utf8_to_herk ("\xC5\xAE"), herk_byte (0x97)); // U+016E + string_eq (utf8_to_herk ("\xC5\xAF"), herk_byte (0xB7)); // U+016F + string_eq (utf8_to_herk ("\xC5\xB0"), herk_byte (0x96)); // U+0170 + string_eq (utf8_to_herk ("\xC5\xB1"), herk_byte (0xB6)); // U+0171 + string_eq (utf8_to_herk ("\xC5\xB8"), herk_byte (0x98)); // U+0178 + string_eq (utf8_to_herk ("\xC5\xB9"), herk_byte (0x99)); // U+0179 + string_eq (utf8_to_herk ("\xC5\xBA"), herk_byte (0xB9)); // U+017A + string_eq (utf8_to_herk ("\xC5\xBB"), herk_byte (0x9B)); // U+017B + string_eq (utf8_to_herk ("\xC5\xBC"), herk_byte (0xBB)); // U+017C + string_eq (utf8_to_herk ("\xC5\xBD"), herk_byte (0x9A)); // U+017D + string_eq (utf8_to_herk ("\xC5\xBE"), herk_byte (0xBA)); // U+017E + string_eq (utf8_to_herk ("\xC8\xB7"), herk_byte (0x1A)); // U+0237 + string_eq (utf8_to_herk ("\xCB\x86"), herk_byte (0x02)); // U+02C6 + string_eq (utf8_to_herk ("\xCB\x87"), herk_byte (0x07)); // U+02C7 + string_eq (utf8_to_herk ("\xCB\x98"), herk_byte (0x08)); // U+02D8 + string_eq (utf8_to_herk ("\xCB\x99"), herk_byte (0x0A)); // U+02D9 + string_eq (utf8_to_herk ("\xCB\x9A"), herk_byte (0x06)); // U+02DA + string_eq (utf8_to_herk ("\xCB\x9B"), herk_byte (0x0C)); // U+02DB + string_eq (utf8_to_herk ("\xCB\x9C"), herk_byte (0x03)); // U+02DC + string_eq (utf8_to_herk ("\xCB\x9D"), herk_byte (0x05)); // U+02DD + string_eq (utf8_to_herk ("\xE1\xBA\x9E"), herk_byte (0xDF)); // U+1E9E + string_eq (utf8_to_herk ("\xE2\x80\x8B"), herk_byte (0x17)); // U+200B + string_eq (utf8_to_herk ("\xE2\x80\x93"), herk_byte (0x15)); // U+2013 + string_eq (utf8_to_herk ("\xE2\x80\x94"), herk_byte (0x16)); // U+2014 + string_eq (utf8_to_herk ("\xE2\x80\x98"), herk_byte (0x60)); // U+2018 + string_eq (utf8_to_herk ("\xE2\x80\x9A"), herk_byte (0x0D)); // U+201A + string_eq (utf8_to_herk ("\xE2\x80\x9C"), herk_byte (0x10)); // U+201C + string_eq (utf8_to_herk ("\xE2\x80\x9D"), herk_byte (0x11)); // U+201D + string_eq (utf8_to_herk ("\xE2\x80\x9E"), herk_byte (0x12)); // U+201E + string_eq (utf8_to_herk ("\xE2\x80\xB9"), herk_byte (0x0E)); // U+2039 + string_eq (utf8_to_herk ("\xE2\x80\xBA"), herk_byte (0x0F)); // U+203A + string_eq (utf8_to_herk ("\xE2\x82\x80"), herk_byte (0x18)); // U+2080 + string_eq (utf8_to_herk ("\xEF\xAC\x80"), herk_byte (0x1B)); // U+FB00 + string_eq (utf8_to_herk ("\xEF\xAC\x81"), herk_byte (0x1C)); // U+FB01 + string_eq (utf8_to_herk ("\xEF\xAC\x82"), herk_byte (0x1D)); // U+FB02 + string_eq (utf8_to_herk ("\xEF\xAC\x83"), herk_byte (0x1E)); // U+FB03 + string_eq (utf8_to_herk ("\xEF\xAC\x84"), herk_byte (0x1F)); // U+FB04 +} + +TEST_CASE ("utf8_to_herk_high_unmapped") { + string_eq (utf8_to_herk ("\xC4\x80"), "<#100>"); // U+0100 + string_eq (utf8_to_herk ("\xC4\x90"), "<#110>"); // U+0110 + string_eq (utf8_to_herk ("\xC4\xA0"), "<#120>"); // U+0120 + string_eq (utf8_to_herk ("\xC8\x80"), "<#200>"); // U+0200 + string_eq (utf8_to_herk ("\xE2\x80\x99"), "<#2019>"); // U+2019 + string_eq (utf8_to_herk ("\xE3\x80\x82"), "<#3002>"); // U+3002 + string_eq (utf8_to_herk ("\xE4\xB8\xAD"), "<#4E2D>"); // U+4E2D +} + +TEST_CASE ("herk_escapes") { + string_eq (herk_to_utf8 ("<#00>"), herk_byte (0x00)); + string_eq (herk_to_utf8 ("<#0F>"), herk_byte (0x0F)); + string_eq (herk_to_utf8 ("<#10>"), herk_byte (0x10)); + string_eq (herk_to_utf8 ("<#1F>"), herk_byte (0x1F)); + string_eq (herk_to_utf8 ("<#FF>"), "ÿ"); // U+00FF + string_eq (herk_to_utf8 ("<#0FF>"), "ÿ"); // U+00FF + string_eq (herk_to_utf8 ("<#00FF>"), "ÿ"); // U+00FF + string_eq (herk_to_utf8 ("<#4E2D>"), "中"); // U+4E2D + + string_eq (utf8_to_herk ("<#00>"), "<#00>"); + string_eq (utf8_to_herk ("<#FF>"), "<#FF>"); + string_eq (utf8_to_herk ("<#0FF>"), "<#0FF>"); + string_eq (utf8_to_herk ("<#00FF>"), "<#00FF>"); +} + +TEST_CASE ("herk_named_escapes") { + string_eq (utf8_to_herk (""), ""); + string_eq (herk_to_utf8 (""), ""); + string_eq (utf8_to_herk (""), ""); + string_eq (herk_to_utf8 (""), ""); +} + +TEST_CASE ("herk_mixed") { + string_eq ( + herk_to_utf8 (herk_byte (0x41) * string ("<#2019>") * herk_byte (0x42)), + "A" * string ("’") * "B"); + string_eq (utf8_to_herk ("A" + "中" + "B"), + herk_byte (0x41) * string ("<#4E2D>") * herk_byte (0x42)); +} diff --git a/src/Data/String/converter.cpp b/src/Data/String/converter.cpp index f0fafec80d..1c4825c078 100644 --- a/src/Data/String/converter.cpp +++ b/src/Data/String/converter.cpp @@ -334,99 +334,6 @@ convert_utf8_to_LaTeX (string input) { return output; } -static const int herk_to_utf8_code[256]= { - 0x0060, 0x00B4, 0x02C6, 0x02DC, 0x00A8, 0x02DD, 0x02DA, 0x02C7, 0x02D8, - 0x00AF, 0x02D9, 0x00B8, 0x02DB, 0x201A, 0x2039, 0x203A, 0x201C, 0x201D, - 0x201E, 0x00AB, 0x00BB, 0x2013, 0x2014, 0x200B, 0x2080, 0x0131, 0x0237, - 0xFB00, 0xFB01, 0xFB02, 0xFB03, 0xFB04, 0x0020, 0x0021, 0x0022, 0x0023, - 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, - 0x002D, 0x002E, 0x002F, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, - 0x0036, 0x0037, 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, - 0x003F, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, - 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, - 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, - 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, 0x2018, 0x0061, 0x0062, - 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006A, 0x006B, - 0x006C, 0x006D, 0x006E, 0x006F, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, - 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, - 0x007E, 0x00AD, 0x0102, 0x0104, 0x0106, 0x010C, 0x010E, 0x011A, 0x0118, - 0x011E, 0x0139, 0x013D, 0x0141, 0x0143, 0x0147, 0x014A, 0x0150, 0x0154, - 0x0158, 0x015A, 0x0160, 0x015E, 0x0164, 0x0162, 0x0170, 0x016E, 0x0178, - 0x0179, 0x017D, 0x017B, 0x0132, 0x0130, 0x0111, 0x00A7, 0x0103, 0x0105, - 0x0107, 0x010D, 0x010F, 0x011B, 0x0119, 0x011F, 0x013A, 0x013E, 0x0142, - 0x0144, 0x0148, 0x014B, 0x0151, 0x0155, 0x0159, 0x015B, 0x0161, 0x015F, - 0x0165, 0x0163, 0x0171, 0x016F, 0x00FF, 0x017A, 0x017E, 0x017C, 0x0133, - 0x00A1, 0x00BF, 0x00A3, 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, - 0x00C6, 0x00C7, 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, - 0x00CF, 0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x0152, - 0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x1E9E, 0x00E0, - 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, 0x00E8, 0x00E9, - 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, 0x00F0, 0x00F1, 0x00F2, - 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x0153, 0x00F8, 0x00F9, 0x00FA, 0x00FB, - 0x00FC, 0x00FD, 0x00FE, 0x00DF, -}; - -static const int utf8_to_herk_byte[256]= { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, - 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, - 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, - 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, - 90, 91, 92, 93, 94, 95, 0, 97, 98, 99, 100, 101, 102, 103, 104, - 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, - 120, 121, 122, 123, 124, 125, 126, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 189, -1, 191, -1, - -1, -1, 159, 4, -1, -1, 19, -1, 127, -1, 9, -1, -1, -1, -1, - 1, -1, -1, -1, 11, -1, -1, 20, -1, -1, -1, 190, 192, 193, 194, - 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, - 210, 211, 212, 213, 214, -1, 216, 217, 218, 219, 220, 221, 222, 255, 224, - 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, - 240, 241, 242, 243, 244, 245, 246, -1, 248, 249, 250, 251, 252, 253, 254, - 184, -}; - -static const int utf8_to_herk_high[][2]= { - {0x0102, 0x80}, {0x0103, 0xA0}, {0x0104, 0x81}, {0x0105, 0xA1}, - {0x0106, 0x82}, {0x0107, 0xA2}, {0x010C, 0x83}, {0x010D, 0xA3}, - {0x010E, 0x84}, {0x010F, 0xA4}, {0x0111, 0x9E}, {0x0118, 0x86}, - {0x0119, 0xA6}, {0x011A, 0x85}, {0x011B, 0xA5}, {0x011E, 0x87}, - {0x011F, 0xA7}, {0x0130, 0x9D}, {0x0131, 0x19}, {0x0132, 0x9C}, - {0x0133, 0xBC}, {0x0139, 0x88}, {0x013A, 0xA8}, {0x013D, 0x89}, - {0x013E, 0xA9}, {0x0141, 0x8A}, {0x0142, 0xAA}, {0x0143, 0x8B}, - {0x0144, 0xAB}, {0x0147, 0x8C}, {0x0148, 0xAC}, {0x014A, 0x8D}, - {0x014B, 0xAD}, {0x0150, 0x8E}, {0x0151, 0xAE}, {0x0152, 0xD7}, - {0x0153, 0xF7}, {0x0154, 0x8F}, {0x0155, 0xAF}, {0x0158, 0x90}, - {0x0159, 0xB0}, {0x015A, 0x91}, {0x015B, 0xB1}, {0x015E, 0x93}, - {0x015F, 0xB3}, {0x0160, 0x92}, {0x0161, 0xB2}, {0x0162, 0x95}, - {0x0163, 0xB5}, {0x0164, 0x94}, {0x0165, 0xB4}, {0x016E, 0x97}, - {0x016F, 0xB7}, {0x0170, 0x96}, {0x0171, 0xB6}, {0x0178, 0x98}, - {0x0179, 0x99}, {0x017A, 0xB9}, {0x017B, 0x9B}, {0x017C, 0xBB}, - {0x017D, 0x9A}, {0x017E, 0xBA}, {0x0237, 0x1A}, {0x02C6, 0x02}, - {0x02C7, 0x07}, {0x02D8, 0x08}, {0x02D9, 0x0A}, {0x02DA, 0x06}, - {0x02DB, 0x0C}, {0x02DC, 0x03}, {0x02DD, 0x05}, {0x1E9E, 0xDF}, - {0x200B, 0x17}, {0x2013, 0x15}, {0x2014, 0x16}, {0x2018, 0x60}, - {0x201A, 0x0D}, {0x201C, 0x10}, {0x201D, 0x11}, {0x201E, 0x12}, - {0x2039, 0x0E}, {0x203A, 0x0F}, {0x2080, 0x18}, {0xFB00, 0x1B}, - {0xFB01, 0x1C}, {0xFB02, 0x1D}, {0xFB03, 0x1E}, {0xFB04, 0x1F}, -}; - -static const int utf8_to_herk_high_count= 88; - -static int -lookup_utf8_to_herk_high (uint32_t code) { - int lo= 0, hi= utf8_to_herk_high_count - 1; - while (lo <= hi) { - int mid= (lo + hi) >> 1; - if (utf8_to_herk_high[mid][0] == (int) code) - return utf8_to_herk_high[mid][1]; - if (utf8_to_herk_high[mid][0] < (int) code) lo= mid + 1; - else hi= mid - 1; - } - return -1; -} - string utf8_to_cork (string input) { converter conv= load_converter ("UTF-8", "Cork"); @@ -443,26 +350,6 @@ utf8_to_cork (string input) { return output; } -string -utf8_to_herk (string input) { - int i, n= N (input); - string output; - for (i= 0; i < n;) { - uint32_t code = decode_from_utf8 (input, i); - int mapped= (code <= 0xFF) ? utf8_to_herk_byte[code] : -1; - if (mapped == -1) mapped= lookup_utf8_to_herk_high (code); - if (mapped >= 0) { - output << string ((char) mapped, 1); - } - else { - if (code < 16) output << "<#0" * to_Hex (code) * ">"; - else if (code < 32 || code >= 128) output << "<#" * to_Hex (code) * ">"; - else output << string ((char) code, 1); - } - } - return output; -} - string sourcecode_to_cork (string input) { converter conv= load_converter ("SourceCode", "Cork"); @@ -497,41 +384,6 @@ cork_to_utf8 (string input) { return r; } -static inline void -append_utf8_code (string& r, int code) { - if (code < 0x80) { - r << (char) code; - } - else if (code < 0x800) { - r << (char) (0xC0 | (code >> 6)); - r << (char) (0x80 | (code & 0x3F)); - } - else { - r << (char) (0xE0 | (code >> 12)); - r << (char) (0x80 | ((code >> 6) & 0x3F)); - r << (char) (0x80 | (code & 0x3F)); - } -} - -string_u8 -herk_to_utf8 (string input) { - int start= 0, i, n= N (input); - string r; - for (i= 0; i < n; i++) - if (input[i] == '<' && i + 1 < n && input[i + 1] == '#') { - for (int j= start; j < i; j++) - append_utf8_code (r, herk_to_utf8_code[(unsigned char) input[j]]); - start= i= i + 2; - while (i < n && input[i] != '>') - i++; - append_utf8_code (r, from_hexadecimal (input (start, i))); - start= i + 1; - } - for (int j= start; j < n; j++) - append_utf8_code (r, herk_to_utf8_code[(unsigned char) input[j]]); - return r; -} - string_u8 strict_cork_to_utf8 (string input) { converter conv = load_converter ("Strict-Cork", "UTF-8"); diff --git a/src/Data/String/converter.hpp b/src/Data/String/converter.hpp index 2f3db636a6..2eed32d6d2 100644 --- a/src/Data/String/converter.hpp +++ b/src/Data/String/converter.hpp @@ -82,7 +82,6 @@ string convert_to_cork (string input, string from); string convert_from_cork (string input, string to); string_u8 cork_to_utf8 (string input); -string_u8 herk_to_utf8 (string input); string_u8 strict_cork_to_utf8 (string input); string_u8 convert_LaTeX_to_utf8 (string input); @@ -91,7 +90,6 @@ string sourcecode_to_cork (string input); string convert_utf8_to_LaTeX (string_u8 input); string utf8_to_cork (string_u8 input); -string utf8_to_herk (string_u8 input); string utf8_to_html (string_u8 input); string utf8_to_t2a (string_u8 input); string_u8 t2a_to_utf8 (string input); diff --git a/src/Data/Tree/tree_traverse.cpp b/src/Data/Tree/tree_traverse.cpp index ae40c7ae82..b27d918832 100644 --- a/src/Data/Tree/tree_traverse.cpp +++ b/src/Data/Tree/tree_traverse.cpp @@ -11,12 +11,12 @@ #include "tree_traverse.hpp" #include "analyze.hpp" -#include "converter.hpp" #include "cork.hpp" #include "hashset.hpp" #include "scheme.hpp" #include "tree_helper.hpp" +#include #include #include #include @@ -836,7 +836,7 @@ search_sections (tree t) { tree tree_utf8_to_herk (tree_u8 t) { if (is_atomic (t)) { - return tree (utf8_to_herk (t->label)); + return tree (lolly::data::utf8_to_herk (t->label)); } else if (!is_func (t, RAW_DATA)) { int t_N= N (t); @@ -853,7 +853,7 @@ tree_utf8_to_herk (tree_u8 t) { tree_u8 tree_herk_to_utf8 (tree t) { if (is_atomic (t)) { - return tree (herk_to_utf8 (t->label)); + return tree (lolly::data::herk_to_utf8 (t->label)); } else if (!is_func (t, RAW_DATA)) { int t_N= N (t); diff --git a/src/Graphics/Fonts/smart_font.cpp b/src/Graphics/Fonts/smart_font.cpp index 2e69e624a1..1b60f91d14 100644 --- a/src/Graphics/Fonts/smart_font.cpp +++ b/src/Graphics/Fonts/smart_font.cpp @@ -24,6 +24,7 @@ #include "translator.hpp" #include "unicode.hpp" +#include #include #include @@ -861,7 +862,8 @@ smart_font_rep::advance (string s, int& pos, string& r, int& nr) { debug_fonts << "Advance for font of [" << s << "] " << this->res_name << " math_kind: " << math_kind << LF; debug_fonts << "Physical font of [" << r << "]" - << "[" << herk_to_utf8 (r) << "][" << cork_to_utf8 (r) << "]" + << "[" << lolly::data::herk_to_utf8 (r) << "][" + << cork_to_utf8 (r) << "]" << " is " << fn[nr]->res_name << LF; } } diff --git a/src/Plugins/Tex/fromtex_post.cpp b/src/Plugins/Tex/fromtex_post.cpp index 151d9e0cef..774c2a70af 100644 --- a/src/Plugins/Tex/fromtex_post.cpp +++ b/src/Plugins/Tex/fromtex_post.cpp @@ -21,6 +21,7 @@ #include "tree_modify.hpp" #include "url.hpp" +#include #include #include @@ -908,7 +909,7 @@ finalize_layout (tree t) { (((i + 1) == n) || (t[i + 1] != tree (END, "verbatim")))) r << tree (FORMAT, "new line"); else if (is_atomic (t[i])) { - r << utf8_to_herk (t[i]->label); + r << lolly::data::utf8_to_herk (t[i]->label); } else { r << t[i]; diff --git a/src/Scheme/L2/glue_lolly.lua b/src/Scheme/L2/glue_lolly.lua index 454f815c48..fdeaf35bf0 100644 --- a/src/Scheme/L2/glue_lolly.lua +++ b/src/Scheme/L2/glue_lolly.lua @@ -819,6 +819,22 @@ function main() "int" } }, + { + scm_name = "utf8->herk", + cpp_name = "lolly::data::utf8_to_herk", + ret_type = "string", + arg_list = { + "string" + } + }, + { + scm_name = "herk->utf8", + cpp_name = "lolly::data::herk_to_utf8", + ret_type = "string", + arg_list = { + "string" + } + }, -- lolly/system { scm_name = "system", diff --git a/src/Scheme/L2/init_glue_l2.cpp b/src/Scheme/L2/init_glue_l2.cpp index 2efd1c8d47..15ee5e6338 100644 --- a/src/Scheme/L2/init_glue_l2.cpp +++ b/src/Scheme/L2/init_glue_l2.cpp @@ -22,6 +22,7 @@ #include "tree.hpp" #include +#include #include #include #include diff --git a/src/Scheme/L3/glue_moebius.lua b/src/Scheme/L3/glue_moebius.lua index 7933229947..4f4c76a7dd 100644 --- a/src/Scheme/L3/glue_moebius.lua +++ b/src/Scheme/L3/glue_moebius.lua @@ -54,22 +54,6 @@ function main() arg_list = { "string" } - }, - { - scm_name = "utf8->herk", - cpp_name = "utf8_to_herk", - ret_type = "string", - arg_list = { - "string" - } - }, - { - scm_name = "herk->utf8", - cpp_name = "herk_to_utf8", - ret_type = "string", - arg_list = { - "string" - } }, -- routines for strings in the TeXmacs encoding { diff --git a/src/System/Language/text_language.cpp b/src/System/Language/text_language.cpp index a573e1acb3..efc66c7ca3 100644 --- a/src/System/Language/text_language.cpp +++ b/src/System/Language/text_language.cpp @@ -17,6 +17,8 @@ #include "sys_utils.hpp" #include "tm_configure.hpp" +#include + /****************************************************************************** * Western text languages / 8 bit charset ******************************************************************************/ @@ -493,10 +495,10 @@ chinese_language_rep::chinese_language_rep (string lan_name) << string ("“") << string ("‘"); for (int i= 0; i < N (full_width_do_not_start); i++) { - do_not_start << utf8_to_herk (full_width_do_not_start[i]); + do_not_start << lolly::data::utf8_to_herk (full_width_do_not_start[i]); } for (int i= 0; i < N (full_width_do_not_end); i++) { - do_not_end << utf8_to_herk (full_width_do_not_end[i]); + do_not_end << lolly::data::utf8_to_herk (full_width_do_not_end[i]); } // special full width characters diff --git a/tests/Data/String/converter_test.cpp b/tests/Data/String/converter_test.cpp index 678a5dbfb9..08e509a5e6 100644 --- a/tests/Data/String/converter_test.cpp +++ b/tests/Data/String/converter_test.cpp @@ -21,112 +21,6 @@ private slots: void init () { init_lolly (); }; void test_utf8_to_cork (); void test_cork_to_utf8 (); - void test_utf8_to_herk (); - void test_herk_to_utf8 (); -}; - -// Helper: encode a Unicode code point as a UTF-8 byte string. -static string -utf8_from_code (int code) { - if (code < 0x80) return string ((char) code); - else if (code < 0x800) { - return string ((char) (0xC0 | (code >> 6))) * - string ((char) (0x80 | (code & 0x3F))); - } - else { - return string ((char) (0xE0 | (code >> 12))) * - string ((char) (0x80 | ((code >> 6) & 0x3F))) * - string ((char) (0x80 | (code & 0x3F))); - } -} - -// Helper: build a single-byte Herk string, even for the NUL byte. -static string -herk_byte (int byte) { - return string ((char) byte, 1); -} - -// Helper: build a <#XXXX> hexadecimal escape for a Unicode code point. -static string -hex_escape (int code) { - static const char* digits= "0123456789ABCDEF"; - if (code < 16) { - char buf[6]= {'<', '#', '0', digits[code], '>', '\0'}; - return string (buf); - } - else if (code < 256) { - char buf[7]= {'<', '#', digits[code >> 4], digits[code & 0xF], '>', '\0'}; - return string (buf); - } - else { - char buf[9]; - int n = 0; - int tmp= code; - while (tmp > 0) { - buf[n++]= digits[tmp & 0xF]; - tmp>>= 4; - } - string r= "<#"; - for (int i= n - 1; i >= 0; i--) - r << buf[i]; - r << ">"; - return r; - } -} - -// Expected Unicode code point for each Herk byte 0x00..0xFF. -static const int herk_to_utf8_code[256]= { - 0x0060, 0x00B4, 0x02C6, 0x02DC, 0x00A8, 0x02DD, 0x02DA, 0x02C7, 0x02D8, - 0x00AF, 0x02D9, 0x00B8, 0x02DB, 0x201A, 0x2039, 0x203A, 0x201C, 0x201D, - 0x201E, 0x00AB, 0x00BB, 0x2013, 0x2014, 0x200B, 0x2080, 0x0131, 0x0237, - 0xFB00, 0xFB01, 0xFB02, 0xFB03, 0xFB04, 0x0020, 0x0021, 0x0022, 0x0023, - 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, - 0x002D, 0x002E, 0x002F, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, - 0x0036, 0x0037, 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, - 0x003F, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, - 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, - 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, - 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, 0x2018, 0x0061, 0x0062, - 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006A, 0x006B, - 0x006C, 0x006D, 0x006E, 0x006F, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, - 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, - 0x007E, 0x00AD, 0x0102, 0x0104, 0x0106, 0x010C, 0x010E, 0x011A, 0x0118, - 0x011E, 0x0139, 0x013D, 0x0141, 0x0143, 0x0147, 0x014A, 0x0150, 0x0154, - 0x0158, 0x015A, 0x0160, 0x015E, 0x0164, 0x0162, 0x0170, 0x016E, 0x0178, - 0x0179, 0x017D, 0x017B, 0x0132, 0x0130, 0x0111, 0x00A7, 0x0103, 0x0105, - 0x0107, 0x010D, 0x010F, 0x011B, 0x0119, 0x011F, 0x013A, 0x013E, 0x0142, - 0x0144, 0x0148, 0x014B, 0x0151, 0x0155, 0x0159, 0x015B, 0x0161, 0x015F, - 0x0165, 0x0163, 0x0171, 0x016F, 0x00FF, 0x017A, 0x017E, 0x017C, 0x0133, - 0x00A1, 0x00BF, 0x00A3, 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, - 0x00C6, 0x00C7, 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, - 0x00CF, 0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x0152, - 0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x1E9E, 0x00E0, - 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, 0x00E8, 0x00E9, - 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, 0x00F0, 0x00F1, 0x00F2, - 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x0153, 0x00F8, 0x00F9, 0x00FA, 0x00FB, - 0x00FC, 0x00FD, 0x00FE, 0x00DF, -}; - -// Reverse mapping: Unicode code point 0x00..0xFF -> Herk byte, -1 if none. -static const int utf8_to_herk_byte[256]= { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, - 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, - 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, - 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, - 90, 91, 92, 93, 94, 95, 0, 97, 98, 99, 100, 101, 102, 103, 104, - 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, - 120, 121, 122, 123, 124, 125, 126, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 189, -1, 191, -1, - -1, -1, 159, 4, -1, -1, 19, -1, 127, -1, 9, -1, -1, -1, -1, - 1, -1, -1, -1, 11, -1, -1, 20, -1, -1, -1, 190, 192, 193, 194, - 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, - 210, 211, 212, 213, 214, -1, 216, 217, 218, 219, 220, 221, 222, 255, 224, - 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, - 240, 241, 242, 243, 244, 245, 246, -1, 248, 249, 250, 251, 252, 253, 254, - 184, }; void @@ -143,77 +37,5 @@ TestConverter::test_cork_to_utf8 () { qcompare (cork_to_utf8 ("\x11"), "”"); } -void -TestConverter::test_utf8_to_herk () { - // U+0000..U+00FF are covered exhaustively. - for (int code= 0; code <= 0xFF; code++) { - int mapped= utf8_to_herk_byte[code]; - string input = utf8_from_code (code); - if (mapped > 0) { - qcompare (utf8_to_herk (input), herk_byte (mapped)); - } - else if (mapped == 0) { - // U+0060 (`) maps to Herk byte 0x00; qcompare cannot check NUL bytes. - string r= utf8_to_herk (input); - QCOMPARE (N (r), 1); - QCOMPARE ((int) (unsigned char) r[0], 0); - } - else if (code < 32 || code >= 128) { - qcompare (utf8_to_herk (input), hex_escape (code)); - } - else { - // U+007F is the only unmapped code point in 0x20..0x7F. - qcompare (utf8_to_herk (input), input); - } - } - - // Characters outside the Cork range use <#XXXX> escapes. - qcompare (utf8_to_herk ("—"), herk_byte (0x16)); // U+2014 has a mapping - qcompare (utf8_to_herk ("’"), "<#2019>"); // U+2019 has no mapping - qcompare (utf8_to_herk ("中"), "<#4E2D>"); // CJK - qcompare (utf8_to_herk ("。"), "<#3002>"); // CJK punctuation - - // <#XXXX> and named escapes pass through unchanged. - qcompare (utf8_to_herk ("<#FF>"), "<#FF>"); - qcompare (utf8_to_herk ("<#0FF>"), "<#0FF>"); - qcompare (utf8_to_herk ("<#00FF>"), "<#00FF>"); - qcompare (utf8_to_herk (""), ""); - - // Mixed strings. - qcompare (utf8_to_herk ("A中B"), - herk_byte (0x41) * string ("<#4E2D>") * herk_byte (0x42)); -} - -void -TestConverter::test_herk_to_utf8 () { - // Every Herk byte 0x00..0xFF must decode to the expected Unicode code point. - for (int byte= 0; byte < 256; byte++) - qcompare (herk_to_utf8 (herk_byte (byte)), - utf8_from_code (herk_to_utf8_code[byte])); - - // <#XXXX> escapes are parsed literally as Unicode code points. - qcompare (herk_to_utf8 ("<#0F>"), utf8_from_code (0x0F)); - qcompare (herk_to_utf8 ("<#10>"), utf8_from_code (0x10)); - qcompare (herk_to_utf8 ("<#1F>"), utf8_from_code (0x1F)); - qcompare (herk_to_utf8 ("<#FF>"), "ÿ"); - qcompare (herk_to_utf8 ("<#0FF>"), "ÿ"); - qcompare (herk_to_utf8 ("<#00FF>"), "ÿ"); - qcompare (herk_to_utf8 ("<#2019>"), "’"); - qcompare (herk_to_utf8 ("<#4E2D>"), "中"); - - // NUL cannot be compared with qcompare because as_charp truncates at '\0'. - string r= herk_to_utf8 ("<#00>"); - QCOMPARE (N (r), 1); - QCOMPARE ((int) (unsigned char) r[0], 0); - - // Named escapes pass through unchanged. - qcompare (herk_to_utf8 (""), ""); - - // Mixed internal bytes and escapes. - qcompare ( - herk_to_utf8 (herk_byte (0x41) * string ("<#2019>") * herk_byte (0x42)), - "A’B"); -} - QTEST_MAIN (TestConverter) #include "converter_test.moc"