From a642c48a38972d73e971d00c977dec8cc38f0ec5 Mon Sep 17 00:00:00 2001 From: Thom Chiovoloni Date: Tue, 10 Oct 2023 16:22:33 -0700 Subject: [PATCH 1/2] wip --- .github/workflows/rust.yml | 12 +- Cargo.toml | 3 + src/case/mod.rs | 56 +++++ src/case/search.rs | 66 +++++ src/case/table.rs | 236 ++++++++++++++++++ src/lib.rs | 12 +- src/uniprop.rs | 235 ++++++++++++++++++ tabgen/Cargo.toml | 10 + tabgen/README.md | 36 +++ tabgen/src/gen.rs | 496 +++++++++++++++++++++++++++++++++++++ tabgen/src/main.rs | 38 +++ 11 files changed, 1183 insertions(+), 17 deletions(-) create mode 100644 src/case/mod.rs create mode 100644 src/case/search.rs create mode 100644 src/case/table.rs create mode 100644 src/uniprop.rs create mode 100644 tabgen/Cargo.toml create mode 100644 tabgen/README.md create mode 100644 tabgen/src/gen.rs create mode 100644 tabgen/src/main.rs diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 3dab2e4..87b19ea 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -48,17 +48,13 @@ jobs: args: -- --check clippy: - name: Clippy + name: Check unicode tables runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: actions-rs/toolchain@v1 with: profile: minimal - toolchain: stable + toolchain: nightly override: true - components: clippy - - uses: actions-rs/clippy-check@v1 - with: - token: ${{ secrets.GITHUB_TOKEN }} - args: -- -D warnings + - run: cargo run -p tabgen --release -- diff --git a/Cargo.toml b/Cargo.toml index 54b1723..b061a01 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,3 +21,6 @@ harness = false [features] default = [] nightly = [] + +[workspace] +members = [".", "tabgen"] diff --git a/src/case/mod.rs b/src/case/mod.rs new file mode 100644 index 0000000..7886838 --- /dev/null +++ b/src/case/mod.rs @@ -0,0 +1,56 @@ +mod search; +mod table; +use table::ChangesWhenTableType; +#[inline] +pub(super) fn changes_when_lowercased(c: char) -> bool { + if c.is_ascii() { + c.is_ascii_uppercase() + } else { + search::changes_when_casemapped_nonascii::( + c, + table::CHANGES_WHEN_LOOKUP_TAB, + ) + } +} +#[inline] +pub(super) fn changes_when_uppercased(c: char) -> bool { + if c.is_ascii() { + c.is_ascii_lowercase() + } else { + search::changes_when_casemapped_nonascii::( + c, + table::CHANGES_WHEN_LOOKUP_TAB, + ) + } +} + +#[cfg(test)] +mod test { + #[test] + fn test_paranoia() { + if core::char::UNICODE_VERSION != super::table::UNICODE_VERSION { + return; + } + for c in '\0'..=char::MAX { + let lower = changes_when_lowercased_refimpl(c); + let upper = changes_when_uppercased_refimpl(c); + let fancy_lower = super::changes_when_lowercased(c); + let fancy_upper = super::changes_when_uppercased(c); + assert_eq!( + (lower, upper), + (fancy_lower, fancy_upper), + "wrong for {:?} (U+{:04x})", + c, + c as u32 + ); + } + } + + fn changes_when_lowercased_refimpl(c: char) -> bool { + !core::iter::once(c).eq(c.to_lowercase()) + } + + fn changes_when_uppercased_refimpl(c: char) -> bool { + !core::iter::once(c).eq(c.to_uppercase()) + } +} diff --git a/src/case/search.rs b/src/case/search.rs new file mode 100644 index 0000000..255ea6f --- /dev/null +++ b/src/case/search.rs @@ -0,0 +1,66 @@ +pub(super) fn changes_when_casemapped_nonascii( + needle: char, + tab: &super::ChangesWhenTableType, +) -> bool { + let Some(enc) = find_encoded_case_range(needle, tab) else { + return false; + }; + const RK_UNIFORM_UPPER: u32 = 0; + const RK_UNIFORM_LOWER: u32 = 1; + const RK_ALT_UPPER_LOWER: u32 = 2; + const RK_ALT_LOWER_UPPER: u32 = 3; + const RK_UNIFORM_BOTH: u32 = 4; + + let range_st = enc >> 11; + let range_len = enc & 0xff; + let range_kind = (enc >> 8) & 0x7; + debug_assert!(range_kind <= 4); + let map_lower = MAP_LOWER; + let map_upper = !MAP_LOWER; + match range_kind { + RK_UNIFORM_BOTH => true, + RK_UNIFORM_UPPER => map_upper, + RK_UNIFORM_LOWER => map_lower, + RK_ALT_UPPER_LOWER | RK_ALT_LOWER_UPPER => { + let offset = needle as u32 - range_st; + debug_assert!(offset <= range_len); + let odd = (offset & 1) != 0; + let odd_is_lower = range_kind == RK_ALT_UPPER_LOWER; + if MAP_LOWER { + odd_is_lower == odd + } else { + odd_is_lower == !odd + } + // match (range_kind == RK_ALT_UPPER_LOWER, MAP_LOWER) { + // (true, true) | (false, true) => !odd, + // (true, false) | (false, false) => odd, + // _ => false, + // } + } + rk => { + debug_assert!(false, "bad rangekind {:?}", rk); + false + } + } +} + +pub(super) fn find_encoded_case_range( + needle: char, + ranges: &super::ChangesWhenTableType, +) -> Option { + let pos = ranges.binary_search_by(|&entry| { + let range_st = entry >> 11; + let range_len = entry & 0xff; + if range_st > (needle as u32) { + core::cmp::Ordering::Greater + } else if (range_st + range_len) <= (needle as u32) { + core::cmp::Ordering::Less + } else { + core::cmp::Ordering::Equal + } + }); + match pos { + Err(_) => None, + Ok(n) => Some(ranges[n]), + } +} diff --git a/src/case/table.rs b/src/case/table.rs new file mode 100644 index 0000000..069a428 --- /dev/null +++ b/src/case/table.rs @@ -0,0 +1,236 @@ +// 225 ranges / 900 bytes. per-rangetype stats: +// - RK_UNIFORM_UPPER, ranges=85, chars=914 +// - RK_UNIFORM_LOWER, ranges=65, chars=815 +// - RK_ALT_UPPER_LOWER, ranges=20, chars=378 +// - RK_ALT_LOWER_UPPER, ranges=45, chars=737 +// - RK_UNIFORM_BOTH, ranges=10, chars=31 +pub(crate) const CHANGES_WHEN_LOOKUP_TAB: &[u32; 225] = &[ + 0x0005a801, // RK_UNIFORM_UPPER,len=1 : U+00b5 ('µ') + 0x00060117, // RK_UNIFORM_LOWER,len=23 : U+00c0..=U+00d6U+00c0..=U+00d6 ('À'..='Ö') + 0x0006c107, // RK_UNIFORM_LOWER,len=7 : U+00d8..=U+00deU+00d8..=U+00de ('Ø'..='Þ') + 0x0006f818, // RK_UNIFORM_UPPER,len=24 : U+00df..=U+00f6U+00df..=U+00f6 ('ß'..='ö') + 0x0007c008, // RK_UNIFORM_UPPER,len=8 : U+00f8..=U+00ffU+00f8..=U+00ff ('ø'..='ÿ') + 0x00080338, // RK_ALT_LOWER_UPPER,len=56 : U+0100..=U+0137U+0100..=U+0137 ('Ā'..='ķ') + 0x0009cb10, // RK_ALT_LOWER_UPPER,len=16 : U+0139..=U+0148U+0139..=U+0148 ('Ĺ'..='ň') + 0x000a4a30, // RK_ALT_UPPER_LOWER,len=48 : U+0149..=U+0178U+0149..=U+0178 ('ʼn'..='Ÿ') + 0x000bcb06, // RK_ALT_LOWER_UPPER,len=6 : U+0179..=U+017eU+0179..=U+017e ('Ź'..='ž') + 0x000bf802, // RK_UNIFORM_UPPER,len=2 : U+017f..=U+0180U+017f..=U+0180 ('ſ'..='ƀ') + 0x000c0902, // RK_UNIFORM_LOWER,len=2 : U+0181..=U+0182U+0181..=U+0182 ('Ɓ'..='Ƃ') + 0x000c1a04, // RK_ALT_UPPER_LOWER,len=4 : U+0183..=U+0186U+0183..=U+0186 ('ƃ'..='Ɔ') + 0x000c3b03, // RK_ALT_LOWER_UPPER,len=3 : U+0187..=U+0189U+0187..=U+0189 ('Ƈ'..='Ɖ') + 0x000c5102, // RK_UNIFORM_LOWER,len=2 : U+018a..=U+018bU+018a..=U+018b ('Ɗ'..='Ƌ') + 0x000c6001, // RK_UNIFORM_UPPER,len=1 : U+018c ('ƌ') + 0x000c7104, // RK_UNIFORM_LOWER,len=4 : U+018e..=U+0191U+018e..=U+0191 ('Ǝ'..='Ƒ') + 0x000c9202, // RK_ALT_UPPER_LOWER,len=2 : U+0192..=U+0193U+0192..=U+0193 ('ƒ'..='Ɠ') + 0x000ca303, // RK_ALT_LOWER_UPPER,len=3 : U+0194..=U+0196U+0194..=U+0196 ('Ɣ'..='Ɩ') + 0x000cb902, // RK_UNIFORM_LOWER,len=2 : U+0197..=U+0198U+0197..=U+0198 ('Ɨ'..='Ƙ') + 0x000cc802, // RK_UNIFORM_UPPER,len=2 : U+0199..=U+019aU+0199..=U+019a ('ƙ'..='ƚ') + 0x000ce102, // RK_UNIFORM_LOWER,len=2 : U+019c..=U+019dU+019c..=U+019d ('Ɯ'..='Ɲ') + 0x000cf202, // RK_ALT_UPPER_LOWER,len=2 : U+019e..=U+019fU+019e..=U+019f ('ƞ'..='Ɵ') + 0x000d0307, // RK_ALT_LOWER_UPPER,len=7 : U+01a0..=U+01a6U+01a0..=U+01a6 ('Ơ'..='Ʀ') + 0x000d3b03, // RK_ALT_LOWER_UPPER,len=3 : U+01a7..=U+01a9U+01a7..=U+01a9 ('Ƨ'..='Ʃ') + 0x000d6303, // RK_ALT_LOWER_UPPER,len=3 : U+01ac..=U+01aeU+01ac..=U+01ae ('Ƭ'..='Ʈ') + 0x000d7b03, // RK_ALT_LOWER_UPPER,len=3 : U+01af..=U+01b1U+01af..=U+01b1 ('Ư'..='Ʊ') + 0x000d9102, // RK_UNIFORM_LOWER,len=2 : U+01b2..=U+01b3U+01b2..=U+01b3 ('Ʋ'..='Ƴ') + 0x000da204, // RK_ALT_UPPER_LOWER,len=4 : U+01b4..=U+01b7U+01b4..=U+01b7 ('ƴ'..='Ʒ') + 0x000dc302, // RK_ALT_LOWER_UPPER,len=2 : U+01b8..=U+01b9U+01b8..=U+01b9 ('Ƹ'..='ƹ') + 0x000de302, // RK_ALT_LOWER_UPPER,len=2 : U+01bc..=U+01bdU+01bc..=U+01bd ('Ƽ'..='ƽ') + 0x000df801, // RK_UNIFORM_UPPER,len=1 : U+01bf ('ƿ') + 0x000e2101, // RK_UNIFORM_LOWER,len=1 : U+01c4 ('DŽ') + 0x000e2c01, // RK_UNIFORM_BOTH,len=1 : U+01c5 ('Dž') + 0x000e3202, // RK_ALT_UPPER_LOWER,len=2 : U+01c6..=U+01c7U+01c6..=U+01c7 ('dž'..='LJ') + 0x000e4401, // RK_UNIFORM_BOTH,len=1 : U+01c8 ('Lj') + 0x000e4a02, // RK_ALT_UPPER_LOWER,len=2 : U+01c9..=U+01caU+01c9..=U+01ca ('lj'..='NJ') + 0x000e5c01, // RK_UNIFORM_BOTH,len=1 : U+01cb ('Nj') + 0x000e6211, // RK_ALT_UPPER_LOWER,len=17 : U+01cc..=U+01dcU+01cc..=U+01dc ('nj'..='ǜ') + 0x000eea13, // RK_ALT_UPPER_LOWER,len=19 : U+01dd..=U+01efU+01dd..=U+01ef ('ǝ'..='ǯ') + 0x000f8202, // RK_ALT_UPPER_LOWER,len=2 : U+01f0..=U+01f1U+01f0..=U+01f1 ('ǰ'..='DZ') + 0x000f9401, // RK_UNIFORM_BOTH,len=1 : U+01f2 ('Dz') + 0x000f9a04, // RK_ALT_UPPER_LOWER,len=4 : U+01f3..=U+01f6U+01f3..=U+01f6 ('dz'..='Ƕ') + 0x000fb902, // RK_UNIFORM_LOWER,len=2 : U+01f7..=U+01f8U+01f7..=U+01f8 ('Ƿ'..='Ǹ') + 0x000fca28, // RK_ALT_UPPER_LOWER,len=40 : U+01f9..=U+0220U+01f9..=U+0220 ('ǹ'..='Ƞ') + 0x00111312, // RK_ALT_LOWER_UPPER,len=18 : U+0222..=U+0233U+0222..=U+0233 ('Ȣ'..='ȳ') + 0x0011d102, // RK_UNIFORM_LOWER,len=2 : U+023a..=U+023bU+023a..=U+023b ('Ⱥ'..='Ȼ') + 0x0011e202, // RK_ALT_UPPER_LOWER,len=2 : U+023c..=U+023dU+023c..=U+023d ('ȼ'..='Ƚ') + 0x0011f302, // RK_ALT_LOWER_UPPER,len=2 : U+023e..=U+023fU+023e..=U+023f ('Ⱦ'..='ȿ') + 0x00120204, // RK_ALT_UPPER_LOWER,len=4 : U+0240..=U+0243U+0240..=U+0243 ('ɀ'..='Ƀ') + 0x00122103, // RK_UNIFORM_LOWER,len=3 : U+0244..=U+0246U+0244..=U+0246 ('Ʉ'..='Ɇ') + 0x00123a09, // RK_ALT_UPPER_LOWER,len=9 : U+0247..=U+024fU+0247..=U+024f ('ɇ'..='ɏ') + 0x00128005, // RK_UNIFORM_UPPER,len=5 : U+0250..=U+0254U+0250..=U+0254 ('ɐ'..='ɔ') + 0x0012b002, // RK_UNIFORM_UPPER,len=2 : U+0256..=U+0257U+0256..=U+0257 ('ɖ'..='ɗ') + 0x0012c801, // RK_UNIFORM_UPPER,len=1 : U+0259 ('ə') + 0x0012d802, // RK_UNIFORM_UPPER,len=2 : U+025b..=U+025cU+025b..=U+025c ('ɛ'..='ɜ') + 0x00130002, // RK_UNIFORM_UPPER,len=2 : U+0260..=U+0261U+0260..=U+0261 ('ɠ'..='ɡ') + 0x00131801, // RK_UNIFORM_UPPER,len=1 : U+0263 ('ɣ') + 0x00132802, // RK_UNIFORM_UPPER,len=2 : U+0265..=U+0266U+0265..=U+0266 ('ɥ'..='ɦ') + 0x00134005, // RK_UNIFORM_UPPER,len=5 : U+0268..=U+026cU+0268..=U+026c ('ɨ'..='ɬ') + 0x00137801, // RK_UNIFORM_UPPER,len=1 : U+026f ('ɯ') + 0x00138802, // RK_UNIFORM_UPPER,len=2 : U+0271..=U+0272U+0271..=U+0272 ('ɱ'..='ɲ') + 0x0013a801, // RK_UNIFORM_UPPER,len=1 : U+0275 ('ɵ') + 0x0013e801, // RK_UNIFORM_UPPER,len=1 : U+027d ('ɽ') + 0x00140001, // RK_UNIFORM_UPPER,len=1 : U+0280 ('ʀ') + 0x00141002, // RK_UNIFORM_UPPER,len=2 : U+0282..=U+0283U+0282..=U+0283 ('ʂ'..='ʃ') + 0x00143806, // RK_UNIFORM_UPPER,len=6 : U+0287..=U+028cU+0287..=U+028c ('ʇ'..='ʌ') + 0x00149001, // RK_UNIFORM_UPPER,len=1 : U+0292 ('ʒ') + 0x0014e802, // RK_UNIFORM_UPPER,len=2 : U+029d..=U+029eU+029d..=U+029e ('ʝ'..='ʞ') + 0x001a2801, // RK_UNIFORM_UPPER,len=1 : U+0345 ('\u{345}') + 0x001b8304, // RK_ALT_LOWER_UPPER,len=4 : U+0370..=U+0373U+0370..=U+0373 ('Ͱ'..='ͳ') + 0x001bb302, // RK_ALT_LOWER_UPPER,len=2 : U+0376..=U+0377U+0376..=U+0377 ('Ͷ'..='ͷ') + 0x001bd803, // RK_UNIFORM_UPPER,len=3 : U+037b..=U+037dU+037b..=U+037d ('ͻ'..='ͽ') + 0x001bf901, // RK_UNIFORM_LOWER,len=1 : U+037f ('Ϳ') + 0x001c3101, // RK_UNIFORM_LOWER,len=1 : U+0386 ('Ά') + 0x001c4103, // RK_UNIFORM_LOWER,len=3 : U+0388..=U+038aU+0388..=U+038a ('Έ'..='Ί') + 0x001c6101, // RK_UNIFORM_LOWER,len=1 : U+038c ('Ό') + 0x001c7102, // RK_UNIFORM_LOWER,len=2 : U+038e..=U+038fU+038e..=U+038f ('Ύ'..='Ώ') + 0x001c8202, // RK_ALT_UPPER_LOWER,len=2 : U+0390..=U+0391U+0390..=U+0391 ('ΐ'..='Α') + 0x001c9110, // RK_UNIFORM_LOWER,len=16 : U+0392..=U+03a1U+0392..=U+03a1 ('Β'..='Ρ') + 0x001d1909, // RK_UNIFORM_LOWER,len=9 : U+03a3..=U+03abU+03a3..=U+03ab ('Σ'..='Ϋ') + 0x001d6023, // RK_UNIFORM_UPPER,len=35 : U+03ac..=U+03ceU+03ac..=U+03ce ('ά'..='ώ') + 0x001e7b02, // RK_ALT_LOWER_UPPER,len=2 : U+03cf..=U+03d0U+03cf..=U+03d0 ('Ϗ'..='ϐ') + 0x001e8801, // RK_UNIFORM_UPPER,len=1 : U+03d1 ('ϑ') + 0x001ea803, // RK_UNIFORM_UPPER,len=3 : U+03d5..=U+03d7U+03d5..=U+03d7 ('ϕ'..='ϗ') + 0x001ec318, // RK_ALT_LOWER_UPPER,len=24 : U+03d8..=U+03efU+03d8..=U+03ef ('Ϙ'..='ϯ') + 0x001f8004, // RK_UNIFORM_UPPER,len=4 : U+03f0..=U+03f3U+03f0..=U+03f3 ('ϰ'..='ϳ') + 0x001fa302, // RK_ALT_LOWER_UPPER,len=2 : U+03f4..=U+03f5U+03f4..=U+03f5 ('ϴ'..='ϵ') + 0x001fbb03, // RK_ALT_LOWER_UPPER,len=3 : U+03f7..=U+03f9U+03f7..=U+03f9 ('Ϸ'..='Ϲ') + 0x001fd302, // RK_ALT_LOWER_UPPER,len=2 : U+03fa..=U+03fbU+03fa..=U+03fb ('Ϻ'..='ϻ') + 0x001fe933, // RK_UNIFORM_LOWER,len=51 : U+03fd..=U+042fU+03fd..=U+042f ('Ͻ'..='Я') + 0x00218030, // RK_UNIFORM_UPPER,len=48 : U+0430..=U+045fU+0430..=U+045f ('а'..='џ') + 0x00230322, // RK_ALT_LOWER_UPPER,len=34 : U+0460..=U+0481U+0460..=U+0481 ('Ѡ'..='ҁ') + 0x00245337, // RK_ALT_LOWER_UPPER,len=55 : U+048a..=U+04c0U+048a..=U+04c0 ('Ҋ'..='Ӏ') + 0x00260b0e, // RK_ALT_LOWER_UPPER,len=14 : U+04c1..=U+04ceU+04c1..=U+04ce ('Ӂ'..='ӎ') + 0x00267a61, // RK_ALT_UPPER_LOWER,len=97 : U+04cf..=U+052fU+04cf..=U+052f ('ӏ'..='ԯ') + 0x00298926, // RK_UNIFORM_LOWER,len=38 : U+0531..=U+0556U+0531..=U+0556 ('Ա'..='Ֆ') + 0x002b0827, // RK_UNIFORM_UPPER,len=39 : U+0561..=U+0587U+0561..=U+0587 ('ա'..='և') + 0x00850126, // RK_UNIFORM_LOWER,len=38 : U+10a0..=U+10c5U+10a0..=U+10c5 ('Ⴀ'..='Ⴥ') + 0x00863901, // RK_UNIFORM_LOWER,len=1 : U+10c7 ('Ⴧ') + 0x00866901, // RK_UNIFORM_LOWER,len=1 : U+10cd ('Ⴭ') + 0x0086802b, // RK_UNIFORM_UPPER,len=43 : U+10d0..=U+10faU+10d0..=U+10fa ('ა'..='ჺ') + 0x0087e803, // RK_UNIFORM_UPPER,len=3 : U+10fd..=U+10ffU+10fd..=U+10ff ('ჽ'..='ჿ') + 0x009d0156, // RK_UNIFORM_LOWER,len=86 : U+13a0..=U+13f5U+13a0..=U+13f5 ('Ꭰ'..='Ᏽ') + 0x009fc006, // RK_UNIFORM_UPPER,len=6 : U+13f8..=U+13fdU+13f8..=U+13fd ('ᏸ'..='ᏽ') + 0x00e40009, // RK_UNIFORM_UPPER,len=9 : U+1c80..=U+1c88U+1c80..=U+1c88 ('ᲀ'..='ᲈ') + 0x00e4812b, // RK_UNIFORM_LOWER,len=43 : U+1c90..=U+1cbaU+1c90..=U+1cba ('Ა'..='Ჺ') + 0x00e5e903, // RK_UNIFORM_LOWER,len=3 : U+1cbd..=U+1cbfU+1cbd..=U+1cbf ('Ჽ'..='Ჿ') + 0x00ebc801, // RK_UNIFORM_UPPER,len=1 : U+1d79 ('ᵹ') + 0x00ebe801, // RK_UNIFORM_UPPER,len=1 : U+1d7d ('ᵽ') + 0x00ec7001, // RK_UNIFORM_UPPER,len=1 : U+1d8e ('ᶎ') + 0x00f00396, // RK_ALT_LOWER_UPPER,len=150 : U+1e00..=U+1e95U+1e00..=U+1e95 ('Ḁ'..='ẕ') + 0x00f4b006, // RK_UNIFORM_UPPER,len=6 : U+1e96..=U+1e9bU+1e96..=U+1e9b ('ẖ'..='ẛ') + 0x00f4f101, // RK_UNIFORM_LOWER,len=1 : U+1e9e ('ẞ') + 0x00f50360, // RK_ALT_LOWER_UPPER,len=96 : U+1ea0..=U+1effU+1ea0..=U+1eff ('Ạ'..='ỿ') + 0x00f80008, // RK_UNIFORM_UPPER,len=8 : U+1f00..=U+1f07U+1f00..=U+1f07 ('ἀ'..='ἇ') + 0x00f84108, // RK_UNIFORM_LOWER,len=8 : U+1f08..=U+1f0fU+1f08..=U+1f0f ('Ἀ'..='Ἇ') + 0x00f88006, // RK_UNIFORM_UPPER,len=6 : U+1f10..=U+1f15U+1f10..=U+1f15 ('ἐ'..='ἕ') + 0x00f8c106, // RK_UNIFORM_LOWER,len=6 : U+1f18..=U+1f1dU+1f18..=U+1f1d ('Ἐ'..='Ἕ') + 0x00f90008, // RK_UNIFORM_UPPER,len=8 : U+1f20..=U+1f27U+1f20..=U+1f27 ('ἠ'..='ἧ') + 0x00f94108, // RK_UNIFORM_LOWER,len=8 : U+1f28..=U+1f2fU+1f28..=U+1f2f ('Ἠ'..='Ἧ') + 0x00f98008, // RK_UNIFORM_UPPER,len=8 : U+1f30..=U+1f37U+1f30..=U+1f37 ('ἰ'..='ἷ') + 0x00f9c108, // RK_UNIFORM_LOWER,len=8 : U+1f38..=U+1f3fU+1f38..=U+1f3f ('Ἰ'..='Ἷ') + 0x00fa0006, // RK_UNIFORM_UPPER,len=6 : U+1f40..=U+1f45U+1f40..=U+1f45 ('ὀ'..='ὅ') + 0x00fa4106, // RK_UNIFORM_LOWER,len=6 : U+1f48..=U+1f4dU+1f48..=U+1f4d ('Ὀ'..='Ὅ') + 0x00fa8008, // RK_UNIFORM_UPPER,len=8 : U+1f50..=U+1f57U+1f50..=U+1f57 ('ὐ'..='ὗ') + 0x00fac901, // RK_UNIFORM_LOWER,len=1 : U+1f59 ('Ὑ') + 0x00fad901, // RK_UNIFORM_LOWER,len=1 : U+1f5b ('Ὓ') + 0x00fae901, // RK_UNIFORM_LOWER,len=1 : U+1f5d ('Ὕ') + 0x00fafb02, // RK_ALT_LOWER_UPPER,len=2 : U+1f5f..=U+1f60U+1f5f..=U+1f60 ('Ὗ'..='ὠ') + 0x00fb0807, // RK_UNIFORM_UPPER,len=7 : U+1f61..=U+1f67U+1f61..=U+1f67 ('ὡ'..='ὧ') + 0x00fb4108, // RK_UNIFORM_LOWER,len=8 : U+1f68..=U+1f6fU+1f68..=U+1f6f ('Ὠ'..='Ὧ') + 0x00fb800e, // RK_UNIFORM_UPPER,len=14 : U+1f70..=U+1f7dU+1f70..=U+1f7d ('ὰ'..='ώ') + 0x00fc0008, // RK_UNIFORM_UPPER,len=8 : U+1f80..=U+1f87U+1f80..=U+1f87 ('ᾀ'..='ᾇ') + 0x00fc4408, // RK_UNIFORM_BOTH,len=8 : U+1f88..=U+1f8fU+1f88..=U+1f8f ('ᾈ'..='ᾏ') + 0x00fc8008, // RK_UNIFORM_UPPER,len=8 : U+1f90..=U+1f97U+1f90..=U+1f97 ('ᾐ'..='ᾗ') + 0x00fcc408, // RK_UNIFORM_BOTH,len=8 : U+1f98..=U+1f9fU+1f98..=U+1f9f ('ᾘ'..='ᾟ') + 0x00fd0008, // RK_UNIFORM_UPPER,len=8 : U+1fa0..=U+1fa7U+1fa0..=U+1fa7 ('ᾠ'..='ᾧ') + 0x00fd4408, // RK_UNIFORM_BOTH,len=8 : U+1fa8..=U+1fafU+1fa8..=U+1faf ('ᾨ'..='ᾯ') + 0x00fd8005, // RK_UNIFORM_UPPER,len=5 : U+1fb0..=U+1fb4U+1fb0..=U+1fb4 ('ᾰ'..='ᾴ') + 0x00fdb002, // RK_UNIFORM_UPPER,len=2 : U+1fb6..=U+1fb7U+1fb6..=U+1fb7 ('ᾶ'..='ᾷ') + 0x00fdc104, // RK_UNIFORM_LOWER,len=4 : U+1fb8..=U+1fbbU+1fb8..=U+1fbb ('Ᾰ'..='Ά') + 0x00fde401, // RK_UNIFORM_BOTH,len=1 : U+1fbc ('ᾼ') + 0x00fdf001, // RK_UNIFORM_UPPER,len=1 : U+1fbe ('ι') + 0x00fe1003, // RK_UNIFORM_UPPER,len=3 : U+1fc2..=U+1fc4U+1fc2..=U+1fc4 ('ῂ'..='ῄ') + 0x00fe3002, // RK_UNIFORM_UPPER,len=2 : U+1fc6..=U+1fc7U+1fc6..=U+1fc7 ('ῆ'..='ῇ') + 0x00fe4104, // RK_UNIFORM_LOWER,len=4 : U+1fc8..=U+1fcbU+1fc8..=U+1fcb ('Ὲ'..='Ή') + 0x00fe6401, // RK_UNIFORM_BOTH,len=1 : U+1fcc ('ῌ') + 0x00fe8004, // RK_UNIFORM_UPPER,len=4 : U+1fd0..=U+1fd3U+1fd0..=U+1fd3 ('ῐ'..='ΐ') + 0x00feb002, // RK_UNIFORM_UPPER,len=2 : U+1fd6..=U+1fd7U+1fd6..=U+1fd7 ('ῖ'..='ῗ') + 0x00fec104, // RK_UNIFORM_LOWER,len=4 : U+1fd8..=U+1fdbU+1fd8..=U+1fdb ('Ῐ'..='Ί') + 0x00ff0008, // RK_UNIFORM_UPPER,len=8 : U+1fe0..=U+1fe7U+1fe0..=U+1fe7 ('ῠ'..='ῧ') + 0x00ff4105, // RK_UNIFORM_LOWER,len=5 : U+1fe8..=U+1fecU+1fe8..=U+1fec ('Ῠ'..='Ῥ') + 0x00ff9003, // RK_UNIFORM_UPPER,len=3 : U+1ff2..=U+1ff4U+1ff2..=U+1ff4 ('ῲ'..='ῴ') + 0x00ffb002, // RK_UNIFORM_UPPER,len=2 : U+1ff6..=U+1ff7U+1ff6..=U+1ff7 ('ῶ'..='ῷ') + 0x00ffc104, // RK_UNIFORM_LOWER,len=4 : U+1ff8..=U+1ffbU+1ff8..=U+1ffb ('Ὸ'..='Ώ') + 0x00ffe401, // RK_UNIFORM_BOTH,len=1 : U+1ffc ('ῼ') + 0x01093101, // RK_UNIFORM_LOWER,len=1 : U+2126 ('Ω') + 0x01095102, // RK_UNIFORM_LOWER,len=2 : U+212a..=U+212bU+212a..=U+212b ('K'..='Å') + 0x01099101, // RK_UNIFORM_LOWER,len=1 : U+2132 ('Ⅎ') + 0x010a7001, // RK_UNIFORM_UPPER,len=1 : U+214e ('ⅎ') + 0x010b0110, // RK_UNIFORM_LOWER,len=16 : U+2160..=U+216fU+2160..=U+216f ('Ⅰ'..='Ⅿ') + 0x010b8010, // RK_UNIFORM_UPPER,len=16 : U+2170..=U+217fU+2170..=U+217f ('ⅰ'..='ⅿ') + 0x010c1b02, // RK_ALT_LOWER_UPPER,len=2 : U+2183..=U+2184U+2183..=U+2184 ('Ↄ'..='ↄ') + 0x0125b11a, // RK_UNIFORM_LOWER,len=26 : U+24b6..=U+24cfU+24b6..=U+24cf ('Ⓐ'..='Ⓩ') + 0x0126801a, // RK_UNIFORM_UPPER,len=26 : U+24d0..=U+24e9U+24d0..=U+24e9 ('ⓐ'..='ⓩ') + 0x01600130, // RK_UNIFORM_LOWER,len=48 : U+2c00..=U+2c2fU+2c00..=U+2c2f ('Ⰰ'..='Ⱟ') + 0x01618030, // RK_UNIFORM_UPPER,len=48 : U+2c30..=U+2c5fU+2c30..=U+2c5f ('ⰰ'..='ⱟ') + 0x01630303, // RK_ALT_LOWER_UPPER,len=3 : U+2c60..=U+2c62U+2c60..=U+2c62 ('Ⱡ'..='Ɫ') + 0x01631902, // RK_UNIFORM_LOWER,len=2 : U+2c63..=U+2c64U+2c63..=U+2c64 ('Ᵽ'..='Ɽ') + 0x01632802, // RK_UNIFORM_UPPER,len=2 : U+2c65..=U+2c66U+2c65..=U+2c66 ('ⱥ'..='ⱦ') + 0x01633b07, // RK_ALT_LOWER_UPPER,len=7 : U+2c67..=U+2c6dU+2c67..=U+2c6d ('Ⱨ'..='Ɑ') + 0x01637103, // RK_UNIFORM_LOWER,len=3 : U+2c6e..=U+2c70U+2c6e..=U+2c70 ('Ɱ'..='Ɒ') + 0x01639302, // RK_ALT_LOWER_UPPER,len=2 : U+2c72..=U+2c73U+2c72..=U+2c73 ('Ⱳ'..='ⱳ') + 0x0163ab02, // RK_ALT_LOWER_UPPER,len=2 : U+2c75..=U+2c76U+2c75..=U+2c76 ('Ⱶ'..='ⱶ') + 0x0163f103, // RK_UNIFORM_LOWER,len=3 : U+2c7e..=U+2c80U+2c7e..=U+2c80 ('Ȿ'..='Ⲁ') + 0x01640a63, // RK_ALT_UPPER_LOWER,len=99 : U+2c81..=U+2ce3U+2c81..=U+2ce3 ('ⲁ'..='ⳣ') + 0x01675b04, // RK_ALT_LOWER_UPPER,len=4 : U+2ceb..=U+2ceeU+2ceb..=U+2cee ('Ⳬ'..='ⳮ') + 0x01679302, // RK_ALT_LOWER_UPPER,len=2 : U+2cf2..=U+2cf3U+2cf2..=U+2cf3 ('Ⳳ'..='ⳳ') + 0x01680026, // RK_UNIFORM_UPPER,len=38 : U+2d00..=U+2d25U+2d00..=U+2d25 ('ⴀ'..='ⴥ') + 0x01693801, // RK_UNIFORM_UPPER,len=1 : U+2d27 ('ⴧ') + 0x01696801, // RK_UNIFORM_UPPER,len=1 : U+2d2d ('ⴭ') + 0x0532032e, // RK_ALT_LOWER_UPPER,len=46 : U+a640..=U+a66dU+a640..=U+a66d ('Ꙁ'..='ꙭ') + 0x0534031c, // RK_ALT_LOWER_UPPER,len=28 : U+a680..=U+a69bU+a680..=U+a69b ('Ꚁ'..='ꚛ') + 0x0539130e, // RK_ALT_LOWER_UPPER,len=14 : U+a722..=U+a72fU+a722..=U+a72f ('Ꜣ'..='ꜯ') + 0x0539933e, // RK_ALT_LOWER_UPPER,len=62 : U+a732..=U+a76fU+a732..=U+a76f ('Ꜳ'..='ꝯ') + 0x053bcb05, // RK_ALT_LOWER_UPPER,len=5 : U+a779..=U+a77dU+a779..=U+a77d ('Ꝺ'..='Ᵹ') + 0x053bf30a, // RK_ALT_LOWER_UPPER,len=10 : U+a77e..=U+a787U+a77e..=U+a787 ('Ꝿ'..='ꞇ') + 0x053c5b03, // RK_ALT_LOWER_UPPER,len=3 : U+a78b..=U+a78dU+a78b..=U+a78d ('Ꞌ'..='Ɥ') + 0x053c8304, // RK_ALT_LOWER_UPPER,len=4 : U+a790..=U+a793U+a790..=U+a793 ('Ꞑ'..='ꞓ') + 0x053ca001, // RK_UNIFORM_UPPER,len=1 : U+a794 ('ꞔ') + 0x053cb315, // RK_ALT_LOWER_UPPER,len=21 : U+a796..=U+a7aaU+a796..=U+a7aa ('Ꞗ'..='Ɦ') + 0x053d5904, // RK_UNIFORM_LOWER,len=4 : U+a7ab..=U+a7aeU+a7ab..=U+a7ae ('Ɜ'..='Ɪ') + 0x053d8105, // RK_UNIFORM_LOWER,len=5 : U+a7b0..=U+a7b4U+a7b0..=U+a7b4 ('Ʞ'..='Ꞵ') + 0x053daa10, // RK_ALT_UPPER_LOWER,len=16 : U+a7b5..=U+a7c4U+a7b5..=U+a7c4 ('ꞵ'..='Ꞔ') + 0x053e2903, // RK_UNIFORM_LOWER,len=3 : U+a7c5..=U+a7c7U+a7c5..=U+a7c7 ('Ʂ'..='Ꟈ') + 0x053e4203, // RK_ALT_UPPER_LOWER,len=3 : U+a7c8..=U+a7caU+a7c8..=U+a7ca ('ꟈ'..='ꟊ') + 0x053e8302, // RK_ALT_LOWER_UPPER,len=2 : U+a7d0..=U+a7d1U+a7d0..=U+a7d1 ('Ꟑ'..='ꟑ') + 0x053eb304, // RK_ALT_LOWER_UPPER,len=4 : U+a7d6..=U+a7d9U+a7d6..=U+a7d9 ('Ꟗ'..='ꟙ') + 0x053fab02, // RK_ALT_LOWER_UPPER,len=2 : U+a7f5..=U+a7f6U+a7f5..=U+a7f6 ('Ꟶ'..='ꟶ') + 0x055a9801, // RK_UNIFORM_UPPER,len=1 : U+ab53 ('ꭓ') + 0x055b8050, // RK_UNIFORM_UPPER,len=80 : U+ab70..=U+abbfU+ab70..=U+abbf ('ꭰ'..='ꮿ') + 0x07d80007, // RK_UNIFORM_UPPER,len=7 : U+fb00..=U+fb06U+fb00..=U+fb06 ('ff'..='st') + 0x07d89805, // RK_UNIFORM_UPPER,len=5 : U+fb13..=U+fb17U+fb13..=U+fb17 ('ﬓ'..='ﬗ') + 0x07f9091a, // RK_UNIFORM_LOWER,len=26 : U+ff21..=U+ff3aU+ff21..=U+ff3a ('A'..='Z') + 0x07fa081a, // RK_UNIFORM_UPPER,len=26 : U+ff41..=U+ff5aU+ff41..=U+ff5a ('a'..='z') + 0x08200128, // RK_UNIFORM_LOWER,len=40 : U+10400..=U+10427U+10400..=U+10427 ('𐐀'..='𐐧') + 0x08214028, // RK_UNIFORM_UPPER,len=40 : U+10428..=U+1044fU+10428..=U+1044f ('𐐨'..='𐑏') + 0x08258124, // RK_UNIFORM_LOWER,len=36 : U+104b0..=U+104d3U+104b0..=U+104d3 ('𐒰'..='𐓓') + 0x0826c024, // RK_UNIFORM_UPPER,len=36 : U+104d8..=U+104fbU+104d8..=U+104fb ('𐓘'..='𐓻') + 0x082b810b, // RK_UNIFORM_LOWER,len=11 : U+10570..=U+1057aU+10570..=U+1057a ('𐕰'..='𐕺') + 0x082be10f, // RK_UNIFORM_LOWER,len=15 : U+1057c..=U+1058aU+1057c..=U+1058a ('𐕼'..='𐖊') + 0x082c6107, // RK_UNIFORM_LOWER,len=7 : U+1058c..=U+10592U+1058c..=U+10592 ('𐖌'..='𐖒') + 0x082ca102, // RK_UNIFORM_LOWER,len=2 : U+10594..=U+10595U+10594..=U+10595 ('𐖔'..='𐖕') + 0x082cb80b, // RK_UNIFORM_UPPER,len=11 : U+10597..=U+105a1U+10597..=U+105a1 ('𐖗'..='𐖡') + 0x082d180f, // RK_UNIFORM_UPPER,len=15 : U+105a3..=U+105b1U+105a3..=U+105b1 ('𐖣'..='𐖱') + 0x082d9807, // RK_UNIFORM_UPPER,len=7 : U+105b3..=U+105b9U+105b3..=U+105b9 ('𐖳'..='𐖹') + 0x082dd802, // RK_UNIFORM_UPPER,len=2 : U+105bb..=U+105bcU+105bb..=U+105bc ('𐖻'..='𐖼') + 0x08640133, // RK_UNIFORM_LOWER,len=51 : U+10c80..=U+10cb2U+10c80..=U+10cb2 ('𐲀'..='𐲲') + 0x08660033, // RK_UNIFORM_UPPER,len=51 : U+10cc0..=U+10cf2U+10cc0..=U+10cf2 ('𐳀'..='𐳲') + 0x08c50120, // RK_UNIFORM_LOWER,len=32 : U+118a0..=U+118bfU+118a0..=U+118bf ('𑢠'..='𑢿') + 0x08c60020, // RK_UNIFORM_UPPER,len=32 : U+118c0..=U+118dfU+118c0..=U+118df ('𑣀'..='𑣟') + 0x0b720120, // RK_UNIFORM_LOWER,len=32 : U+16e40..=U+16e5fU+16e40..=U+16e5f ('𖹀'..='𖹟') + 0x0b730020, // RK_UNIFORM_UPPER,len=32 : U+16e60..=U+16e7fU+16e60..=U+16e7f ('𖹠'..='𖹿') + 0x0f480122, // RK_UNIFORM_LOWER,len=34 : U+1e900..=U+1e921U+1e900..=U+1e921 ('𞤀'..='𞤡') + 0x0f491022, // RK_UNIFORM_UPPER,len=34 : U+1e922..=U+1e943U+1e922..=U+1e943 ('𞤢'..='𞥃') +]; +pub(super) type ChangesWhenTableType = [u32; 225]; + +pub(super) const UNICODE_VERSION: (u8, u8, u8) = (15, 0, 0); diff --git a/src/lib.rs b/src/lib.rs index 97f998b..d054b4e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -178,7 +178,7 @@ impl<'s> CowUtils<'s> for &'s str { // but it wouldn't be particularly clean, so for now just check // if the string contains any uppercase char and let // `str::to_lowercase` rescan it again. - if self.chars().any(changes_when_lowercased) { + if self.chars().any(case::changes_when_lowercased) { Cow::Owned(self.to_lowercase()) } else { Cow::Borrowed(self) @@ -222,7 +222,7 @@ impl<'s> CowUtils<'s> for &'s str { /// assert_matches!("ᾈ".cow_to_uppercase(), Cow::Owned(s) if s == "ἈΙ"); /// ``` fn cow_to_uppercase(self) -> Self::Output { - match self.find(changes_when_uppercased) { + match self.find(case::changes_when_uppercased) { Some(pos) => { let mut output = String::with_capacity(self.len()); // We already know position of the first lowercase char, @@ -240,10 +240,4 @@ impl<'s> CowUtils<'s> for &'s str { } } -fn changes_when_lowercased(c: char) -> bool { - !core::iter::once(c).eq(c.to_lowercase()) -} - -fn changes_when_uppercased(c: char) -> bool { - !core::iter::once(c).eq(c.to_uppercase()) -} +mod case; diff --git a/src/uniprop.rs b/src/uniprop.rs new file mode 100644 index 0000000..1bcbd67 --- /dev/null +++ b/src/uniprop.rs @@ -0,0 +1,235 @@ +// This file is automatically generated. Do not edit by hand. +// +// 225 ranges / 900 bytes. per-rangetype stats: +// - RK_UNIFORM_UPPER, ranges=85, chars=914 +// - RK_UNIFORM_LOWER, ranges=65, chars=815 +// - RK_ALT_UPPER_LOWER, ranges=20, chars=378 +// - RK_ALT_LOWER_UPPER, ranges=45, chars=737 +// - RK_UNIFORM_BOTH, ranges=10, chars=31 +pub(super) const CHANGES_WHEN_LOOKUP_TAB: &[u32; 225] = &[ + 0x0005a801, // RK_UNIFORM_UPPER,len=1 : U+00b5 ('µ') + 0x00060117, // RK_UNIFORM_LOWER,len=23 : U+00c0..=U+00d6U+00c0..=U+00d6 ('À'..='Ö') + 0x0006c107, // RK_UNIFORM_LOWER,len=7 : U+00d8..=U+00deU+00d8..=U+00de ('Ø'..='Þ') + 0x0006f818, // RK_UNIFORM_UPPER,len=24 : U+00df..=U+00f6U+00df..=U+00f6 ('ß'..='ö') + 0x0007c008, // RK_UNIFORM_UPPER,len=8 : U+00f8..=U+00ffU+00f8..=U+00ff ('ø'..='ÿ') + 0x00080338, // RK_ALT_LOWER_UPPER,len=56 : U+0100..=U+0137U+0100..=U+0137 ('Ā'..='ķ') + 0x0009cb10, // RK_ALT_LOWER_UPPER,len=16 : U+0139..=U+0148U+0139..=U+0148 ('Ĺ'..='ň') + 0x000a4a30, // RK_ALT_UPPER_LOWER,len=48 : U+0149..=U+0178U+0149..=U+0178 ('ʼn'..='Ÿ') + 0x000bcb06, // RK_ALT_LOWER_UPPER,len=6 : U+0179..=U+017eU+0179..=U+017e ('Ź'..='ž') + 0x000bf802, // RK_UNIFORM_UPPER,len=2 : U+017f..=U+0180U+017f..=U+0180 ('ſ'..='ƀ') + 0x000c0902, // RK_UNIFORM_LOWER,len=2 : U+0181..=U+0182U+0181..=U+0182 ('Ɓ'..='Ƃ') + 0x000c1a04, // RK_ALT_UPPER_LOWER,len=4 : U+0183..=U+0186U+0183..=U+0186 ('ƃ'..='Ɔ') + 0x000c3b03, // RK_ALT_LOWER_UPPER,len=3 : U+0187..=U+0189U+0187..=U+0189 ('Ƈ'..='Ɖ') + 0x000c5102, // RK_UNIFORM_LOWER,len=2 : U+018a..=U+018bU+018a..=U+018b ('Ɗ'..='Ƌ') + 0x000c6001, // RK_UNIFORM_UPPER,len=1 : U+018c ('ƌ') + 0x000c7104, // RK_UNIFORM_LOWER,len=4 : U+018e..=U+0191U+018e..=U+0191 ('Ǝ'..='Ƒ') + 0x000c9202, // RK_ALT_UPPER_LOWER,len=2 : U+0192..=U+0193U+0192..=U+0193 ('ƒ'..='Ɠ') + 0x000ca303, // RK_ALT_LOWER_UPPER,len=3 : U+0194..=U+0196U+0194..=U+0196 ('Ɣ'..='Ɩ') + 0x000cb902, // RK_UNIFORM_LOWER,len=2 : U+0197..=U+0198U+0197..=U+0198 ('Ɨ'..='Ƙ') + 0x000cc802, // RK_UNIFORM_UPPER,len=2 : U+0199..=U+019aU+0199..=U+019a ('ƙ'..='ƚ') + 0x000ce102, // RK_UNIFORM_LOWER,len=2 : U+019c..=U+019dU+019c..=U+019d ('Ɯ'..='Ɲ') + 0x000cf202, // RK_ALT_UPPER_LOWER,len=2 : U+019e..=U+019fU+019e..=U+019f ('ƞ'..='Ɵ') + 0x000d0307, // RK_ALT_LOWER_UPPER,len=7 : U+01a0..=U+01a6U+01a0..=U+01a6 ('Ơ'..='Ʀ') + 0x000d3b03, // RK_ALT_LOWER_UPPER,len=3 : U+01a7..=U+01a9U+01a7..=U+01a9 ('Ƨ'..='Ʃ') + 0x000d6303, // RK_ALT_LOWER_UPPER,len=3 : U+01ac..=U+01aeU+01ac..=U+01ae ('Ƭ'..='Ʈ') + 0x000d7b03, // RK_ALT_LOWER_UPPER,len=3 : U+01af..=U+01b1U+01af..=U+01b1 ('Ư'..='Ʊ') + 0x000d9102, // RK_UNIFORM_LOWER,len=2 : U+01b2..=U+01b3U+01b2..=U+01b3 ('Ʋ'..='Ƴ') + 0x000da204, // RK_ALT_UPPER_LOWER,len=4 : U+01b4..=U+01b7U+01b4..=U+01b7 ('ƴ'..='Ʒ') + 0x000dc302, // RK_ALT_LOWER_UPPER,len=2 : U+01b8..=U+01b9U+01b8..=U+01b9 ('Ƹ'..='ƹ') + 0x000de302, // RK_ALT_LOWER_UPPER,len=2 : U+01bc..=U+01bdU+01bc..=U+01bd ('Ƽ'..='ƽ') + 0x000df801, // RK_UNIFORM_UPPER,len=1 : U+01bf ('ƿ') + 0x000e2101, // RK_UNIFORM_LOWER,len=1 : U+01c4 ('DŽ') + 0x000e2c01, // RK_UNIFORM_BOTH,len=1 : U+01c5 ('Dž') + 0x000e3202, // RK_ALT_UPPER_LOWER,len=2 : U+01c6..=U+01c7U+01c6..=U+01c7 ('dž'..='LJ') + 0x000e4401, // RK_UNIFORM_BOTH,len=1 : U+01c8 ('Lj') + 0x000e4a02, // RK_ALT_UPPER_LOWER,len=2 : U+01c9..=U+01caU+01c9..=U+01ca ('lj'..='NJ') + 0x000e5c01, // RK_UNIFORM_BOTH,len=1 : U+01cb ('Nj') + 0x000e6211, // RK_ALT_UPPER_LOWER,len=17 : U+01cc..=U+01dcU+01cc..=U+01dc ('nj'..='ǜ') + 0x000eea13, // RK_ALT_UPPER_LOWER,len=19 : U+01dd..=U+01efU+01dd..=U+01ef ('ǝ'..='ǯ') + 0x000f8202, // RK_ALT_UPPER_LOWER,len=2 : U+01f0..=U+01f1U+01f0..=U+01f1 ('ǰ'..='DZ') + 0x000f9401, // RK_UNIFORM_BOTH,len=1 : U+01f2 ('Dz') + 0x000f9a04, // RK_ALT_UPPER_LOWER,len=4 : U+01f3..=U+01f6U+01f3..=U+01f6 ('dz'..='Ƕ') + 0x000fb902, // RK_UNIFORM_LOWER,len=2 : U+01f7..=U+01f8U+01f7..=U+01f8 ('Ƿ'..='Ǹ') + 0x000fca28, // RK_ALT_UPPER_LOWER,len=40 : U+01f9..=U+0220U+01f9..=U+0220 ('ǹ'..='Ƞ') + 0x00111312, // RK_ALT_LOWER_UPPER,len=18 : U+0222..=U+0233U+0222..=U+0233 ('Ȣ'..='ȳ') + 0x0011d102, // RK_UNIFORM_LOWER,len=2 : U+023a..=U+023bU+023a..=U+023b ('Ⱥ'..='Ȼ') + 0x0011e202, // RK_ALT_UPPER_LOWER,len=2 : U+023c..=U+023dU+023c..=U+023d ('ȼ'..='Ƚ') + 0x0011f302, // RK_ALT_LOWER_UPPER,len=2 : U+023e..=U+023fU+023e..=U+023f ('Ⱦ'..='ȿ') + 0x00120204, // RK_ALT_UPPER_LOWER,len=4 : U+0240..=U+0243U+0240..=U+0243 ('ɀ'..='Ƀ') + 0x00122103, // RK_UNIFORM_LOWER,len=3 : U+0244..=U+0246U+0244..=U+0246 ('Ʉ'..='Ɇ') + 0x00123a09, // RK_ALT_UPPER_LOWER,len=9 : U+0247..=U+024fU+0247..=U+024f ('ɇ'..='ɏ') + 0x00128005, // RK_UNIFORM_UPPER,len=5 : U+0250..=U+0254U+0250..=U+0254 ('ɐ'..='ɔ') + 0x0012b002, // RK_UNIFORM_UPPER,len=2 : U+0256..=U+0257U+0256..=U+0257 ('ɖ'..='ɗ') + 0x0012c801, // RK_UNIFORM_UPPER,len=1 : U+0259 ('ə') + 0x0012d802, // RK_UNIFORM_UPPER,len=2 : U+025b..=U+025cU+025b..=U+025c ('ɛ'..='ɜ') + 0x00130002, // RK_UNIFORM_UPPER,len=2 : U+0260..=U+0261U+0260..=U+0261 ('ɠ'..='ɡ') + 0x00131801, // RK_UNIFORM_UPPER,len=1 : U+0263 ('ɣ') + 0x00132802, // RK_UNIFORM_UPPER,len=2 : U+0265..=U+0266U+0265..=U+0266 ('ɥ'..='ɦ') + 0x00134005, // RK_UNIFORM_UPPER,len=5 : U+0268..=U+026cU+0268..=U+026c ('ɨ'..='ɬ') + 0x00137801, // RK_UNIFORM_UPPER,len=1 : U+026f ('ɯ') + 0x00138802, // RK_UNIFORM_UPPER,len=2 : U+0271..=U+0272U+0271..=U+0272 ('ɱ'..='ɲ') + 0x0013a801, // RK_UNIFORM_UPPER,len=1 : U+0275 ('ɵ') + 0x0013e801, // RK_UNIFORM_UPPER,len=1 : U+027d ('ɽ') + 0x00140001, // RK_UNIFORM_UPPER,len=1 : U+0280 ('ʀ') + 0x00141002, // RK_UNIFORM_UPPER,len=2 : U+0282..=U+0283U+0282..=U+0283 ('ʂ'..='ʃ') + 0x00143806, // RK_UNIFORM_UPPER,len=6 : U+0287..=U+028cU+0287..=U+028c ('ʇ'..='ʌ') + 0x00149001, // RK_UNIFORM_UPPER,len=1 : U+0292 ('ʒ') + 0x0014e802, // RK_UNIFORM_UPPER,len=2 : U+029d..=U+029eU+029d..=U+029e ('ʝ'..='ʞ') + 0x001a2801, // RK_UNIFORM_UPPER,len=1 : U+0345 ('\u{345}') + 0x001b8304, // RK_ALT_LOWER_UPPER,len=4 : U+0370..=U+0373U+0370..=U+0373 ('Ͱ'..='ͳ') + 0x001bb302, // RK_ALT_LOWER_UPPER,len=2 : U+0376..=U+0377U+0376..=U+0377 ('Ͷ'..='ͷ') + 0x001bd803, // RK_UNIFORM_UPPER,len=3 : U+037b..=U+037dU+037b..=U+037d ('ͻ'..='ͽ') + 0x001bf901, // RK_UNIFORM_LOWER,len=1 : U+037f ('Ϳ') + 0x001c3101, // RK_UNIFORM_LOWER,len=1 : U+0386 ('Ά') + 0x001c4103, // RK_UNIFORM_LOWER,len=3 : U+0388..=U+038aU+0388..=U+038a ('Έ'..='Ί') + 0x001c6101, // RK_UNIFORM_LOWER,len=1 : U+038c ('Ό') + 0x001c7102, // RK_UNIFORM_LOWER,len=2 : U+038e..=U+038fU+038e..=U+038f ('Ύ'..='Ώ') + 0x001c8202, // RK_ALT_UPPER_LOWER,len=2 : U+0390..=U+0391U+0390..=U+0391 ('ΐ'..='Α') + 0x001c9110, // RK_UNIFORM_LOWER,len=16 : U+0392..=U+03a1U+0392..=U+03a1 ('Β'..='Ρ') + 0x001d1909, // RK_UNIFORM_LOWER,len=9 : U+03a3..=U+03abU+03a3..=U+03ab ('Σ'..='Ϋ') + 0x001d6023, // RK_UNIFORM_UPPER,len=35 : U+03ac..=U+03ceU+03ac..=U+03ce ('ά'..='ώ') + 0x001e7b02, // RK_ALT_LOWER_UPPER,len=2 : U+03cf..=U+03d0U+03cf..=U+03d0 ('Ϗ'..='ϐ') + 0x001e8801, // RK_UNIFORM_UPPER,len=1 : U+03d1 ('ϑ') + 0x001ea803, // RK_UNIFORM_UPPER,len=3 : U+03d5..=U+03d7U+03d5..=U+03d7 ('ϕ'..='ϗ') + 0x001ec318, // RK_ALT_LOWER_UPPER,len=24 : U+03d8..=U+03efU+03d8..=U+03ef ('Ϙ'..='ϯ') + 0x001f8004, // RK_UNIFORM_UPPER,len=4 : U+03f0..=U+03f3U+03f0..=U+03f3 ('ϰ'..='ϳ') + 0x001fa302, // RK_ALT_LOWER_UPPER,len=2 : U+03f4..=U+03f5U+03f4..=U+03f5 ('ϴ'..='ϵ') + 0x001fbb03, // RK_ALT_LOWER_UPPER,len=3 : U+03f7..=U+03f9U+03f7..=U+03f9 ('Ϸ'..='Ϲ') + 0x001fd302, // RK_ALT_LOWER_UPPER,len=2 : U+03fa..=U+03fbU+03fa..=U+03fb ('Ϻ'..='ϻ') + 0x001fe933, // RK_UNIFORM_LOWER,len=51 : U+03fd..=U+042fU+03fd..=U+042f ('Ͻ'..='Я') + 0x00218030, // RK_UNIFORM_UPPER,len=48 : U+0430..=U+045fU+0430..=U+045f ('а'..='џ') + 0x00230322, // RK_ALT_LOWER_UPPER,len=34 : U+0460..=U+0481U+0460..=U+0481 ('Ѡ'..='ҁ') + 0x00245337, // RK_ALT_LOWER_UPPER,len=55 : U+048a..=U+04c0U+048a..=U+04c0 ('Ҋ'..='Ӏ') + 0x00260b0e, // RK_ALT_LOWER_UPPER,len=14 : U+04c1..=U+04ceU+04c1..=U+04ce ('Ӂ'..='ӎ') + 0x00267a61, // RK_ALT_UPPER_LOWER,len=97 : U+04cf..=U+052fU+04cf..=U+052f ('ӏ'..='ԯ') + 0x00298926, // RK_UNIFORM_LOWER,len=38 : U+0531..=U+0556U+0531..=U+0556 ('Ա'..='Ֆ') + 0x002b0827, // RK_UNIFORM_UPPER,len=39 : U+0561..=U+0587U+0561..=U+0587 ('ա'..='և') + 0x00850126, // RK_UNIFORM_LOWER,len=38 : U+10a0..=U+10c5U+10a0..=U+10c5 ('Ⴀ'..='Ⴥ') + 0x00863901, // RK_UNIFORM_LOWER,len=1 : U+10c7 ('Ⴧ') + 0x00866901, // RK_UNIFORM_LOWER,len=1 : U+10cd ('Ⴭ') + 0x0086802b, // RK_UNIFORM_UPPER,len=43 : U+10d0..=U+10faU+10d0..=U+10fa ('ა'..='ჺ') + 0x0087e803, // RK_UNIFORM_UPPER,len=3 : U+10fd..=U+10ffU+10fd..=U+10ff ('ჽ'..='ჿ') + 0x009d0156, // RK_UNIFORM_LOWER,len=86 : U+13a0..=U+13f5U+13a0..=U+13f5 ('Ꭰ'..='Ᏽ') + 0x009fc006, // RK_UNIFORM_UPPER,len=6 : U+13f8..=U+13fdU+13f8..=U+13fd ('ᏸ'..='ᏽ') + 0x00e40009, // RK_UNIFORM_UPPER,len=9 : U+1c80..=U+1c88U+1c80..=U+1c88 ('ᲀ'..='ᲈ') + 0x00e4812b, // RK_UNIFORM_LOWER,len=43 : U+1c90..=U+1cbaU+1c90..=U+1cba ('Ა'..='Ჺ') + 0x00e5e903, // RK_UNIFORM_LOWER,len=3 : U+1cbd..=U+1cbfU+1cbd..=U+1cbf ('Ჽ'..='Ჿ') + 0x00ebc801, // RK_UNIFORM_UPPER,len=1 : U+1d79 ('ᵹ') + 0x00ebe801, // RK_UNIFORM_UPPER,len=1 : U+1d7d ('ᵽ') + 0x00ec7001, // RK_UNIFORM_UPPER,len=1 : U+1d8e ('ᶎ') + 0x00f00396, // RK_ALT_LOWER_UPPER,len=150 : U+1e00..=U+1e95U+1e00..=U+1e95 ('Ḁ'..='ẕ') + 0x00f4b006, // RK_UNIFORM_UPPER,len=6 : U+1e96..=U+1e9bU+1e96..=U+1e9b ('ẖ'..='ẛ') + 0x00f4f101, // RK_UNIFORM_LOWER,len=1 : U+1e9e ('ẞ') + 0x00f50360, // RK_ALT_LOWER_UPPER,len=96 : U+1ea0..=U+1effU+1ea0..=U+1eff ('Ạ'..='ỿ') + 0x00f80008, // RK_UNIFORM_UPPER,len=8 : U+1f00..=U+1f07U+1f00..=U+1f07 ('ἀ'..='ἇ') + 0x00f84108, // RK_UNIFORM_LOWER,len=8 : U+1f08..=U+1f0fU+1f08..=U+1f0f ('Ἀ'..='Ἇ') + 0x00f88006, // RK_UNIFORM_UPPER,len=6 : U+1f10..=U+1f15U+1f10..=U+1f15 ('ἐ'..='ἕ') + 0x00f8c106, // RK_UNIFORM_LOWER,len=6 : U+1f18..=U+1f1dU+1f18..=U+1f1d ('Ἐ'..='Ἕ') + 0x00f90008, // RK_UNIFORM_UPPER,len=8 : U+1f20..=U+1f27U+1f20..=U+1f27 ('ἠ'..='ἧ') + 0x00f94108, // RK_UNIFORM_LOWER,len=8 : U+1f28..=U+1f2fU+1f28..=U+1f2f ('Ἠ'..='Ἧ') + 0x00f98008, // RK_UNIFORM_UPPER,len=8 : U+1f30..=U+1f37U+1f30..=U+1f37 ('ἰ'..='ἷ') + 0x00f9c108, // RK_UNIFORM_LOWER,len=8 : U+1f38..=U+1f3fU+1f38..=U+1f3f ('Ἰ'..='Ἷ') + 0x00fa0006, // RK_UNIFORM_UPPER,len=6 : U+1f40..=U+1f45U+1f40..=U+1f45 ('ὀ'..='ὅ') + 0x00fa4106, // RK_UNIFORM_LOWER,len=6 : U+1f48..=U+1f4dU+1f48..=U+1f4d ('Ὀ'..='Ὅ') + 0x00fa8008, // RK_UNIFORM_UPPER,len=8 : U+1f50..=U+1f57U+1f50..=U+1f57 ('ὐ'..='ὗ') + 0x00fac901, // RK_UNIFORM_LOWER,len=1 : U+1f59 ('Ὑ') + 0x00fad901, // RK_UNIFORM_LOWER,len=1 : U+1f5b ('Ὓ') + 0x00fae901, // RK_UNIFORM_LOWER,len=1 : U+1f5d ('Ὕ') + 0x00fafb02, // RK_ALT_LOWER_UPPER,len=2 : U+1f5f..=U+1f60U+1f5f..=U+1f60 ('Ὗ'..='ὠ') + 0x00fb0807, // RK_UNIFORM_UPPER,len=7 : U+1f61..=U+1f67U+1f61..=U+1f67 ('ὡ'..='ὧ') + 0x00fb4108, // RK_UNIFORM_LOWER,len=8 : U+1f68..=U+1f6fU+1f68..=U+1f6f ('Ὠ'..='Ὧ') + 0x00fb800e, // RK_UNIFORM_UPPER,len=14 : U+1f70..=U+1f7dU+1f70..=U+1f7d ('ὰ'..='ώ') + 0x00fc0008, // RK_UNIFORM_UPPER,len=8 : U+1f80..=U+1f87U+1f80..=U+1f87 ('ᾀ'..='ᾇ') + 0x00fc4408, // RK_UNIFORM_BOTH,len=8 : U+1f88..=U+1f8fU+1f88..=U+1f8f ('ᾈ'..='ᾏ') + 0x00fc8008, // RK_UNIFORM_UPPER,len=8 : U+1f90..=U+1f97U+1f90..=U+1f97 ('ᾐ'..='ᾗ') + 0x00fcc408, // RK_UNIFORM_BOTH,len=8 : U+1f98..=U+1f9fU+1f98..=U+1f9f ('ᾘ'..='ᾟ') + 0x00fd0008, // RK_UNIFORM_UPPER,len=8 : U+1fa0..=U+1fa7U+1fa0..=U+1fa7 ('ᾠ'..='ᾧ') + 0x00fd4408, // RK_UNIFORM_BOTH,len=8 : U+1fa8..=U+1fafU+1fa8..=U+1faf ('ᾨ'..='ᾯ') + 0x00fd8005, // RK_UNIFORM_UPPER,len=5 : U+1fb0..=U+1fb4U+1fb0..=U+1fb4 ('ᾰ'..='ᾴ') + 0x00fdb002, // RK_UNIFORM_UPPER,len=2 : U+1fb6..=U+1fb7U+1fb6..=U+1fb7 ('ᾶ'..='ᾷ') + 0x00fdc104, // RK_UNIFORM_LOWER,len=4 : U+1fb8..=U+1fbbU+1fb8..=U+1fbb ('Ᾰ'..='Ά') + 0x00fde401, // RK_UNIFORM_BOTH,len=1 : U+1fbc ('ᾼ') + 0x00fdf001, // RK_UNIFORM_UPPER,len=1 : U+1fbe ('ι') + 0x00fe1003, // RK_UNIFORM_UPPER,len=3 : U+1fc2..=U+1fc4U+1fc2..=U+1fc4 ('ῂ'..='ῄ') + 0x00fe3002, // RK_UNIFORM_UPPER,len=2 : U+1fc6..=U+1fc7U+1fc6..=U+1fc7 ('ῆ'..='ῇ') + 0x00fe4104, // RK_UNIFORM_LOWER,len=4 : U+1fc8..=U+1fcbU+1fc8..=U+1fcb ('Ὲ'..='Ή') + 0x00fe6401, // RK_UNIFORM_BOTH,len=1 : U+1fcc ('ῌ') + 0x00fe8004, // RK_UNIFORM_UPPER,len=4 : U+1fd0..=U+1fd3U+1fd0..=U+1fd3 ('ῐ'..='ΐ') + 0x00feb002, // RK_UNIFORM_UPPER,len=2 : U+1fd6..=U+1fd7U+1fd6..=U+1fd7 ('ῖ'..='ῗ') + 0x00fec104, // RK_UNIFORM_LOWER,len=4 : U+1fd8..=U+1fdbU+1fd8..=U+1fdb ('Ῐ'..='Ί') + 0x00ff0008, // RK_UNIFORM_UPPER,len=8 : U+1fe0..=U+1fe7U+1fe0..=U+1fe7 ('ῠ'..='ῧ') + 0x00ff4105, // RK_UNIFORM_LOWER,len=5 : U+1fe8..=U+1fecU+1fe8..=U+1fec ('Ῠ'..='Ῥ') + 0x00ff9003, // RK_UNIFORM_UPPER,len=3 : U+1ff2..=U+1ff4U+1ff2..=U+1ff4 ('ῲ'..='ῴ') + 0x00ffb002, // RK_UNIFORM_UPPER,len=2 : U+1ff6..=U+1ff7U+1ff6..=U+1ff7 ('ῶ'..='ῷ') + 0x00ffc104, // RK_UNIFORM_LOWER,len=4 : U+1ff8..=U+1ffbU+1ff8..=U+1ffb ('Ὸ'..='Ώ') + 0x00ffe401, // RK_UNIFORM_BOTH,len=1 : U+1ffc ('ῼ') + 0x01093101, // RK_UNIFORM_LOWER,len=1 : U+2126 ('Ω') + 0x01095102, // RK_UNIFORM_LOWER,len=2 : U+212a..=U+212bU+212a..=U+212b ('K'..='Å') + 0x01099101, // RK_UNIFORM_LOWER,len=1 : U+2132 ('Ⅎ') + 0x010a7001, // RK_UNIFORM_UPPER,len=1 : U+214e ('ⅎ') + 0x010b0110, // RK_UNIFORM_LOWER,len=16 : U+2160..=U+216fU+2160..=U+216f ('Ⅰ'..='Ⅿ') + 0x010b8010, // RK_UNIFORM_UPPER,len=16 : U+2170..=U+217fU+2170..=U+217f ('ⅰ'..='ⅿ') + 0x010c1b02, // RK_ALT_LOWER_UPPER,len=2 : U+2183..=U+2184U+2183..=U+2184 ('Ↄ'..='ↄ') + 0x0125b11a, // RK_UNIFORM_LOWER,len=26 : U+24b6..=U+24cfU+24b6..=U+24cf ('Ⓐ'..='Ⓩ') + 0x0126801a, // RK_UNIFORM_UPPER,len=26 : U+24d0..=U+24e9U+24d0..=U+24e9 ('ⓐ'..='ⓩ') + 0x01600130, // RK_UNIFORM_LOWER,len=48 : U+2c00..=U+2c2fU+2c00..=U+2c2f ('Ⰰ'..='Ⱟ') + 0x01618030, // RK_UNIFORM_UPPER,len=48 : U+2c30..=U+2c5fU+2c30..=U+2c5f ('ⰰ'..='ⱟ') + 0x01630303, // RK_ALT_LOWER_UPPER,len=3 : U+2c60..=U+2c62U+2c60..=U+2c62 ('Ⱡ'..='Ɫ') + 0x01631902, // RK_UNIFORM_LOWER,len=2 : U+2c63..=U+2c64U+2c63..=U+2c64 ('Ᵽ'..='Ɽ') + 0x01632802, // RK_UNIFORM_UPPER,len=2 : U+2c65..=U+2c66U+2c65..=U+2c66 ('ⱥ'..='ⱦ') + 0x01633b07, // RK_ALT_LOWER_UPPER,len=7 : U+2c67..=U+2c6dU+2c67..=U+2c6d ('Ⱨ'..='Ɑ') + 0x01637103, // RK_UNIFORM_LOWER,len=3 : U+2c6e..=U+2c70U+2c6e..=U+2c70 ('Ɱ'..='Ɒ') + 0x01639302, // RK_ALT_LOWER_UPPER,len=2 : U+2c72..=U+2c73U+2c72..=U+2c73 ('Ⱳ'..='ⱳ') + 0x0163ab02, // RK_ALT_LOWER_UPPER,len=2 : U+2c75..=U+2c76U+2c75..=U+2c76 ('Ⱶ'..='ⱶ') + 0x0163f103, // RK_UNIFORM_LOWER,len=3 : U+2c7e..=U+2c80U+2c7e..=U+2c80 ('Ȿ'..='Ⲁ') + 0x01640a63, // RK_ALT_UPPER_LOWER,len=99 : U+2c81..=U+2ce3U+2c81..=U+2ce3 ('ⲁ'..='ⳣ') + 0x01675b04, // RK_ALT_LOWER_UPPER,len=4 : U+2ceb..=U+2ceeU+2ceb..=U+2cee ('Ⳬ'..='ⳮ') + 0x01679302, // RK_ALT_LOWER_UPPER,len=2 : U+2cf2..=U+2cf3U+2cf2..=U+2cf3 ('Ⳳ'..='ⳳ') + 0x01680026, // RK_UNIFORM_UPPER,len=38 : U+2d00..=U+2d25U+2d00..=U+2d25 ('ⴀ'..='ⴥ') + 0x01693801, // RK_UNIFORM_UPPER,len=1 : U+2d27 ('ⴧ') + 0x01696801, // RK_UNIFORM_UPPER,len=1 : U+2d2d ('ⴭ') + 0x0532032e, // RK_ALT_LOWER_UPPER,len=46 : U+a640..=U+a66dU+a640..=U+a66d ('Ꙁ'..='ꙭ') + 0x0534031c, // RK_ALT_LOWER_UPPER,len=28 : U+a680..=U+a69bU+a680..=U+a69b ('Ꚁ'..='ꚛ') + 0x0539130e, // RK_ALT_LOWER_UPPER,len=14 : U+a722..=U+a72fU+a722..=U+a72f ('Ꜣ'..='ꜯ') + 0x0539933e, // RK_ALT_LOWER_UPPER,len=62 : U+a732..=U+a76fU+a732..=U+a76f ('Ꜳ'..='ꝯ') + 0x053bcb05, // RK_ALT_LOWER_UPPER,len=5 : U+a779..=U+a77dU+a779..=U+a77d ('Ꝺ'..='Ᵹ') + 0x053bf30a, // RK_ALT_LOWER_UPPER,len=10 : U+a77e..=U+a787U+a77e..=U+a787 ('Ꝿ'..='ꞇ') + 0x053c5b03, // RK_ALT_LOWER_UPPER,len=3 : U+a78b..=U+a78dU+a78b..=U+a78d ('Ꞌ'..='Ɥ') + 0x053c8304, // RK_ALT_LOWER_UPPER,len=4 : U+a790..=U+a793U+a790..=U+a793 ('Ꞑ'..='ꞓ') + 0x053ca001, // RK_UNIFORM_UPPER,len=1 : U+a794 ('ꞔ') + 0x053cb315, // RK_ALT_LOWER_UPPER,len=21 : U+a796..=U+a7aaU+a796..=U+a7aa ('Ꞗ'..='Ɦ') + 0x053d5904, // RK_UNIFORM_LOWER,len=4 : U+a7ab..=U+a7aeU+a7ab..=U+a7ae ('Ɜ'..='Ɪ') + 0x053d8105, // RK_UNIFORM_LOWER,len=5 : U+a7b0..=U+a7b4U+a7b0..=U+a7b4 ('Ʞ'..='Ꞵ') + 0x053daa10, // RK_ALT_UPPER_LOWER,len=16 : U+a7b5..=U+a7c4U+a7b5..=U+a7c4 ('ꞵ'..='Ꞔ') + 0x053e2903, // RK_UNIFORM_LOWER,len=3 : U+a7c5..=U+a7c7U+a7c5..=U+a7c7 ('Ʂ'..='Ꟈ') + 0x053e4203, // RK_ALT_UPPER_LOWER,len=3 : U+a7c8..=U+a7caU+a7c8..=U+a7ca ('ꟈ'..='ꟊ') + 0x053e8302, // RK_ALT_LOWER_UPPER,len=2 : U+a7d0..=U+a7d1U+a7d0..=U+a7d1 ('Ꟑ'..='ꟑ') + 0x053eb304, // RK_ALT_LOWER_UPPER,len=4 : U+a7d6..=U+a7d9U+a7d6..=U+a7d9 ('Ꟗ'..='ꟙ') + 0x053fab02, // RK_ALT_LOWER_UPPER,len=2 : U+a7f5..=U+a7f6U+a7f5..=U+a7f6 ('Ꟶ'..='ꟶ') + 0x055a9801, // RK_UNIFORM_UPPER,len=1 : U+ab53 ('ꭓ') + 0x055b8050, // RK_UNIFORM_UPPER,len=80 : U+ab70..=U+abbfU+ab70..=U+abbf ('ꭰ'..='ꮿ') + 0x07d80007, // RK_UNIFORM_UPPER,len=7 : U+fb00..=U+fb06U+fb00..=U+fb06 ('ff'..='st') + 0x07d89805, // RK_UNIFORM_UPPER,len=5 : U+fb13..=U+fb17U+fb13..=U+fb17 ('ﬓ'..='ﬗ') + 0x07f9091a, // RK_UNIFORM_LOWER,len=26 : U+ff21..=U+ff3aU+ff21..=U+ff3a ('A'..='Z') + 0x07fa081a, // RK_UNIFORM_UPPER,len=26 : U+ff41..=U+ff5aU+ff41..=U+ff5a ('a'..='z') + 0x08200128, // RK_UNIFORM_LOWER,len=40 : U+10400..=U+10427U+10400..=U+10427 ('𐐀'..='𐐧') + 0x08214028, // RK_UNIFORM_UPPER,len=40 : U+10428..=U+1044fU+10428..=U+1044f ('𐐨'..='𐑏') + 0x08258124, // RK_UNIFORM_LOWER,len=36 : U+104b0..=U+104d3U+104b0..=U+104d3 ('𐒰'..='𐓓') + 0x0826c024, // RK_UNIFORM_UPPER,len=36 : U+104d8..=U+104fbU+104d8..=U+104fb ('𐓘'..='𐓻') + 0x082b810b, // RK_UNIFORM_LOWER,len=11 : U+10570..=U+1057aU+10570..=U+1057a ('𐕰'..='𐕺') + 0x082be10f, // RK_UNIFORM_LOWER,len=15 : U+1057c..=U+1058aU+1057c..=U+1058a ('𐕼'..='𐖊') + 0x082c6107, // RK_UNIFORM_LOWER,len=7 : U+1058c..=U+10592U+1058c..=U+10592 ('𐖌'..='𐖒') + 0x082ca102, // RK_UNIFORM_LOWER,len=2 : U+10594..=U+10595U+10594..=U+10595 ('𐖔'..='𐖕') + 0x082cb80b, // RK_UNIFORM_UPPER,len=11 : U+10597..=U+105a1U+10597..=U+105a1 ('𐖗'..='𐖡') + 0x082d180f, // RK_UNIFORM_UPPER,len=15 : U+105a3..=U+105b1U+105a3..=U+105b1 ('𐖣'..='𐖱') + 0x082d9807, // RK_UNIFORM_UPPER,len=7 : U+105b3..=U+105b9U+105b3..=U+105b9 ('𐖳'..='𐖹') + 0x082dd802, // RK_UNIFORM_UPPER,len=2 : U+105bb..=U+105bcU+105bb..=U+105bc ('𐖻'..='𐖼') + 0x08640133, // RK_UNIFORM_LOWER,len=51 : U+10c80..=U+10cb2U+10c80..=U+10cb2 ('𐲀'..='𐲲') + 0x08660033, // RK_UNIFORM_UPPER,len=51 : U+10cc0..=U+10cf2U+10cc0..=U+10cf2 ('𐳀'..='𐳲') + 0x08c50120, // RK_UNIFORM_LOWER,len=32 : U+118a0..=U+118bfU+118a0..=U+118bf ('𑢠'..='𑢿') + 0x08c60020, // RK_UNIFORM_UPPER,len=32 : U+118c0..=U+118dfU+118c0..=U+118df ('𑣀'..='𑣟') + 0x0b720120, // RK_UNIFORM_LOWER,len=32 : U+16e40..=U+16e5fU+16e40..=U+16e5f ('𖹀'..='𖹟') + 0x0b730020, // RK_UNIFORM_UPPER,len=32 : U+16e60..=U+16e7fU+16e60..=U+16e7f ('𖹠'..='𖹿') + 0x0f480122, // RK_UNIFORM_LOWER,len=34 : U+1e900..=U+1e921U+1e900..=U+1e921 ('𞤀'..='𞤡') + 0x0f491022, // RK_UNIFORM_UPPER,len=34 : U+1e922..=U+1e943U+1e922..=U+1e943 ('𞤢'..='𞥃') +]; diff --git a/tabgen/Cargo.toml b/tabgen/Cargo.toml new file mode 100644 index 0000000..e3bb523 --- /dev/null +++ b/tabgen/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "tabgen" +version = "0.1.0" +edition = "2021" +publish = false + + +[dependencies] + + diff --git a/tabgen/README.md b/tabgen/README.md new file mode 100644 index 0000000..5d6e57d --- /dev/null +++ b/tabgen/README.md @@ -0,0 +1,36 @@ +# Table generator for cow-utils-rs + +This generates tables to efficiently answer if a given character will change when uppercased/lowercased with minimal size overhead. + +Note that some characters change when uppercased even when they are not lowercase, and vice versa, so `c.is_uppercase()`/`c.is_lowercase()` does not answer this. + +What you actually want is to test for the unicode properties `Changes_When_Uppercased` and `Changes_When_Lowercased`. The Rust stdlib doesn't let you query unicode properties directly (especially not obscure ones like that), but you can still answer it using stdlib functionality by performing `c.to_lowercase()`/`c.to_uppercase()`, and seeing if they actually change the character. This straightforward, and only slightly complicated by the fact that those functions return iterators rather than characters (because this conversion may produce multiple characters). That's handled easily enough tho, and you'd end up with: +```rs +fn changes_when_lowercased(c: char) -> bool { + !core::iter::once(c).eq(c.to_lowercase()) +} +fn changes_when_uppercased(c: char) -> bool { + !core::iter::once(c).eq(c.to_uppercase()) +} +``` +This works perfectly, but is somewhat slow, which is why this code exists. + +First off, ASCII is handled + + + +// Note: `c.to_uppercase()` and `c.to_lowercase()` return +// an `Iterator`, rather than a character. + + + + +The implementation works as follows: +- First, ASCII characters use a small lookup table. The same table is used for both properties. +- + +Then, we binary search for the character to determine + + + + diff --git a/tabgen/src/gen.rs b/tabgen/src/gen.rs new file mode 100644 index 0000000..49b8e8e --- /dev/null +++ b/tabgen/src/gen.rs @@ -0,0 +1,496 @@ +//! The basic idea is that we segment codepoints into one of a, +//! few ranges: +//! +//! - Ascii (self explanatory, and handled by the caller). +//! - ChangesUpper (only changes on uppercase). +//! - ChangesLower (only changes on lowercase). +//! - ChangesEither (changes on either upper or lower). +//! - ChangesUpperLowerAlternating (every other character is upper/lower in this +//! block. May sound weird but is very common, many scripts are layed out with +//! the equivalent of `A`, `a`, `B`, `b`, etc). +//! +//! Note that the last of these sounds weird but is extremely common, and would +//! otherwise significantly bloat the table. +#![allow(unused)] +const NUM_CHARS: usize = 0x110000; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct CharInfo { + pub codepoint: u32, + pub changes_when_upper: bool, + pub changes_when_lower: bool, +} +impl CharInfo { + pub fn try_ch(&self) -> Option { + char::from_u32(self.codepoint) + } + pub fn ch(&self) -> char { + char::from_u32(self.codepoint) + .unwrap_or_else(|| panic!("0x{:X} is not a valid scalar", self.codepoint)) + } +} +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum CharCaseChanges { + Never, + UpperOnly, + LowerOnly, + Always, +} +impl CharCaseChanges { + // true if we change under exactly one of uppercase/lowercase + pub fn is_simple_cased(self) -> bool { + matches!( + self, + CharCaseChanges::UpperOnly | CharCaseChanges::LowerOnly + ) + } + // true if we change under exactly one of uppercase/lowercase + pub fn alternates_with(self, o: Self) -> bool { + match (self, o) { + (CharCaseChanges::UpperOnly, CharCaseChanges::LowerOnly) + | (CharCaseChanges::LowerOnly, CharCaseChanges::UpperOnly) => true, + _ => false, + } + } +} + +impl CharInfo { + pub fn case_enum(self) -> CharCaseChanges { + match (self.changes_when_lower, self.changes_when_upper) { + (false, false) => CharCaseChanges::Never, + (false, true) => CharCaseChanges::UpperOnly, + (true, false) => CharCaseChanges::LowerOnly, + (true, true) => CharCaseChanges::Always, + } + } + + // true if we change under exactly one of uppercase/lowercase + pub fn alternates_with(&self, o: &Self) -> bool { + self.case_enum().alternates_with(o.case_enum()) + } +} + +pub struct CaseChangeDb { + pub infos: Box<[CharInfo; NUM_CHARS]>, +} + +impl CaseChangeDb { + pub fn info(&self, c: u32) -> CharInfo { + self.infos + .get(c as usize) + .copied() + .unwrap_or_else(|| CharInfo { + codepoint: c, + changes_when_lower: false, + changes_when_upper: false, + }) + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum CharRangeType { + // Uniform(Never) is not present in output + Uniform(CharCaseChanges), + AlternatingUpperLower, + AlternatingLowerUpper, +} +impl CharRangeType { + pub fn encode(self) -> Option { + use CharRangeType::*; + match self { + Uniform(CharCaseChanges::UpperOnly) => Some(0), + Uniform(CharCaseChanges::LowerOnly) => Some(1), + AlternatingUpperLower => Some(2), + AlternatingLowerUpper => Some(3), + Uniform(CharCaseChanges::Always) => Some(4), + Uniform(CharCaseChanges::Never) => None, + } + } + const ENCNAMES: &'static [&'static str; 5] = &[ + "RK_UNIFORM_UPPER", + "RK_UNIFORM_LOWER", + "RK_ALT_UPPER_LOWER", + "RK_ALT_LOWER_UPPER", + "RK_UNIFORM_BOTH", + ]; +} +// bottom 8 are len, then 3 kind, top 21 are char +pub const RANGE_ENCODING_LEN_BITS: u32 = 8; +pub const RANGE_ENCODING_KIND_BITS: u32 = 3; +pub const RANGE_ENCODING_CHAR_BITS: u32 = 21; + +pub const RANGE_ENCODING_CHAR_SHIFT: u32 = RANGE_ENCODING_LEN_BITS + RANGE_ENCODING_KIND_BITS; +pub const RANGE_ENCODING_KIND_SHIFT: u32 = RANGE_ENCODING_LEN_BITS; + +fn encode_direct(kind: u32, char: u32, len: u32) -> u32 { + debug_assert_eq!(kind & !((1 << RANGE_ENCODING_KIND_BITS) - 1), 0); + debug_assert_eq!(char & !((1 << RANGE_ENCODING_CHAR_BITS) - 1), 0); + debug_assert_eq!(len & !((1 << RANGE_ENCODING_LEN_BITS) - 1), 0); + debug_assert!(len <= RANGE_ENCODING_MAX_LEN as u32); + debug_assert!(kind <= 4); + debug_assert!(::from_u32(char).is_some()); + let result = len | (kind << RANGE_ENCODING_KIND_SHIFT) | (char << RANGE_ENCODING_CHAR_SHIFT); + if cfg!(debug_assertions) { + let (k, c, l) = decode_direct(result); + debug_assert!(k == kind && c == char && len == l); + } + result +} +fn decode_direct(enc: u32) -> (u32, u32, u32) { + let len = enc & 0xff; + let ch = enc >> RANGE_ENCODING_CHAR_SHIFT; + let kind = (enc >> RANGE_ENCODING_KIND_SHIFT) & ((1 << RANGE_ENCODING_KIND_BITS) - 1); + (kind, ch, len) +} + +// bigger and we split it up +pub const RANGE_ENCODING_MAX_LEN: usize = 254; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct FoundCharRange { + pub start_char: u32, + pub length: usize, + pub kind: CharRangeType, +} +impl core::fmt::Display for FoundCharRange { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let rk = self + .kind + .encode() + .map(|n| CharRangeType::ENCNAMES[n as usize]) + .unwrap_or("RK_NEVER"); + // use std::fmt::Write; + // let mut s = String::new(); + + if self.length == 1 { + write!(f, "{rk},len=1 : U+{:04x}", self.start_char)?; + if let Some(c) = char::from_u32(self.start_char) { + write!(f, " ({c:?})")?; + } + } else if self.length != 0 { + write!(f, "{rk},len={} : ", self.length)?; + let end_m1 = self.end() - 1; + write!(f, "U+{:04x}..=U+{:04x}", self.start_char, end_m1)?; + if self.length >= 2 { + write!(f, "U+{:04x}..=U+{:04x}", self.start_char, end_m1)?; + if let (Some(c), Some(e)) = + (char::from_u32(self.start_char), char::from_u32(end_m1)) + { + write!(f, " ({:?}..={:?})", c, e)?; + } + } + } else { + write!(f, "{rk},len=0 : (empty)")?; + } + Ok(()) + } +} + +impl CharRangeType { + pub fn is_alt(&self) -> bool { + matches!( + self, + CharRangeType::AlternatingUpperLower | CharRangeType::AlternatingLowerUpper + ) + } +} + +impl FoundCharRange { + pub fn end(&self) -> u32 { + self.start_char + (self.length as u32) + } + pub fn encode(&self) -> u32 { + encode_direct( + self.kind.encode().unwrap(), + self.start_char, + self.length as u32, + ) + } + + pub fn new_uniform_range(db: &CaseChangeDb, c: u32, len: usize) -> Self { + let kind = db.info(c).case_enum(); + debug_assert!(db.infos[c as usize..][..len] + .iter() + .all(|c| c.case_enum() == kind)); + debug_assert!(len != 0); + Self { + start_char: c, + length: len, + kind: CharRangeType::Uniform(kind), + } + } + pub fn new_alt_range(db: &CaseChangeDb, c: u32, len: usize) -> Self { + let kind = db.info(c).case_enum(); + let slice = &db.infos[c as usize..][..len]; + + // debug_assert!(slice.iter().all(|c| c.case_enum() == kind)); + debug_assert!(len >= 2); + let expected_seq = { + let arr = if CharCaseChanges::LowerOnly == kind { + [CharCaseChanges::LowerOnly, CharCaseChanges::UpperOnly] + } else { + [CharCaseChanges::UpperOnly, CharCaseChanges::LowerOnly] + }; + arr.into_iter().cycle().take(len) + }; + debug_assert!( + slice.iter().map(|c| c.case_enum()).eq(expected_seq), + "{:?}", + slice.iter().map(|c| c.case_enum()).collect::>() + ); + + Self { + start_char: c, + length: len, + kind: match kind { + CharCaseChanges::LowerOnly => CharRangeType::AlternatingLowerUpper, + CharCaseChanges::UpperOnly => CharRangeType::AlternatingUpperLower, + CharCaseChanges::Always | CharCaseChanges::Never => unreachable!(), + }, + } + } + pub fn split_into_chunks(&self, db: &CaseChangeDb, lenmax: usize) -> Vec { + let mut v = Vec::with_capacity(self.length / lenmax + 1); + + if self.length < lenmax { + v.push(self.clone()); + } else { + let mut c0 = self.start_char; + let end = self.end(); + while c0 < end { + let mut next_end = c0 + (lenmax as u32); + if next_end > end { + next_end = end; + } + let new_len = next_end - c0; + + let new_kind = if self.kind.is_alt() { + match db.info(c0).case_enum() { + CharCaseChanges::Always | CharCaseChanges::Never => unreachable!(), + chty if new_len == 1 => CharRangeType::Uniform(chty), + CharCaseChanges::LowerOnly => CharRangeType::AlternatingLowerUpper, + CharCaseChanges::UpperOnly => CharRangeType::AlternatingUpperLower, + } + } else { + self.kind + }; + v.push(Self { + start_char: c0, + length: new_len as usize, + kind: new_kind, + }); + + c0 = next_end; + } + } + + v + } +} + +// basically `slice.group_by(test).next().map(|s| s.len()).unwrap_or_default()`, +// but group_by is unstable +fn group_len bool>(slice: &[T], test: F) -> usize { + if slice.len() < 2 { + return slice.len(); + } + let mut len = 1; + let mut iter = slice.windows(2); + while let Some([l, r]) = iter.next() { + if test(l, r) { + len += 1; + } else { + break; + } + } + len +} + +impl CaseChangeDb { + pub fn new() -> Self { + let info: Vec = (0..NUM_CHARS as u32) + .map(|cp| { + if let Some(ch) = char::from_u32(cp) { + CharInfo { + codepoint: cp, + changes_when_lower: changes_when_lowercased(ch), + changes_when_upper: changes_when_uppercased(ch), + } + } else { + CharInfo { + codepoint: cp, + changes_when_lower: false, + changes_when_upper: false, + } + } + }) + .collect(); + CaseChangeDb { + infos: info.try_into().unwrap(), + } + } + // simple greedy approach + pub fn find_ranges(&self) -> Vec { + let mut ranges = Vec::with_capacity(1000); + let mut cur = 0; + while (cur as usize) < self.infos.len() { + let found = self.find_range_from(cur); + debug_assert_eq!(found.start_char, cur); + let end = found.end(); + ranges.push(found); + cur = end; + } + // if !raw { + ranges.retain_mut(|r| self.should_keep_range(r)); + // } + ranges + } + + fn find_range_from(&self, c: u32) -> FoundCharRange { + let first = self.infos[c as usize].case_enum(); + // let len = self.infos[c as usize..].group_by(|a, b| a.case_enum() == + // b.case_enum()).next().unwrap().len(); + let uniform_len = group_len(&self.infos[c as usize..], |a, b| { + a.case_enum() == b.case_enum() + }); + if uniform_len == 1 && self.info(c).alternates_with(&self.info(c + 1)) { + debug_assert!(matches!( + first, + CharCaseChanges::UpperOnly | CharCaseChanges::LowerOnly + )); + let mut alt_len = group_len(&self.infos[c as usize..], |a, b| a.alternates_with(b)); + if c as usize + alt_len != self.infos.len() { + // Ensure the first entry in the last pair is counted. + // alt_len += 1; + } + FoundCharRange::new_alt_range(self, c, alt_len) + } else { + FoundCharRange::new_uniform_range(self, c, uniform_len) + } + } + pub fn splitify_ranges_for_encoding( + &self, + rs: &[FoundCharRange], + ) -> (Vec, Vec) { + let mut split: Vec = vec![]; + for (_i, rng) in rs.iter().enumerate() { + if rng.length <= RANGE_ENCODING_MAX_LEN { + split.push(*rng) + } else { + let chunks = rng.split_into_chunks(self, RANGE_ENCODING_MAX_LEN); + assert!(chunks.len() >= 2); + if !chunks.iter().all(|c| c.length <= RANGE_ENCODING_MAX_LEN) { + panic!("{:#?} => {:#?}", rng, chunks); + } + split.extend_from_slice(&chunks[..]); + } + // splitmap.extend(core::iter::repeat(i).take(chunks.len())); + } + let enc = split.iter().map(|r| r.encode()).collect(); + (split, enc) + } + pub fn check_encoding(&self, rs: &[FoundCharRange], table: &[u32]) { + assert_eq!(rs.len(), table.len()); + let mut reported = vec![(false, false); NUM_CHARS]; + for testc in ('\0'..=char::MAX).filter(|c| !c.is_ascii()) { + let res = self.info(testc as u32); + let range = check::find_encoded_case_range(testc, table); + let mut bad = false; + let want_no_entry = res.case_enum() == CharCaseChanges::Never; + bad |= range.is_none() != want_no_entry; + if let Some(rng) = range { + let (re_kind, re_ch, re_len) = decode_direct(rng); + debug_assert_eq!(encode_direct(re_kind, re_ch, re_len), rng); + bad |= !(re_ch..re_ch + re_len).contains(&(testc as u32)); + if let Some(i) = table.iter().position(|c| *c == rng) { + let real_range = rs[i]; + bad |= real_range.start_char != re_ch; + bad |= real_range.length != re_len as usize; + bad |= real_range.kind.encode() != Some(re_kind); + } else { + bad |= true; + } + } + } + } + + // pub fn find_ranges(&self) -> Vec { + // let mut ranges = Vec::with_capacity(1000); + // let mut cur = 0; + // while (cur as usize) < self.infos.len() { + // let found = self.find_range_from(cur); + // debug_assert_eq!(found.start_char, cur); + // let end = found.end(); + // ranges.push(found); + // } + + // let mut cleaned_ranges = Vec::with_capacity(ranges.len()); + // for range in ranges.iter() { + // if !self.should_keep_range(range) { + // continue; + // } + // let chunks = range.split_into_chunks(self, RANGE_ENCODING_MAX_LEN); + // cleaned_ranges.extend_from_slice(&chunks); + // } + // cleaned_ranges + // } + + pub fn should_keep_range(&self, range: &FoundCharRange) -> bool { + if range.kind == CharRangeType::Uniform(CharCaseChanges::Never) { + // empty + false + } else if range.start_char < 128 && range.end() < 128 { + // ASCII + false + } else { + true + } + } +} + +fn changes_when_lowercased(c: char) -> bool { + !core::iter::once(c).eq(c.to_lowercase()) +} + +fn changes_when_uppercased(c: char) -> bool { + !core::iter::once(c).eq(c.to_uppercase()) +} + +pub fn emit_tab(name: &str, t: &[u32], r: &[FoundCharRange]) -> String { + use std::fmt::Write; + let mut s = String::new(); + let mut range_type_stats = [(0usize, 0usize); 5]; + let _ = writeln!(s, "pub(super) const {}: &[u32; {}] = &[\n", name, t.len()); + for (&enc, &dec) in core::iter::zip(t, r) { + debug_assert!(enc == dec.encode()); + let mut comment = String::new(); + let _ = writeln!(s, " {enc:#010x}, // {dec}"); + let st = &mut range_type_stats[dec.kind.encode().unwrap() as usize]; + st.0 += 1; + st.1 += dec.length; + } + writeln!(s, "];"); + let mut comm = String::new(); + writeln!( + comm, + "// {} ranges / {} bytes. per-rangetype stats:", + t.len(), + 4 * t.len() + ); + for (i, (ranges, chars)) in range_type_stats.iter().enumerate() { + writeln!( + comm, + "// - {}, ranges={}, chars={}", + CharRangeType::ENCNAMES[i], + ranges, + chars + ); + } + format!("{comm}{s}") +} + +type ChangesWhenTableType = [u32]; + +mod check { + include!("../../src/case/table.rs"); +} diff --git a/tabgen/src/main.rs b/tabgen/src/main.rs new file mode 100644 index 0000000..ed1a4d9 --- /dev/null +++ b/tabgen/src/main.rs @@ -0,0 +1,38 @@ +//! Generates the tables we use +//! +//! Determines property values based on `char`'s case conversion. This avoids +//! needing to download/parse UCD, but means that it can't support a newer +//! version of Unicode than what Rust itself supports. W + +mod gen; + +fn main() { + use gen::*; + let db = CaseChangeDb::new(); + let ranges = db.find_ranges(); + let (ranges, encoded) = db.splitify_ranges_for_encoding(&ranges); + let emit = emit_tab("CHANGES_WHEN_LOOKUP_TAB", &encoded, &ranges); + eprintln!("###begin output\n{emit}\n###end output"); + use core::fmt::Write; + let mut file = String::new(); + writeln!( + file, + "// This file is automatically generated by `cow-utils-rs/tabgen`. Do not edit by hand." + ); + writeln!(file); + writeln!(file, "{}", emit); + + writeln!( + file, + "pub(super) type ChangesWhenTableType = [u32; {}];\n", + encoded.len(), + ); + let genver = core::char::UNICODE_VERSION; + writeln!( + file, + "pub(super) const UNICODE_VERSION: (u8, u8, u8) = ({}, {}, {});\n", + genver.0, genver.1, genver.2, + ); + + db.check_encoding(&ranges, &encoded); +} From 3250603ea0c44844a30aaae99b2bd4d610b55269 Mon Sep 17 00:00:00 2001 From: Thom Chiovoloni Date: Tue, 10 Oct 2023 16:27:33 -0700 Subject: [PATCH 2/2] code's a nightmare but works perfectly and much faster --- src/case/search.rs | 5 ---- src/case/table.rs | 3 +- tabgen/src/gen.rs | 5 ++-- tabgen/src/main.rs | 13 +++++---- tabgen/src/search.rs | 67 ++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 78 insertions(+), 15 deletions(-) create mode 100644 tabgen/src/search.rs diff --git a/src/case/search.rs b/src/case/search.rs index 255ea6f..2ffa887 100644 --- a/src/case/search.rs +++ b/src/case/search.rs @@ -31,11 +31,6 @@ pub(super) fn changes_when_casemapped_nonascii( } else { odd_is_lower == !odd } - // match (range_kind == RK_ALT_UPPER_LOWER, MAP_LOWER) { - // (true, true) | (false, true) => !odd, - // (true, false) | (false, false) => odd, - // _ => false, - // } } rk => { debug_assert!(false, "bad rangekind {:?}", rk); diff --git a/src/case/table.rs b/src/case/table.rs index 069a428..8c84389 100644 --- a/src/case/table.rs +++ b/src/case/table.rs @@ -6,7 +6,7 @@ // - RK_UNIFORM_BOTH, ranges=10, chars=31 pub(crate) const CHANGES_WHEN_LOOKUP_TAB: &[u32; 225] = &[ 0x0005a801, // RK_UNIFORM_UPPER,len=1 : U+00b5 ('µ') - 0x00060117, // RK_UNIFORM_LOWER,len=23 : U+00c0..=U+00d6U+00c0..=U+00d6 ('À'..='Ö') + 0x00060117, // RK_UNIFORM_LOWER,len=23 : U+00c0..=U+00d 6U+00c0..=U+00d6 ('À'..='Ö') 0x0006c107, // RK_UNIFORM_LOWER,len=7 : U+00d8..=U+00deU+00d8..=U+00de ('Ø'..='Þ') 0x0006f818, // RK_UNIFORM_UPPER,len=24 : U+00df..=U+00f6U+00df..=U+00f6 ('ß'..='ö') 0x0007c008, // RK_UNIFORM_UPPER,len=8 : U+00f8..=U+00ffU+00f8..=U+00ff ('ø'..='ÿ') @@ -233,4 +233,5 @@ pub(crate) const CHANGES_WHEN_LOOKUP_TAB: &[u32; 225] = &[ ]; pub(super) type ChangesWhenTableType = [u32; 225]; +#[cfg(test)] pub(super) const UNICODE_VERSION: (u8, u8, u8) = (15, 0, 0); diff --git a/tabgen/src/gen.rs b/tabgen/src/gen.rs index 49b8e8e..c516a12 100644 --- a/tabgen/src/gen.rs +++ b/tabgen/src/gen.rs @@ -491,6 +491,5 @@ pub fn emit_tab(name: &str, t: &[u32], r: &[FoundCharRange]) -> String { type ChangesWhenTableType = [u32]; -mod check { - include!("../../src/case/table.rs"); -} +#[path = "./search.rs"] +mod check; diff --git a/tabgen/src/main.rs b/tabgen/src/main.rs index ed1a4d9..9acff81 100644 --- a/tabgen/src/main.rs +++ b/tabgen/src/main.rs @@ -15,24 +15,25 @@ fn main() { eprintln!("###begin output\n{emit}\n###end output"); use core::fmt::Write; let mut file = String::new(); - writeln!( + let _ = writeln!( file, "// This file is automatically generated by `cow-utils-rs/tabgen`. Do not edit by hand." ); - writeln!(file); - writeln!(file, "{}", emit); + let _ = writeln!(file); + let _ = writeln!(file, "{}", emit); - writeln!( + let _ = writeln!( file, "pub(super) type ChangesWhenTableType = [u32; {}];\n", encoded.len(), ); let genver = core::char::UNICODE_VERSION; - writeln!( + let _ = writeln!( file, - "pub(super) const UNICODE_VERSION: (u8, u8, u8) = ({}, {}, {});\n", + "#[cfg(test)]\npub(super) const UNICODE_VERSION: (u8, u8, u8) = ({}, {}, {});\n", genver.0, genver.1, genver.2, ); + let _ = std::fs::write("table.rs", &file); db.check_encoding(&ranges, &encoded); } diff --git a/tabgen/src/search.rs b/tabgen/src/search.rs new file mode 100644 index 0000000..343b046 --- /dev/null +++ b/tabgen/src/search.rs @@ -0,0 +1,67 @@ +// FIXME dupe +pub(crate) fn changes_when_casemapped_nonascii( + needle: char, + tab: &super::ChangesWhenTableType, +) -> bool { + let Some(enc) = find_encoded_case_range(needle, tab) else { + return false; + }; + const RK_UNIFORM_UPPER: u32 = 0; + const RK_UNIFORM_LOWER: u32 = 1; + const RK_ALT_UPPER_LOWER: u32 = 2; + const RK_ALT_LOWER_UPPER: u32 = 3; + const RK_UNIFORM_BOTH: u32 = 4; + + let range_st = enc >> 11; + let range_len = enc & 0xff; + let range_kind = (enc >> 8) & 0x7; + debug_assert!(range_kind <= 4); + let map_lower = MAP_LOWER; + let map_upper = !MAP_LOWER; + match range_kind { + RK_UNIFORM_BOTH => true, + RK_UNIFORM_UPPER => map_upper, + RK_UNIFORM_LOWER => map_lower, + RK_ALT_UPPER_LOWER | RK_ALT_LOWER_UPPER => { + let offset = needle as u32 - range_st; + debug_assert!(offset <= range_len); + let odd = (offset & 1) != 0; + let odd_is_lower = range_kind == RK_ALT_UPPER_LOWER; + if MAP_LOWER { + odd_is_lower == odd + } else { + odd_is_lower == !odd + } + // match (range_kind == RK_ALT_UPPER_LOWER, MAP_LOWER) { + // (true, true) | (false, true) => !odd, + // (true, false) | (false, false) => odd, + // _ => false, + // } + } + rk => { + debug_assert!(false, "bad rangekind {:?}", rk); + false + } + } +} + +pub(crate) fn find_encoded_case_range( + needle: char, + ranges: &super::ChangesWhenTableType, +) -> Option { + let pos = ranges.binary_search_by(|&entry| { + let range_st = entry >> 11; + let range_len = entry & 0xff; + if range_st > (needle as u32) { + core::cmp::Ordering::Greater + } else if (range_st + range_len) <= (needle as u32) { + core::cmp::Ordering::Less + } else { + core::cmp::Ordering::Equal + } + }); + match pos { + Err(_) => None, + Ok(n) => Some(ranges[n]), + } +}