From 977a58ede272b3e09719f8f7ffefc5f48a1ec47a Mon Sep 17 00:00:00 2001
From: kkew3 <kps6326@hotmail.com>
Date: Sun, 29 Dec 2024 10:36:06 +0800
Subject: [PATCH 1/6] Add support to spacing modifier letters in tokenization

Related to issue https://github.com/kkew3/jieba.vim/issues/7.
---
 pythonx/jieba_vim_rs_core/src/token.rs | 36 +++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/pythonx/jieba_vim_rs_core/src/token.rs b/pythonx/jieba_vim_rs_core/src/token.rs
index ecf2b21..ecce6a3 100644
--- a/pythonx/jieba_vim_rs_core/src/token.rs
+++ b/pythonx/jieba_vim_rs_core/src/token.rs
@@ -122,12 +122,6 @@ fn categorize_char(c: char) -> CharType {
         | '\u{2e80}'..='\u{2ef3}'
         => CharType::Word(WordCharType::Hanzi),
 
-        // Default value of 'iskeyword' in Vim (ASCII range).
-        'a'..='z' | 'A'..='Z' | '0'..='9' | '_'
-        // Default value of 'iskeyword' in Vim (extended ASCII range).
-        | '\u{c0}'..='\u{ff}'
-        => CharType::Word(WordCharType::Other),
-
         // Fullwidth ASCII variants.
         '\u{ff04}' | '\u{ff08}' | '\u{ff3b}' | '\u{ff5b}' | '\u{ff5f}'
         // Halfwidth CJK punctuation.
@@ -197,6 +191,19 @@ fn categorize_char(c: char) -> CharType {
         | '\u{00b7}'
         => CharType::NonWord(NonWordCharType::IsolatedPunc),
 
+        // Default value of 'iskeyword' in Vim (ASCII range: '@,48-57,_').
+        'a'..='z' | 'A'..='Z' | '0'..='9' | '_'
+        // Default value of 'iskeyword' in Vim (extended ASCII range:
+        // '192-255').
+        | '\u{c0}'..='\u{ff}'
+        => CharType::Word(WordCharType::Other),
+        // Default value of 'iskeyword' in Vim (Unicode alphabetic: '@')
+        c if c.is_alphabetic() => CharType::Word(WordCharType::Other),
+        // Although not `is_alphabetic`, apparently spacing modifier letters
+        // (https://en.wikipedia.org/wiki/Spacing_Modifier_Letters) are word
+        // characters in Vim (both compatible and nocompatible).
+        '\u{02b0}'..='\u{02ff}' => CharType::Word(WordCharType::Other),
+
         _ => CharType::NonWord(NonWordCharType::Other),
     }
 }
@@ -1012,4 +1019,21 @@ mod tests {
             ]
         );
     }
+
+    #[test]
+    fn test_parse_spacing_modifiers_1_word() {
+        let tokens = parse_str_test("abc  ʰdef g˦hi jkl", true);
+        assert_eq!(
+            tokens,
+            vec![
+                test_macros::token!(0, 2, 3, Word), // "abc"
+                test_macros::token!(3, 4, 5, Space),
+                test_macros::token!(5, 9, 10, Word), // "ʰdef"
+                test_macros::token!(10, 10, 11, Space),
+                test_macros::token!(11, 15, 16, Word), // "g˦hi"
+                test_macros::token!(16, 16, 17, Space),
+                test_macros::token!(17, 19, 20, Word), // "jkl"
+            ]
+        );
+    }
 }

From 11d0cf552e0c84904902fc47d22cbc85e5277b1e Mon Sep 17 00:00:00 2001
From: kkew3 <kps6326@hotmail.com>
Date: Sun, 29 Dec 2024 18:31:51 +0800
Subject: [PATCH 2/6] Add support to combining diacritical marks in
 tokenization

Related to issue https://github.com/kkew3/jieba.vim/issues/7.
---
 pythonx/jieba_vim_rs_core/src/token.rs | 542 +++++++++++++++++++++++--
 1 file changed, 513 insertions(+), 29 deletions(-)

diff --git a/pythonx/jieba_vim_rs_core/src/token.rs b/pythonx/jieba_vim_rs_core/src/token.rs
index ecce6a3..08e789e 100644
--- a/pythonx/jieba_vim_rs_core/src/token.rs
+++ b/pythonx/jieba_vim_rs_core/src/token.rs
@@ -28,6 +28,12 @@ enum CharType {
     Word(WordCharType),
     /// Non-word characters.
     NonWord(NonWordCharType),
+    /// Unicode combining characters. See
+    /// https://en.wikipedia.org/wiki/Combining_character. Note that this
+    /// intentionally does not include combining diacritical marks extended,
+    /// which might be included in the future in case of frequent need in
+    /// practice.
+    CombiningDiacriticalMark,
 }
 
 /// Word character types.
@@ -56,6 +62,13 @@ enum NonWordCharType {
     Other,
 }
 
+fn is_combining_diacritical_mark(c: char) -> bool {
+    match c {
+        '\u{0300}'..='\u{036f}' => true,
+        _ => false,
+    }
+}
+
 // The unicodes of CJK characters and punctuations are quoted from Github
 // repository: https://github.com/tsroten/zhon.
 // File: https://github.com/tsroten/zhon/blob/main/src/zhon/hanzi.py.
@@ -100,6 +113,8 @@ fn categorize_char(c: char) -> CharType {
         | '\u{303f}'
         => CharType::Space,
 
+        c if is_combining_diacritical_mark(c) => CharType::CombiningDiacriticalMark,
+
         // Ideographic number zero.
         | '\u{3007}'
         // CJK unified ideographs.
@@ -260,6 +275,11 @@ enum CharGroupType {
     Word(WordCharGroupType),
     /// A sequence of [`CharType::NonWord`] characters.
     NonWord(NonWordCharGroupType),
+    /// A sequence of [`CharType::CombiningDiacriticalMark`]. We have to make
+    /// room dedicated for this type (abbr. CDM), since in terms of major
+    /// class, CDM is compatible with non-汉字 word, non-汉字 word is compatible
+    /// with 汉字 word, but CDM is *not* compatible with 汉字 word.
+    CombiningDiacriticalMark,
 }
 
 /// Word character group types.
@@ -317,6 +337,9 @@ impl From<Char> for CharGroup {
                 CharType::Word(WordCharType::Other) => {
                     CharGroupType::Word(WordCharGroupType::Other)
                 }
+                CharType::CombiningDiacriticalMark => {
+                    CharGroupType::CombiningDiacriticalMark
+                }
                 CharType::NonWord(NonWordCharType::LeftPunc) => {
                     CharGroupType::NonWord(
                         NonWordCharGroupType::LeftPuncLeading,
@@ -354,11 +377,11 @@ impl CharGroup {
         }
     }
 
-    /// Push a [`Char`]. Given back `c` if their types are not compatible
-    /// in major class (space, word, nonword). `self`'s type may be modified
-    /// accordingly, but it's guaranteed that the majar class of `self` will
-    /// not be changed after push. Panics if there's gap between `self` and
-    /// `c`.
+    /// Push a [`Char`]. Giving back `c` if their types are not compatible in
+    /// major class (space, word, nonword). Combining characters are compatible
+    /// with any other major class. `self`'s type may be modified accordingly,
+    /// but it's guaranteed that the majar class of `self` will not be changed
+    /// after push. Panics if there's gap between `self` and `c`.
     fn push(&mut self, c: Char) -> Result<(), Char> {
         assert_eq!(self.col.excl_end_byte_index, c.col.start_byte_index);
 
@@ -369,17 +392,25 @@ impl CharGroup {
         use WordCharGroupType as WG;
         use WordCharType as W;
         match (&self.ty, &c.ty) {
-            (G::Space, Space) => (),
+            (G::CombiningDiacriticalMark, CombiningDiacriticalMark) => (),
+            (G::CombiningDiacriticalMark, Word(W::Other)) => {
+                self.ty = G::Word(WG::Other);
+            }
 
-            (G::Word(WG::Hanzi), Word(_)) => (),
+            (G::Space, Space) | (G::Space, CombiningDiacriticalMark) => (),
+
+            (G::Word(WG::Hanzi), Word(_))
+            | (G::Word(WG::Hanzi), CombiningDiacriticalMark) => (),
 
             (G::Word(WG::Other), Word(W::Hanzi)) => {
                 self.ty = G::Word(WG::Hanzi);
             }
-            (G::Word(WG::Other), Word(W::Other)) => (),
+            (G::Word(WG::Other), Word(W::Other))
+            | (G::Word(WG::Other), CombiningDiacriticalMark) => (),
 
             (G::NonWord(NG::LeftPuncLeading), NonWord(N::LeftPunc))
-            | (G::NonWord(NG::LeftPuncLeading), NonWord(N::Other)) => (),
+            | (G::NonWord(NG::LeftPuncLeading), NonWord(N::Other))
+            | (G::NonWord(NG::LeftPuncLeading), CombiningDiacriticalMark) => (),
             (G::NonWord(NG::LeftPuncLeading), NonWord(N::RightPunc))
             | (G::NonWord(NG::LeftPuncLeading), NonWord(N::IsolatedPunc)) => {
                 self.ty = G::NonWord(NG::LeftPuncLeadingRightPuncEnding);
@@ -390,7 +421,8 @@ impl CharGroup {
                 self.ty = G::NonWord(NG::Other);
             }
             (G::NonWord(NG::RightPuncEnding), NonWord(N::RightPunc))
-            | (G::NonWord(NG::RightPuncEnding), NonWord(N::IsolatedPunc)) => (),
+            | (G::NonWord(NG::RightPuncEnding), NonWord(N::IsolatedPunc))
+            | (G::NonWord(NG::RightPuncEnding), CombiningDiacriticalMark) => (),
 
             (
                 G::NonWord(NG::LeftPuncLeadingRightPuncEnding),
@@ -407,10 +439,15 @@ impl CharGroup {
             | (
                 G::NonWord(NG::LeftPuncLeadingRightPuncEnding),
                 NonWord(N::IsolatedPunc),
+            )
+            | (
+                G::NonWord(NG::LeftPuncLeadingRightPuncEnding),
+                CombiningDiacriticalMark,
             ) => (),
 
             (G::NonWord(NG::Other), NonWord(N::LeftPunc))
-            | (G::NonWord(NG::Other), NonWord(N::Other)) => (),
+            | (G::NonWord(NG::Other), NonWord(N::Other))
+            | (G::NonWord(NG::Other), CombiningDiacriticalMark) => (),
             (G::NonWord(NG::Other), NonWord(N::RightPunc))
             | (G::NonWord(NG::Other), NonWord(N::IsolatedPunc)) => {
                 self.ty = G::NonWord(NG::RightPuncEnding);
@@ -419,7 +456,12 @@ impl CharGroup {
             _ => return Err(c),
         }
         self.chars.push(c.ch);
-        self.col.incl_end_byte_index = c.col.incl_end_byte_index;
+        // Combining diacritical marks modify previous character only, and does
+        // not take space.
+        match &c.ty {
+            CombiningDiacriticalMark => (),
+            _ => self.col.incl_end_byte_index = c.col.incl_end_byte_index,
+        }
         self.col.excl_end_byte_index = c.col.excl_end_byte_index;
         Ok(())
     }
@@ -434,14 +476,6 @@ impl CharGroup {
     }
 }
 
-// `CharGroup` is not meant to be displayed. Therefore, I'm not implementing
-// `std::fmt::Display`.
-impl ToString for CharGroup {
-    fn to_string(&self) -> String {
-        self.chars.iter().collect()
-    }
-}
-
 /// Group contiguous [`Char`]s of compatible major class into [`CharGroup`]s,
 /// and insert implicit whitespaces in between as needed.
 fn group_chars_rule(
@@ -454,14 +488,21 @@ fn group_chars_rule(
         Some(mut group) => match group.push(c) {
             Err(c) => {
                 let c = CharGroup::from(c);
-                // `group` and `c` are compatible in major type. We may need to
-                // insert implicit whitespace in between. Since it's cheap, we
-                // prepare one beforehand.
+                // `group` and `c` are not compatible in major type. We may
+                // need to insert implicit whitespace in between. Since it's
+                // cheap, we prepare one beforehand.
                 let ispace =
                     CharGroup::new_implicit_whitespace(c.col.start_byte_index);
                 use CharGroupType::*;
                 use NonWordCharGroupType as N;
+                use WordCharGroupType as W;
                 match (&group.ty, &c.ty) {
+                    (CombiningDiacriticalMark, Word(W::Hanzi))
+                    | (CombiningDiacriticalMark, NonWord(_))
+                    | (CombiningDiacriticalMark, Space) => {
+                        vec![group, c]
+                    }
+
                     // We never need to insert implicit space after a space.
                     (Space, Word(_)) | (Space, NonWord(_)) => vec![group, c],
 
@@ -488,6 +529,26 @@ fn group_chars_rule(
     }
 }
 
+/// If the first [`CharGroup`] is of type
+/// [`CharGroupType::CombiningDiacriticalMark`], convert it to
+/// [`WordCharGroupType::Other`].
+fn convert_first_cdm_group_rule(
+    prev_group: Option<CharGroup>,
+    mut group: CharGroup,
+    _args: &(),
+) -> Vec<CharGroup> {
+    match prev_group {
+        None => match group.ty {
+            CharGroupType::CombiningDiacriticalMark => {
+                group.ty = CharGroupType::Word(WordCharGroupType::Other);
+                vec![group]
+            }
+            _ => vec![group],
+        },
+        Some(prev_group) => vec![prev_group, group],
+    }
+}
+
 impl CharGroup {
     /// Split `self` into subgroups, whose types will be recategorized. Panics
     /// if `self.chars.len() != sizes.sum()`.
@@ -548,6 +609,105 @@ fn insert_implicit_whitespace_in_cut_result_rule(
     }
 }
 
+/// Assuming `group.ty` is [`WordCharGroupType::Hanzi`], this function goes
+/// through the following steps:
+///
+/// 1. Temporarily remove all combining diacritical marks from the group.
+/// 2. Cut words using `jieba`.
+/// 3. Revert removal of the combining marks and append combining marks to each
+///    cut group.
+/// 4. Count the number of chars in each cut group and return.
+fn cut_hanzi_group_and_count_chars<C: JiebaPlaceholder>(
+    group: &CharGroup,
+    jieba: &C,
+) -> Vec<usize> {
+    let mut marks = Vec::with_capacity(group.chars.len());
+    let group_string_no_marks: String = group
+        .chars
+        .iter()
+        .copied()
+        .filter_map(|c| {
+            let is_mark = is_combining_diacritical_mark(c);
+            marks.push(is_mark);
+            if is_mark {
+                None
+            } else {
+                Some(c)
+            }
+        })
+        .collect();
+    let cut_char_counts0 = utils::chain_into_vec(
+        [0],
+        jieba
+            .cut_hmm(&group_string_no_marks)
+            .into_iter()
+            .map(|part| part.chars().count()),
+    );
+    let refined_cut_char_counts =
+        append_mark_to_cuts(&marks, &cut_char_counts0);
+    refined_cut_char_counts
+}
+
+/// The step 3 in [`cut_hanzi_group`].
+///
+/// For example, given a [`CharGroup`] of type [`WordCharGroupType::Hanzi`],
+/// denote 汉字 by `H`, combining marks by `m`, other non-space characters
+/// by `A`, the string representation of the group might be: `m H m H A m`.
+/// Clearly, `marks` will be `true false true false false true`. Suppose the
+/// first `H`s make up a word, then `cut_char_counts0` will be `0 2 1`, where
+/// `0` is fixed, `2` signifies the two `H`s, and `1` for the `A`. The output
+/// will be `1 3 2`, corresponding to `[m] [H m H] [A m]`.
+///
+/// Properties:
+///
+/// - Neither `marks` nor `cut_char_counts0` is empty.
+/// - The first element of `cut_char_counts0` is zero.
+/// - The `cut_char_counts0` and the output elements are guaranteed positive,
+///   except for the first element.
+/// - Number of false's in `marks` equals the sum of input `cut_char_counts0`.
+/// - The sum of the output equals the length of `marks`.
+fn append_mark_to_cuts(
+    marks: &[bool],
+    cut_char_counts0: &[usize],
+) -> Vec<usize> {
+    let mut out = vec![0; cut_char_counts0.len()];
+    let mut x = 0; // The accumulator of `marks`.
+    let mut y = 0; // The accumulator of `cut_char_counts0`.
+    let mut cum_marks = marks
+        .iter()
+        .map(|m| {
+            if !*m {
+                x += 1;
+            }
+            x
+        })
+        .peekable();
+    let mut cum_char_counts = cut_char_counts0
+        .iter()
+        .map(|c| {
+            y += c;
+            y
+        })
+        .peekable();
+    let mut out_iter = out.iter_mut().peekable();
+    while cum_marks.peek().is_some()
+        && cum_char_counts.peek().is_some()
+        && out_iter.peek().is_some()
+    {
+        let x = cum_marks.peek().unwrap();
+        let y = cum_char_counts.peek().unwrap();
+        if x <= y {
+            **out_iter.peek_mut().unwrap() += 1;
+            cum_marks.next().unwrap();
+        } else {
+            cum_char_counts.next().unwrap();
+            out_iter.next().unwrap();
+        }
+    }
+
+    out
+}
+
 /// Cut [`CharGroup`]s of type [`WordCharGroupType::Hanzi`] into sub groups,
 /// and insert implicit whitespaces in between. Since this merging rule
 /// ought to be used after [`group_chars_rule`], we won't need to care about
@@ -567,12 +727,9 @@ fn cut_hanzi_rule<C: JiebaPlaceholder>(
     use WordCharGroupType as W;
     match group.ty {
         Word(W::Hanzi) => {
-            let s = group.to_string();
-            let n_chars: Vec<_> = jieba
-                .cut_hmm(&s)
-                .into_iter()
-                .map(|part| part.chars().count())
-                .collect();
+            let n_chars = cut_hanzi_group_and_count_chars(&group, jieba);
+            // We have assumed that each subgroup contains chars of the same
+            // major class, which is subject to the property of `jieba`.
             let sub_groups = group.split_into_subgroups(n_chars);
             utils::chain_into_vec(
                 prev_group,
@@ -605,6 +762,7 @@ pub(crate) enum TokenType {
     /// contrast, is a sequence of either word or non-word non-whitespace
     /// characters.
     Word,
+    /// Tokens that contain space and/or unicode combining characters only.
     Space,
 }
 
@@ -646,6 +804,7 @@ fn parse_chars_into_words<C: JiebaPlaceholder>(
     jieba: &C,
 ) -> Vec<Token> {
     let groups = utils::stack_merge(chars, &(), group_chars_rule);
+    let groups = utils::stack_merge(groups, &(), convert_first_cdm_group_rule);
     let groups = utils::stack_merge(groups, jieba, cut_hanzi_rule);
     let groups =
         utils::stack_merge(groups, &(), remove_implicit_whitespace_rule);
@@ -681,6 +840,7 @@ fn parse_chars_into_WORDs<C: JiebaPlaceholder>(
     jieba: &C,
 ) -> Vec<Token> {
     let groups = utils::stack_merge(chars, &(), group_chars_rule);
+    let groups = utils::stack_merge(groups, &(), convert_first_cdm_group_rule);
     let groups = utils::stack_merge(groups, jieba, cut_hanzi_rule);
     let groups = utils::stack_merge(groups, &(), concat_nonspace_groups_rule);
     let groups =
@@ -770,6 +930,41 @@ mod tests {
     use once_cell::sync::OnceCell;
     use proptest::prelude::*;
 
+    #[test]
+    fn test_append_mark_to_cuts_1() {
+        let counts = vec![0usize, 2, 1];
+        let marks = vec![true, false, true, false, false, true];
+        assert_eq!(append_mark_to_cuts(&marks, &counts), vec![1, 3, 2]);
+    }
+
+    #[test]
+    fn test_append_mark_to_cuts_2() {
+        let counts = vec![0usize, 2, 1];
+        let marks = vec![true, true, false, true, false, false, true];
+        assert_eq!(append_mark_to_cuts(&marks, &counts), vec![2, 3, 2]);
+    }
+
+    #[test]
+    fn test_append_mark_to_cuts_3() {
+        let counts = vec![0usize, 1, 1, 1];
+        let marks = vec![true, false, true, false, false, true];
+        assert_eq!(append_mark_to_cuts(&marks, &counts), vec![1, 2, 1, 2]);
+    }
+
+    #[test]
+    fn test_append_mark_to_cuts_4() {
+        let counts = vec![0usize, 2, 2];
+        let marks = vec![false, true, false, true, false, false];
+        assert_eq!(append_mark_to_cuts(&marks, &counts), vec![0, 4, 2]);
+    }
+
+    #[test]
+    fn test_append_mark_to_cuts_5() {
+        let counts = vec![0usize, 2];
+        let marks = vec![false, false];
+        assert_eq!(append_mark_to_cuts(&marks, &counts), vec![0, 2]);
+    }
+
     impl JiebaPlaceholder for Jieba {
         fn cut_hmm<'a>(&self, sentence: &'a str) -> Vec<&'a str> {
             self.cut(sentence, true)
@@ -1036,4 +1231,293 @@ mod tests {
             ]
         );
     }
+
+    #[test]
+    fn test_parse_combining_chars_modifying_space_word() {
+        // i.e. "xx ̆cab  ̂de".
+        let tokens = parse_str_test("xx \u{0306}cab  \u{0302}de", true);
+        assert_eq!(
+            tokens,
+            vec![
+                test_macros::token!(0, 1, 2, Word),
+                test_macros::token!(2, 2, 5, Space),
+                test_macros::token!(5, 7, 8, Word),
+                test_macros::token!(8, 9, 12, Space),
+                test_macros::token!(12, 13, 14, Word),
+            ]
+        );
+    }
+
+    #[test]
+    #[allow(non_snake_case)]
+    fn test_parse_combining_chars_modifying_space_WORD() {
+        // i.e. "xx ̆cab  ̂de".
+        let tokens = parse_str_test("xx \u{0306}cab  \u{0302}de", false);
+        assert_eq!(
+            tokens,
+            vec![
+                test_macros::token!(0, 1, 2, Word),
+                test_macros::token!(2, 2, 5, Space),
+                test_macros::token!(5, 7, 8, Word),
+                test_macros::token!(8, 9, 12, Space),
+                test_macros::token!(12, 13, 14, Word),
+            ]
+        );
+    }
+
+    #[test]
+    fn test_parse_combining_chars_modifying_letter_1_word() {
+        // i.e. "xy a͡bc d̂ef".
+        let tokens = parse_str_test("xy a\u{0361}bc d\u{0302}ef", true);
+        assert_eq!(
+            tokens,
+            vec![
+                test_macros::token!(0, 1, 2, Word),
+                test_macros::token!(2, 2, 3, Space),
+                test_macros::token!(3, 7, 8, Word),
+                test_macros::token!(8, 8, 9, Space),
+                test_macros::token!(9, 13, 14, Word),
+            ]
+        );
+    }
+
+    #[test]
+    #[allow(non_snake_case)]
+    fn test_parse_combining_chars_modifying_letter_1_WORD() {
+        // i.e. "xy a͡bc d̂ef".
+        let tokens = parse_str_test("xy a\u{0361}bc d\u{0302}ef", false);
+        assert_eq!(
+            tokens,
+            vec![
+                test_macros::token!(0, 1, 2, Word),
+                test_macros::token!(2, 2, 3, Space),
+                test_macros::token!(3, 7, 8, Word),
+                test_macros::token!(8, 8, 9, Space),
+                test_macros::token!(9, 13, 14, Word),
+            ]
+        );
+    }
+
+    #[test]
+    fn test_parse_combining_chars_modifying_letter_2_word() {
+        // Example from http://demo.danielmclaren.com/2015/diacriticism/.
+        // i.e. "f̸̰̻̯̙̳́̍͗̕o͕̟̫ͮ͆̉̾̍̉̏o̵͖̪͇̪̥͗̈ͭ̕ b̶̬̣̜̱̜͉̾ͩ͌a͚̯̮͒ͬ̆̊̍͂̕r̹̥̟̘̱͙͊͗̀̓".
+        let tokens = parse_str_test(
+            "f\u{0330}\u{0338}\u{0315}\u{033b}\u{0301}\u{032f}\u{0319}\
+            \u{030d}\u{0357}\u{0333}o\u{036e}\u{0355}\u{0346}\u{031f}\u{0309}\
+            \u{033e}\u{032b}\u{030d}\u{0309}\u{030f}o\u{0357}\u{0356}\u{032a}\
+            \u{0308}\u{0347}\u{032a}\u{0315}\u{036d}\u{0325}\u{0335} b\u{032c}\
+            \u{0323}\u{0336}\u{031c}\u{033e}\u{0331}\u{0369}\u{031c}\u{0349}\
+            \u{034c}a\u{035a}\u{0352}\u{036c}\u{0306}\u{0315}\u{030a}\u{030d}\
+            \u{032f}\u{032e}\u{0342}r\u{0339}\u{034a}\u{0357}\u{0325}\u{031f}\
+            \u{0318}\u{0331}\u{0340}\u{0359}\u{0343}",
+            true,
+        );
+        assert_eq!(
+            tokens,
+            vec![
+                test_macros::token!(0, 42, 63, Word),
+                test_macros::token!(63, 63, 64, Space),
+                test_macros::token!(64, 106, 127, Word),
+            ]
+        );
+    }
+
+    #[test]
+    #[allow(non_snake_case)]
+    fn test_parse_combining_chars_modifying_letter_2_WORD() {
+        // Example from http://demo.danielmclaren.com/2015/diacriticism/.
+        // i.e. "f̸̰̻̯̙̳́̍͗̕o͕̟̫ͮ͆̉̾̍̉̏o̵͖̪͇̪̥͗̈ͭ̕ b̶̬̣̜̱̜͉̾ͩ͌a͚̯̮͒ͬ̆̊̍͂̕r̹̥̟̘̱͙͊͗̀̓".
+        let tokens = parse_str_test(
+            "f\u{0330}\u{0338}\u{0315}\u{033b}\u{0301}\u{032f}\u{0319}\
+            \u{030d}\u{0357}\u{0333}o\u{036e}\u{0355}\u{0346}\u{031f}\u{0309}\
+            \u{033e}\u{032b}\u{030d}\u{0309}\u{030f}o\u{0357}\u{0356}\u{032a}\
+            \u{0308}\u{0347}\u{032a}\u{0315}\u{036d}\u{0325}\u{0335} b\u{032c}\
+            \u{0323}\u{0336}\u{031c}\u{033e}\u{0331}\u{0369}\u{031c}\u{0349}\
+            \u{034c}a\u{035a}\u{0352}\u{036c}\u{0306}\u{0315}\u{030a}\u{030d}\
+            \u{032f}\u{032e}\u{0342}r\u{0339}\u{034a}\u{0357}\u{0325}\u{031f}\
+            \u{0318}\u{0331}\u{0340}\u{0359}\u{0343}",
+            false,
+        );
+        assert_eq!(
+            tokens,
+            vec![
+                test_macros::token!(0, 42, 63, Word),
+                test_macros::token!(63, 63, 64, Space),
+                test_macros::token!(64, 106, 127, Word),
+            ]
+        );
+    }
+
+    #[test]
+    fn test_parse_combining_chars_modifying_hanzi_1_word() {
+        let tokens = parse_str_test("你好\u{0302}世界", true);
+        assert_eq!(
+            tokens,
+            vec![
+                test_macros::token!(0, 3, 8, Word),
+                test_macros::token!(8, 11, 14, Word),
+            ]
+        );
+    }
+
+    #[test]
+    #[allow(non_snake_case)]
+    fn test_parse_combining_chars_modifying_hanzi_1_WORD() {
+        let tokens = parse_str_test("你好\u{0302}世界", false);
+        assert_eq!(
+            tokens,
+            vec![
+                test_macros::token!(0, 3, 8, Word),
+                test_macros::token!(8, 11, 14, Word),
+            ]
+        );
+    }
+
+    #[test]
+    fn test_parse_combining_chars_modifying_hanzi_2_word() {
+        let tokens = parse_str_test("你\u{0302}好世界", true);
+        assert_eq!(
+            tokens,
+            vec![
+                test_macros::token!(0, 5, 8, Word),
+                test_macros::token!(8, 11, 14, Word),
+            ]
+        );
+    }
+
+    #[test]
+    #[allow(non_snake_case)]
+    fn test_parse_combining_chars_modifying_hanzi_2_WORD() {
+        let tokens = parse_str_test("你\u{0302}好世界", false);
+        assert_eq!(
+            tokens,
+            vec![
+                test_macros::token!(0, 5, 8, Word),
+                test_macros::token!(8, 11, 14, Word),
+            ]
+        );
+    }
+
+    #[test]
+    fn test_parse_combining_chars_modifying_hanzi_3_word() {
+        let tokens = parse_str_test("你好世界\u{0302}", true);
+        assert_eq!(
+            tokens,
+            vec![
+                test_macros::token!(0, 3, 6, Word),
+                test_macros::token!(6, 9, 14, Word),
+            ]
+        );
+    }
+
+    #[test]
+    #[allow(non_snake_case)]
+    fn test_parse_combining_chars_modifying_hanzi_3_WORD() {
+        let tokens = parse_str_test("你好世界\u{0302}", false);
+        assert_eq!(
+            tokens,
+            vec![
+                test_macros::token!(0, 3, 6, Word),
+                test_macros::token!(6, 9, 14, Word),
+            ]
+        );
+    }
+
+    #[test]
+    fn test_parse_starts_with_combining_chars_1_word() {
+        // i.e. "̂̂̂̂̂ abc".
+        let tokens = parse_str_test(
+            "\u{0302}\u{0302}\u{0302}\u{0302}\u{0302} abc",
+            true,
+        );
+        assert_eq!(
+            tokens,
+            vec![
+                test_macros::token!(0, 0, 10, Word),
+                test_macros::token!(10, 10, 11, Space),
+                test_macros::token!(11, 13, 14, Word),
+            ]
+        );
+    }
+
+    #[test]
+    #[allow(non_snake_case)]
+    fn test_parse_starts_with_combining_chars_1_WORD() {
+        // i.e. "̂̂̂̂̂ abc".
+        let tokens = parse_str_test(
+            "\u{0302}\u{0302}\u{0302}\u{0302}\u{0302} abc",
+            false,
+        );
+        assert_eq!(
+            tokens,
+            vec![
+                test_macros::token!(0, 0, 10, Word),
+                test_macros::token!(10, 10, 11, Space),
+                test_macros::token!(11, 13, 14, Word),
+            ]
+        );
+    }
+
+    #[test]
+    fn test_parse_starts_with_combining_chars_2_word() {
+        // i.e. ̂̂̂̂̂abc".
+        let tokens =
+            parse_str_test("\u{0302}\u{0302}\u{0302}\u{0302}\u{0302}abc", true);
+        assert_eq!(tokens, vec![test_macros::token!(0, 12, 13, Word)]);
+    }
+
+    #[test]
+    #[allow(non_snake_case)]
+    fn test_parse_starts_with_combining_chars_2_WORD() {
+        // i.e. ̂̂̂̂̂abc".
+        let tokens = parse_str_test(
+            "\u{0302}\u{0302}\u{0302}\u{0302}\u{0302}abc",
+            false,
+        );
+        assert_eq!(tokens, vec![test_macros::token!(0, 12, 13, Word)]);
+    }
+
+    #[test]
+    fn test_parse_starts_with_combining_chars_3_word() {
+        let tokens = parse_str_test("\u{0302}你好世界", true);
+        assert_eq!(
+            tokens,
+            vec![
+                test_macros::token!(0, 0, 2, Word),
+                test_macros::token!(2, 5, 8, Word),
+                test_macros::token!(8, 11, 14, Word),
+            ]
+        );
+    }
+
+    #[test]
+    #[allow(non_snake_case)]
+    fn test_parse_starts_with_combining_chars_3_WORD() {
+        let tokens = parse_str_test("\u{0302}你好世界", false);
+        assert_eq!(
+            tokens,
+            vec![
+                test_macros::token!(0, 5, 8, Word),
+                test_macros::token!(8, 11, 14, Word),
+            ]
+        );
+    }
+
+    #[test]
+    fn test_parse_combining_chars_only_word() {
+        // i.e. "̂̂̂̂̂"
+        let tokens =
+            parse_str_test("\u{0302}\u{0302}\u{0302}\u{0302}\u{0302}", true);
+        assert_eq!(tokens, vec![test_macros::token!(0, 0, 10, Word)]);
+    }
+
+    #[test]
+    #[allow(non_snake_case)]
+    fn test_parse_combining_chars_only_WORD() {
+        // i.e. "̂̂̂̂̂"
+        let tokens =
+            parse_str_test("\u{0302}\u{0302}\u{0302}\u{0302}\u{0302}", false);
+        assert_eq!(tokens, vec![test_macros::token!(0, 0, 10, Word)]);
+    }
 }

From 19b761107e25801e81c13f6f9d11443cc8c487a6 Mon Sep 17 00:00:00 2001
From: kkew3 <kps6326@hotmail.com>
Date: Sun, 29 Dec 2024 19:22:08 +0800
Subject: [PATCH 3/6] Refactor char grouping code to leverage rust's pattern
 matching exhaustive check

---
 pythonx/jieba_vim_rs_core/src/token.rs | 148 ++++++++++++++-----------
 1 file changed, 85 insertions(+), 63 deletions(-)

diff --git a/pythonx/jieba_vim_rs_core/src/token.rs b/pythonx/jieba_vim_rs_core/src/token.rs
index 08e789e..0513000 100644
--- a/pythonx/jieba_vim_rs_core/src/token.rs
+++ b/pythonx/jieba_vim_rs_core/src/token.rs
@@ -377,14 +377,33 @@ impl CharGroup {
         }
     }
 
-    /// Push a [`Char`]. Giving back `c` if their types are not compatible in
-    /// major class (space, word, nonword). Combining characters are compatible
-    /// with any other major class. `self`'s type may be modified accordingly,
-    /// but it's guaranteed that the majar class of `self` will not be changed
-    /// after push. Panics if there's gap between `self` and `c`.
-    fn push(&mut self, c: Char) -> Result<(), Char> {
+    /// Try to push a [`Char`]. If `self` and `c` are compatible in major class
+    /// (space, word, nonword). The type of `self` may be modified accordingly,
+    /// but it's guaranteed that the major class will not be changed. If `c`
+    /// is of type [`CharType::CombiningDiacriticalMark`], it's guaranteed to
+    /// be compatible with `self`. Otherwise, return a vec of [`CharGroup`]s
+    /// of either length 1 or 2, where the last element is the singleton char
+    /// group comprised of `c`, with implicit whitespace optionally inserted
+    /// before. In this case, `self` will not be modified. Panics if there's
+    /// gap between `self` and `c`.
+    fn push(&mut self, c: Char) -> Result<(), Vec<CharGroup>> {
         assert_eq!(self.col.excl_end_byte_index, c.col.start_byte_index);
 
+        fn do_push(
+            group: &mut CharGroup,
+            c: Char,
+        ) -> Result<(), Vec<CharGroup>> {
+            group.chars.push(c.ch);
+            // Combining diacritical marks modify previous character only, and does
+            // not take space.
+            match &c.ty {
+                CombiningDiacriticalMark => (),
+                _ => group.col.incl_end_byte_index = c.col.incl_end_byte_index,
+            }
+            group.col.excl_end_byte_index = c.col.excl_end_byte_index;
+            Ok(())
+        }
+
         use CharGroupType as G;
         use CharType::*;
         use NonWordCharGroupType as NG;
@@ -392,37 +411,54 @@ impl CharGroup {
         use WordCharGroupType as WG;
         use WordCharType as W;
         match (&self.ty, &c.ty) {
-            (G::CombiningDiacriticalMark, CombiningDiacriticalMark) => (),
+            // === Compatible cases ===
+            (G::CombiningDiacriticalMark, CombiningDiacriticalMark) => {
+                do_push(self, c)
+            }
             (G::CombiningDiacriticalMark, Word(W::Other)) => {
                 self.ty = G::Word(WG::Other);
+                do_push(self, c)
             }
 
-            (G::Space, Space) | (G::Space, CombiningDiacriticalMark) => (),
+            (G::Space, Space) | (G::Space, CombiningDiacriticalMark) => {
+                do_push(self, c)
+            }
 
             (G::Word(WG::Hanzi), Word(_))
-            | (G::Word(WG::Hanzi), CombiningDiacriticalMark) => (),
+            | (G::Word(WG::Hanzi), CombiningDiacriticalMark) => {
+                do_push(self, c)
+            }
 
             (G::Word(WG::Other), Word(W::Hanzi)) => {
                 self.ty = G::Word(WG::Hanzi);
+                do_push(self, c)
             }
             (G::Word(WG::Other), Word(W::Other))
-            | (G::Word(WG::Other), CombiningDiacriticalMark) => (),
+            | (G::Word(WG::Other), CombiningDiacriticalMark) => {
+                do_push(self, c)
+            }
 
             (G::NonWord(NG::LeftPuncLeading), NonWord(N::LeftPunc))
             | (G::NonWord(NG::LeftPuncLeading), NonWord(N::Other))
-            | (G::NonWord(NG::LeftPuncLeading), CombiningDiacriticalMark) => (),
+            | (G::NonWord(NG::LeftPuncLeading), CombiningDiacriticalMark) => {
+                do_push(self, c)
+            }
             (G::NonWord(NG::LeftPuncLeading), NonWord(N::RightPunc))
             | (G::NonWord(NG::LeftPuncLeading), NonWord(N::IsolatedPunc)) => {
                 self.ty = G::NonWord(NG::LeftPuncLeadingRightPuncEnding);
+                do_push(self, c)
             }
 
             (G::NonWord(NG::RightPuncEnding), NonWord(N::LeftPunc))
             | (G::NonWord(NG::RightPuncEnding), NonWord(N::Other)) => {
                 self.ty = G::NonWord(NG::Other);
+                do_push(self, c)
             }
             (G::NonWord(NG::RightPuncEnding), NonWord(N::RightPunc))
             | (G::NonWord(NG::RightPuncEnding), NonWord(N::IsolatedPunc))
-            | (G::NonWord(NG::RightPuncEnding), CombiningDiacriticalMark) => (),
+            | (G::NonWord(NG::RightPuncEnding), CombiningDiacriticalMark) => {
+                do_push(self, c)
+            }
 
             (
                 G::NonWord(NG::LeftPuncLeadingRightPuncEnding),
@@ -431,7 +467,10 @@ impl CharGroup {
             | (
                 G::NonWord(NG::LeftPuncLeadingRightPuncEnding),
                 NonWord(N::Other),
-            ) => self.ty = G::NonWord(NG::LeftPuncLeading),
+            ) => {
+                self.ty = G::NonWord(NG::LeftPuncLeading);
+                do_push(self, c)
+            }
             (
                 G::NonWord(NG::LeftPuncLeadingRightPuncEnding),
                 NonWord(N::RightPunc),
@@ -443,27 +482,47 @@ impl CharGroup {
             | (
                 G::NonWord(NG::LeftPuncLeadingRightPuncEnding),
                 CombiningDiacriticalMark,
-            ) => (),
+            ) => do_push(self, c),
 
             (G::NonWord(NG::Other), NonWord(N::LeftPunc))
             | (G::NonWord(NG::Other), NonWord(N::Other))
-            | (G::NonWord(NG::Other), CombiningDiacriticalMark) => (),
+            | (G::NonWord(NG::Other), CombiningDiacriticalMark) => {
+                do_push(self, c)
+            }
             (G::NonWord(NG::Other), NonWord(N::RightPunc))
             | (G::NonWord(NG::Other), NonWord(N::IsolatedPunc)) => {
                 self.ty = G::NonWord(NG::RightPuncEnding);
+                do_push(self, c)
             }
 
-            _ => return Err(c),
-        }
-        self.chars.push(c.ch);
-        // Combining diacritical marks modify previous character only, and does
-        // not take space.
-        match &c.ty {
-            CombiningDiacriticalMark => (),
-            _ => self.col.incl_end_byte_index = c.col.incl_end_byte_index,
+            // === Not compatible cases ===
+            (G::CombiningDiacriticalMark, Word(W::Hanzi))
+            | (G::CombiningDiacriticalMark, NonWord(_))
+            | (G::CombiningDiacriticalMark, Space) => Err(vec![c.into()]),
+
+            // We never need to insert implicit space after a space.
+            (G::Space, Word(_)) | (G::Space, NonWord(_)) => Err(vec![c.into()]),
+
+            (G::Word(_), Space) => Err(vec![c.into()]),
+            (G::Word(_), NonWord(N::LeftPunc))
+            | (G::Word(_), NonWord(N::IsolatedPunc)) => {
+                let c = CharGroup::from(c);
+                let ispace =
+                    CharGroup::new_implicit_whitespace(c.col.start_byte_index);
+                Err(vec![ispace, c])
+            }
+            (G::Word(_), NonWord(_)) => Err(vec![c.into()]),
+
+            (G::NonWord(_), Space) => Err(vec![c.into()]),
+            (G::NonWord(NG::RightPuncEnding), Word(_))
+            | (G::NonWord(NG::LeftPuncLeadingRightPuncEnding), Word(_)) => {
+                let c = CharGroup::from(c);
+                let ispace =
+                    CharGroup::new_implicit_whitespace(c.col.start_byte_index);
+                Err(vec![ispace, c])
+            }
+            (G::NonWord(_), Word(_)) => Err(vec![c.into()]),
         }
-        self.col.excl_end_byte_index = c.col.excl_end_byte_index;
-        Ok(())
     }
 
     /// Append `group` after `self`. The type of `self` won't be changed.
@@ -486,44 +545,7 @@ fn group_chars_rule(
     match group {
         None => vec![CharGroup::from(c)],
         Some(mut group) => match group.push(c) {
-            Err(c) => {
-                let c = CharGroup::from(c);
-                // `group` and `c` are not compatible in major type. We may
-                // need to insert implicit whitespace in between. Since it's
-                // cheap, we prepare one beforehand.
-                let ispace =
-                    CharGroup::new_implicit_whitespace(c.col.start_byte_index);
-                use CharGroupType::*;
-                use NonWordCharGroupType as N;
-                use WordCharGroupType as W;
-                match (&group.ty, &c.ty) {
-                    (CombiningDiacriticalMark, Word(W::Hanzi))
-                    | (CombiningDiacriticalMark, NonWord(_))
-                    | (CombiningDiacriticalMark, Space) => {
-                        vec![group, c]
-                    }
-
-                    // We never need to insert implicit space after a space.
-                    (Space, Word(_)) | (Space, NonWord(_)) => vec![group, c],
-
-                    (Word(_), Space) => vec![group, c],
-                    (Word(_), NonWord(N::LeftPuncLeading))
-                    | (Word(_), NonWord(N::LeftPuncLeadingRightPuncEnding)) => {
-                        vec![group, ispace, c]
-                    }
-                    (Word(_), NonWord(_)) => vec![group, c],
-
-                    (NonWord(_), Space) => vec![group, c],
-                    (NonWord(N::RightPuncEnding), Word(_))
-                    | (NonWord(N::LeftPuncLeadingRightPuncEnding), Word(_)) => {
-                        vec![group, ispace, c]
-                    }
-                    (NonWord(_), Word(_)) => vec![group, c],
-
-                    // Should not happen.
-                    _ => panic!(),
-                }
-            }
+            Err(joined) => utils::chain_into_vec([group], joined),
             Ok(()) => vec![group],
         },
     }

From e23c0b7b2ae60d0eeee27b5c6f24e3c75df3be4e Mon Sep 17 00:00:00 2001
From: kkew3 <kps6326@hotmail.com>
Date: Mon, 30 Dec 2024 06:54:27 +0800
Subject: [PATCH 4/6] Simplify Chinese punctuation association rule

Keep only some of the right punctuations, e.g. full-width comma and full-stop. All other punctuation are regarded "other type", e.g. full-width parenthesis and dashes. This way, emoji will then be considered similarly to non-word other type characters in a consistent manner.
---
 pythonx/jieba_vim_rs_core/src/token.rs | 163 ++++++-------------------
 1 file changed, 37 insertions(+), 126 deletions(-)

diff --git a/pythonx/jieba_vim_rs_core/src/token.rs b/pythonx/jieba_vim_rs_core/src/token.rs
index 0513000..10dbb69 100644
--- a/pythonx/jieba_vim_rs_core/src/token.rs
+++ b/pythonx/jieba_vim_rs_core/src/token.rs
@@ -48,16 +48,9 @@ enum WordCharType {
 /// Non-word character types.
 #[derive(Debug)]
 enum NonWordCharType {
-    /// Left-associated CJK punctuations. When a word character is followed by
-    /// a [`NonWordCharType::LeftPunc`], an implicit space is added in between.
-    LeftPunc,
     /// Right-associated CJK punctuations. When a word character follows a
     /// [`NonWordCharType::RightPunc`], an implicit space is added in between.
     RightPunc,
-    /// Isolated CJK punctuations. When a word character is followed by or
-    /// follows a [`NonWordCharType::IsolatedPunc`], an implicit space is added
-    /// in between.
-    IsolatedPunc,
     /// Other non-word characters.
     Other,
 }
@@ -97,7 +90,7 @@ fn is_combining_diacritical_mark(c: char) -> bool {
 // IN THE SOFTWARE.
 // ---
 //
-// The partition of CJK punctuations into left/right/isolated types are decided
+// The partition of CJK punctuations into right/other types are decided
 // by myself, with help from https://www.compart.com/en/unicode. For CJK
 // punctuations that I don't know how to categorize, I've marked them with `??`
 // on the right.
@@ -139,29 +132,25 @@ fn categorize_char(c: char) -> CharType {
 
         // Fullwidth ASCII variants.
         '\u{ff04}' | '\u{ff08}' | '\u{ff3b}' | '\u{ff5b}' | '\u{ff5f}'
+        | '\u{ff09}' | '\u{ff3d}' | '\u{ff5d}' | '\u{ff60}' | '\u{ff05}'
         // Halfwidth CJK punctuation.
-        | '\u{ff62}'
+        | '\u{ff62}' | '\u{ff63}'
         // CJK angle and corner brackets.
         | '\u{3008}' | '\u{300a}' | '\u{300c}' | '\u{300e}' | '\u{3010}'
+        | '\u{3009}' | '\u{300b}' | '\u{300d}' | '\u{300f}' | '\u{3011}'
         // CJK brackets and symbols/punctuation.
         | '\u{3014}' | '\u{3016}' | '\u{3018}' | '\u{301a}' | '\u{301d}'
+        | '\u{3015}' | '\u{3017}' | '\u{3019}' | '\u{301b}' | '\u{301e}'
         // Quotation marks and apostrophe.
-        | '\u{2018}' | '\u{201c}'
-        => CharType::NonWord(NonWordCharType::LeftPunc),
+        | '\u{2018}' | '\u{201c}' | '\u{2019}' | '\u{201d}'
+        => CharType::NonWord(NonWordCharType::Other),
 
         // Fullwidth ASCII variants.
-        '\u{ff09}' | '\u{ff0c}' | '\u{ff1a}' | '\u{ff1b}' | '\u{ff3d}'
-        | '\u{ff5d}' | '\u{ff60}' | '\u{ff05}'
+        '\u{ff0c}' | '\u{ff1a}' | '\u{ff1b}'
         // Halfwidth CJK punctuation.
-        | '\u{ff63}' | '\u{ff64}'
+        | '\u{ff64}'
         // CJK symbols and punctuation.
         | '\u{3001}'
-        // CJK angle and corner brackets.
-        | '\u{3009}' | '\u{300b}' | '\u{300d}' | '\u{300f}' | '\u{3011}'
-        // CJK brackets and symbols/punctuation.
-        | '\u{3015}' | '\u{3017}' | '\u{3019}' | '\u{301b}' | '\u{301e}'
-        // Quotation marks and apostrophe.
-        | '\u{2019}' | '\u{201d}'
         // Small form variants.
         | '\u{fe51}' | '\u{fe54}'
         // Fullwidth full stop.
@@ -204,7 +193,7 @@ fn categorize_char(c: char) -> CharType {
         | '\u{fe4f}'
         // Latin punctuation.
         | '\u{00b7}'
-        => CharType::NonWord(NonWordCharType::IsolatedPunc),
+        => CharType::NonWord(NonWordCharType::Other),
 
         // Default value of 'iskeyword' in Vim (ASCII range: '@,48-57,_').
         'a'..='z' | 'A'..='Z' | '0'..='9' | '_'
@@ -295,25 +284,11 @@ enum WordCharGroupType {
 /// Non-word character group types.
 #[derive(Debug, PartialEq, Eq)]
 enum NonWordCharGroupType {
-    /// A sequence of [`CharType::NonWord`] that starts with a
-    /// [`NonWordCharType::LeftPunc`] or [`NonWordCharType::IsolatedPunc`],
-    /// but does not end with a [`NonWordCharType::RightPunc`] or
-    /// [`NonWordCharType::IsolatedPunc`].
-    LeftPuncLeading,
     /// A sequence of [`CharType::NonWord`] that ends with a
-    /// [`NonWordCharType::RightPunc`] or [`NonWordCharType::IsolatedPunc`]
-    /// but does not start with a [`NonWordCharType::LeftPunc`] or
-    /// [`NonWordCharType::IsolatedPunc`].
+    /// [`NonWordCharType::RightPunc`].
     RightPuncEnding,
-    /// A sequence of [`CharType::NonWord`] that starts with a
-    /// [`NonWordCharType::LeftPunc`] or [`NonWordCharType::IsolatedPunc`],
-    /// and ends with a [`NonWordCharType::RightPunc`] or
-    /// [`NonWordCharType::IsolatedPunc`].
-    LeftPuncLeadingRightPuncEnding,
-    /// A sequence of [`CharType::NonWord`] that neither starts with a
-    /// [`NonWordCharType::LeftPunc`] or [`NonWordCharType::IsolatedPunc`],
-    /// nor ends with a [`NonWordCharType::RightPunc`] or
-    /// [`NonWordCharType::IsolatedPunc`].
+    /// A sequence of [`CharType::NonWord`] that does not end with a
+    /// [`NonWordCharType::RightPunc`].
     Other,
 }
 
@@ -340,21 +315,11 @@ impl From<Char> for CharGroup {
                 CharType::CombiningDiacriticalMark => {
                     CharGroupType::CombiningDiacriticalMark
                 }
-                CharType::NonWord(NonWordCharType::LeftPunc) => {
-                    CharGroupType::NonWord(
-                        NonWordCharGroupType::LeftPuncLeading,
-                    )
-                }
                 CharType::NonWord(NonWordCharType::RightPunc) => {
                     CharGroupType::NonWord(
                         NonWordCharGroupType::RightPuncEnding,
                     )
                 }
-                CharType::NonWord(NonWordCharType::IsolatedPunc) => {
-                    CharGroupType::NonWord(
-                        NonWordCharGroupType::LeftPuncLeadingRightPuncEnding,
-                    )
-                }
                 CharType::NonWord(NonWordCharType::Other) => {
                     CharGroupType::NonWord(NonWordCharGroupType::Other)
                 }
@@ -438,59 +403,20 @@ impl CharGroup {
                 do_push(self, c)
             }
 
-            (G::NonWord(NG::LeftPuncLeading), NonWord(N::LeftPunc))
-            | (G::NonWord(NG::LeftPuncLeading), NonWord(N::Other))
-            | (G::NonWord(NG::LeftPuncLeading), CombiningDiacriticalMark) => {
-                do_push(self, c)
-            }
-            (G::NonWord(NG::LeftPuncLeading), NonWord(N::RightPunc))
-            | (G::NonWord(NG::LeftPuncLeading), NonWord(N::IsolatedPunc)) => {
-                self.ty = G::NonWord(NG::LeftPuncLeadingRightPuncEnding);
-                do_push(self, c)
-            }
-
-            (G::NonWord(NG::RightPuncEnding), NonWord(N::LeftPunc))
-            | (G::NonWord(NG::RightPuncEnding), NonWord(N::Other)) => {
+            (G::NonWord(NG::RightPuncEnding), NonWord(N::Other)) => {
                 self.ty = G::NonWord(NG::Other);
                 do_push(self, c)
             }
             (G::NonWord(NG::RightPuncEnding), NonWord(N::RightPunc))
-            | (G::NonWord(NG::RightPuncEnding), NonWord(N::IsolatedPunc))
             | (G::NonWord(NG::RightPuncEnding), CombiningDiacriticalMark) => {
                 do_push(self, c)
             }
 
-            (
-                G::NonWord(NG::LeftPuncLeadingRightPuncEnding),
-                NonWord(N::LeftPunc),
-            )
-            | (
-                G::NonWord(NG::LeftPuncLeadingRightPuncEnding),
-                NonWord(N::Other),
-            ) => {
-                self.ty = G::NonWord(NG::LeftPuncLeading);
-                do_push(self, c)
-            }
-            (
-                G::NonWord(NG::LeftPuncLeadingRightPuncEnding),
-                NonWord(N::RightPunc),
-            )
-            | (
-                G::NonWord(NG::LeftPuncLeadingRightPuncEnding),
-                NonWord(N::IsolatedPunc),
-            )
-            | (
-                G::NonWord(NG::LeftPuncLeadingRightPuncEnding),
-                CombiningDiacriticalMark,
-            ) => do_push(self, c),
-
-            (G::NonWord(NG::Other), NonWord(N::LeftPunc))
-            | (G::NonWord(NG::Other), NonWord(N::Other))
+            (G::NonWord(NG::Other), NonWord(N::Other))
             | (G::NonWord(NG::Other), CombiningDiacriticalMark) => {
                 do_push(self, c)
             }
-            (G::NonWord(NG::Other), NonWord(N::RightPunc))
-            | (G::NonWord(NG::Other), NonWord(N::IsolatedPunc)) => {
+            (G::NonWord(NG::Other), NonWord(N::RightPunc)) => {
                 self.ty = G::NonWord(NG::RightPuncEnding);
                 do_push(self, c)
             }
@@ -504,18 +430,10 @@ impl CharGroup {
             (G::Space, Word(_)) | (G::Space, NonWord(_)) => Err(vec![c.into()]),
 
             (G::Word(_), Space) => Err(vec![c.into()]),
-            (G::Word(_), NonWord(N::LeftPunc))
-            | (G::Word(_), NonWord(N::IsolatedPunc)) => {
-                let c = CharGroup::from(c);
-                let ispace =
-                    CharGroup::new_implicit_whitespace(c.col.start_byte_index);
-                Err(vec![ispace, c])
-            }
             (G::Word(_), NonWord(_)) => Err(vec![c.into()]),
 
             (G::NonWord(_), Space) => Err(vec![c.into()]),
-            (G::NonWord(NG::RightPuncEnding), Word(_))
-            | (G::NonWord(NG::LeftPuncLeadingRightPuncEnding), Word(_)) => {
+            (G::NonWord(NG::RightPuncEnding), Word(_)) => {
                 let c = CharGroup::from(c);
                 let ispace =
                     CharGroup::new_implicit_whitespace(c.col.start_byte_index);
@@ -1017,11 +935,11 @@ mod tests {
         ));
         assert!(matches!(
             categorize_char('（'),
-            CharType::NonWord(NonWordCharType::LeftPunc)
+            CharType::NonWord(NonWordCharType::Other)
         ));
         assert!(matches!(
             categorize_char('—'),
-            CharType::NonWord(NonWordCharType::IsolatedPunc)
+            CharType::NonWord(NonWordCharType::Other)
         ));
         assert!(matches!(categorize_char('\u{3000}'), CharType::Space));
     }
@@ -1134,10 +1052,10 @@ mod tests {
         assert_eq!(
             tokens,
             vec![
-                test_macros::token!(0, 4, 5, Word),
-                test_macros::token!(5, 5, 6, Word),
-                test_macros::token!(6, 6, 7, Space),
-                test_macros::token!(7, 11, 12, Word),
+                test_macros::token!(0, 4, 5, Word),  // "hello"
+                test_macros::token!(5, 5, 6, Word),  // ","
+                test_macros::token!(6, 6, 7, Space), // " "
+                test_macros::token!(7, 11, 12, Word), // "world"
             ]
         );
     }
@@ -1149,9 +1067,9 @@ mod tests {
         assert_eq!(
             tokens,
             vec![
-                test_macros::token!(0, 5, 6, Word),
-                test_macros::token!(6, 6, 7, Space),
-                test_macros::token!(7, 11, 12, Word),
+                test_macros::token!(0, 5, 6, Word), // "hello,"
+                test_macros::token!(6, 6, 7, Space), // " "
+                test_macros::token!(7, 11, 12, Word), // "world"
             ]
         );
     }
@@ -1162,8 +1080,8 @@ mod tests {
         assert_eq!(
             tokens,
             vec![
-                test_macros::token!(0, 1, 4, Word),
-                test_macros::token!(4, 10, 11, Word),
+                test_macros::token!(0, 1, 4, Word),   // "B超"
+                test_macros::token!(4, 10, 11, Word), // "foo_bar"
             ]
         );
     }
@@ -1175,8 +1093,8 @@ mod tests {
         assert_eq!(
             tokens,
             vec![
-                test_macros::token!(0, 1, 4, Word),
-                test_macros::token!(4, 10, 11, Word),
+                test_macros::token!(0, 1, 4, Word),   // "B超"
+                test_macros::token!(4, 10, 11, Word), // "foo_bar"
             ]
         );
     }
@@ -1187,10 +1105,10 @@ mod tests {
         assert_eq!(
             tokens,
             vec![
-                test_macros::token!(0, 1, 4, Word),
-                test_macros::token!(4, 4, 7, Word),
-                test_macros::token!(7, 9, 10, Word),
-                test_macros::token!(10, 16, 19, Word),
+                test_macros::token!(0, 1, 4, Word),    // "B超"
+                test_macros::token!(4, 4, 7, Word),    // "，"
+                test_macros::token!(7, 9, 10, Word),   // "foo"
+                test_macros::token!(10, 16, 19, Word), // "。。。"
             ]
         );
     }
@@ -1202,8 +1120,8 @@ mod tests {
         assert_eq!(
             tokens,
             vec![
-                test_macros::token!(0, 4, 7, Word),
-                test_macros::token!(7, 16, 19, Word),
+                test_macros::token!(0, 4, 7, Word), // "B超，"
+                test_macros::token!(7, 16, 19, Word), // "foo。。。"
             ]
         );
     }
@@ -1227,14 +1145,7 @@ mod tests {
     #[allow(non_snake_case)]
     fn test_parse_hanzi_1_WORD() {
         let tokens = parse_str_test("（你好——世界）。", false);
-        assert_eq!(
-            tokens,
-            vec![
-                test_macros::token!(0, 6, 9, Word), // "（你好"
-                test_macros::token!(9, 12, 15, Word), // "——"
-                test_macros::token!(15, 24, 27, Word), // "世界）。"
-            ]
-        );
+        assert_eq!(tokens, vec![test_macros::token!(0, 24, 27, Word)]);
     }
 
     #[test]

From 8b8933959360350d6ca7023048cbe60456f4204a Mon Sep 17 00:00:00 2001
From: kkew3 <kps6326@hotmail.com>
Date: Mon, 30 Dec 2024 07:20:22 +0800
Subject: [PATCH 5/6] Add support to emoji in tokenization

Related to issue https://github.com/kkew3/jieba.vim/issues/7.
---
 pythonx/jieba_vim_rs_core/Cargo.toml   |   1 +
 pythonx/jieba_vim_rs_core/src/token.rs | 121 +++++++++++++++++++++++--
 2 files changed, 115 insertions(+), 7 deletions(-)

diff --git a/pythonx/jieba_vim_rs_core/Cargo.toml b/pythonx/jieba_vim_rs_core/Cargo.toml
index 408e9c0..0286980 100644
--- a/pythonx/jieba_vim_rs_core/Cargo.toml
+++ b/pythonx/jieba_vim_rs_core/Cargo.toml
@@ -4,6 +4,7 @@ version = "0.1.2"
 edition = "2021"
 
 [dependencies]
+unic-emoji-char = "0.9"
 
 [dev-dependencies]
 jieba-rs = "0.7"
diff --git a/pythonx/jieba_vim_rs_core/src/token.rs b/pythonx/jieba_vim_rs_core/src/token.rs
index 10dbb69..9c2d74a 100644
--- a/pythonx/jieba_vim_rs_core/src/token.rs
+++ b/pythonx/jieba_vim_rs_core/src/token.rs
@@ -34,6 +34,12 @@ enum CharType {
     /// which might be included in the future in case of frequent need in
     /// practice.
     CombiningDiacriticalMark,
+    /// Emojis are essentially non-word characters. However, Vim treat emojis
+    /// differently from other non-word characters such as punctuation. For
+    /// example, `🖖🖖🖖🖖,,,abc` contains three tokens (vulcan salutes,
+    /// commas, and "abc"), instead of two tokens (vulcan salutes and commas,
+    /// and "abc").
+    Emoji,
 }
 
 /// Word character types.
@@ -51,7 +57,7 @@ enum NonWordCharType {
     /// Right-associated CJK punctuations. When a word character follows a
     /// [`NonWordCharType::RightPunc`], an implicit space is added in between.
     RightPunc,
-    /// Other non-word characters.
+    /// Other non-word characters. This includes the zero-width joiner (ZWJ).
     Other,
 }
 
@@ -208,6 +214,8 @@ fn categorize_char(c: char) -> CharType {
         // characters in Vim (both compatible and nocompatible).
         '\u{02b0}'..='\u{02ff}' => CharType::Word(WordCharType::Other),
 
+        c if unic_emoji_char::is_emoji(c) => CharType::Emoji,
+
         _ => CharType::NonWord(NonWordCharType::Other),
     }
 }
@@ -269,6 +277,8 @@ enum CharGroupType {
     /// class, CDM is compatible with non-汉字 word, non-汉字 word is compatible
     /// with 汉字 word, but CDM is *not* compatible with 汉字 word.
     CombiningDiacriticalMark,
+    /// A sequence of [`CharType::Emoji`].
+    Emoji,
 }
 
 /// Word character group types.
@@ -323,6 +333,7 @@ impl From<Char> for CharGroup {
                 CharType::NonWord(NonWordCharType::Other) => {
                     CharGroupType::NonWord(NonWordCharGroupType::Other)
                 }
+                CharType::Emoji => CharGroupType::Emoji,
             },
         }
     }
@@ -421,25 +432,38 @@ impl CharGroup {
                 do_push(self, c)
             }
 
+            (G::Emoji, Emoji) | (G::Emoji, CombiningDiacriticalMark) => {
+                do_push(self, c)
+            }
+
             // === Not compatible cases ===
             (G::CombiningDiacriticalMark, Word(W::Hanzi))
             | (G::CombiningDiacriticalMark, NonWord(_))
-            | (G::CombiningDiacriticalMark, Space) => Err(vec![c.into()]),
+            | (G::CombiningDiacriticalMark, Space)
+            | (G::CombiningDiacriticalMark, Emoji) => Err(vec![c.into()]),
 
             // We never need to insert implicit space after a space.
-            (G::Space, Word(_)) | (G::Space, NonWord(_)) => Err(vec![c.into()]),
+            (G::Space, Word(_))
+            | (G::Space, NonWord(_))
+            | (G::Space, Emoji) => Err(vec![c.into()]),
 
-            (G::Word(_), Space) => Err(vec![c.into()]),
-            (G::Word(_), NonWord(_)) => Err(vec![c.into()]),
+            (G::Word(_), Space)
+            | (G::Word(_), NonWord(_))
+            | (G::Word(_), Emoji) => Err(vec![c.into()]),
 
-            (G::NonWord(_), Space) => Err(vec![c.into()]),
             (G::NonWord(NG::RightPuncEnding), Word(_)) => {
                 let c = CharGroup::from(c);
                 let ispace =
                     CharGroup::new_implicit_whitespace(c.col.start_byte_index);
                 Err(vec![ispace, c])
             }
-            (G::NonWord(_), Word(_)) => Err(vec![c.into()]),
+            (G::NonWord(_), Space)
+            | (G::NonWord(_), Word(_))
+            | (G::NonWord(_), Emoji) => Err(vec![c.into()]),
+
+            (G::Emoji, Space)
+            | (G::Emoji, Word(_))
+            | (G::Emoji, NonWord(_)) => Err(vec![c.into()]),
         }
     }
 
@@ -942,6 +966,12 @@ mod tests {
             CharType::NonWord(NonWordCharType::Other)
         ));
         assert!(matches!(categorize_char('\u{3000}'), CharType::Space));
+        assert!(matches!(categorize_char('\u{1f596}'), CharType::Emoji));
+        assert!(matches!(categorize_char('\u{1f3ff}'), CharType::Emoji));
+        assert!(matches!(
+            categorize_char('\u{200d}'),
+            CharType::NonWord(NonWordCharType::Other)
+        ));
     }
 
     #[test]
@@ -1453,4 +1483,81 @@ mod tests {
             parse_str_test("\u{0302}\u{0302}\u{0302}\u{0302}\u{0302}", false);
         assert_eq!(tokens, vec![test_macros::token!(0, 0, 10, Word)]);
     }
+
+    #[test]
+    fn test_emoji_zwj_word() {
+        // i.e. "👨‍👩‍👧‍👦".
+        let tokens = parse_str_test(
+            "\u{1f468}\u{200d}\u{1f469}\u{200d}\u{1f467}\u{200d}\u{1f466}",
+            true,
+        );
+        assert_eq!(
+            tokens,
+            vec![
+                test_macros::token!(0, 0, 4, Word),
+                test_macros::token!(4, 4, 7, Word),
+                test_macros::token!(7, 7, 11, Word),
+                test_macros::token!(11, 11, 14, Word),
+                test_macros::token!(14, 14, 18, Word),
+                test_macros::token!(18, 18, 21, Word),
+                test_macros::token!(21, 21, 25, Word),
+            ]
+        );
+    }
+
+    #[test]
+    #[allow(non_snake_case)]
+    fn test_emoji_zwj_WORD() {
+        // i.e. "👨‍👩‍👧‍👦".
+        let tokens = parse_str_test(
+            "\u{1f468}\u{200d}\u{1f469}\u{200d}\u{1f467}\u{200d}\u{1f466}",
+            false,
+        );
+        assert_eq!(tokens, vec![test_macros::token!(0, 21, 25, Word)]);
+    }
+
+    #[test]
+    fn test_emoji_surrounded_by_nonword_word() {
+        // i.e. ".🖖,,abc"
+        let tokens = parse_str_test(".\u{1f596},,abc", true);
+        assert_eq!(
+            tokens,
+            vec![
+                test_macros::token!(0, 0, 1, Word),  // "."
+                test_macros::token!(1, 1, 5, Word),  // "🖖"
+                test_macros::token!(5, 6, 7, Word),  // ",,"
+                test_macros::token!(7, 9, 10, Word), // "abc"
+            ]
+        );
+    }
+
+    #[test]
+    #[allow(non_snake_case)]
+    fn test_emoji_surrounded_by_nonword_WORD() {
+        // i.e. ".🖖,,abc"
+        let tokens = parse_str_test(".\u{1f596},,abc", false);
+        assert_eq!(tokens, vec![test_macros::token!(0, 9, 10, Word)]);
+    }
+
+    #[test]
+    fn test_emoji_surrounded_by_hanzi_word() {
+        // i.e. "你好🖖世界".
+        let tokens = parse_str_test("你好\u{1f596}世界", true);
+        assert_eq!(
+            tokens,
+            vec![
+                test_macros::token!(0, 3, 6, Word),  // "你好"
+                test_macros::token!(6, 6, 10, Word), // "🖖"
+                test_macros::token!(10, 13, 16, Word), // "世界"
+            ]
+        );
+    }
+
+    #[test]
+    #[allow(non_snake_case)]
+    fn test_emoji_surrounded_by_hanzi_WORD() {
+        // i.e. "你好🖖世界".
+        let tokens = parse_str_test("你好\u{1f596}世界", false);
+        assert_eq!(tokens, vec![test_macros::token!(0, 13, 16, Word)]);
+    }
 }

From 699d9ba881f0695b8bbad6d42bdf5bb1c66e9e17 Mon Sep 17 00:00:00 2001
From: kkew3 <kps6326@hotmail.com>
Date: Mon, 30 Dec 2024 15:24:45 +0800
Subject: [PATCH 6/6] Bump the version of `jieba_vim_rs_core`

---
 pythonx/jieba_vim_rs_core/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pythonx/jieba_vim_rs_core/Cargo.toml b/pythonx/jieba_vim_rs_core/Cargo.toml
index 0286980..a55dd11 100644
--- a/pythonx/jieba_vim_rs_core/Cargo.toml
+++ b/pythonx/jieba_vim_rs_core/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "jieba_vim_rs_core"
-version = "0.1.2"
+version = "0.1.3"
 edition = "2021"
 
 [dependencies]