@@ -8,7 +8,69 @@ use tauri::{Emitter, Manager};
88use tantivy:: collector:: TopDocs ;
99use tantivy:: query:: QueryParser ;
1010use tantivy:: schema:: { self , Value as TantivyValue , * } ;
11+ use tantivy:: tokenizer:: { LowerCaser , TextAnalyzer , Token , TokenStream , Tokenizer } ;
1112use tantivy:: { doc, Index , IndexWriter , ReloadPolicy } ;
13+ use jieba_rs:: Jieba ;
14+ use std:: sync:: LazyLock ;
15+
16+ // Global jieba instance for Chinese tokenization
17+ static JIEBA : LazyLock < Jieba > = LazyLock :: new ( || Jieba :: new ( ) ) ;
18+
19+ // Custom tokenizer for Chinese + English mixed content
20+ #[ derive( Clone ) ]
21+ struct JiebaTokenizer ;
22+
23+ impl Tokenizer for JiebaTokenizer {
24+ type TokenStream < ' a > = JiebaTokenStream ;
25+
26+ fn token_stream < ' a > ( & ' a mut self , text : & ' a str ) -> Self :: TokenStream < ' a > {
27+ let words = JIEBA . cut ( text, true ) ;
28+ let mut tokens = Vec :: new ( ) ;
29+ let mut offset = 0 ;
30+
31+ for word in words {
32+ let word_str = word. trim ( ) ;
33+ if !word_str. is_empty ( ) {
34+ let start = text[ offset..] . find ( word) . map ( |i| offset + i) . unwrap_or ( offset) ;
35+ let end = start + word. len ( ) ;
36+ tokens. push ( Token {
37+ offset_from : start,
38+ offset_to : end,
39+ position : tokens. len ( ) ,
40+ text : word_str. to_string ( ) ,
41+ position_length : 1 ,
42+ } ) ;
43+ offset = end;
44+ }
45+ }
46+
47+ JiebaTokenStream { tokens, index : 0 }
48+ }
49+ }
50+
51+ struct JiebaTokenStream {
52+ tokens : Vec < Token > ,
53+ index : usize ,
54+ }
55+
56+ impl TokenStream for JiebaTokenStream {
57+ fn advance ( & mut self ) -> bool {
58+ if self . index < self . tokens . len ( ) {
59+ self . index += 1 ;
60+ true
61+ } else {
62+ false
63+ }
64+ }
65+
66+ fn token ( & self ) -> & Token {
67+ & self . tokens [ self . index - 1 ]
68+ }
69+
70+ fn token_mut ( & mut self ) -> & mut Token {
71+ & mut self . tokens [ self . index - 1 ]
72+ }
73+ }
1274
1375// Global search index state
1476static SEARCH_INDEX : Mutex < Option < SearchIndex > > = Mutex :: new ( None ) ;
@@ -25,19 +87,38 @@ fn get_index_dir() -> PathBuf {
2587 . join ( "search-index" )
2688}
2789
90+ const JIEBA_TOKENIZER_NAME : & str = "jieba" ;
91+
2892fn create_schema ( ) -> Schema {
2993 let mut schema_builder = Schema :: builder ( ) ;
94+
95+ // Use custom jieba tokenizer for content fields to support Chinese
96+ let text_options = TextOptions :: default ( )
97+ . set_indexing_options (
98+ TextFieldIndexing :: default ( )
99+ . set_tokenizer ( JIEBA_TOKENIZER_NAME )
100+ . set_index_option ( schema:: IndexRecordOption :: WithFreqsAndPositions )
101+ )
102+ . set_stored ( ) ;
103+
30104 schema_builder. add_text_field ( "uuid" , STRING | STORED ) ;
31- schema_builder. add_text_field ( "content" , TEXT | STORED ) ;
105+ schema_builder. add_text_field ( "content" , text_options . clone ( ) ) ;
32106 schema_builder. add_text_field ( "role" , STRING | STORED ) ;
33107 schema_builder. add_text_field ( "project_id" , STRING | STORED ) ;
34108 schema_builder. add_text_field ( "project_path" , STRING | STORED ) ;
35109 schema_builder. add_text_field ( "session_id" , STRING | STORED ) ;
36- schema_builder. add_text_field ( "session_summary" , TEXT | STORED ) ;
110+ schema_builder. add_text_field ( "session_summary" , text_options ) ;
37111 schema_builder. add_text_field ( "timestamp" , STRING | STORED ) ;
38112 schema_builder. build ( )
39113}
40114
115+ fn register_jieba_tokenizer ( index : & Index ) {
116+ let tokenizer = TextAnalyzer :: builder ( JiebaTokenizer )
117+ . filter ( LowerCaser )
118+ . build ( ) ;
119+ index. tokenizers ( ) . register ( JIEBA_TOKENIZER_NAME , tokenizer) ;
120+ }
121+
41122#[ derive( Debug , Serialize , Deserialize ) ]
42123pub struct Project {
43124 pub id : String ,
@@ -551,6 +632,9 @@ fn build_search_index() -> Result<usize, String> {
551632 let schema = create_schema ( ) ;
552633 let index = Index :: create_in_dir ( & index_dir, schema. clone ( ) ) . map_err ( |e| e. to_string ( ) ) ?;
553634
635+ // Register jieba tokenizer for Chinese support
636+ register_jieba_tokenizer ( & index) ;
637+
554638 let mut index_writer: IndexWriter = index
555639 . writer ( 50_000_000 ) // 50MB heap
556640 . map_err ( |e| e. to_string ( ) ) ?;
@@ -660,6 +744,8 @@ fn search_chats(query: String, limit: Option<usize>) -> Result<Vec<SearchResult>
660744
661745 let schema = create_schema ( ) ;
662746 let index = Index :: open_in_dir ( & index_dir) . map_err ( |e| e. to_string ( ) ) ?;
747+ // Register jieba tokenizer for Chinese support
748+ register_jieba_tokenizer ( & index) ;
663749 * guard = Some ( SearchIndex { index, schema } ) ;
664750 }
665751
0 commit comments