Skip to content

Commit 2d0afdf

Browse files
MarkShawn2020claude
andcommitted
fix(search): 修复中文搜索无法匹配的问题
- 实现 JiebaTokenizer 自定义分词器支持中文分词 - 为 content 和 session_summary 字段配置 jieba 分词器 - 在索引构建和搜索时注册分词器 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 6ac666f commit 2d0afdf

1 file changed

Lines changed: 88 additions & 2 deletions

File tree

src-tauri/src/lib.rs

Lines changed: 88 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,69 @@ use tauri::{Emitter, Manager};
88
use tantivy::collector::TopDocs;
99
use tantivy::query::QueryParser;
1010
use tantivy::schema::{self, Value as TantivyValue, *};
11+
use tantivy::tokenizer::{LowerCaser, TextAnalyzer, Token, TokenStream, Tokenizer};
1112
use tantivy::{doc, Index, IndexWriter, ReloadPolicy};
13+
use jieba_rs::Jieba;
14+
use std::sync::LazyLock;
15+
16+
// Global jieba instance for Chinese tokenization
17+
static JIEBA: LazyLock<Jieba> = LazyLock::new(|| Jieba::new());
18+
19+
// Custom tokenizer for Chinese + English mixed content
20+
#[derive(Clone)]
21+
struct JiebaTokenizer;
22+
23+
impl Tokenizer for JiebaTokenizer {
24+
type TokenStream<'a> = JiebaTokenStream;
25+
26+
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
27+
let words = JIEBA.cut(text, true);
28+
let mut tokens = Vec::new();
29+
let mut offset = 0;
30+
31+
for word in words {
32+
let word_str = word.trim();
33+
if !word_str.is_empty() {
34+
let start = text[offset..].find(word).map(|i| offset + i).unwrap_or(offset);
35+
let end = start + word.len();
36+
tokens.push(Token {
37+
offset_from: start,
38+
offset_to: end,
39+
position: tokens.len(),
40+
text: word_str.to_string(),
41+
position_length: 1,
42+
});
43+
offset = end;
44+
}
45+
}
46+
47+
JiebaTokenStream { tokens, index: 0 }
48+
}
49+
}
50+
51+
struct JiebaTokenStream {
52+
tokens: Vec<Token>,
53+
index: usize,
54+
}
55+
56+
impl TokenStream for JiebaTokenStream {
57+
fn advance(&mut self) -> bool {
58+
if self.index < self.tokens.len() {
59+
self.index += 1;
60+
true
61+
} else {
62+
false
63+
}
64+
}
65+
66+
fn token(&self) -> &Token {
67+
&self.tokens[self.index - 1]
68+
}
69+
70+
fn token_mut(&mut self) -> &mut Token {
71+
&mut self.tokens[self.index - 1]
72+
}
73+
}
1274

1375
// Global search index state
1476
static SEARCH_INDEX: Mutex<Option<SearchIndex>> = Mutex::new(None);
@@ -25,19 +87,38 @@ fn get_index_dir() -> PathBuf {
2587
.join("search-index")
2688
}
2789

90+
const JIEBA_TOKENIZER_NAME: &str = "jieba";
91+
2892
fn create_schema() -> Schema {
2993
let mut schema_builder = Schema::builder();
94+
95+
// Use custom jieba tokenizer for content fields to support Chinese
96+
let text_options = TextOptions::default()
97+
.set_indexing_options(
98+
TextFieldIndexing::default()
99+
.set_tokenizer(JIEBA_TOKENIZER_NAME)
100+
.set_index_option(schema::IndexRecordOption::WithFreqsAndPositions)
101+
)
102+
.set_stored();
103+
30104
schema_builder.add_text_field("uuid", STRING | STORED);
31-
schema_builder.add_text_field("content", TEXT | STORED);
105+
schema_builder.add_text_field("content", text_options.clone());
32106
schema_builder.add_text_field("role", STRING | STORED);
33107
schema_builder.add_text_field("project_id", STRING | STORED);
34108
schema_builder.add_text_field("project_path", STRING | STORED);
35109
schema_builder.add_text_field("session_id", STRING | STORED);
36-
schema_builder.add_text_field("session_summary", TEXT | STORED);
110+
schema_builder.add_text_field("session_summary", text_options);
37111
schema_builder.add_text_field("timestamp", STRING | STORED);
38112
schema_builder.build()
39113
}
40114

115+
fn register_jieba_tokenizer(index: &Index) {
116+
let tokenizer = TextAnalyzer::builder(JiebaTokenizer)
117+
.filter(LowerCaser)
118+
.build();
119+
index.tokenizers().register(JIEBA_TOKENIZER_NAME, tokenizer);
120+
}
121+
41122
#[derive(Debug, Serialize, Deserialize)]
42123
pub struct Project {
43124
pub id: String,
@@ -551,6 +632,9 @@ fn build_search_index() -> Result<usize, String> {
551632
let schema = create_schema();
552633
let index = Index::create_in_dir(&index_dir, schema.clone()).map_err(|e| e.to_string())?;
553634

635+
// Register jieba tokenizer for Chinese support
636+
register_jieba_tokenizer(&index);
637+
554638
let mut index_writer: IndexWriter = index
555639
.writer(50_000_000) // 50MB heap
556640
.map_err(|e| e.to_string())?;
@@ -660,6 +744,8 @@ fn search_chats(query: String, limit: Option<usize>) -> Result<Vec<SearchResult>
660744

661745
let schema = create_schema();
662746
let index = Index::open_in_dir(&index_dir).map_err(|e| e.to_string())?;
747+
// Register jieba tokenizer for Chinese support
748+
register_jieba_tokenizer(&index);
663749
*guard = Some(SearchIndex { index, schema });
664750
}
665751

0 commit comments

Comments
 (0)