@@ -97,11 +97,11 @@ def _ensure_metadata_index_exists(self, metadata_index: str) -> None:
9797 if not self ._client .indices .exists (index = metadata_index ):
9898 body = {
9999 "settings" : {
100- "number_of_shards" : 1 ,
101- "number_of_replicas" : 0 ,
100+ "number_of_shards" : self . _settings . elasticsearch . number_of_shards ,
101+ "number_of_replicas" : self . _settings . elasticsearch . number_of_replicas ,
102102 "index" : {
103- "max_result_window" : 10000 ,
104- "refresh_interval" : "1s" ,
103+ "max_result_window" : self . _settings . elasticsearch . index_max_result_window ,
104+ "refresh_interval" : self . _settings . elasticsearch . index_refresh_interval ,
105105 },
106106 },
107107 "mappings" : {
@@ -138,11 +138,11 @@ def _ensure_chunk_index_exists(self, chunk_index: str) -> None:
138138 if not self ._client .indices .exists (index = chunk_index ):
139139 body = {
140140 "settings" : {
141- "number_of_shards" : 1 ,
142- "number_of_replicas" : 0 ,
141+ "number_of_shards" : self . _settings . elasticsearch . number_of_shards ,
142+ "number_of_replicas" : self . _settings . elasticsearch . number_of_replicas ,
143143 "index" : {
144- "max_result_window" : 10000 ,
145- "refresh_interval" : "1s" ,
144+ "max_result_window" : self . _settings . elasticsearch . index_max_result_window ,
145+ "refresh_interval" : self . _settings . elasticsearch . index_refresh_interval ,
146146 },
147147 },
148148 "mappings" : {
@@ -159,9 +159,9 @@ def _ensure_chunk_index_exists(self, chunk_index: str) -> None:
159159 "similarity" : self ._embedder .similarity_metric ,
160160 "index" : True ,
161161 "index_options" : {
162- "type" : self ._settings .embedder . index_type ,
163- "m" : 32 ,
164- "ef_construction" : 100 ,
162+ "type" : self ._settings .elasticsearch . index_option_type ,
163+ "m" : self . _settings . elasticsearch . index_option_m ,
164+ "ef_construction" : self . _settings . elasticsearch . index_option_ef_construction ,
165165 },
166166 },
167167 "chunk_index" : {"type" : "integer" },
@@ -207,7 +207,9 @@ def store_for_vector_hybrid_search(self, document: Document) -> str:
207207 metadata_index , chunk_index = self ._ensure_indexes_exist (
208208 document .index_prefix
209209 )
210-
210+ logger .info (
211+ f"向量混合搜索: 元数据索引名={ metadata_index } 分片索引名={ chunk_index } "
212+ )
211213 metadata_id = self ._create_metadata (metadata_index , document )
212214 document .id = metadata_id # 确保 document 对象持有 ID
213215 logger .info (f"元数据占位符创建成功,ID: { metadata_id } " )
@@ -359,10 +361,11 @@ def search(self, parameters: SearchParameters) -> SearchResult:
359361 )
360362
361363 # 执行ES搜索
364+ logger .info (f"在 { parameters .index_name } 上执行查询: { search_body } " )
362365 response = self ._client .search (
363366 index = parameters .index_name , body = search_body
364367 )
365-
368+ logger . info ( f"查询结果: { response } " )
366369 # 计算搜索耗时
367370 search_time_ms = int ((time .time () - start_time ) * 1000 )
368371
@@ -416,57 +419,57 @@ def _build_hybrid_search_body(
416419 ES查询体
417420 """
418421 # 获取文本查询进行向量化
419- text_query : str | None = None
420- for condition in search_conditions ["vector" ]:
421- if isinstance (condition .value , str ):
422- text_query = condition .value
423- if not text_query :
424- raise ValueError ("向量混合搜索需要文本查询内容" )
422+ text_query = cast ("str" , search_conditions ["vector" ][0 ].value )
425423
426424 # 生成查询向量
427425 query_vector = self ._embedder .embed_documents ([text_query ])[0 ]
428426
429427 # 计算召回数量(用于后续重排序)
430- retrieval_size = parameters .limit * self ._settings .retrieval .multiplier
428+ k = parameters .limit * self ._settings .retrieval .multiplier
429+ vector_similarity = self ._settings .retrieval .vector_similarity
431430
432431 # 获取权重配置
433432 vector_weight = self ._settings .retrieval .vector_weight
434433 text_weight = self ._settings .retrieval .text_weight
435434
435+ # # 确保 num_candidates 至少为 k 的 2 倍或 100,取较大值
436+ num_candidates = max (k * 2 , 100 )
437+
436438 # 构建混合搜索查询体
437439 search_body : dict [str , Any ] = {
438- "size" : retrieval_size ,
440+ "size" : parameters . limit ,
439441 "_source" : ["content" , "file_metadata_id" ], # 只返回需要的字段
440442 "knn" : {
441443 "field" : "content_vector" , # 固定向量字段
442444 "query_vector" : query_vector ,
443- "k" : retrieval_size ,
444- "num_candidates" : 100 ,
445+ "k" : k ,
446+ "num_candidates" : num_candidates ,
445447 "boost" : vector_weight ,
448+ "similarity" : vector_similarity ,
446449 },
447450 "query" : {
448451 "bool" : {
449- "should" : [
450- # 普通匹配
452+ "must" : [
451453 {
452454 "match" : {
453455 "content" : {
454456 "query" : text_query ,
455- "boost" : text_weight * 0.5 ,
457+ "boost" : text_weight * 0.7 , # 基础匹配权重
456458 }
457459 }
458- },
459- # 短语匹配
460+ }
461+ ],
462+ "should" : [
460463 {
461464 "match_phrase" : {
462465 "content" : {
463466 "query" : text_query ,
464- "boost" : text_weight * 0.3 ,
467+ "boost" : text_weight * 0.3 , # 短语匹配加分
465468 }
466469 }
467- },
470+ }
468471 ],
469- "minimum_should_match" : 0 ,
472+ "minimum_should_match" : 0 , # should是纯加分项
470473 }
471474 },
472475 }
@@ -554,7 +557,7 @@ def _convert_to_search_result(
554557 # 根据搜索类型处理结果
555558 if is_hybrid_search :
556559 documents = self ._process_hybrid_search_results (
557- cast ("str" , search_conditions ["vector" ][0 ].value ), hits , limit
560+ cast ("str" , search_conditions ["vector" ][0 ].value ), hits
558561 )
559562 else :
560563 documents = self ._process_structured_search_results (hits )
@@ -569,7 +572,6 @@ def _process_hybrid_search_results(
569572 self ,
570573 text_query : str ,
571574 hits : list [dict [str , Any ]],
572- limit : int ,
573575 ) -> list [DocumentResult ]:
574576 """
575577 处理混合搜索结果:去重 + 重排序
@@ -603,7 +605,7 @@ def _process_hybrid_search_results(
603605 unique_chunks .append (chunk )
604606
605607 # 重排
606- return self ._reranker .rerank (text_query , unique_chunks )[: limit ]
608+ return self ._reranker .rerank (text_query , unique_chunks )
607609
608610 @staticmethod
609611 def _process_structured_search_results (
0 commit comments