Skip to content

Commit 9348b72

Browse files
authored
refactor: 重构search接口支持通过upload接口上传的“向量混合”查询并完成集成测试 (#3)
1 parent f1f024a commit 9348b72

File tree

11 files changed

+577
-117
lines changed

11 files changed

+577
-117
lines changed

app/config/settings.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,13 @@ class ElasticsearchSettings(BaseModel):
3333
"""Elasticsearch 相关配置"""
3434

3535
url: str
36+
number_of_shards: int
37+
number_of_replicas: int
38+
index_max_result_window: int
39+
index_refresh_interval: str
40+
index_option_type: str
41+
index_option_m: int
42+
index_option_ef_construction: int
3643
metadata_index_suffix: str
3744
chunk_index_suffix: str
3845
request_timeout: int = 15
@@ -44,7 +51,6 @@ class EmbedderSettings(BaseModel):
4451
model_name: str
4552
dimensions: int
4653
similarity_metric: str
47-
index_type: str
4854

4955

5056
class RerankerSettings(BaseModel):
@@ -82,9 +88,16 @@ class RetrievalSettings(BaseModel):
8288

8389
multiplier: int = Field(5, description="召回倍数配置")
8490
vector_weight: float = Field(2.0, description="向量搜索权重")
91+
vector_similarity: float = Field(0.7, description="相似度")
8592
text_weight: float = Field(1.0, description="文本搜索权重")
8693

8794

95+
class SearchSettings(BaseModel):
96+
"""搜索相关配置"""
97+
98+
max_top_k: int = Field(50, description="最大top_k值限制")
99+
100+
88101
class TencentOssSettings(BaseModel):
89102
"""
90103
腾讯云对象存储相关配置。
@@ -114,6 +127,7 @@ class Settings(BaseSettings):
114127
storage: StorageSettings
115128
upload: UploadSettings
116129
retrieval: RetrievalSettings
130+
search: SearchSettings
117131

118132
@property
119133
def cos_config(self) -> CosConfig:

app/service/elasticsearch.py

Lines changed: 36 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -97,11 +97,11 @@ def _ensure_metadata_index_exists(self, metadata_index: str) -> None:
9797
if not self._client.indices.exists(index=metadata_index):
9898
body = {
9999
"settings": {
100-
"number_of_shards": 1,
101-
"number_of_replicas": 0,
100+
"number_of_shards": self._settings.elasticsearch.number_of_shards,
101+
"number_of_replicas": self._settings.elasticsearch.number_of_replicas,
102102
"index": {
103-
"max_result_window": 10000,
104-
"refresh_interval": "1s",
103+
"max_result_window": self._settings.elasticsearch.index_max_result_window,
104+
"refresh_interval": self._settings.elasticsearch.index_refresh_interval,
105105
},
106106
},
107107
"mappings": {
@@ -138,11 +138,11 @@ def _ensure_chunk_index_exists(self, chunk_index: str) -> None:
138138
if not self._client.indices.exists(index=chunk_index):
139139
body = {
140140
"settings": {
141-
"number_of_shards": 1,
142-
"number_of_replicas": 0,
141+
"number_of_shards": self._settings.elasticsearch.number_of_shards,
142+
"number_of_replicas": self._settings.elasticsearch.number_of_replicas,
143143
"index": {
144-
"max_result_window": 10000,
145-
"refresh_interval": "1s",
144+
"max_result_window": self._settings.elasticsearch.index_max_result_window,
145+
"refresh_interval": self._settings.elasticsearch.index_refresh_interval,
146146
},
147147
},
148148
"mappings": {
@@ -159,9 +159,9 @@ def _ensure_chunk_index_exists(self, chunk_index: str) -> None:
159159
"similarity": self._embedder.similarity_metric,
160160
"index": True,
161161
"index_options": {
162-
"type": self._settings.embedder.index_type,
163-
"m": 32,
164-
"ef_construction": 100,
162+
"type": self._settings.elasticsearch.index_option_type,
163+
"m": self._settings.elasticsearch.index_option_m,
164+
"ef_construction": self._settings.elasticsearch.index_option_ef_construction,
165165
},
166166
},
167167
"chunk_index": {"type": "integer"},
@@ -207,7 +207,9 @@ def store_for_vector_hybrid_search(self, document: Document) -> str:
207207
metadata_index, chunk_index = self._ensure_indexes_exist(
208208
document.index_prefix
209209
)
210-
210+
logger.info(
211+
f"向量混合搜索: 元数据索引名={metadata_index} 分片索引名={chunk_index}"
212+
)
211213
metadata_id = self._create_metadata(metadata_index, document)
212214
document.id = metadata_id # 确保 document 对象持有 ID
213215
logger.info(f"元数据占位符创建成功,ID: {metadata_id}")
@@ -359,10 +361,11 @@ def search(self, parameters: SearchParameters) -> SearchResult:
359361
)
360362

361363
# 执行ES搜索
364+
logger.info(f"在 {parameters.index_name} 上执行查询: {search_body}")
362365
response = self._client.search(
363366
index=parameters.index_name, body=search_body
364367
)
365-
368+
logger.info(f"查询结果: {response}")
366369
# 计算搜索耗时
367370
search_time_ms = int((time.time() - start_time) * 1000)
368371

@@ -416,57 +419,57 @@ def _build_hybrid_search_body(
416419
ES查询体
417420
"""
418421
# 获取文本查询进行向量化
419-
text_query: str | None = None
420-
for condition in search_conditions["vector"]:
421-
if isinstance(condition.value, str):
422-
text_query = condition.value
423-
if not text_query:
424-
raise ValueError("向量混合搜索需要文本查询内容")
422+
text_query = cast("str", search_conditions["vector"][0].value)
425423

426424
# 生成查询向量
427425
query_vector = self._embedder.embed_documents([text_query])[0]
428426

429427
# 计算召回数量(用于后续重排序)
430-
retrieval_size = parameters.limit * self._settings.retrieval.multiplier
428+
k = parameters.limit * self._settings.retrieval.multiplier
429+
vector_similarity = self._settings.retrieval.vector_similarity
431430

432431
# 获取权重配置
433432
vector_weight = self._settings.retrieval.vector_weight
434433
text_weight = self._settings.retrieval.text_weight
435434

435+
# # 确保 num_candidates 至少为 k 的 2 倍或 100,取较大值
436+
num_candidates = max(k * 2, 100)
437+
436438
# 构建混合搜索查询体
437439
search_body: dict[str, Any] = {
438-
"size": retrieval_size,
440+
"size": parameters.limit,
439441
"_source": ["content", "file_metadata_id"], # 只返回需要的字段
440442
"knn": {
441443
"field": "content_vector", # 固定向量字段
442444
"query_vector": query_vector,
443-
"k": retrieval_size,
444-
"num_candidates": 100,
445+
"k": k,
446+
"num_candidates": num_candidates,
445447
"boost": vector_weight,
448+
"similarity": vector_similarity,
446449
},
447450
"query": {
448451
"bool": {
449-
"should": [
450-
# 普通匹配
452+
"must": [
451453
{
452454
"match": {
453455
"content": {
454456
"query": text_query,
455-
"boost": text_weight * 0.5,
457+
"boost": text_weight * 0.7, # 基础匹配权重
456458
}
457459
}
458-
},
459-
# 短语匹配
460+
}
461+
],
462+
"should": [
460463
{
461464
"match_phrase": {
462465
"content": {
463466
"query": text_query,
464-
"boost": text_weight * 0.3,
467+
"boost": text_weight * 0.3, # 短语匹配加分
465468
}
466469
}
467-
},
470+
}
468471
],
469-
"minimum_should_match": 0,
472+
"minimum_should_match": 0, # should是纯加分项
470473
}
471474
},
472475
}
@@ -554,7 +557,7 @@ def _convert_to_search_result(
554557
# 根据搜索类型处理结果
555558
if is_hybrid_search:
556559
documents = self._process_hybrid_search_results(
557-
cast("str", search_conditions["vector"][0].value), hits, limit
560+
cast("str", search_conditions["vector"][0].value), hits
558561
)
559562
else:
560563
documents = self._process_structured_search_results(hits)
@@ -569,7 +572,6 @@ def _process_hybrid_search_results(
569572
self,
570573
text_query: str,
571574
hits: list[dict[str, Any]],
572-
limit: int,
573575
) -> list[DocumentResult]:
574576
"""
575577
处理混合搜索结果:去重 + 重排序
@@ -603,7 +605,7 @@ def _process_hybrid_search_results(
603605
unique_chunks.append(chunk)
604606

605607
# 重排
606-
return self._reranker.rerank(text_query, unique_chunks)[:limit]
608+
return self._reranker.rerank(text_query, unique_chunks)
607609

608610
@staticmethod
609611
def _process_structured_search_results(

app/utils/converters/search.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,11 @@ def request_vo_to_domain(request: SearchRequest) -> SearchParameters:
5858
conditions = [
5959
SearchCondition(
6060
field_name=cond.field,
61-
mode=SearchMode.TERM
62-
if cond.op == ConditionOperator.TERM
63-
else SearchMode.MATCH,
61+
mode=(
62+
SearchMode.TERM
63+
if cond.op == ConditionOperator.TERM
64+
else SearchMode.MATCH
65+
),
6466
value=cond.value,
6567
)
6668
for cond in request.query.conditions
@@ -83,7 +85,7 @@ def result_domain_to_vo(
8385
if search_type == SearchType.VECTOR_HYBRID:
8486
results = [
8587
VectorHybridSearchResult(
86-
text=doc.content.get("text", ""),
88+
text=doc.content.get("content", ""),
8789
file_metadata_id=doc.content.get("file_metadata_id", ""),
8890
score=doc.score,
8991
)
@@ -100,4 +102,4 @@ def result_domain_to_vo(
100102
if doc.id
101103
]
102104

103-
return SearchResponse(type=search_type, results=results)
105+
return SearchResponse(results=results)

app/web/document.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from pathlib import Path
2020
from urllib.parse import urlparse
2121

22+
from elasticsearch import NotFoundError
2223
from fastapi import (
2324
APIRouter,
2425
BackgroundTasks,
@@ -426,6 +427,10 @@ async def search(self, request: SearchRequest) -> SearchResponse:
426427
f"✅ 搜索完成, 返回{len(domain_response.documents)}条结果"
427428
)
428429
return resp
430+
except NotFoundError as e:
431+
raise HTTPException(
432+
status_code=404, detail=f"索引 {request.query.index} 不存在"
433+
) from e
429434
except Exception as e:
430435
logger.error(f"❌ 搜索失败: {e}", exc_info=True)
431436
raise HTTPException(status_code=500, detail="搜索处理失败") from e

app/web/vo.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
from pydantic import BaseModel, Field, HttpUrl, Json, field_validator
2121
from pydantic_core.core_schema import ValidationInfo
2222

23+
from app.config.settings import settings
24+
2325

2426
class FileUploadResponse(BaseModel):
2527
"""文件上传后的标准响应模型"""
@@ -75,6 +77,16 @@ class Condition(BaseModel):
7577
..., description="字段值,支持多种类型"
7678
)
7779

80+
@field_validator("value")
81+
@classmethod
82+
def validate_value_not_empty_string(
83+
cls, v: str | int | float | bool
84+
) -> str | int | float | bool:
85+
"""验证字符串值不能为空"""
86+
if isinstance(v, str) and v.strip() == "":
87+
raise ValueError("字符串类型的查询值不能为空")
88+
return v
89+
7890

7991
class Query(BaseModel):
8092
"""查询对象"""
@@ -93,13 +105,16 @@ class SearchRequest(BaseModel):
93105

94106
type: SearchType = Field(..., description="搜索类型")
95107
query: Query = Field(..., description="查询条件")
96-
top_k: int = Field(..., ge=1, description="返回结果数量,至少为1")
108+
top_k: int = Field(
109+
...,
110+
ge=1,
111+
le=settings.search.max_top_k,
112+
description="返回结果数量 1 <= top_k <= 配置文件中的max_top_k",
113+
)
97114

98115
@field_validator("query")
99116
@classmethod
100-
def validate_query_for_search_type(
101-
cls, v: Query, info: ValidationInfo
102-
) -> Query:
117+
def validate_query(cls, v: Query, info: ValidationInfo) -> Query:
103118
"""根据搜索类型验证查询条件"""
104119
search_type = info.data.get("type")
105120

@@ -134,7 +149,6 @@ class StructuredSearchResult(BaseModel):
134149
class SearchResponse(BaseModel):
135150
"""搜索响应"""
136151

137-
type: SearchType = Field(..., description="搜索类型") # 保持一致性
138152
results: list[VectorHybridSearchResult | StructuredSearchResult] = Field(
139153
default_factory=list, description="搜索结果"
140154
)

config.yaml

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
elasticsearch:
22
url: "http://localhost:9200"
3+
number_of_shards: 1
4+
number_of_replicas: 0
5+
index_max_result_window: 10000
6+
index_refresh_interval: 1s
7+
index_option_type: "int8_hnsw"
8+
index_option_m: 32 # 控制HNSW图中每个节点可以连接的最大邻居节点数量
9+
index_option_ef_construction: 100 # 索引构建时每个节点考虑的候选邻居数量,影响索引质量。
310
metadata_index_suffix: "_metadatas"
411
chunk_index_suffix: "_chunks"
512
request_timeout: 60
@@ -26,7 +33,12 @@ upload:
2633
- ".pdf"
2734
- ".md"
2835
- ".txt"
36+
2937
retrieval:
3038
multiplier: 5 # 召回倍数配置
3139
vector_weight: 2.0 # 向量搜索权重
32-
text_weight: 1.0 # 文本搜索权重
40+
vector_similarity: 0.1 # 向量搜索相似度阈值
41+
text_weight: 1.0 # 文本搜索权重
42+
43+
search:
44+
max_top_k: 50 # 最大top_k值限制

0 commit comments

Comments
 (0)