diff --git a/dbgpt/storage/knowledge_graph/community/base.py b/dbgpt/storage/knowledge_graph/community/base.py index 63bc7a93c..9da8c9a14 100644 --- a/dbgpt/storage/knowledge_graph/community/base.py +++ b/dbgpt/storage/knowledge_graph/community/base.py @@ -189,7 +189,8 @@ def explore_trigraph( """Explore the graph from given subjects up to a depth. Args: - subs (Union[List[str], List[List[float]]): The list of the subjects (keywords or embedding vectors). + subs (Union[List[str], List[List[float]]): The list of the subjects + (keywords or embedding vectors). topk (Optional[int]): The number of the top similar entities. score_threshold (Optional[float]): The threshold of the similarity score. direct (Direction): The direction of the graph that will be explored. @@ -224,9 +225,9 @@ def explore_docgraph_with_entities( limit (Optional[int]): The limit number of the queried chunks. Returns: - MemoryGraph: The document graph that includes the leaf chunks that connect to the - entities, the chains from documents to the leaf chunks, and the chain - from documents to chunks. + MemoryGraph: The document graph that includes the leaf chunks that connect + to the entities, the chains from documents to the leaf chunks, and the + chain from documents to chunks. """ @abstractmethod @@ -243,7 +244,8 @@ def explore_docgraph_without_entities( """Explore the graph from given subjects up to a depth. Args: - subs (Union[List[str], List[List[float]]): The list of the subjects (keywords or embedding vectors). + subs (Union[List[str], List[List[float]]): The list of the subjects + (keywords or embedding vectors). topk (Optional[int]): The number of the top similar chunks. score_threshold (Optional[float]): The threshold of the similarity score. direct (Direction): The direction of the graph that will be explored. @@ -252,8 +254,9 @@ def explore_docgraph_without_entities( limit (Optional[int]): The limit number of the queried chunks. Returns: - MemoryGraph: The document graph that includes the chains from documents to chunks - that contain the subs (keywords) or similar chunks (embedding vectors). + MemoryGraph: The document graph that includes the chains from documents + to chunks that contain the subs (keywords) or similar chunks + (embedding vectors). """ @abstractmethod diff --git a/dbgpt/storage/knowledge_graph/community/community_store.py b/dbgpt/storage/knowledge_graph/community/community_store.py index 34a415c41..f403996f3 100644 --- a/dbgpt/storage/knowledge_graph/community/community_store.py +++ b/dbgpt/storage/knowledge_graph/community/community_store.py @@ -56,7 +56,9 @@ async def _summary_community(self, community_id: str) -> Optional[Community]: return None graph = community.data.format() - community.summary = await self._community_summarizer.summarize(graph=graph) + community.summary = ( + await self._community_summarizer.summarize(graph=graph) or "" + ) logger.info(f"Summarize community {community_id}: {community.summary[:50]}...") return community diff --git a/dbgpt/storage/knowledge_graph/community/memgraph_store_adapter.py b/dbgpt/storage/knowledge_graph/community/memgraph_store_adapter.py index 4be81a6b2..901f0bd72 100644 --- a/dbgpt/storage/knowledge_graph/community/memgraph_store_adapter.py +++ b/dbgpt/storage/knowledge_graph/community/memgraph_store_adapter.py @@ -29,7 +29,7 @@ class MemGraphStoreAdapter(GraphStoreAdapter): def __init__(self, enable_summary: bool = False): """Initialize MemGraph Community Store Adapter.""" - self._graph_store = MemoryGraphStore(MemoryGraphStoreConfig()) + self._graph_store: MemoryGraphStore = MemoryGraphStore(MemoryGraphStoreConfig()) super().__init__(self._graph_store) @@ -38,7 +38,7 @@ def __init__(self, enable_summary: bool = False): async def discover_communities(self, **kwargs) -> List[str]: """Run community discovery with leiden.""" - [] + return [] async def get_community(self, community_id: str) -> Community: """Get community.""" @@ -196,7 +196,7 @@ def check_label(self, graph_elem_type: GraphElemType) -> bool: True if the label exists in the specified graph element type, otherwise False. """ - pass + raise NotImplementedError("Memory graph store does not have label") def explore( self, @@ -214,8 +214,8 @@ def explore( def query(self, query: str, **kwargs) -> MemoryGraph: """Execute a query on graph.""" - pass + raise NotImplementedError("Memory graph store does not support query") async def stream_query(self, query: str, **kwargs) -> AsyncGenerator[Graph, None]: """Execute a stream query.""" - pass + raise NotImplementedError("Memory graph store does not support stream query") diff --git a/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py b/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py index 6034f4c06..04e04454a 100644 --- a/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py +++ b/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py @@ -2,16 +2,7 @@ import json import logging -from typing import ( - Any, - AsyncGenerator, - Dict, - Iterator, - List, - Optional, - Tuple, - Union, -) +from typing import Any, AsyncGenerator, Dict, Iterator, List, Optional, Tuple, Union from dbgpt.storage.graph_store.graph import ( Direction, @@ -172,7 +163,7 @@ def upsert_entities(self, entities: Iterator[Vertex]) -> None: # If not exist, then create vector index if self.query(check_entity_vector_query).vertex_count == 0: # Get the dimension - dimension = len(entity_list[0].get("_embedding")) + dimension = len(entity_list[0].get("_embedding", [])) # Then create index create_vector_index_query = ( "CALL db.addVertexVectorIndex(" @@ -246,12 +237,13 @@ def upsert_chunks(self, chunks: Iterator[Union[Vertex, ParagraphChunk]]) -> None # If not exist, then create vector index if self.query(check_chunk_vector_query).vertex_count == 0: # Get the dimension - dimension = len(chunk_list[0].get("_embedding")) + embedding = chunk_list[0].get("_embedding", []) + assert isinstance(embedding, list) # Then create index create_vector_index_query = ( "CALL db.addVertexVectorIndex(" f'"{GraphElemType.CHUNK.value}", "_embedding", ' - f"{{dimension: {dimension}}})" + f"{{dimension: {len(embedding)}}})" ) self.graph_store.conn.run(query=create_vector_index_query) @@ -524,12 +516,14 @@ def create_graph_label( (vertices) and edges in the graph. """ if graph_elem_type.is_vertex(): # vertex - vertex_meta = json.dumps({ - "label": graph_elem_type.value, - "type": "VERTEX", - "primary": "id", - "properties": graph_properties, - }) + vertex_meta = json.dumps( + { + "label": graph_elem_type.value, + "type": "VERTEX", + "primary": "id", + "properties": graph_properties, + } + ) gql = f"""CALL db.createVertexLabelByJson('{vertex_meta}')""" else: # edge @@ -555,12 +549,14 @@ def edge_direction(graph_elem_type: GraphElemType) -> List[List[str]]: else: raise ValueError("Invalid graph element type.") - edge_meta = json.dumps({ - "label": graph_elem_type.value, - "type": "EDGE", - "constraints": edge_direction(graph_elem_type), - "properties": graph_properties, - }) + edge_meta = json.dumps( + { + "label": graph_elem_type.value, + "type": "EDGE", + "constraints": edge_direction(graph_elem_type), + "properties": graph_properties, + } + ) gql = f"""CALL db.createEdgeLabelByJson('{edge_meta}')""" self.graph_store.conn.run(gql) @@ -597,7 +593,8 @@ def explore_trigraph( """Explore the graph from given subjects up to a depth. Args: - subs (Union[List[str], List[List[float]]): The list of the subjects (keywords or embedding vectors). + subs (Union[List[str], List[List[float]]): The list of the subjects + (keywords or embedding vectors). topk (Optional[int]): The number of the top similar entities. score_threshold (Optional[float]): The threshold of the similarity score. direct (Direction): The direction of the graph that will be explored. @@ -651,7 +648,7 @@ def explore_trigraph( conditional_statement = f"WHERE n.id IN {ids} " else: conditional_statement = ( - f"WHERE n.id IN {[self._escape_quotes(sub) for sub in subs]} " + f"WHERE n.id IN {[self._escape_quotes(str(sub)) for sub in subs]} " ) # Multi-hop search @@ -686,9 +683,9 @@ def explore_docgraph_with_entities( limit (Optional[int]): The limit number of the queried chunks. Returns: - MemoryGraph: The document graph that includes the leaf chunks that connect to the - entities, the chains from documents to the leaf chunks, and the chain - from documents to chunks. + MemoryGraph: The document graph that includes the leaf chunks that + connect to the entities, the chains from documents to the leaf chunks, + and the chain from documents to chunks. """ if len(subs) == 0: return MemoryGraph() @@ -771,7 +768,8 @@ def explore_docgraph_without_entities( """Explore the graph from given subjects up to a depth. Args: - subs (Union[List[str], List[List[float]]): The list of the subjects (keywords or embedding vectors). + subs (Union[List[str], List[List[float]]): The list of the subjects + (keywords or embedding vectors). topk (Optional[int]): The number of the top similar chunks. score_threshold (Optional[float]): The threshold of the similarity score. direct (Direction): The direction of the graph that will be explored. @@ -780,8 +778,9 @@ def explore_docgraph_without_entities( limit (Optional[int]): The limit number of the queried chunks. Returns: - MemoryGraph: The document graph that includes the chains from documents to chunks - that contain the subs (keywords) or similar chunks (embedding vectors). + MemoryGraph: The document graph that includes the chains from documents + to chunks that contain the subs (keywords) or similar chunks + (embedding vectors). """ if len(subs) == 0: return MemoryGraph() @@ -825,13 +824,16 @@ def explore_docgraph_without_entities( self.graph_store.conn.run(query=similarity_retrieval_query) ) names = [(record["name"]) for record in similar_chunks] - _subs_condition = " OR ".join([ - f"m.content CONTAINS '{name}'" for name in names - ]) + _subs_condition = " OR ".join( + [f"m.content CONTAINS '{name}'" for name in names] + ) else: - _subs_condition = " OR ".join([ - f"m.content CONTAINS '{self._escape_quotes(sub)}'" for sub in subs - ]) + _subs_condition = " OR ".join( + [ + f"m.content CONTAINS '{self._escape_quotes(str(sub))}'" + for sub in subs + ] + ) # Query the chain from documents to chunks, # document -> chunk -> chunk -> chunk -> ... -> chunk @@ -921,15 +923,19 @@ async def stream_query( # type: ignore[override] rels = list(record["p"].relationships) formatted_path = [] for i in range(len(nodes)): - formatted_path.append({ - "id": nodes[i]._properties["id"], - "description": nodes[i]._properties["description"], - }) + formatted_path.append( + { + "id": nodes[i]._properties["id"], + "description": nodes[i]._properties["description"], + } + ) if i < len(rels): - formatted_path.append({ - "id": rels[i]._properties["id"], - "description": rels[i]._properties["description"], - }) + formatted_path.append( + { + "id": rels[i]._properties["id"], + "description": rels[i]._properties["description"], + } + ) for i in range(0, len(formatted_path), 2): mg.upsert_vertex( Vertex( @@ -1100,9 +1106,9 @@ def upsert_doc_include_chunk( chunk: ParagraphChunk, ) -> None: """Convert chunk to document include chunk.""" - assert chunk.chunk_parent_id and chunk.chunk_parent_name, ( - "Chunk parent ID and name are required (document_include_chunk)" - ) + assert ( + chunk.chunk_parent_id and chunk.chunk_parent_name + ), "Chunk parent ID and name are required (document_include_chunk)" edge = Edge( sid=chunk.chunk_parent_id, @@ -1123,9 +1129,9 @@ def upsert_chunk_include_chunk( chunk: ParagraphChunk, ) -> None: """Convert chunk to chunk include chunk.""" - assert chunk.chunk_parent_id and chunk.chunk_parent_name, ( - "Chunk parent ID and name are required (chunk_include_chunk)" - ) + assert ( + chunk.chunk_parent_id and chunk.chunk_parent_name + ), "Chunk parent ID and name are required (chunk_include_chunk)" edge = Edge( sid=chunk.chunk_parent_id, diff --git a/dbgpt/storage/knowledge_graph/community_summary.py b/dbgpt/storage/knowledge_graph/community_summary.py index f89d855de..38479d28c 100644 --- a/dbgpt/storage/knowledge_graph/community_summary.py +++ b/dbgpt/storage/knowledge_graph/community_summary.py @@ -402,6 +402,7 @@ async def asimilar_search_with_scores( keywords: List[str] = await self._keyword_extractor.extract(text) # If enable similarity search, using subs to transfer embeddings + subs: Union[List[str], List[List[float]]] if enable_similarity_search: # Embedding the question vector = await self._text_embedder.embed(text) @@ -411,9 +412,9 @@ async def asimilar_search_with_scores( ) # Using the embeddings of keywords and question vectors.append(vector) - subs: Union[List[str], List[List[float]]] = vectors + subs = vectors else: - subs: Union[List[str], List[List[float]]] = keywords + subs = keywords # If enable triplet graph, using subs to search enetities # subs -> enetities diff --git a/dbgpt/storage/knowledge_graph/knowledge_graph.py b/dbgpt/storage/knowledge_graph/knowledge_graph.py index ef2d15039..f27d2ff5b 100644 --- a/dbgpt/storage/knowledge_graph/knowledge_graph.py +++ b/dbgpt/storage/knowledge_graph/knowledge_graph.py @@ -123,7 +123,9 @@ async def asimilar_search_with_scores( # extract keywords and explore graph store keywords = await self._keyword_extractor.extract(text) - subgraph = self._graph_store_apdater.explore(keywords, limit=topk).format() + subgraph = self._graph_store_apdater.explore_trigraph( + keywords, limit=topk + ).format() logger.info(f"Search subgraph from {len(keywords)} keywords") @@ -134,7 +136,7 @@ async def asimilar_search_with_scores( "The following entities and relationships provided after " "[Subgraph] are retrieved from the knowledge graph " "based on the keywords:\n" - f"\"{','.join(keywords)}\".\n" + f'"{",".join(keywords)}".\n' "---------------------\n" "The following examples after [Entities] and [Relationships] that " "can help you understand the data format of the knowledge graph, "