format

eosphoros-ai · Jan 7, 2025 · 86a6a49 · 86a6a49
1 parent dd8efc3
commit 86a6a49
Show file tree

Hide file tree

Showing 6 changed files with 84 additions and 70 deletions.
diff --git a/dbgpt/storage/knowledge_graph/community/base.py b/dbgpt/storage/knowledge_graph/community/base.py
@@ -189,7 +189,8 @@ def explore_trigraph(
         """Explore the graph from given subjects up to a depth.
 
         Args:
-            subs (Union[List[str], List[List[float]]): The list of the subjects (keywords or embedding vectors).
+            subs (Union[List[str], List[List[float]]): The list of the subjects
+                (keywords or embedding vectors).
             topk (Optional[int]): The number of the top similar entities.
             score_threshold (Optional[float]): The threshold of the similarity score.
             direct (Direction): The direction of the graph that will be explored.
@@ -224,9 +225,9 @@ def explore_docgraph_with_entities(
             limit (Optional[int]): The limit number of the queried chunks.
 
         Returns:
-            MemoryGraph: The document graph that includes the leaf chunks that connect to the
-                entities, the chains from documents to the leaf chunks, and the chain
-                from documents to chunks.
+            MemoryGraph: The document graph that includes the leaf chunks that connect
+                to the entities, the chains from documents to the leaf chunks, and the
+                chain from documents to chunks.
         """
 
     @abstractmethod
@@ -243,7 +244,8 @@ def explore_docgraph_without_entities(
         """Explore the graph from given subjects up to a depth.
 
         Args:
-            subs (Union[List[str], List[List[float]]): The list of the subjects (keywords or embedding vectors).
+            subs (Union[List[str], List[List[float]]): The list of the subjects
+                (keywords or embedding vectors).
             topk (Optional[int]): The number of the top similar chunks.
             score_threshold (Optional[float]): The threshold of the similarity score.
             direct (Direction): The direction of the graph that will be explored.
@@ -252,8 +254,9 @@ def explore_docgraph_without_entities(
             limit (Optional[int]): The limit number of the queried chunks.
 
         Returns:
-            MemoryGraph: The document graph that includes the chains from documents to chunks
-                that contain the subs (keywords) or similar chunks (embedding vectors).
+            MemoryGraph: The document graph that includes the chains from documents
+                to chunks that contain the subs (keywords) or similar chunks
+                (embedding vectors).
         """
 
     @abstractmethod

diff --git a/dbgpt/storage/knowledge_graph/community/community_store.py b/dbgpt/storage/knowledge_graph/community/community_store.py
@@ -56,7 +56,9 @@ async def _summary_community(self, community_id: str) -> Optional[Community]:
             return None
 
         graph = community.data.format()
-        community.summary = await self._community_summarizer.summarize(graph=graph)
+        community.summary = (
+            await self._community_summarizer.summarize(graph=graph) or ""
+        )
         logger.info(f"Summarize community {community_id}: {community.summary[:50]}...")
         return community
 

diff --git a/dbgpt/storage/knowledge_graph/community/memgraph_store_adapter.py b/dbgpt/storage/knowledge_graph/community/memgraph_store_adapter.py
@@ -29,7 +29,7 @@ class MemGraphStoreAdapter(GraphStoreAdapter):
 
     def __init__(self, enable_summary: bool = False):
         """Initialize MemGraph Community Store Adapter."""
-        self._graph_store = MemoryGraphStore(MemoryGraphStoreConfig())
+        self._graph_store: MemoryGraphStore = MemoryGraphStore(MemoryGraphStoreConfig())
 
         super().__init__(self._graph_store)
 
@@ -38,7 +38,7 @@ def __init__(self, enable_summary: bool = False):
 
     async def discover_communities(self, **kwargs) -> List[str]:
         """Run community discovery with leiden."""
-        []
+        return []
 
     async def get_community(self, community_id: str) -> Community:
         """Get community."""
@@ -196,7 +196,7 @@ def check_label(self, graph_elem_type: GraphElemType) -> bool:
             True if the label exists in the specified graph element type, otherwise
             False.
         """
-        pass
+        raise NotImplementedError("Memory graph store does not have label")
 
     def explore(
         self,
@@ -214,8 +214,8 @@ def explore(
 
     def query(self, query: str, **kwargs) -> MemoryGraph:
         """Execute a query on graph."""
-        pass
+        raise NotImplementedError("Memory graph store does not support query")
 
     async def stream_query(self, query: str, **kwargs) -> AsyncGenerator[Graph, None]:
         """Execute a stream query."""
-        pass
+        raise NotImplementedError("Memory graph store does not support stream query")
diff --git a/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py b/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py
@@ -2,16 +2,7 @@
 
 import json
 import logging
-from typing import (
-    Any,
-    AsyncGenerator,
-    Dict,
-    Iterator,
-    List,
-    Optional,
-    Tuple,
-    Union,
-)
+from typing import Any, AsyncGenerator, Dict, Iterator, List, Optional, Tuple, Union
 
 from dbgpt.storage.graph_store.graph import (
     Direction,
@@ -172,7 +163,7 @@ def upsert_entities(self, entities: Iterator[Vertex]) -> None:
             # If not exist, then create vector index
             if self.query(check_entity_vector_query).vertex_count == 0:
                 # Get the dimension
-                dimension = len(entity_list[0].get("_embedding"))
+                dimension = len(entity_list[0].get("_embedding", []))
                 # Then create index
                 create_vector_index_query = (
                     "CALL db.addVertexVectorIndex("
@@ -246,12 +237,13 @@ def upsert_chunks(self, chunks: Iterator[Union[Vertex, ParagraphChunk]]) -> None
             # If not exist, then create vector index
             if self.query(check_chunk_vector_query).vertex_count == 0:
                 # Get the dimension
-                dimension = len(chunk_list[0].get("_embedding"))
+                embedding = chunk_list[0].get("_embedding", [])
+                assert isinstance(embedding, list)
                 # Then create index
                 create_vector_index_query = (
                     "CALL db.addVertexVectorIndex("
                     f'"{GraphElemType.CHUNK.value}", "_embedding", '
-                    f"{{dimension: {dimension}}})"
+                    f"{{dimension: {len(embedding)}}})"
                 )
                 self.graph_store.conn.run(query=create_vector_index_query)
 
@@ -524,12 +516,14 @@ def create_graph_label(
         (vertices) and edges in the graph.
         """
         if graph_elem_type.is_vertex():  # vertex
-            vertex_meta = json.dumps({
-                "label": graph_elem_type.value,
-                "type": "VERTEX",
-                "primary": "id",
-                "properties": graph_properties,
-            })
+            vertex_meta = json.dumps(
+                {
+                    "label": graph_elem_type.value,
+                    "type": "VERTEX",
+                    "primary": "id",
+                    "properties": graph_properties,
+                }
+            )
             gql = f"""CALL db.createVertexLabelByJson('{vertex_meta}')"""
         else:  # edge
 
@@ -555,12 +549,14 @@ def edge_direction(graph_elem_type: GraphElemType) -> List[List[str]]:
                 else:
                     raise ValueError("Invalid graph element type.")
 
-            edge_meta = json.dumps({
-                "label": graph_elem_type.value,
-                "type": "EDGE",
-                "constraints": edge_direction(graph_elem_type),
-                "properties": graph_properties,
-            })
+            edge_meta = json.dumps(
+                {
+                    "label": graph_elem_type.value,
+                    "type": "EDGE",
+                    "constraints": edge_direction(graph_elem_type),
+                    "properties": graph_properties,
+                }
+            )
             gql = f"""CALL db.createEdgeLabelByJson('{edge_meta}')"""
 
         self.graph_store.conn.run(gql)
@@ -597,7 +593,8 @@ def explore_trigraph(
         """Explore the graph from given subjects up to a depth.
 
         Args:
-            subs (Union[List[str], List[List[float]]): The list of the subjects (keywords or embedding vectors).
+            subs (Union[List[str], List[List[float]]): The list of the subjects
+                (keywords or embedding vectors).
             topk (Optional[int]): The number of the top similar entities.
             score_threshold (Optional[float]): The threshold of the similarity score.
             direct (Direction): The direction of the graph that will be explored.
@@ -651,7 +648,7 @@ def explore_trigraph(
             conditional_statement = f"WHERE n.id IN {ids} "
         else:
             conditional_statement = (
-                f"WHERE n.id IN {[self._escape_quotes(sub) for sub in subs]} "
+                f"WHERE n.id IN {[self._escape_quotes(str(sub)) for sub in subs]} "
             )
 
         # Multi-hop search
@@ -686,9 +683,9 @@ def explore_docgraph_with_entities(
             limit (Optional[int]): The limit number of the queried chunks.
 
         Returns:
-            MemoryGraph: The document graph that includes the leaf chunks that connect to the
-                entities, the chains from documents to the leaf chunks, and the chain
-                from documents to chunks.
+            MemoryGraph: The document graph that includes the leaf chunks that
+                connect to the entities, the chains from documents to the leaf chunks,
+                and the chain from documents to chunks.
         """
         if len(subs) == 0:
             return MemoryGraph()
@@ -771,7 +768,8 @@ def explore_docgraph_without_entities(
         """Explore the graph from given subjects up to a depth.
 
         Args:
-            subs (Union[List[str], List[List[float]]): The list of the subjects (keywords or embedding vectors).
+            subs (Union[List[str], List[List[float]]): The list of the subjects
+                (keywords or embedding vectors).
             topk (Optional[int]): The number of the top similar chunks.
             score_threshold (Optional[float]): The threshold of the similarity score.
             direct (Direction): The direction of the graph that will be explored.
@@ -780,8 +778,9 @@ def explore_docgraph_without_entities(
             limit (Optional[int]): The limit number of the queried chunks.
 
         Returns:
-            MemoryGraph: The document graph that includes the chains from documents to chunks
-                that contain the subs (keywords) or similar chunks (embedding vectors).
+            MemoryGraph: The document graph that includes the chains from documents
+                to chunks that contain the subs (keywords) or similar chunks
+                (embedding vectors).
         """
         if len(subs) == 0:
             return MemoryGraph()
@@ -825,13 +824,16 @@ def explore_docgraph_without_entities(
                     self.graph_store.conn.run(query=similarity_retrieval_query)
                 )
             names = [(record["name"]) for record in similar_chunks]
-            _subs_condition = " OR ".join([
-                f"m.content CONTAINS '{name}'" for name in names
-            ])
+            _subs_condition = " OR ".join(
+                [f"m.content CONTAINS '{name}'" for name in names]
+            )
         else:
-            _subs_condition = " OR ".join([
-                f"m.content CONTAINS '{self._escape_quotes(sub)}'" for sub in subs
-            ])
+            _subs_condition = " OR ".join(
+                [
+                    f"m.content CONTAINS '{self._escape_quotes(str(sub))}'"
+                    for sub in subs
+                ]
+            )
 
         # Query the chain from documents to chunks,
         # document -> chunk -> chunk -> chunk -> ... -> chunk
@@ -921,15 +923,19 @@ async def stream_query(  # type: ignore[override]
                     rels = list(record["p"].relationships)
                     formatted_path = []
                     for i in range(len(nodes)):
-                        formatted_path.append({
-                            "id": nodes[i]._properties["id"],
-                            "description": nodes[i]._properties["description"],
-                        })
+                        formatted_path.append(
+                            {
+                                "id": nodes[i]._properties["id"],
+                                "description": nodes[i]._properties["description"],
+                            }
+                        )
                         if i < len(rels):
-                            formatted_path.append({
-                                "id": rels[i]._properties["id"],
-                                "description": rels[i]._properties["description"],
-                            })
+                            formatted_path.append(
+                                {
+                                    "id": rels[i]._properties["id"],
+                                    "description": rels[i]._properties["description"],
+                                }
+                            )
                     for i in range(0, len(formatted_path), 2):
                         mg.upsert_vertex(
                             Vertex(
@@ -1100,9 +1106,9 @@ def upsert_doc_include_chunk(
         chunk: ParagraphChunk,
     ) -> None:
         """Convert chunk to document include chunk."""
-        assert chunk.chunk_parent_id and chunk.chunk_parent_name, (
-            "Chunk parent ID and name are required (document_include_chunk)"
-        )
+        assert (
+            chunk.chunk_parent_id and chunk.chunk_parent_name
+        ), "Chunk parent ID and name are required (document_include_chunk)"
 
         edge = Edge(
             sid=chunk.chunk_parent_id,
@@ -1123,9 +1129,9 @@ def upsert_chunk_include_chunk(
         chunk: ParagraphChunk,
     ) -> None:
         """Convert chunk to chunk include chunk."""
-        assert chunk.chunk_parent_id and chunk.chunk_parent_name, (
-            "Chunk parent ID and name are required (chunk_include_chunk)"
-        )
+        assert (
+            chunk.chunk_parent_id and chunk.chunk_parent_name
+        ), "Chunk parent ID and name are required (chunk_include_chunk)"
 
         edge = Edge(
             sid=chunk.chunk_parent_id,

diff --git a/dbgpt/storage/knowledge_graph/community_summary.py b/dbgpt/storage/knowledge_graph/community_summary.py
@@ -402,6 +402,7 @@ async def asimilar_search_with_scores(
         keywords: List[str] = await self._keyword_extractor.extract(text)
 
         # If enable similarity search, using subs to transfer embeddings
+        subs: Union[List[str], List[List[float]]]
         if enable_similarity_search:
             # Embedding the question
             vector = await self._text_embedder.embed(text)
@@ -411,9 +412,9 @@ async def asimilar_search_with_scores(
             )
             # Using the embeddings of keywords and question
             vectors.append(vector)
-            subs: Union[List[str], List[List[float]]] = vectors
+            subs = vectors
         else:
-            subs: Union[List[str], List[List[float]]] = keywords
+            subs = keywords
 
         # If enable triplet graph, using subs to search enetities
         # subs -> enetities

diff --git a/dbgpt/storage/knowledge_graph/knowledge_graph.py b/dbgpt/storage/knowledge_graph/knowledge_graph.py
@@ -123,7 +123,9 @@ async def asimilar_search_with_scores(
 
         # extract keywords and explore graph store
         keywords = await self._keyword_extractor.extract(text)
-        subgraph = self._graph_store_apdater.explore(keywords, limit=topk).format()
+        subgraph = self._graph_store_apdater.explore_trigraph(
+            keywords, limit=topk
+        ).format()
 
         logger.info(f"Search subgraph from {len(keywords)} keywords")
 
@@ -134,7 +136,7 @@ async def asimilar_search_with_scores(
             "The following entities and relationships provided after "
             "[Subgraph] are retrieved from the knowledge graph "
             "based on the keywords:\n"
-            f"\"{','.join(keywords)}\".\n"
+            f'"{",".join(keywords)}".\n'
             "---------------------\n"
             "The following examples after [Entities] and [Relationships] that "
             "can help you understand the data format of the knowledge graph, "