Skip to content

Commit

Permalink
format
Browse files Browse the repository at this point in the history
  • Loading branch information
Appointat committed Jan 7, 2025
1 parent dd8efc3 commit 86a6a49
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 70 deletions.
17 changes: 10 additions & 7 deletions dbgpt/storage/knowledge_graph/community/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,8 @@ def explore_trigraph(
"""Explore the graph from given subjects up to a depth.
Args:
subs (Union[List[str], List[List[float]]): The list of the subjects (keywords or embedding vectors).
subs (Union[List[str], List[List[float]]): The list of the subjects
(keywords or embedding vectors).
topk (Optional[int]): The number of the top similar entities.
score_threshold (Optional[float]): The threshold of the similarity score.
direct (Direction): The direction of the graph that will be explored.
Expand Down Expand Up @@ -224,9 +225,9 @@ def explore_docgraph_with_entities(
limit (Optional[int]): The limit number of the queried chunks.
Returns:
MemoryGraph: The document graph that includes the leaf chunks that connect to the
entities, the chains from documents to the leaf chunks, and the chain
from documents to chunks.
MemoryGraph: The document graph that includes the leaf chunks that connect
to the entities, the chains from documents to the leaf chunks, and the
chain from documents to chunks.
"""

@abstractmethod
Expand All @@ -243,7 +244,8 @@ def explore_docgraph_without_entities(
"""Explore the graph from given subjects up to a depth.
Args:
subs (Union[List[str], List[List[float]]): The list of the subjects (keywords or embedding vectors).
subs (Union[List[str], List[List[float]]): The list of the subjects
(keywords or embedding vectors).
topk (Optional[int]): The number of the top similar chunks.
score_threshold (Optional[float]): The threshold of the similarity score.
direct (Direction): The direction of the graph that will be explored.
Expand All @@ -252,8 +254,9 @@ def explore_docgraph_without_entities(
limit (Optional[int]): The limit number of the queried chunks.
Returns:
MemoryGraph: The document graph that includes the chains from documents to chunks
that contain the subs (keywords) or similar chunks (embedding vectors).
MemoryGraph: The document graph that includes the chains from documents
to chunks that contain the subs (keywords) or similar chunks
(embedding vectors).
"""

@abstractmethod
Expand Down
4 changes: 3 additions & 1 deletion dbgpt/storage/knowledge_graph/community/community_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,9 @@ async def _summary_community(self, community_id: str) -> Optional[Community]:
return None

graph = community.data.format()
community.summary = await self._community_summarizer.summarize(graph=graph)
community.summary = (
await self._community_summarizer.summarize(graph=graph) or ""
)
logger.info(f"Summarize community {community_id}: {community.summary[:50]}...")
return community

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class MemGraphStoreAdapter(GraphStoreAdapter):

def __init__(self, enable_summary: bool = False):
"""Initialize MemGraph Community Store Adapter."""
self._graph_store = MemoryGraphStore(MemoryGraphStoreConfig())
self._graph_store: MemoryGraphStore = MemoryGraphStore(MemoryGraphStoreConfig())

super().__init__(self._graph_store)

Expand All @@ -38,7 +38,7 @@ def __init__(self, enable_summary: bool = False):

async def discover_communities(self, **kwargs) -> List[str]:
"""Run community discovery with leiden."""
[]
return []

async def get_community(self, community_id: str) -> Community:
"""Get community."""
Expand Down Expand Up @@ -196,7 +196,7 @@ def check_label(self, graph_elem_type: GraphElemType) -> bool:
True if the label exists in the specified graph element type, otherwise
False.
"""
pass
raise NotImplementedError("Memory graph store does not have label")

def explore(
self,
Expand All @@ -214,8 +214,8 @@ def explore(

def query(self, query: str, **kwargs) -> MemoryGraph:
"""Execute a query on graph."""
pass
raise NotImplementedError("Memory graph store does not support query")

async def stream_query(self, query: str, **kwargs) -> AsyncGenerator[Graph, None]:
"""Execute a stream query."""
pass
raise NotImplementedError("Memory graph store does not support stream query")
112 changes: 59 additions & 53 deletions dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,7 @@

import json
import logging
from typing import (
Any,
AsyncGenerator,
Dict,
Iterator,
List,
Optional,
Tuple,
Union,
)
from typing import Any, AsyncGenerator, Dict, Iterator, List, Optional, Tuple, Union

from dbgpt.storage.graph_store.graph import (
Direction,
Expand Down Expand Up @@ -172,7 +163,7 @@ def upsert_entities(self, entities: Iterator[Vertex]) -> None:
# If not exist, then create vector index
if self.query(check_entity_vector_query).vertex_count == 0:
# Get the dimension
dimension = len(entity_list[0].get("_embedding"))
dimension = len(entity_list[0].get("_embedding", []))
# Then create index
create_vector_index_query = (
"CALL db.addVertexVectorIndex("
Expand Down Expand Up @@ -246,12 +237,13 @@ def upsert_chunks(self, chunks: Iterator[Union[Vertex, ParagraphChunk]]) -> None
# If not exist, then create vector index
if self.query(check_chunk_vector_query).vertex_count == 0:
# Get the dimension
dimension = len(chunk_list[0].get("_embedding"))
embedding = chunk_list[0].get("_embedding", [])
assert isinstance(embedding, list)
# Then create index
create_vector_index_query = (
"CALL db.addVertexVectorIndex("
f'"{GraphElemType.CHUNK.value}", "_embedding", '
f"{{dimension: {dimension}}})"
f"{{dimension: {len(embedding)}}})"
)
self.graph_store.conn.run(query=create_vector_index_query)

Expand Down Expand Up @@ -524,12 +516,14 @@ def create_graph_label(
(vertices) and edges in the graph.
"""
if graph_elem_type.is_vertex(): # vertex
vertex_meta = json.dumps({
"label": graph_elem_type.value,
"type": "VERTEX",
"primary": "id",
"properties": graph_properties,
})
vertex_meta = json.dumps(
{
"label": graph_elem_type.value,
"type": "VERTEX",
"primary": "id",
"properties": graph_properties,
}
)
gql = f"""CALL db.createVertexLabelByJson('{vertex_meta}')"""
else: # edge

Expand All @@ -555,12 +549,14 @@ def edge_direction(graph_elem_type: GraphElemType) -> List[List[str]]:
else:
raise ValueError("Invalid graph element type.")

edge_meta = json.dumps({
"label": graph_elem_type.value,
"type": "EDGE",
"constraints": edge_direction(graph_elem_type),
"properties": graph_properties,
})
edge_meta = json.dumps(
{
"label": graph_elem_type.value,
"type": "EDGE",
"constraints": edge_direction(graph_elem_type),
"properties": graph_properties,
}
)
gql = f"""CALL db.createEdgeLabelByJson('{edge_meta}')"""

self.graph_store.conn.run(gql)
Expand Down Expand Up @@ -597,7 +593,8 @@ def explore_trigraph(
"""Explore the graph from given subjects up to a depth.
Args:
subs (Union[List[str], List[List[float]]): The list of the subjects (keywords or embedding vectors).
subs (Union[List[str], List[List[float]]): The list of the subjects
(keywords or embedding vectors).
topk (Optional[int]): The number of the top similar entities.
score_threshold (Optional[float]): The threshold of the similarity score.
direct (Direction): The direction of the graph that will be explored.
Expand Down Expand Up @@ -651,7 +648,7 @@ def explore_trigraph(
conditional_statement = f"WHERE n.id IN {ids} "
else:
conditional_statement = (
f"WHERE n.id IN {[self._escape_quotes(sub) for sub in subs]} "
f"WHERE n.id IN {[self._escape_quotes(str(sub)) for sub in subs]} "
)

# Multi-hop search
Expand Down Expand Up @@ -686,9 +683,9 @@ def explore_docgraph_with_entities(
limit (Optional[int]): The limit number of the queried chunks.
Returns:
MemoryGraph: The document graph that includes the leaf chunks that connect to the
entities, the chains from documents to the leaf chunks, and the chain
from documents to chunks.
MemoryGraph: The document graph that includes the leaf chunks that
connect to the entities, the chains from documents to the leaf chunks,
and the chain from documents to chunks.
"""
if len(subs) == 0:
return MemoryGraph()
Expand Down Expand Up @@ -771,7 +768,8 @@ def explore_docgraph_without_entities(
"""Explore the graph from given subjects up to a depth.
Args:
subs (Union[List[str], List[List[float]]): The list of the subjects (keywords or embedding vectors).
subs (Union[List[str], List[List[float]]): The list of the subjects
(keywords or embedding vectors).
topk (Optional[int]): The number of the top similar chunks.
score_threshold (Optional[float]): The threshold of the similarity score.
direct (Direction): The direction of the graph that will be explored.
Expand All @@ -780,8 +778,9 @@ def explore_docgraph_without_entities(
limit (Optional[int]): The limit number of the queried chunks.
Returns:
MemoryGraph: The document graph that includes the chains from documents to chunks
that contain the subs (keywords) or similar chunks (embedding vectors).
MemoryGraph: The document graph that includes the chains from documents
to chunks that contain the subs (keywords) or similar chunks
(embedding vectors).
"""
if len(subs) == 0:
return MemoryGraph()
Expand Down Expand Up @@ -825,13 +824,16 @@ def explore_docgraph_without_entities(
self.graph_store.conn.run(query=similarity_retrieval_query)
)
names = [(record["name"]) for record in similar_chunks]
_subs_condition = " OR ".join([
f"m.content CONTAINS '{name}'" for name in names
])
_subs_condition = " OR ".join(
[f"m.content CONTAINS '{name}'" for name in names]
)
else:
_subs_condition = " OR ".join([
f"m.content CONTAINS '{self._escape_quotes(sub)}'" for sub in subs
])
_subs_condition = " OR ".join(
[
f"m.content CONTAINS '{self._escape_quotes(str(sub))}'"
for sub in subs
]
)

# Query the chain from documents to chunks,
# document -> chunk -> chunk -> chunk -> ... -> chunk
Expand Down Expand Up @@ -921,15 +923,19 @@ async def stream_query( # type: ignore[override]
rels = list(record["p"].relationships)
formatted_path = []
for i in range(len(nodes)):
formatted_path.append({
"id": nodes[i]._properties["id"],
"description": nodes[i]._properties["description"],
})
formatted_path.append(
{
"id": nodes[i]._properties["id"],
"description": nodes[i]._properties["description"],
}
)
if i < len(rels):
formatted_path.append({
"id": rels[i]._properties["id"],
"description": rels[i]._properties["description"],
})
formatted_path.append(
{
"id": rels[i]._properties["id"],
"description": rels[i]._properties["description"],
}
)
for i in range(0, len(formatted_path), 2):
mg.upsert_vertex(
Vertex(
Expand Down Expand Up @@ -1100,9 +1106,9 @@ def upsert_doc_include_chunk(
chunk: ParagraphChunk,
) -> None:
"""Convert chunk to document include chunk."""
assert chunk.chunk_parent_id and chunk.chunk_parent_name, (
"Chunk parent ID and name are required (document_include_chunk)"
)
assert (
chunk.chunk_parent_id and chunk.chunk_parent_name
), "Chunk parent ID and name are required (document_include_chunk)"

edge = Edge(
sid=chunk.chunk_parent_id,
Expand All @@ -1123,9 +1129,9 @@ def upsert_chunk_include_chunk(
chunk: ParagraphChunk,
) -> None:
"""Convert chunk to chunk include chunk."""
assert chunk.chunk_parent_id and chunk.chunk_parent_name, (
"Chunk parent ID and name are required (chunk_include_chunk)"
)
assert (
chunk.chunk_parent_id and chunk.chunk_parent_name
), "Chunk parent ID and name are required (chunk_include_chunk)"

edge = Edge(
sid=chunk.chunk_parent_id,
Expand Down
5 changes: 3 additions & 2 deletions dbgpt/storage/knowledge_graph/community_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,7 @@ async def asimilar_search_with_scores(
keywords: List[str] = await self._keyword_extractor.extract(text)

# If enable similarity search, using subs to transfer embeddings
subs: Union[List[str], List[List[float]]]
if enable_similarity_search:
# Embedding the question
vector = await self._text_embedder.embed(text)
Expand All @@ -411,9 +412,9 @@ async def asimilar_search_with_scores(
)
# Using the embeddings of keywords and question
vectors.append(vector)
subs: Union[List[str], List[List[float]]] = vectors
subs = vectors
else:
subs: Union[List[str], List[List[float]]] = keywords
subs = keywords

# If enable triplet graph, using subs to search enetities
# subs -> enetities
Expand Down
6 changes: 4 additions & 2 deletions dbgpt/storage/knowledge_graph/knowledge_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,9 @@ async def asimilar_search_with_scores(

# extract keywords and explore graph store
keywords = await self._keyword_extractor.extract(text)
subgraph = self._graph_store_apdater.explore(keywords, limit=topk).format()
subgraph = self._graph_store_apdater.explore_trigraph(
keywords, limit=topk
).format()

logger.info(f"Search subgraph from {len(keywords)} keywords")

Expand All @@ -134,7 +136,7 @@ async def asimilar_search_with_scores(
"The following entities and relationships provided after "
"[Subgraph] are retrieved from the knowledge graph "
"based on the keywords:\n"
f"\"{','.join(keywords)}\".\n"
f'"{",".join(keywords)}".\n'
"---------------------\n"
"The following examples after [Entities] and [Relationships] that "
"can help you understand the data format of the knowledge graph, "
Expand Down

0 comments on commit 86a6a49

Please sign in to comment.