Skip to content
This repository has been archived by the owner on Oct 21, 2024. It is now read-only.

Commit

Permalink
PDF AI
Browse files Browse the repository at this point in the history
  • Loading branch information
ashpreetbedi committed Jan 31, 2024
1 parent 52cd299 commit 0062b2e
Show file tree
Hide file tree
Showing 12 changed files with 245 additions and 49 deletions.
4 changes: 2 additions & 2 deletions app/pages/1_PDF_Assistant.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,8 @@ def main() -> None:
st.session_state["pdf_knowledge_base_loaded"] = True
st.sidebar.success("Knowledge base updated")

if st.sidebar.button("Recreate Knowledge Base"):
pdf_assistant.knowledge_base.load(recreate=True, disabled=True)
if st.sidebar.button("Recreate Knowledge Base", disabled=True):
pdf_assistant.knowledge_base.load(recreate=True)
st.session_state["pdf_knowledge_base_loaded"] = True
st.sidebar.success("Knowledge base recreated")

Expand Down
3 changes: 2 additions & 1 deletion example.env
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
HACKERNEWS_AI=True
PDF_AI=True
# HACKERNEWS_AI=True
# IMAGE_REPO=repo
# BUILD_IMAGES=True
# PUSH_IMAGES=True
Expand Down
1 change: 0 additions & 1 deletion hn_ai/assistant.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ def get_hn_assistant(
"If the user asks what's trending, use the `get_top_stories` tool to get the top 5 stories.",
f"If the user asks about their posts, use the `get_user_details` tool with the username {user_id}.",
"If you need to search the web, use the `search_web` tool to search the web for the answer.",
"If you need to search the web, use the `search_web` tool to search the web for any query. ",
"Remember, you can first user the `search_web` tool to get context on the question and then use `search_hackernews_stories` to get information from HackerNews.",
"Using this information, provide a reasoned summary for the user. Talk about the general sentiment in the comments and the popularity of the story.",
"Always share the story score, number of comments and a link to the story if available.",
Expand Down
11 changes: 11 additions & 0 deletions pdf_ai/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from phi.tools.streamlit.components import (
get_openai_key_sidebar,
get_username_sidebar,
reload_button_sidebar,
)

from pdf_ai.assistant import get_pdf_assistant
Expand Down Expand Up @@ -108,6 +109,13 @@ def main() -> None:
pdf_documents: List[Document] = reader.read(uploaded_file)
if pdf_documents:
pdf_assistant.knowledge_base.load_documents(documents=pdf_documents, upsert=True)
# Refresh the assistant to update the instructions and document names
pdf_assistant = get_pdf_assistant(
user_id=username,
run_id=st.session_state["pdf_assistant_run_id"],
debug_mode=True,
)
st.session_state["pdf_assistant"] = pdf_assistant
else:
st.sidebar.error("Could not read PDF")
st.session_state[f"{pdf_name}_uploaded"] = True
Expand Down Expand Up @@ -138,5 +146,8 @@ def main() -> None:
if pdf_assistant_run_name:
st.sidebar.write(f":thread: {pdf_assistant_run_name}")

# Show reload button
reload_button_sidebar()


main()
74 changes: 50 additions & 24 deletions pdf_ai/assistant.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional
from typing import Optional, List

from phi.assistant import Assistant
from phi.llm.openai import OpenAIChat
Expand All @@ -8,6 +8,7 @@
from pdf_ai.storage import pdf_assistant_storage
from pdf_ai.tools import PDFTools
from pdf_ai.knowledge import get_pdf_knowledge_base_for_user
from utils.log import logger


def get_pdf_assistant(
Expand All @@ -16,6 +17,50 @@ def get_pdf_assistant(
debug_mode: bool = False,
) -> Assistant:
pdf_tools = PDFTools(user_id=user_id)
document_names: Optional[List[str]] = pdf_tools.get_document_names()
logger.info(f"Documents available: {document_names}")

introduction = "Hi, I am PDF AI, built by [phidata](https://github.com/phidatahq/phidata)."

instructions = [
"You are made by phidata: https://github.com/phidatahq/phidata",
f"You are interacting with the user: {user_id}",
"You have a knowledge base of PDFs that you can use to answer questions.",
"When the user asks a question, first determine if you should search the web or your knowledge base for the answer.",
"If you need to search the web, use the `search_web` tool to search the web for the answer.",
]
if document_names is None or len(document_names) == 0:
introduction += " Please upload a document to get started."
instructions.append(
"You do not have any documents in your knowledge base. Ask the user politely to upload a document and share a nice joke with them."
)
elif len(document_names) == 1:
introduction += "\n\nAsk me about: {}".format(", ".join(document_names))
(f"You have the following documents in your knowledge base: {document_names}",)
instructions.extend(
[
"If the user asks a specific question, use the `search_latest_document` tool to search the latest document for context.",
"If the user asks a summary, use the `get_latest_document_contents` tool to get the contents of the latest document.",
]
)
else:
introduction += "\n\nAsk me about: {}".format(", ".join(document_names))
instructions.extend(
[
f"You have the following documents in your knowledge base: {document_names}",
"When the user asks a question, first determine if you should search a specific document or the latest document uploaded by the user.",
"If the user asks a specific question, use the `search_document` tool if you know the document to search OR `search_latest_document` tool to search the latest document for context.",
"If the user asks to summarize a document, use the `get_document_contents` if you know the document to search OR `get_latest_document_contents` tool to get the contents of the latest document.",
]
)
instructions.extend(
[
"Keep your conversation light hearted and fun.",
"Using information from the document, provide the user with a concise and relevant answer.",
"If the user asks what is this? they are asking about the latest document",
"If the user compliments you, ask them to star phidata on GitHub: https://github.com/phidatahq/phidata",
]
)

return Assistant(
name=f"pdf_assistant_{user_id}" if user_id else "hn_assistant",
Expand All @@ -29,32 +74,13 @@ def get_pdf_assistant(
storage=pdf_assistant_storage,
monitoring=True,
use_tools=True,
# tools=[search_web, pdf_tools],
introduction=introduction,
tools=[search_web, pdf_tools],
knowledge_base=get_pdf_knowledge_base_for_user(user_id),
show_tool_calls=True,
debug_mode=debug_mode,
description="Your name is PDF AI and you are a chatbot that answers questions from a knowledge base of PDFs.",
add_datetime_to_instructions=True,
instructions=[
"You are made by phidata: https://github.com/phidatahq/phidata",
f"You are interacting with the user: {user_id}",
# "If the user asks a question, first determine if you should search the web or your knowledge base for the answer.",
# "If you need to search the web, use the `search_web` tool to search the web for the answer.",
# "If the user asks a question but the document is not clear, use the `search_latest_document` tool to search the latest document for the answer.",
# "If the user asks to summarize a document, use the `get_latest_document_contents` tool to get the contents of the latest document.",
# "When the user asks a question, first determine if you should search the web or HackerNews for the answer.",
# "If you need to search HackerNews, use the `search_hackernews_stories` tool. Search for atleast 10 stories."
# + " Then use the `get_story_details` tool to get the details of the most popular 3 stories.",
# "If the user asks what's trending, use the `get_top_stories` tool to get the top 5 stories.",
# f"If the user asks about their posts, use the `get_user_details` tool with the username {user_id}.",
# "If you need to search the web, use the `search_web` tool to search the web for the answer.",
# "If you need to search the web, use the `search_web` tool to search the web for any query. ",
# "Remember, you can first user the `search_web` tool to get context on the question and then use `search_hackernews_stories` to get information from HackerNews.",
# "Using this information, provide a reasoned summary for the user. Talk about the general sentiment in the comments and the popularity of the story.",
# "Always share the story score, number of comments and a link to the story if available.",
# "If the user provides a URL, use the `get_item_details_by_url` tool to get the details of the item.",
# "Prefer stories with high scores and comments",
# "Always try to delight the user with an interesting fact about the story.",
"If the user compliments you, ask them to star phidata on GitHub: https://github.com/phidatahq/phidata",
],
instructions=instructions,
user_data={"documents": document_names},
)
1 change: 0 additions & 1 deletion pdf_ai/knowledge.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from phi.vectordb.pgvector import PgVector2

from db.session import db_url
from utils.log import logger


def get_pdf_knowledge_base_for_user(user_id: Optional[str] = None) -> AssistantKnowledge:
Expand Down
11 changes: 9 additions & 2 deletions pdf_ai/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,12 @@

pdf_assistant = get_pdf_assistant(user_id="ab", debug_mode=True)

# pdf_assistant.print_response("Who is the agreement between?")
pdf_assistant.print_response("hello?")
pdf_assistant.print_response("Who is the agreement between?")
pdf_assistant.print_response("Who is the heyday agreement with?")
# pdf_assistant.print_response("summarize this")
# pdf_assistant.print_response("What the capital of India?")

# "If you are able to identify the document and the user asks a specific question, use the `search_document` tool to search the contents of the document for context.",
# "If you are able to identify the document and needs you to summarize it, use the `get_document_contents` tool to get the contents of the document.",
# "If the user asks a specific question but the document name is not clear, use the `search_latest_document` tool to search the latest document for context.",
# "If the user asks to summarize a document but the document name is not clear, use the `get_latest_document_contents` tool to get the contents of the latest document.",
19 changes: 17 additions & 2 deletions pdf_ai/test_tools.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,21 @@
from phi.utils.log import set_log_level_to_debug

from pdf_ai.tools import PDFTools

set_log_level_to_debug()
pdf_tools = PDFTools(user_id="ab")

latest_document = pdf_tools.get_latest_document()
print(latest_document)
# latest_document = pdf_tools.get_latest_document()
# print(latest_document)

# result = pdf_tools.search_latest_document("agreement between")
# print(result)

# document_names = pdf_tools.get_document_names()
# print(document_names)

# search_document = pdf_tools.search_document("agreement", "Hydy Services Agreement")
# print(search_document)

# document_content = pdf_tools.get_document_contents("Hydy Services Agreement")
# print(document_content)
138 changes: 129 additions & 9 deletions pdf_ai/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from phi.document import Document
from phi.tools import ToolRegistry
from phi.knowledge import AssistantKnowledge
from phi.vectordb import VectorDb
from phi.vectordb.pgvector import PgVector2

from pdf_ai.knowledge import get_pdf_knowledge_base_for_user
Expand All @@ -19,12 +18,14 @@ def __init__(self, user_id: str):
self.knowledge_base: AssistantKnowledge = get_pdf_knowledge_base_for_user(user_id=user_id)
self.register(self.get_latest_document_contents)
self.register(self.search_latest_document)
self.register(self.search_document)
self.register(self.get_document_contents)

def get_latest_document_contents(self, limit: int = 5000) -> Optional[str]:
"""Use this function to get the content of the latest document uploaded by the user.
Args:
limit (int, optional): Maximum number of characters to return. Defaults to 5000.
limit (int): Maximum number of characters to return. Defaults to 5000.
Returns:
str: JSON string of the latest document
Expand Down Expand Up @@ -57,14 +58,12 @@ def get_latest_document_contents(self, limit: int = 5000) -> Optional[str]:

return latest_document_content[:limit]

return "Sorry could not find latest document"

def search_latest_document(self, query: str, num_documents: Optional[int] = None) -> Optional[str]:
def search_latest_document(self, query: str, num_chunks: int = 5) -> Optional[str]:
"""Use this function to search the latest document uploaded by the user for a query.
Args:
query (str): Query to search for
num_documents (Optional[int], optional): Number of documents to return. Defaults to None.
num_chunks (int): Number of chunks to return. Defaults to 5.
Returns:
str: JSON string of the search results
Expand All @@ -78,8 +77,8 @@ def search_latest_document(self, query: str, num_documents: Optional[int] = None
table = vector_db.table
latest_document_name = None
with vector_db.Session() as session, session.begin():
query = session.query(table).order_by(table.c.created_at.desc()).limit(1)
result = session.execute(query)
latest_document_query = session.query(table).order_by(table.c.created_at.desc()).limit(1)
result = session.execute(latest_document_query)
row = result.fetchone()

if row is None:
Expand All @@ -92,11 +91,132 @@ def search_latest_document(self, query: str, num_documents: Optional[int] = None
return "Sorry could not find latest document"

search_results: List[Document] = vector_db.search(
query=query, limit=num_documents, filters={"name": latest_document_name}
query=query, limit=num_chunks, filters={"name": latest_document_name}
)
logger.debug(f"Search result: {search_results}")

if len(search_results) == 0:
return "Sorry could not find any results from latest document"

return json.dumps([doc.to_dict() for doc in search_results])

def get_document_names(self, limit: int = 20) -> Optional[List[str]]:
"""Use this function to get the names of the documents uploaded by the user.
Args:
limit (int): Maximum number of documents to return. Defaults to 20.
Returns:
str: JSON string of the document names
"""

logger.debug("Getting all document names")
if self.knowledge_base.vector_db is None or not isinstance(self.knowledge_base.vector_db, PgVector2):
return None

vector_db: PgVector2 = self.knowledge_base.vector_db
table = vector_db.table
with vector_db.Session() as session, session.begin():
try:
query = session.query(table).distinct(table.c.name).limit(limit)
result = session.execute(query)
rows = result.fetchall()

if rows is None:
return "Sorry could not find any documents"

document_names = []
for row in rows:
document_name = row.name
document_names.append(document_name)

return document_names
except Exception as e:
logger.error(f"Error getting document names: {e}")
return None

def search_document(self, query: str, document_name: str, num_chunks: int = 5) -> Optional[str]:
"""Use this function to search the latest document uploaded by the user for a query.
Args:
query (str): Query to search for
num_chunks (int): Number of chunks to return. Defaults to 5.
Returns:
str: JSON string of the search results
"""

logger.debug(f"Searching document {document_name} for query: {query}")
if self.knowledge_base.vector_db is None or not isinstance(self.knowledge_base.vector_db, PgVector2):
return "Sorry could not search latest document"

search_results: List[Document] = self.knowledge_base.vector_db.search(
query=query, limit=num_chunks, filters={"name": document_name}
)
logger.debug(f"Search result: {search_results}")

if len(search_results) == 0:
return "Sorry could not find any results from latest document"

return json.dumps([doc.to_dict() for doc in search_results])

def get_document_contents(self, document_name: str, limit: int = 5000) -> Optional[str]:
"""Use this function to get the content of the document with name=document_name.
Args:
limit (int): Maximum number of characters to return. Defaults to 5000.
Returns:
str: JSON string of the document contents
"""

logger.debug(f"Getting document contents for user {document_name}")
if self.knowledge_base.vector_db is None or not isinstance(self.knowledge_base.vector_db, PgVector2):
return "Sorry could not find latest document"

vector_db: PgVector2 = self.knowledge_base.vector_db
table = vector_db.table
with vector_db.Session() as session, session.begin():
document_query = (
session.query(table).filter(table.c.name == document_name).order_by(table.c.created_at.desc())
)
document_result = session.execute(document_query)
document_rows = document_result.fetchall()
document_content = ""
for document_row in document_rows:
document_content += document_row.content

return document_content[:limit]

# def get_document_introduction(self) -> Optional[str]:
# """Use this function to get a quick introduction to the documents uploaded by the user.
# This function will return a dictionary of document names and their first 200 characters.

# Returns:
# str: JSON string of the document names and their first 200 characters
# """

# logger.debug("Getting document introduction")
# if self.knowledge_base.vector_db is None or not isinstance(self.knowledge_base.vector_db, PgVector2):
# return None

# vector_db: PgVector2 = self.knowledge_base.vector_db
# table = vector_db.table
# with vector_db.Session() as session, session.begin():
# try:
# query = select(table.c.name, table.c.meta_data, table.c.content).order_by(table.c.created_at)
# result = session.execute(query)
# rows = result.fetchall()

# if rows is None:
# return "Sorry could not find any documents"

# document_names = []
# for row in rows:
# document_name = row.name
# document_names.append(document_name)

# return document_names
# except Exception as e:
# logger.error(f"Error getting document names: {e}")
# return None
Loading

0 comments on commit 0062b2e

Please sign in to comment.