diff --git a/.gitignore b/.gitignore index 93ecc0ff..3ee85d33 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ data/ embedding_model/* !embedding_model/.ignore .DS_Store +.conda +.gitignore diff --git a/FREAloadcontent.py b/FREAloadcontent.py new file mode 100644 index 00000000..aefd3d15 --- /dev/null +++ b/FREAloadcontent.py @@ -0,0 +1,160 @@ + +def readfile(file_info): + pathname = file_info['path'] + if file_info['type'] == 'text/plain': + with open(pathname, 'r') as file: + data = file.read() + return data + if file_info['type'] == 'application/pdf': + pdf_text = "" + try: + fd = open(pathname, "rb") + viewer = SimplePDFViewer(fd) + pdf_text = viewer.render() + except Exception as e: + print(f"Error reading PDF file: {e}") + return pdf_text + + +def maketags(file_info): + substraction = 'C:/SyncedFolder/Team Shares/FREA/' + pathname = file_info['path'] + tagstring = pathname.replace(substraction, '') + tagstring2 = tagstring.replace(file_info['name'], '') + tags = tagstring2.split('/') + print(tags) + return tags + +def process_file_getinfo(file_info): + return_data = {} + #data =readfile(file_info) + tags = maketags(file_info) + #return_data['data'] = data + #return_data['tags'] = tags + + return tags + + +def functext(file_info): + process_file_getinfo(file_info) + + +#text/html +def funcWebPages(file_info): + process_file_getinfo(file_info) + + # 'text/markdown': +def funcMarkdown(file_info): + process_file_getinfo(file_info) + + # 'application/xml': +def funcXML(file_info): + process_file_getinfo(file_info) + + # 'application/pdf': +def funcPDF(file_info): + process_file_getinfo(file_info) + + # 'application/msword': +def funcDOC(file_info): + process_file_getinfo(file_info) + + # 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': +def funcDOCX(file_info): + process_file_getinfo(file_info) + + # 'application/vnd.ms-excel (XLS)': +def funcXLS(file_info): + process_file_getinfo(file_info) + + # 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': +def funcXLSX(file_info): + process_file_getinfo(file_info) + + # 'application/vnd.ms-powerpoint (PPT)': +def funcPPT(file_info): + process_file_getinfo(file_info) + + # 'application/vnd.openxmlformats-officedocument.presentationml.presentation': +def funcPPTX(file_info): + process_file_getinfo(file_info) + + # 'application/rtf': +def funcRTF(file_info): + process_file_getinfo(file_info) + + # 'image/jpeg': +def funcJPG(file_info): + process_file_getinfo(file_info) + + # 'image/png': +def funcPNG(file_info): + process_file_getinfo(file_info) + + # 'image/gif': +def funcGIF(file_info): + process_file_getinfo(file_info) + +# 'image/bmp': +def funcBMP(file_info): + process_file_getinfo(file_info) + + # 'image/tiff': +def funcTIFF(file_info): + process_file_getinfo(file_info) + + # 'application/javascript': +def funcJavaScript(file_info): + process_file_getinfo(file_info) + + # 'application/zip': +def funcZIP(file_info): + process_file_getinfo(file_info) + + # 'application/gzip': +def funcGZIP(file_info): + process_file_getinfo(file_info) + + # 'audio/mpeg': +def funcMP3(file_info): + process_file_getinfo(file_info) + +# 'video/mp4': +def funcMP4(file_info): + process_file_getinfo(file_info) + + # 'audio/wav': +def funcWAV(file_info): + process_file_getinfo(file_info) + + # 'audio/ogg': +def funcOGG(file_info): + process_file_getinfo(file_info) + + # 'video/webm': +def funcWEBM(file_info): + process_file_getinfo(file_info) + + # 'application/json': +def funcJSON(file_info): + process_file_getinfo(file_info) + + # 'application/x-yaml': +def funcYAML(file_info): + process_file_getinfo(file_info) + + # 'application/epub+zip': +def funcEPUB(file_info): + process_file_getinfo(file_info) + + # 'application/x-mobipocket-ebook': +def funcMOBI(file_info): + process_file_getinfo(file_info) + +def funcnone(file_info): + process_file_getinfo(file_info) + + + + + \ No newline at end of file diff --git a/FREAloader.Dockerfile b/FREAloader.Dockerfile new file mode 100644 index 00000000..ebd46cc9 --- /dev/null +++ b/FREAloader.Dockerfile @@ -0,0 +1,24 @@ +FROM langchain/langchain + +WORKDIR /app + +RUN apt-get update && apt-get install -y \ + build-essential \ + curl \ + software-properties-common \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . + +RUN pip install --upgrade -r requirements.txt + +COPY FREAloader.py . +COPY utils.py . +COPY chains.py . +COPY images ./images + +EXPOSE 8506 + +HEALTHCHECK CMD curl --fail http://localhost:8502/_stcore/health + +ENTRYPOINT ["streamlit", "run", "FREAloader.py", "--server.port=8506", "--server.address=0.0.0.0"] diff --git a/FREAloader.py b/FREAloader.py new file mode 100644 index 00000000..967c0f2c --- /dev/null +++ b/FREAloader.py @@ -0,0 +1,226 @@ +import os +import requests +import mimetypes +from dotenv import load_dotenv +from langchain_community.graphs import Neo4jGraph +import streamlit as st +from streamlit.logger import get_logger +from chains import load_embedding_model +from utils import create_constraints, create_vector_index +from PIL import Image +import FREAloadcontent as FC +from pdfreader import SimplePDFViewer + + +load_dotenv(".env") + +url = os.getenv("NEO4J_URI") +username = os.getenv("NEO4J_USERNAME") +password = os.getenv("NEO4J_PASSWORD") +ollama_base_url = os.getenv("OLLAMA_BASE_URL") +embedding_model_name = os.getenv("EMBEDDING_MODEL") +# Remapping for Langchain Neo4j integration +os.environ["NEO4J_URL"] = url + +logger = get_logger(__name__) + +#results = read_files_info('C:/SyncedFolder/Team Shares/FREA/') + +#so_api_base_url = "https://api.stackexchange.com/2.3/search/advanced" +#next(results) +embeddings, dimension = load_embedding_model( + embedding_model_name, config={"ollama_base_url": ollama_base_url}, logger=logger +) + +# if Neo4j is local, you can go to http://localhost:7474/ to browse the database +neo4j_graph = Neo4jGraph(url=url, username=username, password=password) + +create_constraints(neo4j_graph) +create_vector_index(neo4j_graph, dimension) + +def read_files_info(directory='.'): + #files_info = [] + for root, dirs, files in os.walk(directory): + for filename in files: + file_path = os.path.join(root, filename) + info = os.stat(file_path) + file_info = { + 'path': file_path, + 'name': filename, + 'type': mimetypes.guess_type(file_path)[0], + 'size': os.path.getsize(file_path), + 'creation_time': info.st_ctime, + 'modification_time': info.st_mtime + } + yield file_info + +def get_file_info(): + file_info = next(results) + value = file_info['type'] + path = file_info['path'] + name = file_info['name'] + switch_case(value,file_info) + +results = read_files_info('C:/SyncedFolder/Team Shares/FREA/') + +def switch_case(value,file_info): + switch = { + 'text/plain': FC.functext, + 'text/markdown': FC.funcMarkdown, + 'application/xml': FC.funcXML, + 'application/pdf': FC.funcPDF, + 'application/msword': FC.funcDOC, + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': FC.funcDOCX, + 'application/vnd.ms-excel (XLS)': FC.funcXLS, + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': FC.funcXLSX, + 'application/vnd.ms-powerpoint (PPT)': FC.funcPPT, + 'application/vnd.openxmlformats-officedocument.presentationml.presentation': FC.funcPPTX, + 'application/rtf': FC.funcRTF, + 'image/jpeg': FC.funcJPG, + 'image/png': FC.funcPNG, + 'image/gif': FC.funcGIF, + 'image/bmp': FC.funcBMP, + 'image/tiff': FC.funcTIFF, + 'application/javascript': FC.funcJavaScript, + 'application/zip': FC.funcZIP, + 'application/gzip': FC.funcGZIP, + 'audio/mpeg': FC.funcMP3, + 'video/mp4': FC.funcMP4, + 'audio/wav': FC.funcWAV, + 'audio/ogg': FC.funcOGG, + 'video/webm': FC.funcWEBM, + 'application/json': FC.funcJSON, + 'application/x-yaml': FC.funcYAML, + 'application/epub+zip': FC.funcEPUB, + 'application/x-mobipocket-ebook': FC.funcMOBI, + 'None': FC.funcnone, + } + func = switch.get(value) + if func: + func(file_info) + else: + print(f"No function found for file type {value}") + + +def insert_so_data(): + i = 1 + while i <= 20: + print(i) + i += 1 + +def load_so_data(tag: str = "neo4j", page: int = 1) -> None: + parameters = ( + + ) + #data = requests.get(so_api_base_url + parameters).json() + #insert_so_data(): + + +def load_high_score_so_data() -> None: + parameters = ( + + ) + data = requests.get(so_api_base_url + parameters).json() + insert_so_data(data) + + + + + + +''' +def insert_so_data(data: dict) -> None: + # Calculate embedding values for questions and answers + for q in data["items"]: + question_text = q["title"] + "\n" + q["body_markdown"] + q["embedding"] = embeddings.embed_query(question_text) + for a in q["answers"]: + a["embedding"] = embeddings.embed_query( + question_text + "\n" + a["body_markdown"] + ) + + # Cypher, the query language of Neo4j, is used to import the data + # https://neo4j.com/docs/getting-started/cypher-intro/ + # https://neo4j.com/docs/cypher-cheat-sheet/5/auradb-enterprise/ + import_query = """ + UNWIND $data AS q + MERGE (question:Question {id:q.question_id}) + ON CREATE SET question.title = q.title, question.link = q.link, question.score = q.score, + question.favorite_count = q.favorite_count, question.creation_date = datetime({epochSeconds: q.creation_date}), + question.body = q.body_markdown, question.embedding = q.embedding + FOREACH (tagName IN q.tags | + MERGE (tag:Tag {name:tagName}) + MERGE (question)-[:TAGGED]->(tag) + ) + FOREACH (a IN q.answers | + MERGE (question)<-[:ANSWERS]-(answer:Answer {id:a.answer_id}) + SET answer.is_accepted = a.is_accepted, + answer.score = a.score, + answer.creation_date = datetime({epochSeconds:a.creation_date}), + answer.body = a.body_markdown, + answer.embedding = a.embedding + MERGE (answerer:User {id:coalesce(a.owner.user_id, "deleted")}) + ON CREATE SET answerer.display_name = a.owner.display_name, + answerer.reputation= a.owner.reputation + MERGE (answer)<-[:PROVIDED]-(answerer) + ) + WITH * WHERE NOT q.owner.user_id IS NULL + MERGE (owner:User {id:q.owner.user_id}) + ON CREATE SET owner.display_name = q.owner.display_name, + owner.reputation = q.owner.reputation + MERGE (owner)-[:ASKED]->(question) + """ + neo4j_graph.query(import_query, {"data": data["items"]}) +''' + +# Streamlit +def get_tag() -> str: + input_text = st.text_input( + "Which tag questions do you want to import?", value="test automation" + ) + return input_text + + +def get_pages(): + col1, col2 = st.columns(2) + with col1: + num_pages = st.number_input( + "Number of pages (100 questions per page)", step=1, min_value=1 + ) + with col2: + start_page = st.number_input("Start page", step=1, min_value=1) + st.caption("Only questions with answers will be imported.") + return (int(num_pages), int(start_page)) + + +def render_page(): + datamodel_image = Image.open("./images/datamodel.png") + st.header("StackOverflow Loader") + st.subheader("Choose StackOverflow tags to load into Neo4j") + st.caption("Go to http://localhost:7474/ to explore the graph.") + + #user_input = get_tag() + #num_pages, start_page = get_pages() + + if st.button("Import", type="primary"): + with st.spinner("Loading... This might take a minute or two."): + try: + for page in range(1, num_pages + 1): + load_so_data(user_input, start_page + (page - 1)) + st.success("Import successful", icon="✅") + st.caption("Data model") + st.image(datamodel_image) + st.caption("Go to http://localhost:7474/ to interact with the database") + except Exception as e: + st.error(f"Error: {e}", icon="🚨") + with st.expander("Highly ranked questions rather than tags?"): + if st.button("Import highly ranked questions"): + with st.spinner("Loading... This might take a minute or two."): + try: + load_high_score_so_data() + st.success("Import successful", icon="✅") + except Exception as e: + st.error(f"Error: {e}", icon="🚨") + + +render_page() diff --git a/__pycache__/chains.cpython-310.pyc b/__pycache__/chains.cpython-310.pyc new file mode 100644 index 00000000..1a37e95b Binary files /dev/null and b/__pycache__/chains.cpython-310.pyc differ diff --git a/__pycache__/utils.cpython-310.pyc b/__pycache__/utils.cpython-310.pyc new file mode 100644 index 00000000..865c0a7a Binary files /dev/null and b/__pycache__/utils.cpython-310.pyc differ diff --git a/docker-compose.yml b/docker-compose.yml index 3a1bbc08..7dc8f029 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -36,7 +36,7 @@ services: - 7687:7687 - 7474:7474 volumes: - - $PWD/data:/data + - "D:/data:/data" environment: - NEO4J_AUTH=${NEO4J_USERNAME-neo4j}/${NEO4J_PASSWORD-password} - NEO4J_PLUGINS=["apoc"] @@ -91,6 +91,47 @@ services: - 8081:8080 - 8502:8502 + frealoader: + build: + context: . + dockerfile: FREAloader.Dockerfile + volumes: + - $PWD/embedding_model:/embedding_model + environment: + - NEO4J_URI=${NEO4J_URI-neo4j://database:7687} + - NEO4J_PASSWORD=${NEO4J_PASSWORD-password} + - NEO4J_USERNAME=${NEO4J_USERNAME-neo4j} + - OPENAI_API_KEY=${OPENAI_API_KEY-} + - GOOGLE_API_KEY=${GOOGLE_API_KEY-} + - OLLAMA_BASE_URL=${OLLAMA_BASE_URL-http://host.docker.internal:11434} + - EMBEDDING_MODEL=${EMBEDDING_MODEL-sentence_transformer} + - LANGCHAIN_ENDPOINT=${LANGCHAIN_ENDPOINT-"https://api.smith.langchain.com"} + - LANGCHAIN_TRACING_V2=${LANGCHAIN_TRACING_V2-false} + - LANGCHAIN_PROJECT=${LANGCHAIN_PROJECT} + - LANGCHAIN_API_KEY=${LANGCHAIN_API_KEY} + - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} + - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} + - AWS_DEFAULT_REGION=${AWS_DEFAULT_REGION} + networks: + - net + depends_on: + database: + condition: service_healthy + pull-model: + condition: service_completed_successfully + x-develop: + watch: + - action: rebuild + path: . + ignore: + - bot.py + - pdf_bot.py + - api.py + - front-end/ + ports: + - 8082:8080 + - 8506:8506 + bot: build: diff --git a/functions/YoLo/weights/person_yolov8m-seg.pt b/functions/YoLo/weights/person_yolov8m-seg.pt new file mode 100644 index 00000000..d17330b5 Binary files /dev/null and b/functions/YoLo/weights/person_yolov8m-seg.pt differ diff --git a/functions/YoLo/yolo_seg b/functions/YoLo/yolo_seg new file mode 100644 index 00000000..1f30a3b5 --- /dev/null +++ b/functions/YoLo/yolo_seg @@ -0,0 +1,80 @@ +import torch +from ultralytics import YOLO +import cv2 +import matplotlib.pyplot as plt +import numpy as np + + + + +def segment_image(image_path, model_weights, output_path=None): + + # Load the YOLOv8 model with the specified weights + model = YOLO(model_weights) + + # Read the image + image = cv2.imread(image_path) + image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + + # Perform segmentation + results = model(image_rgb) + + # Extract the segmented mask + segmented_image = results[0].masks.data[0].numpy() + + # Convert the mask to a binary mask + segmented_image = (segmented_image > 0.5).astype('uint8') * 255 + + # Save the segmented image if output path is specified + if output_path: + cv2.imwrite(output_path, segmented_image) + + # Return the segmented image + return segmented_image + + +def image_to_numpy_opencv(image_path): + + image_array = cv2.imread(image_path) + return image_array + +def load_mask(mask_path): + + mask_image = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE) + _, mask_array = cv2.threshold(mask_image, 127, 1, cv2.THRESH_BINARY) + return mask_array + + +def extract_masked_pixels(image, mask): + + # Ensure the mask is binary + mask = mask.astype(np.uint8) + + # Create an empty image with the same dimensions as the original + masked_image = np.zeros_like(image) + + # Copy the pixels from the original image to the masked image where the mask is 1 + masked_image[mask == 1] = image[mask == 1] + + return masked_image + + + + +# Example usage +original_image = './images/me.jpg' +model_weights = './functions/YoLo/weights/person_yolov8m-seg.pt' # Path to open-source YOLOv8 weights +binary_mask = './images/output_segmented.jpg' + +segmented_image = segment_image(original_image, model_weights, binary_mask) + +original_imageNP = image_to_numpy_opencv(original_image) +binary_mask = load_mask(binary_mask) # Ensure the mask is loaded correctly + +result = extract_masked_pixels(original_image, binary_mask) +cv2.imwrite('masked_image.png', result) + + + + + diff --git a/functions/functions.py b/functions/functions.py new file mode 100644 index 00000000..c3312f35 --- /dev/null +++ b/functions/functions.py @@ -0,0 +1,40 @@ +from neo4j import GraphDatabase +from dotenv import load_dotenv +url = os.getenv("NEO4J_URI") +username = os.getenv("NEO4J_USERNAME") +password = os.getenv("NEO4J_PASSWORD") +ollama_base_url = os.getenv("OLLAMA_BASE_URL") +embedding_model_name = os.getenv("EMBEDDING_MODEL") +llm_name = os.getenv("LLM") +# Remapping for Langchain Neo4j integration +os.environ["NEO4J_URL"] = url + +# Define your Neo4j connection details +##neo4j_uri = "bolt://192.168.1.153:7687" # Replace with your Neo4j host and port +#neo4j_username = "neo4j" # Replace with your Neo4j username +#neo4j_password = "password" # Replace with your Neo4j password + +# Connect to the Neo4j database +driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password)) + +cypher_query = """ +MATCH (n) +RETURN n +""" + +def run_cypher_query(query): + with driver.session() as session: + result = session.run(query) + return [record for record in result] + + +# Run the Cypher query +#results = run_cypher_query(cypher_query) + +# Process and print the results +#for record in results: + ###print(record) + +# Close the Neo4j driver +#driver.close() +# \ No newline at end of file diff --git a/images/me.jpg b/images/me.jpg new file mode 100644 index 00000000..6f8dfbb8 Binary files /dev/null and b/images/me.jpg differ diff --git a/images/output_segmented.jpg b/images/output_segmented.jpg new file mode 100644 index 00000000..bd350143 Binary files /dev/null and b/images/output_segmented.jpg differ diff --git a/mimetype/frea_load_DOC_neo.py b/mimetype/frea_load_DOC_neo.py new file mode 100644 index 00000000..e69de29b diff --git a/mimetype/frea_load_image_neo.py b/mimetype/frea_load_image_neo.py new file mode 100644 index 00000000..e69de29b diff --git a/mimetype/frea_load_pdf_neo.py b/mimetype/frea_load_pdf_neo.py new file mode 100644 index 00000000..e69de29b diff --git a/mimetype/frea_load_sheet_neo.py b/mimetype/frea_load_sheet_neo.py new file mode 100644 index 00000000..e69de29b diff --git a/mimetype/frea_mimetype_switch_neo.py b/mimetype/frea_mimetype_switch_neo.py new file mode 100644 index 00000000..a87e1cc6 --- /dev/null +++ b/mimetype/frea_mimetype_switch_neo.py @@ -0,0 +1,37 @@ + +##Plain Text: text/plain +#HTML: text/html +#Markdown: text/markdown +#XML: application/xml +#Documents +#PDF: application/pdf +#Microsoft Word: application/msword (DOC), application/vnd.openxmlformats-officedocument.wordprocessingml.document (DOCX) +#Microsoft Excel: application/vnd.ms-excel (XLS), application/vnd.openxmlformats-officedocument.spreadsheetml.sheet (XLSX) +#Microsoft PowerPoint: application/vnd.ms-powerpoint (PPT), application/vnd.openxmlformats-officedocument.presentationml.presentation (PPTX) +#Rich Text Format: application/rtf +#Images +#JPEG: image/jpeg +#PNG: image/png +#GIF: image/gif +#BMP: image/bmp +#TIFF: image/tiff +#Web Content +#Web Pages: text/html, which includes embedded images, scripts, etc. +#CSS: text/css +#JavaScript: application/javascript +#Archives and Compressed Files +#ZIP: application/zip +#GZIP: application/gzip +#Multimedia +#MP3: audio/mpeg +#MP4: video/mp4 +#WAV: audio/wav +#OGG: audio/ogg +#WebM: video/webm +#Others +#JSON: application/json +#YAML: application/x-yaml +#CSV: text/csv +#Specialized Formats +#EPUB: application/epub+zip +#MOBI: application/x-mobipocket-ebook \ No newline at end of file diff --git a/multisearch.py b/multisearch.py new file mode 100644 index 00000000..e9221d0b --- /dev/null +++ b/multisearch.py @@ -0,0 +1,141 @@ +import os +import json +import cv2 +import requests +import torch +import networkx as nx +import matplotlib.pyplot as plt +from neo4j import GraphDatabase +from transformers import TrOCRProcessor, VisionEncoderDecoderModel +from fastapi import FastAPI, UploadFile, File +from langchain.llms import OpenAI +from langchain.chains import LLMChain +from langchain.prompts import PromptTemplate +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() + +# Neo4j Configuration +NEO4J_URI = "bolt://localhost:7687" +NEO4J_USER = "neo4j" +NEO4J_PASSWORD = "password" +STACKAI_LLM_API = "http://localhost:8000/v1/completions" + +# Initialize FastAPI +app = FastAPI() + +# Load OCR Model (TrOCR) +processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") +model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten") + +# Connect to Neo4j +driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD)) + +def preprocess_image(image_path): + """Preprocess image for OCR.""" + image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE) + image = cv2.threshold(image, 128, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] + return image + +def extract_text(image_path): + """Extract text from handwritten notes.""" + image = preprocess_image(image_path) + image = cv2.resize(image, (1024, 1024)) + + pixel_values = processor(image, return_tensors="pt").pixel_values + generated_ids = model.generate(pixel_values) + extracted_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] + return extracted_text + +def extract_math(image_path): + """Extract mathematical formulas.""" + response = requests.post("https://huggingface.co/breezedeus/pix2text-mfr", + files={"file": open(image_path, "rb")}) + return response.json().get("text", "") + +def extract_chemistry(image_path): + """Extract chemical formulas.""" + response = requests.post("https://chemocr.ai/api/v1/predict", + files={"file": open(image_path, "rb")}) + return response.json().get("chemical_formula", "") + +def summarize_content(text): + """Summarize scientific notes using LLM.""" + prompt = PromptTemplate( + input_variables=["text"], + template="Summarize this scientific note and extract key concepts: {text}" + ) + chain = LLMChain(llm=OpenAI(base_url=STACKAI_LLM_API, api_key="your-api-key"), prompt=prompt) + return chain.run(text) + +def store_in_neo4j(title, text, math, chem, summary): + """Store extracted information in Neo4j as a knowledge graph.""" + with driver.session() as session: + session.run(""" + CREATE (n:Note {title: $title, text: $text, summary: $summary}) + """, title=title, text=text, summary=summary) + + for formula in math.split("\n"): + if formula: + session.run(""" + MATCH (n:Note {title: $title}) + CREATE (m:MathFormula {formula: $formula})-[:APPEARS_IN]->(n) + """, title=title, formula=formula) + + for compound in chem.split("\n"): + if compound: + session.run(""" + MATCH (n:Note {title: $title}) + CREATE (c:Chemical {compound: $compound})-[:APPEARS_IN]->(n) + """, title=title, compound=compound) + +@app.post("/process/") +async def process_image(file: UploadFile = File(...)): + """Process uploaded image and store in Neo4j.""" + file_path = f"./temp/{file.filename}" + with open(file_path, "wb") as f: + f.write(await file.read()) + + extracted_text = extract_text(file_path) + extracted_math = extract_math(file_path) + extracted_chem = extract_chemistry(file_path) + + full_content = f"{extracted_text}\nMathematical Formulas: {extracted_math}\nChemical Formulas: {extracted_chem}" + summary = summarize_content(full_content) + + store_in_neo4j(file.filename, extracted_text, extracted_math, extracted_chem, summary) + + return {"summary": summary, "math": extracted_math, "chem": extracted_chem} + +@app.get("/knowledge-graph/") +def get_knowledge_graph(): + """Retrieve knowledge graph data from Neo4j and visualize it.""" + query = """ + MATCH (n)-[r]->(m) RETURN n, r, m + """ + nodes = [] + relationships = [] + + with driver.session() as session: + results = session.run(query) + for record in results: + n, r, m = record["n"], record["r"], record["m"] + nodes.append(n["title"] if "title" in n else n["formula"] if "formula" in n else n["compound"]) + nodes.append(m["title"] if "title" in m else m["formula"] if "formula" in m else m["compound"]) + relationships.append((n["title"] if "title" in n else n["formula"] if "formula" in n else n["compound"], + m["title"] if "title" in m else m["formula"] if "formula" in m else m["compound"])) + + G = nx.Graph() + G.add_edges_from(relationships) + + plt.figure(figsize=(10, 6)) + nx.draw(G, with_labels=True, node_color="lightblue", edge_color="gray", node_size=2000, font_size=10) + plt.title("Knowledge Graph") + plt.savefig("./temp/knowledge_graph.png") + return {"message": "Knowledge graph updated. View the generated graph at /temp/knowledge_graph.png"} + +if __name__ == "__main__": + import uvicorn + os.makedirs("./temp", exist_ok=True) + uvicorn.run(app, host="0.0.0.0", port=8080) \ No newline at end of file diff --git a/process_documents.py b/process_documents.py new file mode 100644 index 00000000..f74948f7 --- /dev/null +++ b/process_documents.py @@ -0,0 +1,156 @@ +import os +import cv2 +import json +import requests +import torch +import networkx as nx +import matplotlib.pyplot as plt +from pdfminer.high_level import extract_text as extract_pdf_text +from docx import Document +from neo4j import GraphDatabase +from transformers import TrOCRProcessor, VisionEncoderDecoderModel +from langchain.llms import OpenAI +from langchain.chains import LLMChain +from langchain.prompts import PromptTemplate +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() + +# Neo4j Configuration +NEO4J_URI = "bolt://localhost:7687" +NEO4J_USER = "neo4j" +NEO4J_PASSWORD = "password" +STACKAI_LLM_API = "http://localhost:8000/v1/completions" + +# Initialize Neo4j connection +driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD)) + +# Load TrOCR for OCR +processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") +model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten") + +def preprocess_image(image_path): + """Preprocess image for OCR.""" + image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE) + image = cv2.threshold(image, 128, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] + return image + +def extract_text_from_image(image_path): + """Extract handwritten text from image using TrOCR.""" + image = preprocess_image(image_path) + pixel_values = processor(image, return_tensors="pt").pixel_values + generated_ids = model.generate(pixel_values) + return processor.batch_decode(generated_ids, skip_special_tokens=True)[0] + +def extract_text_from_pdf(pdf_path): + """Extract text from PDF using pdfminer.""" + return extract_pdf_text(pdf_path) + +def extract_text_from_docx(docx_path): + """Extract text from DOCX using python-docx.""" + doc = Document(docx_path) + return "\n".join([para.text for para in doc.paragraphs]) + +def extract_text_from_txt(txt_path): + """Extract text from TXT file.""" + with open(txt_path, "r", encoding="utf-8") as file: + return file.read() + +def extract_math(image_path): + """Extract mathematical formulas.""" + response = requests.post("https://huggingface.co/breezedeus/pix2text-mfr", + files={"file": open(image_path, "rb")}) + return response.json().get("text", "") + +def extract_chemistry(image_path): + """Extract chemical formulas.""" + response = requests.post("https://chemocr.ai/api/v1/predict", + files={"file": open(image_path, "rb")}) + return response.json().get("chemical_formula", "") + +def summarize_content(text): + """Summarize extracted content using LLM.""" + prompt = PromptTemplate( + input_variables=["text"], + template="Summarize this scientific note and extract key concepts: {text}" + ) + chain = LLMChain(llm=OpenAI(base_url=STACKAI_LLM_API, api_key="your-api-key"), prompt=prompt) + return chain.run(text) + +def store_in_neo4j(filename, text, math, chem, summary): + """Store extracted information in Neo4j as a knowledge graph.""" + with driver.session() as session: + session.run(""" + CREATE (n:Document {filename: $filename, text: $text, summary: $summary}) + """, filename=filename, text=text, summary=summary) + + for formula in math.split("\n"): + if formula: + session.run(""" + MATCH (n:Document {filename: $filename}) + CREATE (m:MathFormula {formula: $formula})-[:APPEARS_IN]->(n) + """, filename=filename, formula=formula) + + for compound in chem.split("\n"): + if compound: + session.run(""" + MATCH (n:Document {filename: $filename}) + CREATE (c:Chemical {compound: $compound})-[:APPEARS_IN]->(n) + """, filename=filename, compound=compound) + +def process_files(directory): + """Recursively process files in a directory and store them in Neo4j.""" + for root, _, files in os.walk(directory): + for file in files: + file_path = os.path.join(root, file) + print(f"Processing: {file_path}") + + extracted_text = "" + extracted_math = "" + extracted_chem = "" + + if file.lower().endswith((".jpg", ".jpeg", ".png")): + extracted_text = extract_text_from_image(file_path) + extracted_math = extract_math(file_path) + extracted_chem = extract_chemistry(file_path) + elif file.lower().endswith(".pdf"): + extracted_text = extract_text_from_pdf(file_path) + elif file.lower().endswith(".docx"): + extracted_text = extract_text_from_docx(file_path) + elif file.lower().endswith(".txt"): + extracted_text = extract_text_from_txt(file_path) + + full_content = f"{extracted_text}\nMath: {extracted_math}\nChem: {extracted_chem}" + summary = summarize_content(full_content) + + store_in_neo4j(file, extracted_text, extracted_math, extracted_chem, summary) + +def generate_knowledge_graph(): + """Generate a visualization of the knowledge graph.""" + query = "MATCH (n)-[r]->(m) RETURN n, r, m" + nodes = [] + relationships = [] + + with driver.session() as session: + results = session.run(query) + for record in results: + n, r, m = record["n"], record["r"], record["m"] + nodes.append(n["filename"] if "filename" in n else n["formula"] if "formula" in n else n["compound"]) + nodes.append(m["filename"] if "filename" in m else m["formula"] if "formula" in m else m["compound"]) + relationships.append((n["filename"] if "filename" in n else n["formula"] if "formula" in n else n["compound"], + m["filename"] if "filename" in m else m["formula"] if "formula" in m else m["compound"])) + + G = nx.Graph() + G.add_edges_from(relationships) + + plt.figure(figsize=(10, 6)) + nx.draw(G, with_labels=True, node_color="lightblue", edge_color="gray", node_size=2000, font_size=10) + plt.title("Knowledge Graph") + plt.savefig("knowledge_graph.png") + print("Knowledge graph saved as knowledge_graph.png") + +if __name__ == "__main__": + dir_path = input("Enter the directory path to process: ") + process_files(dir_path) + generate_knowledge_graph() \ No newline at end of file diff --git a/pull_model.Dockerfile b/pull_model.Dockerfile index b06625f7..ca1fcbb0 100644 --- a/pull_model.Dockerfile +++ b/pull_model.Dockerfile @@ -15,31 +15,32 @@ COPY < /dev/null || ./bin/ollama pull %s'" llm llm)) + (if (= :stop v) :stopped + (do (println (format "... pulling model (%ss) - will take several minutes" (* n 10))) + (recur (inc n)))))) + + (process/shell {:env {"OLLAMA_HOST" url + "HOME" (System/getProperty "user.home")} + :out :inherit :err :inherit} + (format "bash -c './bin/ollama show %s --modelfile > /dev/null || ./bin/ollama pull %s'" llm llm)) + (async/>!! done :stop)) (println "OLLAMA model only pulled if both LLM and OLLAMA_BASE_URL are set and the LLM model is not gpt"))) (catch Throwable _ (System/exit 1))) EOF - -ENTRYPOINT ["bb", "-f", "pull_model.clj"] - diff --git a/pull_model.clj b/pull_model.clj new file mode 100644 index 00000000..5251db18 --- /dev/null +++ b/pull_model.clj @@ -0,0 +1,35 @@ +(ns pull-model + (:require [babashka.process :as process] + [clojure.core.async :as async])) + +(try + (let [llm (get (System/getenv) "LLM") + url (get (System/getenv) "OLLAMA_BASE_URL")] + (println (format "pulling ollama model %s using %s" llm url)) + (if (and llm + url + (not (#{"gpt-4" "gpt-3.5" "claudev2" "gpt-4o" "gpt-4-turbo"} llm)) + (not (some #(.startsWith llm %) ["ai21.jamba-instruct-v1:0" + "amazon.titan" + "anthropic.claude" + "cohere.command" + "meta.llama" + "mistral.mi"]))) + + ;; ---------------------------------------------------------------------- + ;; just call `ollama pull` here - create OLLAMA_HOST from OLLAMA_BASE_URL + ;; ---------------------------------------------------------------------- + ;; TODO - this still doesn't show progress properly when run from docker compose + + (let [done (async/chan)] + (async/go-loop [n 0] + (let [[v _] (async/alts! [done (async/timeout 5000)])] + (if (= :stop v) :stopped (do (println (format "... pulling model (%ss) - will take several minutes" (* n 10))) (recur (inc n)))))) + + (process/shell {:env {"OLLAMA_HOST" url "HOME" (System/getProperty "user.home")} :out :inherit :err :inherit} + (format "bash -c './bin/ollama show %s --modelfile > /dev/null || ./bin/ollama pull %s'" llm llm)) + (async/>!! done :stop)) + + (println "OLLAMA model only pulled if both LLM and OLLAMA_BASE_URL are set and the LLM model is not gpt"))) + + (catch Throwable _ (System/exit 1)))