Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pg vector #802

Open
wants to merge 28 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CREATE EXTENSION IF NOT EXISTS vector;.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CREATE EXTENSION IF NOT EXISTS vector;
Binary file added Results/New Microsoft Word Document.docx
Binary file not shown.
Binary file removed SOURCE_DOCUMENTS/Orca_paper.pdf
Binary file not shown.
Binary file added SOURCE_DOCUMENTS/Saudi Sanitaryware Market.pdf
Binary file not shown.
Binary file added chroma.sqlite3
Binary file not shown.
23 changes: 15 additions & 8 deletions constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@

# from dotenv import load_dotenv
from chromadb.config import Settings
# from faissdb.config import Settings

# https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/excel.html?highlight=xlsx#microsoft-excel
from langchain.document_loaders import CSVLoader, PDFMinerLoader, TextLoader, UnstructuredExcelLoader, Docx2txtLoader
from langchain.document_loaders import UnstructuredFileLoader, UnstructuredMarkdownLoader
from langchain.document_loaders import UnstructuredHTMLLoader
from langchain_community.document_loaders import CSVLoader, PDFMinerLoader, TextLoader, UnstructuredExcelLoader, Docx2txtLoader
from langchain_community.document_loaders import UnstructuredFileLoader, UnstructuredMarkdownLoader
from langchain_community.document_loaders import UnstructuredHTMLLoader


# load_dotenv()
Expand All @@ -19,6 +20,9 @@

MODELS_PATH = "./models"

# INDEX_PATH = "faiss_index.index"
# METADATA_PATH = "faiss_metadata.pkl"

# Can be changed to a specific number
INGEST_THREADS = os.cpu_count() or 8

Expand Down Expand Up @@ -59,7 +63,7 @@

# Default Instructor Model
EMBEDDING_MODEL_NAME = "hkunlp/instructor-large" # Uses 1.5 GB of VRAM (High Accuracy with lower VRAM usage)

# EMBEDDING_MODEL_NAME = 'TheBloke/Mistral-7B-Instruct-v0.1-GGUF'
####
#### OTHER EMBEDDING MODEL OPTIONS
####
Expand Down Expand Up @@ -107,15 +111,18 @@
# MODEL_BASENAME = "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf"

# LLAMA 3 # use for Apple Silicon
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
MODEL_BASENAME = None
# MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
# MODEL_ID = "TheBloke/Llama-2-7B-32K-Instruct-GPTQ"
# MODEL_BASENAME = None#"Llama-2-7B-32K-Instruct-GPTQ"
# MODEL_BASENAME = "model.safetensors.awq"


# LLAMA 3 # use for NVIDIA GPUs
# MODEL_ID = "unsloth/llama-3-8b-bnb-4bit"
# MODEL_BASENAME = None

# MODEL_ID = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
# MODEL_BASENAME = "mistral-7b-instruct-v0.1.Q8_0.gguf"
MODEL_ID = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
MODEL_BASENAME = "mistral-7b-instruct-v0.1.Q8_0.gguf"

# MODEL_ID = "TheBloke/Llama-2-70b-Chat-GGUF"
# MODEL_BASENAME = "llama-2-70b-chat.Q4_K_M.gguf"
Expand Down
Empty file added down.ipynb
Empty file.
226 changes: 190 additions & 36 deletions ingest.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,26 @@
import logging
import os
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
import faiss
import pickle
from transformers import AutoModel, AutoTokenizer
import psycopg2

import click
import torch
from langchain.docstore.document import Document
from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_community.vectorstores import Chroma , FAISS
from utils import get_embeddings
from langchain_community.docstore.in_memory import InMemoryDocstore

from langchain_cohere import CohereEmbeddings
# from langchain_core.documents import Document
# from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector

from pgvector.psycopg import register_vector
import psycopg

from constants import (
CHROMA_SETTINGS,
Expand All @@ -16,9 +29,13 @@
INGEST_THREADS,
PERSIST_DIRECTORY,
SOURCE_DIRECTORY,
# INDEX_PATH,
# METADATA_PATH,

)



def file_log(logentry):
file1 = open("file_ingest.log", "a")
file1.write(logentry + "\n")
Expand Down Expand Up @@ -113,36 +130,59 @@ def split_documents(documents: list[Document]) -> tuple[list[Document], list[Doc
return text_docs, python_docs


@click.command()
@click.option(
"--device_type",
default="cuda" if torch.cuda.is_available() else "cpu",
type=click.Choice(
[
"cpu",
"cuda",
"ipu",
"xpu",
"mkldnn",
"opengl",
"opencl",
"ideep",
"hip",
"ve",
"fpga",
"ort",
"xla",
"lazy",
"vulkan",
"mps",
"meta",
"hpu",
"mtia",
],
),
help="Device to run on. (Default is cuda)",
)
# @click.command()
# @click.option(
# "--device_type",
# default="cuda" if torch.cuda.is_available() else "cpu",
# type=click.Choice(
# [
# "cpu",
# "cuda",
# "ipu",
# "xpu",
# "mkldnn",
# "opengl",
# "opencl",
# "ideep",
# "hip",
# "ve",
# "fpga",
# "ort",
# "xla",
# "lazy",
# "vulkan",
# "mps",
# "meta",
# "hpu",
# "mtia",
# ],
# ),
# help="Device to run on. (Default is cuda)",
# )
# def save_faiss_index(db, index_path, metadata_path):
# faiss.write_index(db.index, index_path)
# metadata = {
# "index_to_docstore_id": db.index_to_docstore_id,
# "docstore": db.docstore,
# }
# with open(metadata_path, "wb") as f:
# pickle.dump(metadata, f)

# def load_faiss_index(index_path, metadata_path):
# index = faiss.read_index(index_path)
# with open(metadata_path, "rb") as f:
# metadata = pickle.load(f)
# docstore = metadata["docstore"]
# index_to_docstore_id = metadata["index_to_docstore_id"]
# db = FAISS(index=index,
# docstore=docstore,
# index_to_docstore_id=index_to_docstore_id)
# return db
device_type = 'cpu'


def main(device_type):
print(f"Running on device: {device_type}")
# Load documents and split in chunks
logging.info(f"Loading documents from {SOURCE_DIRECTORY}")
documents = load_documents(SOURCE_DIRECTORY)
Expand All @@ -161,22 +201,136 @@ def main(device_type):

(2) Provides additional arguments for instructor and BGE models to improve results, pursuant to the instructions contained on
their respective huggingface repository, project page or github repository.

"""

embeddings = get_embeddings(device_type)

logging.info(f"Loaded embeddings from {EMBEDDING_MODEL_NAME}")

db = Chroma.from_documents(
texts,
embeddings,
persist_directory=PERSIST_DIRECTORY,
client_settings=CHROMA_SETTINGS,
# # See docker command above to launch a postgres instance with pgvector enabled.
connection = "postgresql+psycopg://postgres:123456@localhost:5432/postgres" # Uses psycopg3!
# # "dbname=postgres user=postgres password=123456 host=localhost port=5432"
# connection.execute('CREATE EXTENSION IF NOT EXISTS vector')
# register_vector(connection)

# connection.execute('DROP TABLE IF EXISTS documents')
# connection.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding vector(384))')

collection_name = "PG_VECTOR_SAudi"
# embeddings = CohereEmbeddings()
# -------------------------------
# Q
#--------------------------------
# connection = psycopg2.connect("dbname=postgres user=postgres password=123456 host=localhost port=5432")
# connection = connection.cursor()
# db = PGVector(
# documents= texts,
# embeddings=embeddings,
# collection_name=collection_name,
# connection=connection,
# use_jsonb=True,
# )
# db.add_documents(texts, ids=[doc.metadata["id"] for doc in texts])
# "dbname=postgres user=postgres password=123456 host=localhost port=5432"
#changing to more programatically conn string
db = PGVector.from_documents(
documents= texts,
embedding=embeddings,
collection_name=collection_name,
connection=connection,
use_jsonb=True,
)
print(">>>>>>>>/n/n>>>>>>>>>>Connected AND Loaded to the database successfully!")

# collection_name = "PG_VECTOR_SAudi"
# db = PGVector.from_documents(
# embedding=embeddings,
# documents=texts,
# connection_string=CONNECTION_STRING,
# collection_name=collection_name,
# )
# -------------------------------
# Q
#--------------------------------
# db = Chroma.from_documents(
# texts,
# embeddings,
# persist_directory=PERSIST_DIRECTORY,
# client_settings=CHROMA_SETTINGS,
# )
# if os.path.exists(INDEX_PATH) and os.path.exists(METADATA_PATH):
# db = load_faiss_index(INDEX_PATH, METADATA_PATH)
# logging.info("Loaded FAISS index and metadata from disk.")
# else:

# d = embeddings.shape[1]
# index = faiss.IndexFlatL2(d)
# index.add(embeddings)

# docstore = InMemoryDocstore()
# index_to_docstore_id = {i: doc["id"] for i, doc in enumerate(texts)}

# db = FAISS(index=index, docstore=docstore, index_to_docstore_id=index_to_docstore_id)

# save_faiss_index(db, INDEX_PATH, METADATA_PATH)
# logging.info("Saved FAISS index and metadata to disk.")

# Load the model and tokenizer
# model_name = EMBEDDING_MODEL_NAME
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModel.from_pretrained(model_name)
# # Tokenize the input texts
# inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
# # Get the embeddings from the model
# with torch.no_grad():
# outputs = model(**inputs)
# # Extract the last hidden states (embeddings)
# embeddings = outputs.last_hidden_state
# # Pool the embeddings (e.g., mean pooling)
# pooled_embeddings = embeddings.mean(dim=1)
# # Convert the embeddings to a NumPy array
# numpy_embeddings = pooled_embeddings.cpu().numpy()

# # Get the dimension of the vectors
# vector_dimension = numpy_embeddings.shape[1]

# Create the FAISS index
# faiss_index = faiss.IndexFlatL2(vector_dimension)
# print(faiss_index.is_trained)
# # Add the embeddings to the index
# faiss_index.add(numpy_embeddings)
# # Save the index
# faiss.write_index(faiss_index, index_file_path)
# print(f"Index saved to {index_file_path}")
# print(faiss_index.ntotal)

# Define the directory and file name to save the index
# persist_dir = PERSIST_DIRECTORY
# index_file_path = os.path.join(persist_dir, 'faiss_index.index')

# # Load the index to verify
# faiss_index_loaded = faiss.read_index(index_file_path)
# print(f"Index loaded from {index_file_path}")

# Verify the loaded index
# print(f"Number of vectors in the loaded index: {faiss_index_loaded.ntotal}")

# db = FAISS.from_documents(
# texts,
# embeddings,
# # persist_directory=PERSIST_DIRECTORY,
# # client_settings=CHROMA_SETTINGS,
# )
# db.save_local("DB/faiss")

import argparse

if __name__ == "__main__":
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s", level=logging.INFO
)
main()
# parser = argparse.ArgumentParser(description="Ingest script for localGPT")
# parser.add_argument("--device_type", type=str, required=True, help="Device type (cpu or gpu)")
# args = parser.parse_args()
main(device_type='cpu')#args.device_type)
8 changes: 5 additions & 3 deletions localGPT_UI.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,16 @@
import subprocess
import streamlit as st
from run_localGPT import load_model
from langchain.vectorstores import Chroma
from langchain_community.vectorstores import Chroma
from constants import CHROMA_SETTINGS, EMBEDDING_MODEL_NAME, PERSIST_DIRECTORY, MODEL_ID, MODEL_BASENAME
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
from langchain.chains import RetrievalQA
from streamlit_extras.add_vertical_space import add_vertical_space
from streamlit_extras import add_vertical_space
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory

# Use the function in your Streamlit app
add_vertical_space(10)

def model_memory():
# Adding history to the model.
Expand Down
2 changes: 2 additions & 0 deletions postgres.session.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
CREATE EXTENSION IF NOT EXISTS vector;
-- CREATE EXTENSION vector WITH SCHEMA public;
Loading