PromtEngineer · Mahmoud-Khorshed-DS-AI · May 18, 2024 · May 18, 2024 · May 18, 2024 · May 18, 2024
diff --git a/CREATE EXTENSION IF NOT EXISTS vector;.sql b/CREATE EXTENSION IF NOT EXISTS vector;.sql
@@ -0,0 +1 @@
+CREATE EXTENSION  IF NOT EXISTS vector;
diff --git a/Results/New Microsoft Word Document.docx b/Results/New Microsoft Word Document.docx
diff --git a/SOURCE_DOCUMENTS/Orca_paper.pdf b/SOURCE_DOCUMENTS/Orca_paper.pdf
diff --git a/SOURCE_DOCUMENTS/Saudi Sanitaryware Market.pdf b/SOURCE_DOCUMENTS/Saudi Sanitaryware Market.pdf
diff --git a/chroma.sqlite3 b/chroma.sqlite3
diff --git a/constants.py b/constants.py
@@ -2,11 +2,12 @@
 
 # from dotenv import load_dotenv
 from chromadb.config import Settings
+# from faissdb.config import Settings
 
 # https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/excel.html?highlight=xlsx#microsoft-excel
-from langchain.document_loaders import CSVLoader, PDFMinerLoader, TextLoader, UnstructuredExcelLoader, Docx2txtLoader
-from langchain.document_loaders import UnstructuredFileLoader, UnstructuredMarkdownLoader
-from langchain.document_loaders import UnstructuredHTMLLoader
+from langchain_community.document_loaders import CSVLoader, PDFMinerLoader, TextLoader, UnstructuredExcelLoader, Docx2txtLoader
+from langchain_community.document_loaders import UnstructuredFileLoader, UnstructuredMarkdownLoader
+from langchain_community.document_loaders import UnstructuredHTMLLoader
 
 
 # load_dotenv()
@@ -19,6 +20,9 @@
 
 MODELS_PATH = "./models"
 
+# INDEX_PATH = "faiss_index.index"
+# METADATA_PATH = "faiss_metadata.pkl"
+
 # Can be changed to a specific number
 INGEST_THREADS = os.cpu_count() or 8
 
@@ -59,7 +63,7 @@
 
 # Default Instructor Model
 EMBEDDING_MODEL_NAME = "hkunlp/instructor-large"  # Uses 1.5 GB of VRAM (High Accuracy with lower VRAM usage)
-
+# EMBEDDING_MODEL_NAME = 'TheBloke/Mistral-7B-Instruct-v0.1-GGUF'
 ####
 #### OTHER EMBEDDING MODEL OPTIONS
 ####
@@ -107,15 +111,18 @@
 # MODEL_BASENAME = "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf"
 
 # LLAMA 3 # use for Apple Silicon
-MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-MODEL_BASENAME = None
+# MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+# MODEL_ID = "TheBloke/Llama-2-7B-32K-Instruct-GPTQ"
+# MODEL_BASENAME = None#"Llama-2-7B-32K-Instruct-GPTQ"
+# MODEL_BASENAME = "model.safetensors.awq"
+
 
 # LLAMA 3 # use for NVIDIA GPUs
 # MODEL_ID = "unsloth/llama-3-8b-bnb-4bit"
 # MODEL_BASENAME = None
 
-# MODEL_ID = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
-# MODEL_BASENAME = "mistral-7b-instruct-v0.1.Q8_0.gguf"
+MODEL_ID = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
+MODEL_BASENAME = "mistral-7b-instruct-v0.1.Q8_0.gguf"
 
 # MODEL_ID = "TheBloke/Llama-2-70b-Chat-GGUF"
 # MODEL_BASENAME = "llama-2-70b-chat.Q4_K_M.gguf"

diff --git a/down.ipynb b/down.ipynb
diff --git a/ingest.py b/ingest.py
@@ -1,13 +1,26 @@
 import logging
 import os
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
+import faiss
+import pickle
+from transformers import AutoModel, AutoTokenizer
+import psycopg2
 
 import click
 import torch
 from langchain.docstore.document import Document
 from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
-from langchain.vectorstores import Chroma
+from langchain_community.vectorstores import Chroma , FAISS
 from utils import get_embeddings
+from langchain_community.docstore.in_memory import InMemoryDocstore
+
+from langchain_cohere import CohereEmbeddings
+# from langchain_core.documents import Document
+# from langchain_postgres import PGVector
+from langchain_postgres.vectorstores import PGVector
+
+from pgvector.psycopg import register_vector
+import psycopg
 
 from constants import (
     CHROMA_SETTINGS,
@@ -16,9 +29,13 @@
     INGEST_THREADS,
     PERSIST_DIRECTORY,
     SOURCE_DIRECTORY,
+    # INDEX_PATH,
+    # METADATA_PATH,
+
 )
 
 
+
 def file_log(logentry):
     file1 = open("file_ingest.log", "a")
     file1.write(logentry + "\n")
@@ -113,36 +130,59 @@ def split_documents(documents: list[Document]) -> tuple[list[Document], list[Doc
     return text_docs, python_docs
 
 
-@click.command()
-@click.option(
-    "--device_type",
-    default="cuda" if torch.cuda.is_available() else "cpu",
-    type=click.Choice(
-        [
-            "cpu",
-            "cuda",
-            "ipu",
-            "xpu",
-            "mkldnn",
-            "opengl",
-            "opencl",
-            "ideep",
-            "hip",
-            "ve",
-            "fpga",
-            "ort",
-            "xla",
-            "lazy",
-            "vulkan",
-            "mps",
-            "meta",
-            "hpu",
-            "mtia",
-        ],
-    ),
-    help="Device to run on. (Default is cuda)",
-)
+# @click.command()
+# @click.option(
+#     "--device_type",
+#     default="cuda" if torch.cuda.is_available() else "cpu",
+#     type=click.Choice(
+#         [
+#             "cpu",
+#             "cuda",
+#             "ipu",
+#             "xpu",
+#             "mkldnn",
+#             "opengl",
+#             "opencl",
+#             "ideep",
+#             "hip",
+#             "ve",
+#             "fpga",
+#             "ort",
+#             "xla",
+#             "lazy",
+#             "vulkan",
+#             "mps",
+#             "meta",
+#             "hpu",
+#             "mtia",
+#         ],
+#     ),
+#     help="Device to run on. (Default is cuda)",
+# )
+# def save_faiss_index(db, index_path, metadata_path):
+#     faiss.write_index(db.index, index_path)
+#     metadata = {
+#         "index_to_docstore_id": db.index_to_docstore_id,
+#         "docstore": db.docstore,
+#     }
+#     with open(metadata_path, "wb") as f:
+#         pickle.dump(metadata, f)
+
+# def load_faiss_index(index_path, metadata_path):
+#     index = faiss.read_index(index_path)
+#     with open(metadata_path, "rb") as f:
+#         metadata = pickle.load(f)
+#     docstore = metadata["docstore"]
+#     index_to_docstore_id = metadata["index_to_docstore_id"]
+#     db = FAISS(index=index,
+#                docstore=docstore,
+#                index_to_docstore_id=index_to_docstore_id)
+#     return db
+device_type = 'cpu'
+
+
 def main(device_type):
+    print(f"Running on device: {device_type}")
     # Load documents and split in chunks
     logging.info(f"Loading documents from {SOURCE_DIRECTORY}")
     documents = load_documents(SOURCE_DIRECTORY)
@@ -161,22 +201,136 @@ def main(device_type):
 
     (2) Provides additional arguments for instructor and BGE models to improve results, pursuant to the instructions contained on
     their respective huggingface repository, project page or github repository.
+
     """
 
     embeddings = get_embeddings(device_type)
 
     logging.info(f"Loaded embeddings from {EMBEDDING_MODEL_NAME}")
 
-    db = Chroma.from_documents(
-        texts,
-        embeddings,
-        persist_directory=PERSIST_DIRECTORY,
-        client_settings=CHROMA_SETTINGS,
+    # # See docker command above to launch a postgres instance with pgvector enabled.
+    connection = "postgresql+psycopg://postgres:123456@localhost:5432/postgres"  # Uses psycopg3!
+    # # "dbname=postgres user=postgres password=123456 host=localhost port=5432"
+    # connection.execute('CREATE EXTENSION IF NOT EXISTS vector')
+    # register_vector(connection)
+
+    # connection.execute('DROP TABLE IF EXISTS documents')
+    # connection.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding vector(384))')
+
+    collection_name = "PG_VECTOR_SAudi"
+    # embeddings = CohereEmbeddings()
+# -------------------------------
+# Q
+#--------------------------------
+    # connection = psycopg2.connect("dbname=postgres user=postgres password=123456 host=localhost port=5432")
+    # connection = connection.cursor()
+    # db = PGVector(
+    #     documents= texts,
+    #     embeddings=embeddings,
+    #     collection_name=collection_name,
+    #     connection=connection,
+    #     use_jsonb=True,
+    # )
+    # db.add_documents(texts, ids=[doc.metadata["id"] for doc in texts])
+    # "dbname=postgres user=postgres password=123456 host=localhost port=5432"
+    #changing to more programatically conn string
+    db = PGVector.from_documents(
+        documents= texts,
+        embedding=embeddings,
+        collection_name=collection_name,
+        connection=connection,
+        use_jsonb=True,
     )
+    print(">>>>>>>>/n/n>>>>>>>>>>Connected AND Loaded to the database successfully!")
+
+    # collection_name = "PG_VECTOR_SAudi"
+    # db = PGVector.from_documents(
+    #     embedding=embeddings,
+    #     documents=texts,
+    #     connection_string=CONNECTION_STRING,
+    #     collection_name=collection_name,
+    # )
+# -------------------------------
+# Q
+#--------------------------------
+    # db = Chroma.from_documents(
+    #     texts,
+    #     embeddings,
+    #     persist_directory=PERSIST_DIRECTORY,
+    #     client_settings=CHROMA_SETTINGS,
+    # )
+    # if os.path.exists(INDEX_PATH) and os.path.exists(METADATA_PATH):
+    #     db = load_faiss_index(INDEX_PATH, METADATA_PATH)
+    #     logging.info("Loaded FAISS index and metadata from disk.")
+    # else:
+
+    #     d = embeddings.shape[1]
+    #     index = faiss.IndexFlatL2(d)
+    #     index.add(embeddings)
+
+    #     docstore = InMemoryDocstore()
+    #     index_to_docstore_id = {i: doc["id"] for i, doc in enumerate(texts)}
+
+    #     db = FAISS(index=index, docstore=docstore, index_to_docstore_id=index_to_docstore_id)
+
+    #     save_faiss_index(db, INDEX_PATH, METADATA_PATH)
+    #     logging.info("Saved FAISS index and metadata to disk.")
+
+    # Load the model and tokenizer
+    # model_name = EMBEDDING_MODEL_NAME
+    # tokenizer = AutoTokenizer.from_pretrained(model_name)
+    # model = AutoModel.from_pretrained(model_name)
+    # # Tokenize the input texts
+    # inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
+    # # Get the embeddings from the model
+    # with torch.no_grad():
+    #     outputs = model(**inputs)
+    # # Extract the last hidden states (embeddings)
+    # embeddings = outputs.last_hidden_state
+    # # Pool the embeddings (e.g., mean pooling)
+    # pooled_embeddings = embeddings.mean(dim=1)
+    # # Convert the embeddings to a NumPy array
+    # numpy_embeddings = pooled_embeddings.cpu().numpy()
+
+    # # Get the dimension of the vectors
+    # vector_dimension = numpy_embeddings.shape[1]
+
+    # Create the FAISS index
+    # faiss_index = faiss.IndexFlatL2(vector_dimension)
+    # print(faiss_index.is_trained)
+    # # Add the embeddings to the index
+    # faiss_index.add(numpy_embeddings)
+    # # Save the index
+    # faiss.write_index(faiss_index, index_file_path)
+    # print(f"Index saved to {index_file_path}")
+    # print(faiss_index.ntotal)
+
+    # Define the directory and file name to save the index
+    # persist_dir = PERSIST_DIRECTORY
+    # index_file_path = os.path.join(persist_dir, 'faiss_index.index')
+
+    # # Load the index to verify
+    # faiss_index_loaded = faiss.read_index(index_file_path)
+    # print(f"Index loaded from {index_file_path}")
+
+    # Verify the loaded index
+    # print(f"Number of vectors in the loaded index: {faiss_index_loaded.ntotal}")
+
+    # db = FAISS.from_documents(
+    #     texts,
+    #     embeddings,
+    #     # persist_directory=PERSIST_DIRECTORY,
+    #     # client_settings=CHROMA_SETTINGS,
+    #     )
+    # db.save_local("DB/faiss")
 
+import argparse
 
 if __name__ == "__main__":
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s", level=logging.INFO
     )
-    main()
+    # parser = argparse.ArgumentParser(description="Ingest script for localGPT")
+    # parser.add_argument("--device_type", type=str, required=True, help="Device type (cpu or gpu)")
+    # args = parser.parse_args()
+    main(device_type='cpu')#args.device_type)
diff --git a/localGPT_UI.py b/localGPT_UI.py
@@ -2,14 +2,16 @@
 import subprocess
 import streamlit as st
 from run_localGPT import load_model
-from langchain.vectorstores import Chroma
+from langchain_community.vectorstores import Chroma
 from constants import CHROMA_SETTINGS, EMBEDDING_MODEL_NAME, PERSIST_DIRECTORY, MODEL_ID, MODEL_BASENAME
-from langchain.embeddings import HuggingFaceInstructEmbeddings
+from langchain_community.embeddings import HuggingFaceInstructEmbeddings
 from langchain.chains import RetrievalQA
-from streamlit_extras.add_vertical_space import add_vertical_space
+from streamlit_extras import add_vertical_space
 from langchain.prompts import PromptTemplate
 from langchain.memory import ConversationBufferMemory
 
+# Use the function in your Streamlit app
+add_vertical_space(10)
 
 def model_memory():
     # Adding history to the model.

diff --git a/postgres.session.sql b/postgres.session.sql
@@ -0,0 +1,2 @@
+CREATE EXTENSION  IF NOT EXISTS vector;
+-- CREATE EXTENSION vector WITH SCHEMA public;
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		CREATE EXTENSION IF NOT EXISTS vector;
		-- CREATE EXTENSION vector WITH SCHEMA public;