PromtEngineer · daniellefisla · Mar 5, 2024 · Mar 5, 2024
diff --git a/constants.py b/constants.py
@@ -4,10 +4,16 @@
 from chromadb.config import Settings
 
 # https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/excel.html?highlight=xlsx#microsoft-excel
-from langchain.document_loaders import CSVLoader, PDFMinerLoader, TextLoader, UnstructuredExcelLoader, Docx2txtLoader
-from langchain.document_loaders import UnstructuredFileLoader, UnstructuredMarkdownLoader
-from langchain.document_loaders import UnstructuredHTMLLoader
-
+from langchain.document_loaders import (
+    CSVLoader,
+    Docx2txtLoader,
+    PDFMinerLoader,
+    TextLoader,
+    UnstructuredExcelLoader,
+    UnstructuredFileLoader,
+    UnstructuredHTMLLoader,
+    UnstructuredMarkdownLoader,
+)
 
 # load_dotenv()
 ROOT_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
@@ -142,12 +148,16 @@
 ##### 24GB VRAM Graphics Cards (RTX 3090 - RTX 4090 (35% Faster) - RTX A5000 - RTX A5500) #####
 
 ### 13b GPTQ Models for 24GB GPUs (*** With best embedding model: hkunlp/instructor-xl ***)
+# FAILS/blank answers
 # MODEL_ID = "TheBloke/Wizard-Vicuna-13B-Uncensored-GPTQ"
-# MODEL_BASENAME = "Wizard-Vicuna-13B-Uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors"
+# MODEL_BASENAME = "model.safetensors"
+
 # MODEL_ID = "TheBloke/vicuna-13B-v1.5-GPTQ"
 # MODEL_BASENAME = "model.safetensors"
+
 # MODEL_ID = "TheBloke/Nous-Hermes-13B-GPTQ"
 # MODEL_BASENAME = "nous-hermes-13b-GPTQ-4bit-128g.no-act.order"
+
 # MODEL_ID = "TheBloke/WizardLM-13B-V1.2-GPTQ"
 # MODEL_BASENAME = "gptq_model-4bit-128g.safetensors
 
@@ -162,9 +172,11 @@
 
 ### 7b GPTQ Models for 8GB GPUs
 # MODEL_ID = "TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ"
-# MODEL_BASENAME = "Wizard-Vicuna-7B-Uncensored-GPTQ-4bit-128g.no-act.order.safetensors"
+# MODEL_BASENAME = "model.safetensors"
+
 # MODEL_ID = "TheBloke/WizardLM-7B-uncensored-GPTQ"
 # MODEL_BASENAME = "WizardLM-7B-uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors"
+
 # MODEL_ID = "TheBloke/wizardLM-7B-GPTQ"
 # MODEL_BASENAME = "wizardLM-7B-GPTQ-4bit.compat.no-act-order.safetensors"
 
@@ -184,5 +196,36 @@
 ### (*** MODEL_BASENAME is not actually used but have to contain .awq so the correct model loading is used ***)
 ### (*** Compute capability 7.5 (sm75) and CUDA Toolkit 11.8+ are required ***)
 ####
+# WORKS
 # MODEL_ID = "TheBloke/Llama-2-7B-Chat-AWQ"
 # MODEL_BASENAME = "model.safetensors.awq"
+
+# FAILS
+# torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 28.00 MiB (GPU 0; 23.62 GiB total capacity; 19.95 GiB already allocated; 50.75 MiB free; 19.95 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
+
+# MODEL_ID = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ"
+# MODEL_BASENAME = "model.safetensors"
+
+# WORKS
+# MODEL_ID = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF"
+# MODEL_BASENAME = "mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf"
+
+# LOADS Forever
+# MODEL_ID = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ"
+# MODEL_BASENAME = "model.safetensors"
+
+# for GGUF/GGML models
+# pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir
+
+# WORKS
+# MODEL_ID = "TheBloke/CapybaraHermes-2.5-Mistral-7B-GPTQ"
+# MODEL_BASENAME = "model.safetensors"
+
+# WORKS BUT CHAT ONLY
+# MODEL_ID = "TheBloke/Llama-2-13B-chat-GPTQ"
+# MODEL_BASENAME = "model.safetensors"
+
+# WORKS, testing REVISION
+MODEL_ID = "TheBloke/vicuna-13B-v1.5-16K-GPTQ"
+MODEL_BASENAME = "model.safetensors"
+REVISION = "main"
diff --git a/load_models.py b/load_models.py
@@ -7,7 +7,7 @@
 
 from huggingface_hub import hf_hub_download
 from langchain.llms import LlamaCpp
-from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, LlamaForCausalLM, LlamaTokenizer
 
 from constants import CONTEXT_WINDOW_SIZE, MAX_NEW_TOKENS, MODELS_PATH, N_BATCH, N_GPU_LAYERS
 
@@ -60,7 +60,7 @@ def load_quantized_model_gguf_ggml(model_id, model_basename, device_type, loggin
         return None
 
 
-def load_quantized_model_qptq(model_id, model_basename, device_type, logging):
+def load_quantized_model_qptq(model_id, revision, model_basename, device_type, logging):
     """
     Load a GPTQ quantized model using AutoGPTQForCausalLM.
 
@@ -102,6 +102,7 @@ def load_quantized_model_qptq(model_id, model_basename, device_type, logging):
 
     model = AutoGPTQForCausalLM.from_quantized(
         model_id,
+        revision=revision,
         model_basename=model_basename,
         use_safetensors=True,
         trust_remote_code=True,
@@ -144,11 +145,11 @@ def load_full_model(model_id, model_basename, device_type, logging):
         tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="./models/")
         logging.info("Tokenizer loaded")
         bnb_config = BitsAndBytesConfig(
-                load_in_4bit=True,
-                bnb_4bit_use_double_quant=True,
-                bnb_4bit_quant_type="nf4",
-                bnb_4bit_compute_dtype=torch.float16
-                )
+            load_in_4bit=True,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.float16,
+        )
         model = AutoModelForCausalLM.from_pretrained(
             model_id,
             device_map="auto",
@@ -157,10 +158,10 @@ def load_full_model(model_id, model_basename, device_type, logging):
             cache_dir=MODELS_PATH,
             trust_remote_code=True,  # set these if you are using NVIDIA GPU
             quantization_config=bnb_config
-           # load_in_4bit=True,
-           # bnb_4bit_quant_type="nf4",
-           # bnb_4bit_compute_dtype=torch.float16,
-           # max_memory={0: "15GB"},  # Uncomment this line with you encounter CUDA out of memory errors
+            # load_in_4bit=True,
+            # bnb_4bit_quant_type="nf4",
+            # bnb_4bit_compute_dtype=torch.float16,
+            # max_memory={0: "15GB"},  # Uncomment this line with you encounter CUDA out of memory errors
         )
 
         model.tie_weights()

diff --git a/prompt_template_utils.py b/prompt_template_utils.py
@@ -1,6 +1,6 @@
 """
-This file implements prompt template for llama based models. 
-Modify the prompt template based on the model you select. 
+This file implements prompt template for llama based models.
+Modify the prompt template based on the model you select.
 This seems to have significant impact on the output of the LLM.
 """
 
@@ -10,7 +10,7 @@
 # this is specific to Llama-2.
 
 system_prompt = """You are a helpful assistant, you will use the provided context to answer user questions.
-Read the given context before answering questions and think step by step. If you can not answer a user question based on 
+Read the given context before answering questions and think step by step. If you can not answer a user question based on
 the provided context, inform the user. Do not use any other information for answering user. Provide a detailed answer to the question."""
 
 
@@ -40,7 +40,7 @@ def get_prompt_template(system_prompt=system_prompt, promptTemplate_type=None, h
                 B_INST
                 + system_prompt
                 + """
-    
+
             Context: {history} \n {context}
             User: {question}"""
                 + E_INST
@@ -51,19 +51,41 @@ def get_prompt_template(system_prompt=system_prompt, promptTemplate_type=None, h
                 B_INST
                 + system_prompt
                 + """
-            
+
             Context: {context}
             User: {question}"""
                 + E_INST
             )
             prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template)
+    elif promptTemplate_type == "hermes":
+        B_INST, E_INST = "<|im_start|> ", " <|im_end|>"
+
+        prompt_template = (
+            B_INST
+            + system_prompt
+            + """
+
+        Context: {context}
+        User: {question}"""
+            + E_INST
+            + "\n"
+            + B_INST
+            + "assistant"
+        )
+        prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template)
+    # A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {prompt} ASSISTANT:
+
+    elif promptTemplate_type == "vicuna":
+        prompt_template = "Context: {context} USER: {question}" + "ASSISTANT:"
+        prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template)
+
     else:
         # change this based on the model you have selected.
         if history:
             prompt_template = (
                 system_prompt
                 + """
-    
+
             Context: {history} \n {context}
             User: {question}
             Answer:"""
@@ -73,7 +95,7 @@ def get_prompt_template(system_prompt=system_prompt, promptTemplate_type=None, h
             prompt_template = (
                 system_prompt
                 + """
-            
+
             Context: {context}
             User: {question}
             Answer:"""

diff --git a/run_localGPT.py b/run_localGPT.py
@@ -1,45 +1,43 @@
-import os
 import logging
+import os
+
 import click
 import torch
-import utils
+from langchain.callbacks.manager import CallbackManager
+from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler  # for streaming response
 from langchain.chains import RetrievalQA
 from langchain.embeddings import HuggingFaceInstructEmbeddings
 from langchain.llms import HuggingFacePipeline
-from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler  # for streaming response
-from langchain.callbacks.manager import CallbackManager
 
-callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
+import utils
 
-from prompt_template_utils import get_prompt_template
-from utils import get_embeddings
+callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
 
 # from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 from langchain.vectorstores import Chroma
-from transformers import (
-    GenerationConfig,
-    pipeline,
-)
-
-from load_models import (
-    load_quantized_model_awq,
-    load_quantized_model_gguf_ggml,
-    load_quantized_model_qptq,
-    load_full_model,
-)
+from transformers import GenerationConfig, pipeline
 
 from constants import (
+    CHROMA_SETTINGS,
     EMBEDDING_MODEL_NAME,
-    PERSIST_DIRECTORY,
-    MODEL_ID,
-    MODEL_BASENAME,
     MAX_NEW_TOKENS,
+    MODEL_BASENAME,
+    MODEL_ID,
     MODELS_PATH,
-    CHROMA_SETTINGS,
+    PERSIST_DIRECTORY,
+    REVISION,
 )
+from load_models import (
+    load_full_model,
+    load_quantized_model_awq,
+    load_quantized_model_gguf_ggml,
+    load_quantized_model_qptq,
+)
+from prompt_template_utils import get_prompt_template
+from utils import get_embeddings
 
 
-def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
+def load_model(device_type, model_id, revision="main", model_basename=None, LOGGING=logging):
     """
     Select a model for text generation using the HuggingFace library.
     If you are running this for the first time, it will download a model for you.
@@ -48,6 +46,7 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
     Args:
         device_type (str): Type of device to use, e.g., "cuda" for GPU or "cpu" for CPU.
         model_id (str): Identifier of the model to load from HuggingFace's model hub.
+        revision (str, optional): Revision/branch of the model to load. Defaults to "main".
         model_basename (str, optional): Basename of the model if using quantized models.
             Defaults to None.
 
@@ -69,7 +68,7 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
         elif ".awq" in model_basename.lower():
             model, tokenizer = load_quantized_model_awq(model_id, LOGGING)
         else:
-            model, tokenizer = load_quantized_model_qptq(model_id, model_basename, device_type, LOGGING)
+            model, tokenizer = load_quantized_model_qptq(model_id, revision, model_basename, device_type, LOGGING)
     else:
         model, tokenizer = load_full_model(model_id, model_basename, device_type, LOGGING)
 
@@ -122,7 +121,7 @@ def retrieval_qa_pipline(device_type, use_history, promptTemplate_type="llama"):
 
     """
     (1) Chooses an appropriate langchain library based on the enbedding model name.  Matching code is contained within ingest.py.
-    
+
     (2) Provides additional arguments for instructor and BGE models to improve results, pursuant to the instructions contained on
     their respective huggingface repository, project page or github repository.
     """
@@ -139,7 +138,7 @@ def retrieval_qa_pipline(device_type, use_history, promptTemplate_type="llama"):
     prompt, memory = get_prompt_template(promptTemplate_type=promptTemplate_type, history=use_history)
 
     # load the llm pipeline
-    llm = load_model(device_type, model_id=MODEL_ID, model_basename=MODEL_BASENAME, LOGGING=logging)
+    llm = load_model(device_type, model_id=MODEL_ID, revision=REVISION, model_basename=MODEL_BASENAME, LOGGING=logging)
 
     if use_history:
         qa = RetrievalQA.from_chain_type(
@@ -211,7 +210,7 @@ def retrieval_qa_pipline(device_type, use_history, promptTemplate_type="llama"):
     "--model_type",
     default="llama",
     type=click.Choice(
-        ["llama", "mistral", "non_llama"],
+        ["llama", "mistral", "non_llama", "hermes", "vicuna"],
     ),
     help="model type, llama, mistral or non_llama",
 )