diff --git a/constants.py b/constants.py index 380c650a..7e291073 100644 --- a/constants.py +++ b/constants.py @@ -4,10 +4,16 @@ from chromadb.config import Settings # https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/excel.html?highlight=xlsx#microsoft-excel -from langchain.document_loaders import CSVLoader, PDFMinerLoader, TextLoader, UnstructuredExcelLoader, Docx2txtLoader -from langchain.document_loaders import UnstructuredFileLoader, UnstructuredMarkdownLoader -from langchain.document_loaders import UnstructuredHTMLLoader - +from langchain.document_loaders import ( + CSVLoader, + Docx2txtLoader, + PDFMinerLoader, + TextLoader, + UnstructuredExcelLoader, + UnstructuredFileLoader, + UnstructuredHTMLLoader, + UnstructuredMarkdownLoader, +) # load_dotenv() ROOT_DIRECTORY = os.path.dirname(os.path.realpath(__file__)) @@ -142,12 +148,16 @@ ##### 24GB VRAM Graphics Cards (RTX 3090 - RTX 4090 (35% Faster) - RTX A5000 - RTX A5500) ##### ### 13b GPTQ Models for 24GB GPUs (*** With best embedding model: hkunlp/instructor-xl ***) +# FAILS/blank answers # MODEL_ID = "TheBloke/Wizard-Vicuna-13B-Uncensored-GPTQ" -# MODEL_BASENAME = "Wizard-Vicuna-13B-Uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors" +# MODEL_BASENAME = "model.safetensors" + # MODEL_ID = "TheBloke/vicuna-13B-v1.5-GPTQ" # MODEL_BASENAME = "model.safetensors" + # MODEL_ID = "TheBloke/Nous-Hermes-13B-GPTQ" # MODEL_BASENAME = "nous-hermes-13b-GPTQ-4bit-128g.no-act.order" + # MODEL_ID = "TheBloke/WizardLM-13B-V1.2-GPTQ" # MODEL_BASENAME = "gptq_model-4bit-128g.safetensors @@ -162,9 +172,11 @@ ### 7b GPTQ Models for 8GB GPUs # MODEL_ID = "TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ" -# MODEL_BASENAME = "Wizard-Vicuna-7B-Uncensored-GPTQ-4bit-128g.no-act.order.safetensors" +# MODEL_BASENAME = "model.safetensors" + # MODEL_ID = "TheBloke/WizardLM-7B-uncensored-GPTQ" # MODEL_BASENAME = "WizardLM-7B-uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors" + # MODEL_ID = "TheBloke/wizardLM-7B-GPTQ" # MODEL_BASENAME = "wizardLM-7B-GPTQ-4bit.compat.no-act-order.safetensors" @@ -184,5 +196,36 @@ ### (*** MODEL_BASENAME is not actually used but have to contain .awq so the correct model loading is used ***) ### (*** Compute capability 7.5 (sm75) and CUDA Toolkit 11.8+ are required ***) #### +# WORKS # MODEL_ID = "TheBloke/Llama-2-7B-Chat-AWQ" # MODEL_BASENAME = "model.safetensors.awq" + +# FAILS +# torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 28.00 MiB (GPU 0; 23.62 GiB total capacity; 19.95 GiB already allocated; 50.75 MiB free; 19.95 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF + +# MODEL_ID = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ" +# MODEL_BASENAME = "model.safetensors" + +# WORKS +# MODEL_ID = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF" +# MODEL_BASENAME = "mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf" + +# LOADS Forever +# MODEL_ID = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ" +# MODEL_BASENAME = "model.safetensors" + +# for GGUF/GGML models +# pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir + +# WORKS +# MODEL_ID = "TheBloke/CapybaraHermes-2.5-Mistral-7B-GPTQ" +# MODEL_BASENAME = "model.safetensors" + +# WORKS BUT CHAT ONLY +# MODEL_ID = "TheBloke/Llama-2-13B-chat-GPTQ" +# MODEL_BASENAME = "model.safetensors" + +# WORKS, testing REVISION +MODEL_ID = "TheBloke/vicuna-13B-v1.5-16K-GPTQ" +MODEL_BASENAME = "model.safetensors" +REVISION = "main" diff --git a/load_models.py b/load_models.py index 65f10e6e..f0ca510b 100644 --- a/load_models.py +++ b/load_models.py @@ -7,7 +7,7 @@ from huggingface_hub import hf_hub_download from langchain.llms import LlamaCpp -from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, LlamaForCausalLM, LlamaTokenizer from constants import CONTEXT_WINDOW_SIZE, MAX_NEW_TOKENS, MODELS_PATH, N_BATCH, N_GPU_LAYERS @@ -60,7 +60,7 @@ def load_quantized_model_gguf_ggml(model_id, model_basename, device_type, loggin return None -def load_quantized_model_qptq(model_id, model_basename, device_type, logging): +def load_quantized_model_qptq(model_id, revision, model_basename, device_type, logging): """ Load a GPTQ quantized model using AutoGPTQForCausalLM. @@ -102,6 +102,7 @@ def load_quantized_model_qptq(model_id, model_basename, device_type, logging): model = AutoGPTQForCausalLM.from_quantized( model_id, + revision=revision, model_basename=model_basename, use_safetensors=True, trust_remote_code=True, @@ -144,11 +145,11 @@ def load_full_model(model_id, model_basename, device_type, logging): tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="./models/") logging.info("Tokenizer loaded") bnb_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_use_double_quant=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.float16 - ) + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.float16, + ) model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", @@ -157,10 +158,10 @@ def load_full_model(model_id, model_basename, device_type, logging): cache_dir=MODELS_PATH, trust_remote_code=True, # set these if you are using NVIDIA GPU quantization_config=bnb_config - # load_in_4bit=True, - # bnb_4bit_quant_type="nf4", - # bnb_4bit_compute_dtype=torch.float16, - # max_memory={0: "15GB"}, # Uncomment this line with you encounter CUDA out of memory errors + # load_in_4bit=True, + # bnb_4bit_quant_type="nf4", + # bnb_4bit_compute_dtype=torch.float16, + # max_memory={0: "15GB"}, # Uncomment this line with you encounter CUDA out of memory errors ) model.tie_weights() diff --git a/prompt_template_utils.py b/prompt_template_utils.py index beb56f8a..21981772 100644 --- a/prompt_template_utils.py +++ b/prompt_template_utils.py @@ -1,6 +1,6 @@ """ -This file implements prompt template for llama based models. -Modify the prompt template based on the model you select. +This file implements prompt template for llama based models. +Modify the prompt template based on the model you select. This seems to have significant impact on the output of the LLM. """ @@ -10,7 +10,7 @@ # this is specific to Llama-2. system_prompt = """You are a helpful assistant, you will use the provided context to answer user questions. -Read the given context before answering questions and think step by step. If you can not answer a user question based on +Read the given context before answering questions and think step by step. If you can not answer a user question based on the provided context, inform the user. Do not use any other information for answering user. Provide a detailed answer to the question.""" @@ -40,7 +40,7 @@ def get_prompt_template(system_prompt=system_prompt, promptTemplate_type=None, h B_INST + system_prompt + """ - + Context: {history} \n {context} User: {question}""" + E_INST @@ -51,19 +51,41 @@ def get_prompt_template(system_prompt=system_prompt, promptTemplate_type=None, h B_INST + system_prompt + """ - + Context: {context} User: {question}""" + E_INST ) prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template) + elif promptTemplate_type == "hermes": + B_INST, E_INST = "<|im_start|> ", " <|im_end|>" + + prompt_template = ( + B_INST + + system_prompt + + """ + + Context: {context} + User: {question}""" + + E_INST + + "\n" + + B_INST + + "assistant" + ) + prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template) + # A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {prompt} ASSISTANT: + + elif promptTemplate_type == "vicuna": + prompt_template = "Context: {context} USER: {question}" + "ASSISTANT:" + prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template) + else: # change this based on the model you have selected. if history: prompt_template = ( system_prompt + """ - + Context: {history} \n {context} User: {question} Answer:""" @@ -73,7 +95,7 @@ def get_prompt_template(system_prompt=system_prompt, promptTemplate_type=None, h prompt_template = ( system_prompt + """ - + Context: {context} User: {question} Answer:""" diff --git a/run_localGPT.py b/run_localGPT.py index 4ed53983..de374fea 100644 --- a/run_localGPT.py +++ b/run_localGPT.py @@ -1,45 +1,43 @@ -import os import logging +import os + import click import torch -import utils +from langchain.callbacks.manager import CallbackManager +from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler # for streaming response from langchain.chains import RetrievalQA from langchain.embeddings import HuggingFaceInstructEmbeddings from langchain.llms import HuggingFacePipeline -from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler # for streaming response -from langchain.callbacks.manager import CallbackManager -callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) +import utils -from prompt_template_utils import get_prompt_template -from utils import get_embeddings +callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) # from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain.vectorstores import Chroma -from transformers import ( - GenerationConfig, - pipeline, -) - -from load_models import ( - load_quantized_model_awq, - load_quantized_model_gguf_ggml, - load_quantized_model_qptq, - load_full_model, -) +from transformers import GenerationConfig, pipeline from constants import ( + CHROMA_SETTINGS, EMBEDDING_MODEL_NAME, - PERSIST_DIRECTORY, - MODEL_ID, - MODEL_BASENAME, MAX_NEW_TOKENS, + MODEL_BASENAME, + MODEL_ID, MODELS_PATH, - CHROMA_SETTINGS, + PERSIST_DIRECTORY, + REVISION, ) +from load_models import ( + load_full_model, + load_quantized_model_awq, + load_quantized_model_gguf_ggml, + load_quantized_model_qptq, +) +from prompt_template_utils import get_prompt_template +from utils import get_embeddings -def load_model(device_type, model_id, model_basename=None, LOGGING=logging): +def load_model(device_type, model_id, revision="main", model_basename=None, LOGGING=logging): """ Select a model for text generation using the HuggingFace library. If you are running this for the first time, it will download a model for you. @@ -48,6 +46,7 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging): Args: device_type (str): Type of device to use, e.g., "cuda" for GPU or "cpu" for CPU. model_id (str): Identifier of the model to load from HuggingFace's model hub. + revision (str, optional): Revision/branch of the model to load. Defaults to "main". model_basename (str, optional): Basename of the model if using quantized models. Defaults to None. @@ -69,7 +68,7 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging): elif ".awq" in model_basename.lower(): model, tokenizer = load_quantized_model_awq(model_id, LOGGING) else: - model, tokenizer = load_quantized_model_qptq(model_id, model_basename, device_type, LOGGING) + model, tokenizer = load_quantized_model_qptq(model_id, revision, model_basename, device_type, LOGGING) else: model, tokenizer = load_full_model(model_id, model_basename, device_type, LOGGING) @@ -122,7 +121,7 @@ def retrieval_qa_pipline(device_type, use_history, promptTemplate_type="llama"): """ (1) Chooses an appropriate langchain library based on the enbedding model name. Matching code is contained within ingest.py. - + (2) Provides additional arguments for instructor and BGE models to improve results, pursuant to the instructions contained on their respective huggingface repository, project page or github repository. """ @@ -139,7 +138,7 @@ def retrieval_qa_pipline(device_type, use_history, promptTemplate_type="llama"): prompt, memory = get_prompt_template(promptTemplate_type=promptTemplate_type, history=use_history) # load the llm pipeline - llm = load_model(device_type, model_id=MODEL_ID, model_basename=MODEL_BASENAME, LOGGING=logging) + llm = load_model(device_type, model_id=MODEL_ID, revision=REVISION, model_basename=MODEL_BASENAME, LOGGING=logging) if use_history: qa = RetrievalQA.from_chain_type( @@ -211,7 +210,7 @@ def retrieval_qa_pipline(device_type, use_history, promptTemplate_type="llama"): "--model_type", default="llama", type=click.Choice( - ["llama", "mistral", "non_llama"], + ["llama", "mistral", "non_llama", "hermes", "vicuna"], ), help="model type, llama, mistral or non_llama", ) diff --git a/run_localGPT_API.py b/run_localGPT_API.py index b345612f..f422f76a 100644 --- a/run_localGPT_API.py +++ b/run_localGPT_API.py @@ -1,29 +1,40 @@ +import argparse import logging import os import shutil import subprocess -import argparse + +# API queue addition +from threading import Lock import torch from flask import Flask, jsonify, request from langchain.chains import RetrievalQA from langchain.embeddings import HuggingFaceInstructEmbeddings -# from langchain.embeddings import HuggingFaceEmbeddings -from run_localGPT import load_model -from prompt_template_utils import get_prompt_template - # from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain.vectorstores import Chroma from werkzeug.utils import secure_filename -from constants import CHROMA_SETTINGS, EMBEDDING_MODEL_NAME, PERSIST_DIRECTORY, MODEL_ID, MODEL_BASENAME +from constants import ( + CHROMA_SETTINGS, + EMBEDDING_MODEL_NAME, + MAX_NEW_TOKENS, + MODEL_BASENAME, + MODEL_ID, + MODELS_PATH, + PERSIST_DIRECTORY, + REVISION, +) +# default model revision/branch +REVISION = "main" -# API queue addition -from threading import Lock +from prompt_template_utils import get_prompt_template -request_lock = Lock() +# from langchain.embeddings import HuggingFaceEmbeddings +from run_localGPT import load_model +request_lock = Lock() if torch.backends.mps.is_available(): DEVICE_TYPE = "mps" @@ -68,8 +79,24 @@ RETRIEVER = DB.as_retriever() -LLM = load_model(device_type=DEVICE_TYPE, model_id=MODEL_ID, model_basename=MODEL_BASENAME) -prompt, memory = get_prompt_template(promptTemplate_type="llama", history=False) +args = None +parser = argparse.ArgumentParser() +parser.add_argument("--port", type=int, default=5110, help="Port to run the API on. Defaults to 5110.") +parser.add_argument( + "--model_type", type=str, default="llama", help="Model type: llama, mistral, non_llama, hermes, or vicuna." +) +parser.add_argument( + "--host", + type=str, + default="127.0.0.1", + help="Host to run the UI on. Defaults to 127.0.0.1. " + "Set to 0.0.0.0 to make the UI externally " + "accessible from other devices.", +) +args = parser.parse_args() + +LLM = load_model(device_type=DEVICE_TYPE, model_id=MODEL_ID, revision=REVISION, model_basename=MODEL_BASENAME) +prompt, memory = get_prompt_template(promptTemplate_type=args.model_type, history=False) QA = RetrievalQA.from_chain_type( llm=LLM, @@ -166,7 +193,7 @@ def prompt_route(): if user_prompt: # Acquire the lock before processing the prompt with request_lock: - # print(f'User Prompt: {user_prompt}') + # print(f'User Prompt: {user_prompt}') # Get the answer from the chain res = QA(user_prompt) answer, docs = res["result"], res["source_documents"] @@ -188,18 +215,6 @@ def prompt_route(): if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--port", type=int, default=5110, help="Port to run the API on. Defaults to 5110.") - parser.add_argument( - "--host", - type=str, - default="127.0.0.1", - help="Host to run the UI on. Defaults to 127.0.0.1. " - "Set to 0.0.0.0 to make the UI externally " - "accessible from other devices.", - ) - args = parser.parse_args() - logging.basicConfig( format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s", level=logging.INFO )