diff --git a/README.md b/README.md index f833e57d..8c5bd83f 100644 --- a/README.md +++ b/README.md @@ -193,6 +193,11 @@ Another option is to enable chat history. ***Note***: This is disabled by defaul python run_localGPT.py --use_history ``` +Another option is to use a different model. ***Note***: Other models need a different prompt template. The prompt template is selected and can be edited in `prompt_template_utils.py`. + +```shell +python run_localGPT.py --model_type em_german_leo +``` # Run the Graphical User Interface @@ -205,7 +210,7 @@ python run_localGPT.py --use_history 3. Open up a terminal and activate your python environment that contains the dependencies installed from requirements.txt. -4. Navigate to the `/LOCALGPT` directory. +4. Navigate to the `/localGPT` directory. 5. Run the following command `python run_localGPT_API.py`. The API should being to run. @@ -213,7 +218,7 @@ python run_localGPT.py --use_history 7. Open up a second terminal and activate the same python environment. -8. Navigate to the `/LOCALGPT/localGPTUI` directory. +8. Navigate to the `/localGPT/localGPTUI` directory. 9. Run the command `python localGPTUI.py`. @@ -240,6 +245,8 @@ To change the models you will need to set both `MODEL_ID` and `MODEL_BASENAME`. 8. Follow the same steps for `GGUF` and `GGML` models. +9. Change the Prompt Template for your model in `prompt_template_utils.py`, if necessary. + # GPU and VRAM Requirements Below is the VRAM requirement for different models depending on their size (Billions of parameters). The estimates in the table does not include VRAM used by the Embedding models - which use an additional 2GB-7GB of VRAM depending on the model. diff --git a/constants.py b/constants.py index 007d16b4..f63b95f9 100644 --- a/constants.py +++ b/constants.py @@ -31,12 +31,12 @@ CONTEXT_WINDOW_SIZE = 4096 MAX_NEW_TOKENS = CONTEXT_WINDOW_SIZE # int(CONTEXT_WINDOW_SIZE/4) -#### If you get a "not enough space in the buffer" error, you should reduce the values below, start with half of the original values and keep halving the value until the error stops appearing +# If you get a "not enough space in the buffer" error, you should reduce the values below, start with half of the original values and keep halving the value until the error stops appearing N_GPU_LAYERS = 100 # Llama-2-70B has 83 layers N_BATCH = 512 -### From experimenting with the Llama-2-7B-Chat-GGML model on 8GB VRAM, these values work: +# From experimenting with the Llama-2-7B-Chat-GGML model on 8GB VRAM, these values work: # N_GPU_LAYERS = 20 # N_BATCH = 512 @@ -59,7 +59,7 @@ EMBEDDING_MODEL_NAME = "hkunlp/instructor-large" # Uses 1.5 GB of VRAM (High Accuracy with lower VRAM usage) #### -#### OTHER EMBEDDING MODEL OPTIONS +# OTHER EMBEDDING MODEL OPTIONS #### # EMBEDDING_MODEL_NAME = "hkunlp/instructor-xl" # Uses 5 GB of VRAM (Most Accurate of all models) @@ -68,38 +68,41 @@ # EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" # Uses 0.2 GB of VRAM (Less accurate but fastest - only requires 150mb of vram) #### -#### MULTILINGUAL EMBEDDING MODELS +# MULTILINGUAL EMBEDDING MODELS #### # EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-large" # Uses 2.5 GB of VRAM # EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-base" # Uses 1.2 GB of VRAM -#### SELECT AN OPEN SOURCE LLM (LARGE LANGUAGE MODEL) +# SELECT AN OPEN SOURCE LLM (LARGE LANGUAGE MODEL) # Select the Model ID and model_basename # load the LLM for generating Natural Language responses -#### GPU VRAM Memory required for LLM Models (ONLY) by Billion Parameter value (B Model) -#### Does not include VRAM used by Embedding Models - which use an additional 2GB-7GB of VRAM depending on the model. +# GPU VRAM Memory required for LLM Models (ONLY) by Billion Parameter value (B Model) +# Does not include VRAM used by Embedding Models - which use an additional 2GB-7GB of VRAM depending on the model. #### -#### (B Model) (float32) (float16) (GPTQ 8bit) (GPTQ 4bit) -#### 7b 28 GB 14 GB 7 GB - 9 GB 3.5 GB - 5 GB -#### 13b 52 GB 26 GB 13 GB - 15 GB 6.5 GB - 8 GB -#### 32b 130 GB 65 GB 32.5 GB - 35 GB 16.25 GB - 19 GB -#### 65b 260.8 GB 130.4 GB 65.2 GB - 67 GB 32.6 GB - - 35 GB +# (B Model) (float32) (float16) (GPTQ 8bit) (GPTQ 4bit) +# 7b 28 GB 14 GB 7 GB - 9 GB 3.5 GB - 5 GB +# 13b 52 GB 26 GB 13 GB - 15 GB 6.5 GB - 8 GB +# 32b 130 GB 65 GB 32.5 GB - 35 GB 16.25 GB - 19 GB +# 65b 260.8 GB 130.4 GB 65.2 GB - 67 GB 32.6 GB - - 35 GB # MODEL_ID = "TheBloke/Llama-2-7B-Chat-GGML" # MODEL_BASENAME = "llama-2-7b-chat.ggmlv3.q4_0.bin" #### -#### (FOR GGUF MODELS) +# (FOR GGUF MODELS) #### # MODEL_ID = "TheBloke/Llama-2-13b-Chat-GGUF" # MODEL_BASENAME = "llama-2-13b-chat.Q4_K_M.gguf" -MODEL_ID = "TheBloke/Llama-2-7b-Chat-GGUF" -MODEL_BASENAME = "llama-2-7b-chat.Q4_K_M.gguf" +# MODEL_ID = "TheBloke/Llama-2-7b-Chat-GGUF" +# MODEL_BASENAME = "llama-2-7b-chat.Q4_K_M.gguf" + +MODEL_ID = "TheBloke/em_german_leo_mistral-GGUF" +MODEL_BASENAME = "em_german_leo_mistral.Q4_K_S.gguf" # MODEL_ID = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF" # MODEL_BASENAME = "mistral-7b-instruct-v0.1.Q8_0.gguf" @@ -108,7 +111,7 @@ # MODEL_BASENAME = "llama-2-70b-chat.Q4_K_M.gguf" #### -#### (FOR HF MODELS) +# (FOR HF MODELS) #### # MODEL_ID = "NousResearch/Llama-2-7b-chat-hf" @@ -122,12 +125,12 @@ # llm = load_model(device_type, model_id=model_id) #### -#### (FOR GPTQ QUANTIZED) Select a llm model based on your GPU and VRAM GB. Does not include Embedding Models VRAM usage. +# (FOR GPTQ QUANTIZED) Select a llm model based on your GPU and VRAM GB. Does not include Embedding Models VRAM usage. #### ##### 48GB VRAM Graphics Cards (RTX 6000, RTX A6000 and other 48GB VRAM GPUs) ##### -### 65b GPTQ LLM Models for 48GB GPUs (*** With best embedding model: hkunlp/instructor-xl ***) +# 65b GPTQ LLM Models for 48GB GPUs (*** With best embedding model: hkunlp/instructor-xl ***) # MODEL_ID = "TheBloke/guanaco-65B-GPTQ" # MODEL_BASENAME = "model.safetensors" # MODEL_ID = "TheBloke/Airoboros-65B-GPT4-2.0-GPTQ" @@ -139,7 +142,7 @@ ##### 24GB VRAM Graphics Cards (RTX 3090 - RTX 4090 (35% Faster) - RTX A5000 - RTX A5500) ##### -### 13b GPTQ Models for 24GB GPUs (*** With best embedding model: hkunlp/instructor-xl ***) +# 13b GPTQ Models for 24GB GPUs (*** With best embedding model: hkunlp/instructor-xl ***) # MODEL_ID = "TheBloke/Wizard-Vicuna-13B-Uncensored-GPTQ" # MODEL_BASENAME = "Wizard-Vicuna-13B-Uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors" # MODEL_ID = "TheBloke/vicuna-13B-v1.5-GPTQ" @@ -149,16 +152,16 @@ # MODEL_ID = "TheBloke/WizardLM-13B-V1.2-GPTQ" # MODEL_BASENAME = "gptq_model-4bit-128g.safetensors -### 30b GPTQ Models for 24GB GPUs (*** Requires using intfloat/e5-base-v2 instead of hkunlp/instructor-large as embedding model ***) +# 30b GPTQ Models for 24GB GPUs (*** Requires using intfloat/e5-base-v2 instead of hkunlp/instructor-large as embedding model ***) # MODEL_ID = "TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ" # MODEL_BASENAME = "Wizard-Vicuna-30B-Uncensored-GPTQ-4bit--1g.act.order.safetensors" # MODEL_ID = "TheBloke/WizardLM-30B-Uncensored-GPTQ" # MODEL_BASENAME = "WizardLM-30B-Uncensored-GPTQ-4bit.act-order.safetensors" ##### 8-10GB VRAM Graphics Cards (RTX 3080 - RTX 3080 Ti - RTX 3070 Ti - 3060 Ti - RTX 2000 Series, Quadro RTX 4000, 5000, 6000) ##### -### (*** Requires using intfloat/e5-small-v2 instead of hkunlp/instructor-large as embedding model ***) +# (*** Requires using intfloat/e5-small-v2 instead of hkunlp/instructor-large as embedding model ***) -### 7b GPTQ Models for 8GB GPUs +# 7b GPTQ Models for 8GB GPUs # MODEL_ID = "TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ" # MODEL_BASENAME = "Wizard-Vicuna-7B-Uncensored-GPTQ-4bit-128g.no-act.order.safetensors" # MODEL_ID = "TheBloke/WizardLM-7B-uncensored-GPTQ" @@ -167,7 +170,7 @@ # MODEL_BASENAME = "wizardLM-7B-GPTQ-4bit.compat.no-act-order.safetensors" #### -#### (FOR GGML) (Quantized cpu+gpu+mps) models - check if they support llama.cpp +# (FOR GGML) (Quantized cpu+gpu+mps) models - check if they support llama.cpp #### # MODEL_ID = "TheBloke/wizard-vicuna-13B-GGML" diff --git a/prompt_template_utils.py b/prompt_template_utils.py index beb56f8a..4b6c4349 100644 --- a/prompt_template_utils.py +++ b/prompt_template_utils.py @@ -57,6 +57,21 @@ def get_prompt_template(system_prompt=system_prompt, promptTemplate_type=None, h + E_INST ) prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template) + elif promptTemplate_type == "em_german_leo": + if history: + prompt_template = ( + """Du bist ein hilfreicher Assistent. Nutze nur den folgenden Kontext: {context} sowie {history} + USER: {question} ASSISTANT: + """ + ) + prompt = PromptTemplate(input_variables=["history", "context", "question"], template=prompt_template) + else: + prompt_template = ( + """Du bist ein hilfreicher Assistent. Nutze nur den folgenden Kontext: {context} + USER: {question} ASSISTANT: + """ + ) + prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template) else: # change this based on the model you have selected. if history: diff --git a/run_localGPT.py b/run_localGPT.py index 05a3d253..d9b843ab 100644 --- a/run_localGPT.py +++ b/run_localGPT.py @@ -1,3 +1,23 @@ +from constants import ( + EMBEDDING_MODEL_NAME, + PERSIST_DIRECTORY, + MODEL_ID, + MODEL_BASENAME, + MAX_NEW_TOKENS, + MODELS_PATH, + CHROMA_SETTINGS +) +from load_models import ( + load_quantized_model_gguf_ggml, + load_quantized_model_qptq, + load_full_model, +) +from transformers import ( + GenerationConfig, + pipeline, +) +from langchain.vectorstores import Chroma +from prompt_template_utils import get_prompt_template import os import logging import click @@ -11,30 +31,8 @@ callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) -from prompt_template_utils import get_prompt_template # from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler -from langchain.vectorstores import Chroma -from transformers import ( - GenerationConfig, - pipeline, -) - -from load_models import ( - load_quantized_model_gguf_ggml, - load_quantized_model_qptq, - load_full_model, -) - -from constants import ( - EMBEDDING_MODEL_NAME, - PERSIST_DIRECTORY, - MODEL_ID, - MODEL_BASENAME, - MAX_NEW_TOKENS, - MODELS_PATH, - CHROMA_SETTINGS -) def load_model(device_type, model_id, model_basename=None, LOGGING=logging): @@ -93,7 +91,7 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging): return local_llm -def retrieval_qa_pipline(device_type, use_history, promptTemplate_type="llama"): +def retrieval_qa_pipline(device_type, use_history, promptTemplate_type=None): """ Initializes and returns a retrieval-based Question Answering (QA) pipeline. @@ -204,7 +202,7 @@ def retrieval_qa_pipline(device_type, use_history, promptTemplate_type="llama"): "--model_type", default="llama", type=click.Choice( - ["llama", "mistral", "non_llama"], + ["llama", "mistral", "em_german_leo", "non_llama"], ), help="model type, llama, mistral or non_llama", ) @@ -213,7 +211,6 @@ def retrieval_qa_pipline(device_type, use_history, promptTemplate_type="llama"): is_flag=True, help="whether to save Q&A pairs to a CSV file (Default is False)", ) - def main(device_type, show_sources, use_history, model_type, save_qa): """ Implements the main information retrieval task for a localGPT. @@ -266,7 +263,7 @@ def main(device_type, show_sources, use_history, model_type, save_qa): print("\n> " + document.metadata["source"] + ":") print(document.page_content) print("----------------------------------SOURCE DOCUMENTS---------------------------") - + # Log the Q&A to CSV only if save_qa is True if save_qa: utils.log_to_csv(query, answer) diff --git a/run_localGPT_API.py b/run_localGPT_API.py index a6df0003..fe950086 100644 --- a/run_localGPT_API.py +++ b/run_localGPT_API.py @@ -63,7 +63,7 @@ RETRIEVER = DB.as_retriever() LLM = load_model(device_type=DEVICE_TYPE, model_id=MODEL_ID, model_basename=MODEL_BASENAME) -prompt, memory = get_prompt_template(promptTemplate_type="llama", history=False) +prompt, memory = get_prompt_template(promptTemplate_type="em_german_leo", history=False) QA = RetrievalQA.from_chain_type( llm=LLM,