Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 476 branches #765

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 49 additions & 6 deletions constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,16 @@
from chromadb.config import Settings

# https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/excel.html?highlight=xlsx#microsoft-excel
from langchain.document_loaders import CSVLoader, PDFMinerLoader, TextLoader, UnstructuredExcelLoader, Docx2txtLoader
from langchain.document_loaders import UnstructuredFileLoader, UnstructuredMarkdownLoader
from langchain.document_loaders import UnstructuredHTMLLoader

from langchain.document_loaders import (
CSVLoader,
Docx2txtLoader,
PDFMinerLoader,
TextLoader,
UnstructuredExcelLoader,
UnstructuredFileLoader,
UnstructuredHTMLLoader,
UnstructuredMarkdownLoader,
)

# load_dotenv()
ROOT_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
Expand Down Expand Up @@ -142,12 +148,16 @@
##### 24GB VRAM Graphics Cards (RTX 3090 - RTX 4090 (35% Faster) - RTX A5000 - RTX A5500) #####

### 13b GPTQ Models for 24GB GPUs (*** With best embedding model: hkunlp/instructor-xl ***)
# FAILS/blank answers
# MODEL_ID = "TheBloke/Wizard-Vicuna-13B-Uncensored-GPTQ"
# MODEL_BASENAME = "Wizard-Vicuna-13B-Uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors"
# MODEL_BASENAME = "model.safetensors"

# MODEL_ID = "TheBloke/vicuna-13B-v1.5-GPTQ"
# MODEL_BASENAME = "model.safetensors"

# MODEL_ID = "TheBloke/Nous-Hermes-13B-GPTQ"
# MODEL_BASENAME = "nous-hermes-13b-GPTQ-4bit-128g.no-act.order"

# MODEL_ID = "TheBloke/WizardLM-13B-V1.2-GPTQ"
# MODEL_BASENAME = "gptq_model-4bit-128g.safetensors

Expand All @@ -162,9 +172,11 @@

### 7b GPTQ Models for 8GB GPUs
# MODEL_ID = "TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ"
# MODEL_BASENAME = "Wizard-Vicuna-7B-Uncensored-GPTQ-4bit-128g.no-act.order.safetensors"
# MODEL_BASENAME = "model.safetensors"

# MODEL_ID = "TheBloke/WizardLM-7B-uncensored-GPTQ"
# MODEL_BASENAME = "WizardLM-7B-uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors"

# MODEL_ID = "TheBloke/wizardLM-7B-GPTQ"
# MODEL_BASENAME = "wizardLM-7B-GPTQ-4bit.compat.no-act-order.safetensors"

Expand All @@ -184,5 +196,36 @@
### (*** MODEL_BASENAME is not actually used but have to contain .awq so the correct model loading is used ***)
### (*** Compute capability 7.5 (sm75) and CUDA Toolkit 11.8+ are required ***)
####
# WORKS
# MODEL_ID = "TheBloke/Llama-2-7B-Chat-AWQ"
# MODEL_BASENAME = "model.safetensors.awq"

# FAILS
# torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 28.00 MiB (GPU 0; 23.62 GiB total capacity; 19.95 GiB already allocated; 50.75 MiB free; 19.95 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

# MODEL_ID = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ"
# MODEL_BASENAME = "model.safetensors"

# WORKS
# MODEL_ID = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF"
# MODEL_BASENAME = "mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf"

# LOADS Forever
# MODEL_ID = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ"
# MODEL_BASENAME = "model.safetensors"

# for GGUF/GGML models
# pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir

# WORKS
# MODEL_ID = "TheBloke/CapybaraHermes-2.5-Mistral-7B-GPTQ"
# MODEL_BASENAME = "model.safetensors"

# WORKS BUT CHAT ONLY
# MODEL_ID = "TheBloke/Llama-2-13B-chat-GPTQ"
# MODEL_BASENAME = "model.safetensors"

# WORKS, testing REVISION
MODEL_ID = "TheBloke/vicuna-13B-v1.5-16K-GPTQ"
MODEL_BASENAME = "model.safetensors"
REVISION = "main"
23 changes: 12 additions & 11 deletions load_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from huggingface_hub import hf_hub_download
from langchain.llms import LlamaCpp
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, LlamaForCausalLM, LlamaTokenizer

from constants import CONTEXT_WINDOW_SIZE, MAX_NEW_TOKENS, MODELS_PATH, N_BATCH, N_GPU_LAYERS

Expand Down Expand Up @@ -60,7 +60,7 @@ def load_quantized_model_gguf_ggml(model_id, model_basename, device_type, loggin
return None


def load_quantized_model_qptq(model_id, model_basename, device_type, logging):
def load_quantized_model_qptq(model_id, revision, model_basename, device_type, logging):
"""
Load a GPTQ quantized model using AutoGPTQForCausalLM.

Expand Down Expand Up @@ -102,6 +102,7 @@ def load_quantized_model_qptq(model_id, model_basename, device_type, logging):

model = AutoGPTQForCausalLM.from_quantized(
model_id,
revision=revision,
model_basename=model_basename,
use_safetensors=True,
trust_remote_code=True,
Expand Down Expand Up @@ -144,11 +145,11 @@ def load_full_model(model_id, model_basename, device_type, logging):
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="./models/")
logging.info("Tokenizer loaded")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16
)
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
)
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
Expand All @@ -157,10 +158,10 @@ def load_full_model(model_id, model_basename, device_type, logging):
cache_dir=MODELS_PATH,
trust_remote_code=True, # set these if you are using NVIDIA GPU
quantization_config=bnb_config
# load_in_4bit=True,
# bnb_4bit_quant_type="nf4",
# bnb_4bit_compute_dtype=torch.float16,
# max_memory={0: "15GB"}, # Uncomment this line with you encounter CUDA out of memory errors
# load_in_4bit=True,
# bnb_4bit_quant_type="nf4",
# bnb_4bit_compute_dtype=torch.float16,
# max_memory={0: "15GB"}, # Uncomment this line with you encounter CUDA out of memory errors
)

model.tie_weights()
Expand Down
36 changes: 29 additions & 7 deletions prompt_template_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""
This file implements prompt template for llama based models.
Modify the prompt template based on the model you select.
This file implements prompt template for llama based models.
Modify the prompt template based on the model you select.
This seems to have significant impact on the output of the LLM.
"""

Expand All @@ -10,7 +10,7 @@
# this is specific to Llama-2.

system_prompt = """You are a helpful assistant, you will use the provided context to answer user questions.
Read the given context before answering questions and think step by step. If you can not answer a user question based on
Read the given context before answering questions and think step by step. If you can not answer a user question based on
the provided context, inform the user. Do not use any other information for answering user. Provide a detailed answer to the question."""


Expand Down Expand Up @@ -40,7 +40,7 @@ def get_prompt_template(system_prompt=system_prompt, promptTemplate_type=None, h
B_INST
+ system_prompt
+ """

Context: {history} \n {context}
User: {question}"""
+ E_INST
Expand All @@ -51,19 +51,41 @@ def get_prompt_template(system_prompt=system_prompt, promptTemplate_type=None, h
B_INST
+ system_prompt
+ """

Context: {context}
User: {question}"""
+ E_INST
)
prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template)
elif promptTemplate_type == "hermes":
B_INST, E_INST = "<|im_start|> ", " <|im_end|>"

prompt_template = (
B_INST
+ system_prompt
+ """

Context: {context}
User: {question}"""
+ E_INST
+ "\n"
+ B_INST
+ "assistant"
)
prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template)
# A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {prompt} ASSISTANT:

elif promptTemplate_type == "vicuna":
prompt_template = "Context: {context} USER: {question}" + "ASSISTANT:"
prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template)

else:
# change this based on the model you have selected.
if history:
prompt_template = (
system_prompt
+ """

Context: {history} \n {context}
User: {question}
Answer:"""
Expand All @@ -73,7 +95,7 @@ def get_prompt_template(system_prompt=system_prompt, promptTemplate_type=None, h
prompt_template = (
system_prompt
+ """

Context: {context}
User: {question}
Answer:"""
Expand Down
53 changes: 26 additions & 27 deletions run_localGPT.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,43 @@
import os
import logging
import os

import click
import torch
import utils
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler # for streaming response
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler # for streaming response
from langchain.callbacks.manager import CallbackManager

callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
import utils

from prompt_template_utils import get_prompt_template
from utils import get_embeddings
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

# from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.vectorstores import Chroma
from transformers import (
GenerationConfig,
pipeline,
)

from load_models import (
load_quantized_model_awq,
load_quantized_model_gguf_ggml,
load_quantized_model_qptq,
load_full_model,
)
from transformers import GenerationConfig, pipeline

from constants import (
CHROMA_SETTINGS,
EMBEDDING_MODEL_NAME,
PERSIST_DIRECTORY,
MODEL_ID,
MODEL_BASENAME,
MAX_NEW_TOKENS,
MODEL_BASENAME,
MODEL_ID,
MODELS_PATH,
CHROMA_SETTINGS,
PERSIST_DIRECTORY,
REVISION,
)
from load_models import (
load_full_model,
load_quantized_model_awq,
load_quantized_model_gguf_ggml,
load_quantized_model_qptq,
)
from prompt_template_utils import get_prompt_template
from utils import get_embeddings


def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
def load_model(device_type, model_id, revision="main", model_basename=None, LOGGING=logging):
"""
Select a model for text generation using the HuggingFace library.
If you are running this for the first time, it will download a model for you.
Expand All @@ -48,6 +46,7 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
Args:
device_type (str): Type of device to use, e.g., "cuda" for GPU or "cpu" for CPU.
model_id (str): Identifier of the model to load from HuggingFace's model hub.
revision (str, optional): Revision/branch of the model to load. Defaults to "main".
model_basename (str, optional): Basename of the model if using quantized models.
Defaults to None.

Expand All @@ -69,7 +68,7 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
elif ".awq" in model_basename.lower():
model, tokenizer = load_quantized_model_awq(model_id, LOGGING)
else:
model, tokenizer = load_quantized_model_qptq(model_id, model_basename, device_type, LOGGING)
model, tokenizer = load_quantized_model_qptq(model_id, revision, model_basename, device_type, LOGGING)
else:
model, tokenizer = load_full_model(model_id, model_basename, device_type, LOGGING)

Expand Down Expand Up @@ -122,7 +121,7 @@ def retrieval_qa_pipline(device_type, use_history, promptTemplate_type="llama"):

"""
(1) Chooses an appropriate langchain library based on the enbedding model name. Matching code is contained within ingest.py.

(2) Provides additional arguments for instructor and BGE models to improve results, pursuant to the instructions contained on
their respective huggingface repository, project page or github repository.
"""
Expand All @@ -139,7 +138,7 @@ def retrieval_qa_pipline(device_type, use_history, promptTemplate_type="llama"):
prompt, memory = get_prompt_template(promptTemplate_type=promptTemplate_type, history=use_history)

# load the llm pipeline
llm = load_model(device_type, model_id=MODEL_ID, model_basename=MODEL_BASENAME, LOGGING=logging)
llm = load_model(device_type, model_id=MODEL_ID, revision=REVISION, model_basename=MODEL_BASENAME, LOGGING=logging)

if use_history:
qa = RetrievalQA.from_chain_type(
Expand Down Expand Up @@ -211,7 +210,7 @@ def retrieval_qa_pipline(device_type, use_history, promptTemplate_type="llama"):
"--model_type",
default="llama",
type=click.Choice(
["llama", "mistral", "non_llama"],
["llama", "mistral", "non_llama", "hermes", "vicuna"],
),
help="model type, llama, mistral or non_llama",
)
Expand Down
Loading