diff --git a/load_models.py b/load_models.py index b7fa5a37..59ec3922 100644 --- a/load_models.py +++ b/load_models.py @@ -9,6 +9,8 @@ LlamaForCausalLM, LlamaTokenizer, ) +# Uncomment below line if you have IntelĀ® Discrete GPU's and it has XPU Support. +#import intel_extension_for_pytorch as ipex from constants import CONTEXT_WINDOW_SIZE, MAX_NEW_TOKENS, N_GPU_LAYERS, N_BATCH, MODELS_PATH @@ -131,6 +133,13 @@ def load_full_model(model_id, model_basename, device_type, logging): logging.info("Using LlamaTokenizer") tokenizer = LlamaTokenizer.from_pretrained(model_id, cache_dir="./models/") model = LlamaForCausalLM.from_pretrained(model_id, cache_dir="./models/") + elif device_type.lower() == "xpu": + logging.info("Using LlamaTokenizer") + tokenizer = LlamaTokenizer.from_pretrained(model_id, cache_dir="./models/") + logging.info("Using AutoModelForCausalLM") + model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto", low_cpu_mem_usage=True, cache_dir="./models") + model = model.to('xpu') + model = ipex.optimize(model) else: logging.info("Using AutoModelForCausalLM for full models") tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="./models/") diff --git a/run_localGPT.py b/run_localGPT.py index a1816131..661a5df2 100644 --- a/run_localGPT.py +++ b/run_localGPT.py @@ -7,6 +7,10 @@ from langchain.llms import HuggingFacePipeline from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler # for streaming response from langchain.callbacks.manager import CallbackManager +from langchain.llms.base import LLM + +# Uncomment below line if you have Intel Discrete GPU's and it has XPU Support +#import intel_extension_for_pytorch as ipex callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) @@ -19,6 +23,18 @@ pipeline, ) +import warnings + +warnings.filterwarnings( + "ignore", category=UserWarning, module="intel_extension_for_pytorch" +) +warnings.filterwarnings( + "ignore", category=UserWarning, module="torchvision.io.image", lineno=13 +) +warnings.filterwarnings( + "ignore", category=UserWarning, module="transformers" +) + from load_models import ( load_quantized_model_gguf_ggml, load_quantized_model_qptq, @@ -34,7 +50,6 @@ MODELS_PATH, ) - def load_model(device_type, model_id, model_basename=None, LOGGING=logging): """ Select a model for text generation using the HuggingFace library. @@ -65,7 +80,20 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging): else: model, tokenizer = load_quantized_model_qptq(model_id, model_basename, device_type, LOGGING) else: - model, tokenizer = load_full_model(model_id, model_basename, device_type, LOGGING) + model, tokenizer = load_full_model(model_id, model_basename, device_type, LOGGING) + if device_type == "xpu": + class CustomLLM(LLM): + def _call(self, prompt, stop=None, run_manager=None) -> str: + input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') + result = model.generate(input_ids=input_ids, max_new_tokens=MAX_NEW_TOKENS) + result = tokenizer.decode(result[0]) + return result + @property + def _llm_type(self) -> str: + return "custom" + + llm = CustomLLM() + return llm # Load configuration from the model to avoid warnings generation_config = GenerationConfig.from_pretrained(model_id)