diff --git a/load_models.py b/load_models.py
index b7fa5a37..59ec3922 100644
--- a/load_models.py
+++ b/load_models.py
@@ -9,6 +9,8 @@
     LlamaForCausalLM,
     LlamaTokenizer,
 )
+# Uncomment below line if you have Intel® Discrete GPU's and it has XPU Support.
+#import intel_extension_for_pytorch as ipex
 from constants import CONTEXT_WINDOW_SIZE, MAX_NEW_TOKENS, N_GPU_LAYERS, N_BATCH, MODELS_PATH
 
 
@@ -131,6 +133,13 @@ def load_full_model(model_id, model_basename, device_type, logging):
         logging.info("Using LlamaTokenizer")
         tokenizer = LlamaTokenizer.from_pretrained(model_id, cache_dir="./models/")
         model = LlamaForCausalLM.from_pretrained(model_id, cache_dir="./models/")
+    elif device_type.lower() == "xpu":
+        logging.info("Using LlamaTokenizer")
+        tokenizer = LlamaTokenizer.from_pretrained(model_id, cache_dir="./models/")
+        logging.info("Using AutoModelForCausalLM")
+        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto", low_cpu_mem_usage=True, cache_dir="./models")
+        model = model.to('xpu')
+        model = ipex.optimize(model)
     else:
         logging.info("Using AutoModelForCausalLM for full models")
         tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="./models/")
diff --git a/run_localGPT.py b/run_localGPT.py
index a1816131..661a5df2 100644
--- a/run_localGPT.py
+++ b/run_localGPT.py
@@ -7,6 +7,10 @@
 from langchain.llms import HuggingFacePipeline
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler  # for streaming response
 from langchain.callbacks.manager import CallbackManager
+from langchain.llms.base import LLM
+
+# Uncomment below line if you have Intel Discrete GPU's and it has XPU Support
+#import intel_extension_for_pytorch as ipex
 
 callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
 
@@ -19,6 +23,18 @@
     pipeline,
 )
 
+import warnings
+
+warnings.filterwarnings(
+    "ignore", category=UserWarning, module="intel_extension_for_pytorch"
+)
+warnings.filterwarnings(
+    "ignore", category=UserWarning, module="torchvision.io.image", lineno=13
+)
+warnings.filterwarnings(
+    "ignore", category=UserWarning, module="transformers"
+)
+
 from load_models import (
     load_quantized_model_gguf_ggml,
     load_quantized_model_qptq,
@@ -34,7 +50,6 @@
     MODELS_PATH,
 )
 
-
 def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
     """
     Select a model for text generation using the HuggingFace library.
@@ -65,7 +80,20 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
         else:
             model, tokenizer = load_quantized_model_qptq(model_id, model_basename, device_type, LOGGING)
     else:
-        model, tokenizer = load_full_model(model_id, model_basename, device_type, LOGGING)
+        model, tokenizer = load_full_model(model_id, model_basename,  device_type, LOGGING)
+        if device_type == "xpu":
+            class CustomLLM(LLM):
+                def _call(self, prompt, stop=None, run_manager=None) -> str:
+                    input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
+                    result = model.generate(input_ids=input_ids, max_new_tokens=MAX_NEW_TOKENS)
+                    result = tokenizer.decode(result[0])
+                    return result
+                @property
+                def _llm_type(self) -> str:
+                    return "custom"
+
+            llm = CustomLLM()
+            return llm
 
     # Load configuration from the model to avoid warnings
     generation_config = GenerationConfig.from_pretrained(model_id)