diff --git a/.devops/openvino.Dockerfile b/.devops/openvino.Dockerfile
new file mode 100644
index 0000000000000..16924e3937c90
--- /dev/null
+++ b/.devops/openvino.Dockerfile
@@ -0,0 +1,134 @@
+ARG OPENVINO_VERSION_MAJOR=2025.2
+ARG OPENVINO_VERSION_FULL=2025.2.0.19140.c01cd93e24d
+ARG UBUNTU_VERSION=24.04
+
+# Optional proxy build arguments - empty by default
+ARG http_proxy=
+ARG https_proxy=
+
+## Build Image
+FROM ubuntu:${UBUNTU_VERSION} AS build
+
+# Pass proxy args to build stage
+ARG http_proxy
+ARG https_proxy
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        ca-certificates \
+        gnupg \
+        wget \
+        git \
+        cmake \
+        ninja-build \
+        build-essential \
+        libtbb12 \
+        libcurl4-openssl-dev && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install OpenVINO for Ubuntu 24.04
+ARG OPENVINO_VERSION_MAJOR
+ARG OPENVINO_VERSION_FULL
+RUN mkdir -p /opt/intel && \
+    wget https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
+    tar -xf openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
+    mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
+    cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
+    echo "Y" | ./install_dependencies/install_openvino_dependencies.sh && \
+    cd - && \
+    ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino
+
+ENV OpenVINO_DIR=/opt/intel/openvino
+
+WORKDIR /app
+
+COPY . .
+
+# Build Stage
+RUN bash -c "source ${OpenVINO_DIR}/setupvars.sh && \
+    cmake -B build/ReleaseOV -G Ninja \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DGGML_OPENVINO=ON && \
+    cmake --build build/ReleaseOV -j$(nproc)"
+
+# Copy all necessary libraries
+RUN mkdir -p /app/lib && \
+    find build/ReleaseOV -name '*.so*' -exec cp {} /app/lib \; && \
+    find ${OpenVINO_DIR}/runtime/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \; 2>/dev/null || \
+    find ${OpenVINO_DIR}/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \;
+
+# Create runtime directories and copy binaries
+RUN mkdir -p /app/full \
+    && cp build/ReleaseOV/bin/* /app/full/ \
+    && cp *.py /app/full \
+    && cp -r gguf-py /app/full \
+    && cp -r requirements /app/full \
+    && cp requirements.txt /app/full \
+    && cp .devops/tools.sh /app/full/tools.sh
+
+## Base Runtime Image
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+# Pass proxy args to runtime stage
+ARG http_proxy
+ARG https_proxy
+
+RUN apt-get update \
+    && apt-get install -y libgomp1 libtbb12 curl\
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+COPY --from=build /app/lib/ /app/
+
+### Full (all binaries)
+FROM base AS full
+
+ARG http_proxy
+ARG https_proxy
+
+COPY --from=build /app/full /app/
+
+WORKDIR /app
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    git \
+    python3 \
+    python3-venv \
+    python3-pip && \
+    python3 -m venv /ov-venv && \
+    /ov-venv/bin/pip install --no-cache-dir --upgrade pip setuptools wheel && \
+    /ov-venv/bin/pip install --no-cache-dir -r requirements.txt && \
+    apt-get autoremove -y && \
+    apt-get clean && \
+    rm -rf /tmp/* /var/tmp/* && \
+    find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
+    find /var/cache -type f -delete
+
+ENTRYPOINT ["/bin/bash", "-c", "source /ov-venv/bin/activate && exec /app/tools.sh \"$@\"", "--"]
+
+
+### Light, CLI only
+FROM base AS light
+
+COPY --from=build /app/full/llama-cli /app/
+
+WORKDIR /app
+
+ENTRYPOINT [ "/app/llama-cli" ]
+
+### Server, Server only
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+COPY --from=build /app/full/llama-server /app/
+
+WORKDIR /app
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]
\ No newline at end of file
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index d4ed3ce7e17a3..9d5a00bec76a1 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -596,6 +596,45 @@ jobs:
             -DGGML_SYCL_F16=ON
           cmake --build build --config Release -j $(nproc)
 
+  ubuntu-24-cmake-openvino:
+      runs-on: ubuntu-24.04
+
+      steps:
+        - name: Clone
+          id: checkout
+          uses: actions/checkout@v4
+
+        - name: ccache
+          uses: hendrikmuhs/ccache-action@v1.2.16
+          with:
+            key: ubuntu-24-cmake-openvino-no-preset-v1
+            evict-old-files: 1d
+
+        - name: Dependencies
+          id: depends
+          run: |
+            export OPENVINO_VERSION_MAJOR=2025.2
+            export OPENVINO_VERSION_FULL=2025.2.0.19140.c01cd93e24d
+            sudo apt-get update
+            sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar
+            sudo mkdir -p /opt/intel
+            wget -O openvino_${OPENVINO_VERSION_MAJOR}.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz
+            tar -xf openvino_${OPENVINO_VERSION_MAJOR}.tgz
+            sudo mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR}
+            rm openvino_${OPENVINO_VERSION_MAJOR}.tgz
+            cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR}
+            echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh && cd -
+            sudo ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino
+
+        - name: Build
+          id: cmake_build
+          run: |
+            source /opt/intel/openvino/setupvars.sh
+            cmake -B build/ReleaseOV -G Ninja \
+              -DCMAKE_BUILD_TYPE=Release \
+              -DGGML_OPENVINO=ON
+            cmake --build build/ReleaseOV --config Release -j $(nproc)
+
   build-linux-cross:
     uses: ./.github/workflows/build-linux-cross.yml
 
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index 2067927be56ca..1dcf7f36c1d49 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -44,6 +44,7 @@ jobs:
           - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
           - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
           - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
+          - { tag: "openvino", dockerfile: ".devops/openvino.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
           # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
           #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
     steps:
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 4ed6126f487c0..ab3c734b38217 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -240,6 +240,63 @@ jobs:
           path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
           name: llama-bin-ubuntu-vulkan-x64.zip
 
+  ubuntu-24-openvino:
+    runs-on: ubuntu-24.04
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ubuntu-24-cmake-openvino-release-no-preset-v1 
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        run: |
+          export OPENVINO_VERSION_MAJOR=2025.2
+          export OPENVINO_VERSION_FULL=2025.2.0.19140.c01cd93e24d
+          sudo apt-get update
+          sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar
+          sudo mkdir -p /opt/intel
+          wget -O openvino_${OPENVINO_VERSION_MAJOR}.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz
+          tar -xf openvino_${OPENVINO_VERSION_MAJOR}.tgz
+          sudo mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR}
+          rm openvino_${OPENVINO_VERSION_MAJOR}.tgz
+          cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR}
+          echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh && cd -
+          sudo ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino
+
+      - name: Build
+        id: cmake_build
+        run: |
+          source /opt/intel/openvino/setupvars.sh
+          cmake -B build/ReleaseOV -G Ninja \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_OPENVINO=ON
+          cmake --build build/ReleaseOV --config Release -j $(nproc)
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          cp LICENSE ./build/ReleaseOV/bin/
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-x64.zip ./build/ReleaseOV/bin/*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-x64.zip
+          name: llama-bin-ubuntu-openvino-x64.zip
+
   windows-cpu:
     runs-on: windows-2025
 
diff --git a/ci/run.sh b/ci/run.sh
index 4d3abf9232212..bce498f860e89 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -22,6 +22,9 @@
 # # with MUSA support
 # GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
+# # with OPENVINO support
+# GG_BUILD_OPENVINO=1 GG_BUILD_LOW_PERF=1 GGML_OPENVINO_DEVICE=CPU bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#
 
 if [ -z "$2" ]; then
     echo "usage: $0 <output-dir> <mnt-dir>"
@@ -93,6 +96,15 @@ if [ ! -z ${GG_BUILD_MUSA} ]; then
     MUSA_ARCH=${MUSA_ARCH:-21}
     CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
 fi
+
+if [ ! -z ${GG_BUILD_OPENVINO} ]; then
+    if [ -z ${OpenVINO_DIR} ]; then
+        echo "OpenVINO_DIR not found, please install OpenVINO via archives and enable it by:"
+        echo "source /opt/intel/openvino/setupvars.sh"
+        exit 1
+    fi
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_OPENVINO=ON"
+fi
 ## helpers
 
 # download a file if it does not exist or if it is outdated
diff --git a/docs/build.md b/docs/build.md
index dd486fe293546..4040552137cb7 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -13,6 +13,21 @@ cd llama.cpp
 
 The following sections describe how to build with different backends and options.
 
+* [CPU Build](#cpu-build)
+* [BLAS Build](#blas-build)
+* [Metal Build](#metal-build)
+* [SYCL](#sycl)
+* [CUDA](#cuda)
+* [MUSA](#musa)
+* [HIP](#hip)
+* [Vulkan](#vulkan)
+* [CANN](#cann)
+* [Arm® KleidiAI™](#arm-kleidiai)
+* [OpenCL](#opencl)
+* [Android](#android-1)
+* [OpenVINO](#openvino)
+* [Notes about GPU-accelerated backends](#notes-about-gpu-accelerated-backends)
+
 ## CPU Build
 
 Build llama.cpp using `CMake`:
@@ -578,6 +593,127 @@ Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/m
 
 To read documentation for how to build on IBM Z & LinuxONE, [click here](./build-s390x.md)
 
+## OpenVINO
+
+[OpenVINO](https://docs.openvino.ai/2025/index.html) is an open-source toolkit for optimizing and deploying high-performance AI inference, specifically designed for Intel hardware, including CPUs, GPUs, and NPUs, in the cloud, on-premises, and on the edge.
+The OpenVINO backend enhances performance by leveraging hardware-specific optimizations and can be enabled for use with llama.cpp.
+
+Follow the instructions below to install OpenVINO runtime and build llama.cpp with OpenVINO support.
+
+### Prerequisites
+
+- Linux or Windows system with Intel hardware (CPU, GPU, or NPU)
+- **For Intel GPU or NPU Usage**: Install the appropriate hardware drivers for your Intel GPU or NPU. For detailed instructions, see: [Additional Configurations for Hardware Acceleration](https://docs.openvino.ai/2025/get-started/install-openvino/configurations.html).
+- Git, CMake, and Ninja software tools are needed for building.
+```bash
+  sudo apt-get update
+  sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar
+```
+
+### 1. Install OpenVINO Runtime
+
+- Follow the guide to install OpenVINO Runtime from an archive file: [Linux](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-linux.html) | [Windows](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-windows.html)
+
+<details>
+<summary>📦 Click to expand OpenVINO 2025.2 installation commands on Linux</summary>
+<br>
+
+```bash
+export OPENVINO_VERSION_MAJOR=2025.2
+export OPENVINO_VERSION_FULL=2025.2.0.19140.c01cd93e24d
+sudo apt-get update
+sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar
+sudo mkdir -p /opt/intel
+wget -O openvino_${OPENVINO_VERSION_MAJOR}.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz
+tar -xf openvino_${OPENVINO_VERSION_MAJOR}.tgz
+sudo mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR}
+rm openvino_${OPENVINO_VERSION_MAJOR}.tgz
+cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR}
+echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh && cd -
+sudo ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino
+source /opt/intel/openvino/setupvars.sh
+```
+</details>
+
+- Verify OpenVINO is initialized properly
+```bash
+echo $OpenVINO_DIR
+```
+
+### 2. Build llama.cpp with OpenVINO Backend
+
+Clone the OpenVINO-enabled llama.cpp fork and build it:
+
+```bash
+git clone https://github.com/ravi9/llama.cpp.git
+cd llama.cpp
+git switch dev_backend_openvino
+
+# Build with OpenVINO support
+source /opt/intel/openvino/setupvars.sh
+cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON
+cmake --build build/ReleaseOV --config Release -j $(nproc)
+```
+
+### 3. Download Sample Model
+
+Download models for testing:
+
+```bash
+# Create models directory
+mkdir -p ~/models/
+
+# Download model file: Llama-3.2-1B-Instruct.fp16.gguf
+wget https://huggingface.co/MaziyarPanahi/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct.fp16.gguf \
+     -O ~/models/Llama-3.2-1B-Instruct.fp16.gguf
+
+# Download model file: Phi-3-mini-4k-instruct-fp16.gguf
+wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-fp16.gguf \
+     -O ~/models/Phi-3-mini-4k-instruct-fp16.gguf
+```
+
+### 4. Run inference with OpenVINO backend:
+
+When using the OpenVINO backend, the first inference token may have slightly higher latency due to on-the-fly conversion to the OpenVINO graph. Subsequent tokens and runs will be faster.
+
+```bash
+export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache
+# Default device is GPU.
+# If not set, automatically selects the first available device in priority order: GPU, CPU, NPU.
+export GGML_OPENVINO_DEVICE=GPU
+
+./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is "
+
+```
+
+To run in chat mode:
+```bash
+export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache
+./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is "
+
+```
+
+### Configuration Options
+
+Control OpenVINO behavior using these environment variables:
+
+-   **`GGML_OPENVINO_DEVICE`**: Specify the target device for OpenVINO inference.  If not set, automatically selects the first available device in priority order: GPU, CPU, NPU. When set to `NPU` to use Intel NPUs, it enables static compilation mode for optimal performance.
+-   **`GGML_OPENVINO_CACHE_DIR`**: Directory for model caching (recommended: `/tmp/ov_cache`). If set, enables model caching in OpenVINO. Note: Not supported when using NPU devices yet.
+-   **`GGML_OPENVINO_PROFILING`**: Enable execution time profiling.
+-   **`GGML_OPENVINO_DUMP_CGRAPH`**: Save compute graph to `cgraph.txt`.
+-   **`GGML_OPENVINO_DUMP_IR`**: Export OpenVINO IR files with timestamps.
+-   **`GGML_OPENVINO_DEBUG_INPUT`**: Enable input debugging.
+-   **`GGML_OPENVINO_DEBUG_OUTPUT`**: Enable output debugging.
+
+### Example with Profiling
+
+```bash
+export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache
+export GGML_OPENVINO_PROFILING=1
+
+./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is "
+```
+
 ## Notes about GPU-accelerated backends
 
 The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`.
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 1fb7abeaf088f..b5a8a6fd31719 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -207,6 +207,8 @@ set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
 set   (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
                                             "ggml: sycl device architecture")
 
+option(GGML_OPENVINO                        "ggml: use OPENVINO"                              OFF)
+
 option(GGML_OPENCL                          "ggml: use OpenCL"                                OFF)
 option(GGML_OPENCL_PROFILING                "ggml: use OpenCL profiling (increases overhead)" OFF)
 option(GGML_OPENCL_EMBED_KERNELS            "ggml: embed kernels"                             ON)
@@ -278,6 +280,7 @@ set(GGML_PUBLIC_HEADERS
     include/ggml-sycl.h
     include/ggml-vulkan.h
     include/ggml-webgpu.h
+    include/ggml-openvino.h
     include/gguf.h)
 
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
diff --git a/ggml/include/ggml-openvino.h b/ggml/include/ggml-openvino.h
new file mode 100644
index 0000000000000..151c48d40d067
--- /dev/null
+++ b/ggml/include/ggml-openvino.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include <cstring>
+#include <array>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define GGML_OPENVINO_NAME "OPENVINO"
+#define GGML_OPENVINO_MAX_DEVICES       16
+
+// backend API
+GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device);
+
+GGML_BACKEND_API bool ggml_backend_is_openvino(ggml_backend_t backend);
+
+// device buffer
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device);
+
+// split tensor buffer that splits matrices by rows across multiple devices
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_split_buffer_type(const float * tensor_split);
+
+// pinned host buffer for use with the CPU backend for faster copies between CPU
+// and GPU
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(void);
+
+GGML_BACKEND_API int  ggml_backend_openvino_get_device_count(void);
+// GGML_BACKEND_API void ggml_backend_openvino_get_device_description(int device, char * description,
+//                                                                    size_t description_size);
+// GGML_BACKEND_API void ggml_backend_openvino_get_device_memory(int device, size_t * free, size_t * total);
+
+// GGML_BACKEND_API bool ggml_backend_openvino_register_host_buffer(void * buffer, size_t size);
+// GGML_BACKEND_API void ggml_backend_openvino_unregister_host_buffer(void * buffer);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void);
+
+struct ggml_openvino_device_info {
+    int device_count;
+
+    struct openvino_device_info {
+        int     cc;                 // compute capability
+        int     nsm;                // number of streaming multiprocessors
+        size_t  smpb;               // max. shared memory per block
+        size_t  smpbo;              // max. shared memory per block (with opt-in)
+        bool    vmm;                // virtual memory support
+        size_t  vmm_granularity;    // granularity of virtual memory
+        size_t  total_vram;
+    };
+
+    openvino_device_info devices[GGML_OPENVINO_MAX_DEVICES] = {};
+
+    std::array<float, GGML_OPENVINO_MAX_DEVICES> default_tensor_split = {};
+};
+
+const ggml_openvino_device_info & ggml_openvino_info();
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 177fb2821357f..caf923bb05a2d 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -383,6 +383,7 @@ ggml_add_backend(SYCL)
 ggml_add_backend(Vulkan)
 ggml_add_backend(WebGPU)
 ggml_add_backend(OpenCL)
+ggml_add_backend(OPENVINO)
 
 foreach (target ggml-base ggml)
     target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 6c31513750c9b..59fe821d57334 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -65,6 +65,10 @@
 #include "ggml-cann.h"
 #endif
 
+#ifdef GGML_USE_OPENVINO
+#include "ggml-openvino.h"
+#endif
+
 // disable C++17 deprecation warning for std::codecvt_utf8
 #if defined(__clang__)
 #    pragma clang diagnostic push
@@ -192,6 +196,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_RPC
         register_backend(ggml_backend_rpc_reg());
 #endif
+#ifdef GGML_USE_OPENVINO
+        register_backend(ggml_backend_openvino_reg());
+#endif
 #ifdef GGML_USE_CPU
         register_backend(ggml_backend_cpu_reg());
 #endif
@@ -584,6 +591,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
     ggml_backend_load_best("vulkan", silent, dir_path);
     ggml_backend_load_best("opencl", silent, dir_path);
     ggml_backend_load_best("musa", silent, dir_path);
+    ggml_backend_load_best("openvino", silent, dir_path);
     ggml_backend_load_best("cpu", silent, dir_path);
     // check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
     const char * backend_path = std::getenv("GGML_BACKEND_PATH");
diff --git a/ggml/src/ggml-openvino/.clang-format b/ggml/src/ggml-openvino/.clang-format
new file mode 100644
index 0000000000000..63dc2c472a95d
--- /dev/null
+++ b/ggml/src/ggml-openvino/.clang-format
@@ -0,0 +1,143 @@
+---
+# Override root .clang-format
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+ReferenceAlignment: Left
+PointerAlignment: Left
+Cpp11BracedListStyle: true
+AccessModifierOffset: -4
+BinPackArguments: false
+BreakBeforeBraces: Attach
+IndentCaseBlocks: false
+IndentCaseLabels: false
+
+Language:        Cpp
+AlignAfterOpenBracket: Align
+AlignArrayOfStructures: Left
+AlignConsecutiveBitFields: AcrossComments
+AlignConsecutiveMacros: AcrossComments
+# AlignConsecutiveShortCaseStatements: AcrossComments
+AlignEscapedNewlines: Left # LeftWithLastLine
+AlignOperands:   Align
+AlignTrailingComments:
+  Kind: Always
+  OverEmptyLines: 1
+AllowAllArgumentsOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: false
+# AllowBreakBeforeNoexceptSpecifier: OnlyWithParen
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Inline
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLambdasOnASingleLine: Inline
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakBeforeMultilineStrings: true
+BinPackParameters: true
+BitFieldColonSpacing: Both
+# BreakAdjacentStringLiterals: true
+BreakAfterAttributes: Never
+BreakBeforeBinaryOperators: None
+BreakBeforeInlineASMColon: OnlyMultiline
+BreakBeforeTernaryOperators: false
+# BreakBinaryOperations: Never
+BreakConstructorInitializers: AfterColon
+# BreakFunctionDefinitionParameters: false
+BreakInheritanceList: AfterComma
+BreakStringLiterals: true
+# BreakTemplateDeclarations: Yes
+ColumnLimit:     120
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+DerivePointerAlignment: false
+DisableFormat:   false
+EmptyLineBeforeAccessModifier: Leave
+EmptyLineAfterAccessModifier: Never
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+IncludeBlocks:   Regroup
+IncludeCategories:
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+    SortPriority:    0
+  - Regex:           '^<.*'
+    Priority:        2
+    SortPriority:    0
+  - Regex:           '.*'
+    Priority:        3
+    SortPriority:    0
+IncludeIsMainRegex: '([-_](test|unittest))?$'
+IncludeIsMainSourceRegex: ''
+IndentAccessModifiers: false
+IndentExternBlock: NoIndent
+IndentGotoLabels: false
+IndentPPDirectives: AfterHash
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+InsertBraces:    true # NOTE: may lead to incorrect formatting
+InsertNewlineAtEOF: true
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+LambdaBodyIndentation: Signature
+LineEnding: LF
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Auto
+ObjCBlockIndentWidth: 4
+ObjCSpaceAfterProperty: true
+ObjCSpaceBeforeProtocolList: true
+PPIndentWidth: -1
+PackConstructorInitializers: CurrentLine
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+QualifierAlignment: Left
+#QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict']
+RawStringFormats:
+  - Language:        Cpp
+    Delimiters:
+      - cc
+      - CC
+      - cpp
+      - Cpp
+      - CPP
+      - 'c++'
+      - 'C++'
+    CanonicalDelimiter: ''
+ReflowComments:  false # IndentOnly
+SeparateDefinitionBlocks: Always
+SortIncludes:    CaseInsensitive
+SortUsingDeclarations: LexicographicNumeric
+SpaceAfterCStyleCast: true
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyBlock: false
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles:  Never
+SpacesInLineCommentPrefix:
+  Minimum: 1
+  Maximum: -1
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+SpaceBeforeSquareBrackets: false
+Standard:        c++17
+TabWidth:        4
+UseTab:          Never
+WhitespaceSensitiveMacros: ['STRINGIZE']
+...
diff --git a/ggml/src/ggml-openvino/CMakeLists.txt b/ggml/src/ggml-openvino/CMakeLists.txt
new file mode 100644
index 0000000000000..216aa756a7a96
--- /dev/null
+++ b/ggml/src/ggml-openvino/CMakeLists.txt
@@ -0,0 +1,19 @@
+find_package(OpenVINO REQUIRED)
+
+file(GLOB_RECURSE GGML_HEADERS_OPENVINO "*.h" "*.hpp")
+file(GLOB_RECURSE GGML_SOURCES_OPENVINO "*.cpp")
+
+ggml_add_backend_library(ggml-openvino
+    ${GGML_SOURCES_OPENVINO}
+    ${GGML_HEADERS_OPENVINO}
+)
+
+target_link_libraries(ggml-openvino PRIVATE openvino::runtime)
+
+if (GGML_OPENVINO)
+    if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
+    elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64")
+    else()
+        message(FATAL_ERROR "OpenVINO: OpenVINO toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}")
+    endif()
+endif()
diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
new file mode 100644
index 0000000000000..09919c85052ca
--- /dev/null
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -0,0 +1,667 @@
+#include "ggml-decoder.h"
+
+#include <ggml-impl.h>
+#include <ggml.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <execution>
+#include <fstream>
+#include <iomanip>
+#include <map>
+#include <memory>
+#include <openvino/core/dimension.hpp>
+#include <openvino/core/node.hpp>
+#include <openvino/core/partial_shape.hpp>
+#include <openvino/core/type/bfloat16.hpp>
+#include <openvino/core/type/element_type.hpp>
+#include <openvino/core/type/float16.hpp>
+#include <openvino/op/constant.hpp>
+#include <openvino/op/parameter.hpp>
+#include <openvino/runtime/tensor.hpp>
+#include <ostream>
+#include <set>
+#include <stdexcept>
+#include <string>
+
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+
+GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token,
+                             int context_size, int num_heads, int num_heads_kv, int head_size) :
+    m_cgraph(cgraph),
+    m_node(node),
+    m_op_name(std::string(node->name)),
+    m_context_size(context_size),
+    m_num_heads(num_heads),
+    m_num_heads_kv(num_heads_kv),
+    m_head_size(head_size),
+    m_is_static(is_static),
+    m_is_first_token(is_first_token) {
+    set_input_output(node);
+}
+
+GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph,
+                             std::map<std::string, std::shared_ptr<ov::Node>>& model_weights, bool is_static,
+                             bool is_first_token) :
+    m_cgraph(cgraph),
+    m_op_name(m_node ? std::string(m_node->name) : ""),
+    m_model_weights(model_weights),
+    m_is_static(is_static),
+    m_is_first_token(is_first_token) {
+    if (is_first_token && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) {
+        print_tensor_address_map(cgraph);
+    }
+
+    if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
+        std::string filename = "cgraph.txt";
+        dump_cgraph(cgraph, filename);
+    }
+
+    set_llm_params();
+
+    for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
+        auto* cur_node = cgraph->nodes[node_n];
+        m_nodes.push_back(cur_node);
+        set_input_output(cur_node);
+    }
+
+    add_extra_inputs();
+}
+
+GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) {
+    m_cgraph = cgraph;
+    for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
+        auto* cur_node = cgraph->nodes[node_n];
+        if (cur_node->op == GGML_OP_NONE) {
+            continue;
+        }
+        m_nodes.push_back(cur_node);
+        set_input_output(cur_node, true);
+    }
+}
+
+// Called in GgmlOvDecoder constructor. Two cases: 1. constructing a decoder for the whole graph;
+// 2. constructing a decoder for a node;
+// 3. constructing a decoder for the whole graph naively (op test case)
+void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) {
+    std::string node_name;
+    if (node->op == GGML_OP_SET_ROWS) {
+        // SET_ROWS updates the tensor in place. For later ov op that uses the
+        // the view_src of SET_ROWS, we need to make sure they get the updated tensor
+        // by putting the view_src name in the tensor_map in
+        // <openvino>/src/frontends/ggml/src/translate_session.cpp
+        node_name = std::string(node->view_src->name);
+    } else {
+        node_name = std::string(node->name);
+    }
+
+    m_output_names.push_back(node_name);
+    m_outputs[node_name] = node;
+
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        auto* src = node->src[i];
+        if (src == nullptr) {
+            continue;
+        }
+        std::string src_name = std::string(src->name);
+        m_input_names.push_back(src_name);
+        m_inputs[src_name] = src;
+        m_op_node_name.emplace_back(src_name, ggml_op_name(node->op));
+
+        // Add model inputs and weights constants, if called for the whole graph
+        if (naive) {
+            auto param_node = std::make_shared<ov::op::v0::Parameter>(get_ov_type(src), get_graph_input_shape(src));
+            param_node->set_friendly_name(src_name);
+            param_node->output(0).get_tensor().set_names({src_name});
+            m_model_inputs[src_name] = param_node;
+
+        } else if (!m_node && !src->view_src) {
+            ggml_backend_buffer* buffer = src->buffer;
+
+            if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) {
+                // GGML_BACKEND_BUFFER_USAGE_ANY are kv caches
+                if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) {
+                    assert(src_name.find("cache_k") == 0 || src_name.find("cache_v") == 0);
+                }
+                if (m_model_inputs.find(src_name) != m_model_inputs.end()) {
+                    continue;
+                }
+                auto param_node = std::make_shared<ov::op::v0::Parameter>(get_ov_type(src), get_graph_input_shape(src));
+                param_node->set_friendly_name(src_name);
+                param_node->output(0).get_tensor().set_names({src_name});
+                m_model_inputs[src_name] = param_node;
+            }
+        }
+    }
+
+    // Add model outputs, if called for the whole graph
+    if (naive) {
+        m_model_output_names.push_back(node->name);
+    } else if (!m_node) {
+        static std::set<std::string> debug_output_names = {};
+        // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph
+        if (node->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || node->flags & GGML_TENSOR_FLAG_OUTPUT ||
+            std::string(node->name).find("result") == 0 || debug_output_names.count(node->name)) {
+            auto name = node->view_src ? std::string(node->view_src->name) : std::string(node->name);
+            if (node->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) {
+                assert(name.find("cache_k") == 0 || name.find("cache_v") == 0);
+            }
+            if (auto it = std::find(m_model_output_names.begin(), m_model_output_names.end(), name);
+                it == m_model_output_names.end()) {
+                m_model_output_names.push_back(name);
+            }
+            if (auto it = std::find(m_kv_names.begin(), m_kv_names.end(), name); it == m_kv_names.end()) {
+                m_kv_names.push_back(name);
+            }
+        }
+    }
+
+    if (m_node) {
+        switch (node->op) {
+        case GGML_OP_RESHAPE: {
+            if (node->ne[0] * node->ne[1] == node->src[0]->ne[0]) {
+                m_op_case = 1;
+            } else if (node->src[0]->ne[0] * node->src[0]->ne[1] == node->ne[0]) {
+                m_op_case = 2;
+            } else if (node->src[0]->ne[0] * node->src[0]->ne[1] == node->ne[1]) {
+                m_op_case = 3;
+            }
+            break;
+        }
+        case GGML_OP_CONT: {
+            if (ggml_nelements(node->src[0]) == ggml_nelements(node->src[0]->view_src)) {
+                // The input comes from a PERMUTE
+                m_op_case = 1;
+            } else {
+                // The input comes from a VIEW which is subtensor
+                m_op_case = 2;
+            }
+            break;
+        }
+        case GGML_OP_SET_ROWS: {
+            if (std::string(node->name).find("cache_k") == 0) {
+                m_op_case = 1;
+            } else {
+                m_op_case = 2;
+            }
+            break;
+        }
+        case GGML_OP_PERMUTE: {
+            if (node->src[0]->view_src == nullptr) {
+                // Permute Qcur
+                m_op_case = 1;
+            } else if (ggml_is_contiguous(node->src[0])) {
+                // Permute cache_k (view)
+                m_op_case = 2;
+            } else {
+                // Permute cache_v (view)
+                m_op_case = 3;
+            }
+            break;
+        }
+        case GGML_OP_GET_ROWS: {
+            if (node->src[1]->op == GGML_OP_VIEW) {
+                m_op_case = 2;
+            } else {
+                m_op_case = 1;
+            }
+            break;
+        }
+        case GGML_OP_ROPE: {
+            if (node->src[0]->op == GGML_OP_VIEW) {
+                m_op_case = 2;
+            } else {
+                m_op_case = 1;
+            }
+            break;
+        }
+        default:
+            break;
+        }
+    }
+}
+
+void GgmlOvDecoder::set_llm_params() {
+    for (int i = 0; i < m_cgraph->n_nodes; i++) {
+        auto* node = m_cgraph->nodes[i];
+        if (node->op == GGML_OP_VIEW && std::string(node->name) == "cache_k_l0 (view)") {
+            auto* cache_k = node->src[0];
+            m_context_size = cache_k->ne[1];
+        } else if (node->op == GGML_OP_ROPE && std::string(node->name) == "Qcur-0") {
+            m_head_size = node->ne[0];
+            m_num_heads = node->ne[1];
+            m_rope_params = node->op_params;
+        } else if (node->op == GGML_OP_ROPE && std::string(node->name) == "Kcur-0") {
+            m_num_heads_kv = node->ne[1];
+        }
+    }
+}
+
+ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) const {
+    auto name = std::string(src->name);
+    ov::PartialShape input_shape;
+    if (name == "inp_tokens" || name == "inp_pos") {
+        if (m_is_static) {
+            if (m_is_first_token) {
+                input_shape = ov::PartialShape{1, 1, m_context_size};
+            } else {
+                input_shape = ov::PartialShape{1, 1, 1};
+            }
+        } else {
+            input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_context_size)};
+        }
+    } else if (name == "inp_out_ids" && !m_is_static) {
+        input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_context_size)};
+    } else if (name == "KQ_mask") {
+        if (m_is_static) {
+            if (m_is_first_token) {
+                input_shape = ov::PartialShape{1, m_context_size, m_context_size};
+            } else {
+                input_shape = ov::PartialShape{1, 1, m_context_size};
+            }
+        } else {
+            auto max_mask_size = GGML_PAD(m_context_size, GGML_KQ_MASK_PAD);
+            input_shape = ov::PartialShape{1, ov::Dimension(1, max_mask_size), ov::Dimension(1, max_mask_size)};
+        }
+    } else if (name.find("cache_k") == 0) {
+        input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size};
+    } else if (name.find("cache_v") == 0) {
+        input_shape = ov::PartialShape{m_num_heads_kv, m_head_size, m_context_size};
+    } else if (const auto* op = get_tensor_used_op(src); op && op->op == GGML_OP_SET_ROWS) {
+        input_shape = ov::PartialShape{1, 1, -1};
+        if (m_is_static) {
+            if (m_is_first_token) {
+                // Dummy static shape, since the indices are not used in this case
+                input_shape = ov::PartialShape{1};
+            } else if (std::string(op->name).find("cache_k") == 0) {
+                input_shape = ov::PartialShape{1, 1, 1};
+            } else {
+                input_shape = ov::PartialShape{1, 1, m_num_heads_kv * m_head_size};
+            }
+        }
+    } else if (src->op == GGML_OP_VIEW) {
+        // This case is added to make test-backend-ops work
+        input_shape = ov::PartialShape{get_shape(src->view_src)};
+    } else {
+        input_shape = ov::PartialShape{get_shape(src)};
+    }
+    return input_shape;
+}
+
+void GgmlOvDecoder::add_extra_inputs() {
+    // Extra inputs:
+    // 1. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned,
+    //     see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding.
+    //     Not used for NPU
+    int64_t attention_size = -1;
+    for (const auto& node : m_nodes) {
+        if (node->op == GGML_OP_SOFT_MAX) {
+            auto* mask = node->src[1];
+            if (std::string(mask->name).find("KQ_mask") != 0) {
+                throw std::runtime_error("Unexpected softmax node: " + std::string(mask->name));
+            }
+            attention_size = mask->ne[0];
+            break;
+        }
+    }
+
+    {
+        std::string name = "attention_size";
+        auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
+        param_node->set_friendly_name(name);
+        param_node->output(0).get_tensor().set_names({name});
+        m_model_extra_inputs[name] = param_node;
+
+        auto tensor = std::make_shared<ov::Tensor>(ov::element::i64, ov::Shape{1});
+        *tensor->data<int64_t>() = attention_size;
+        m_model_extra_input_values[name] = tensor;
+    }
+}
+
+const ggml_tensor* GgmlOvDecoder::get_tensor_used_op(const ggml_tensor* tensor) const {
+    if (tensor == nullptr) {
+        return nullptr;
+    }
+    for (int i = 0; i < m_cgraph->n_nodes; i++) {
+        const auto* node = m_cgraph->nodes[i];
+        for (int j = 0; j < GGML_MAX_SRC; j++) {
+            if (node->src[j] == tensor) {
+                return node;
+            }
+        }
+    }
+    return nullptr;
+}
+
+const ggml_tensor* GgmlOvDecoder::get_tensor_from_name(const std::string& name) const {
+    for (int i = 0; i < m_cgraph->n_nodes; i++) {
+        const auto* node = m_cgraph->nodes[i];
+        for (int j = 0; j < GGML_MAX_SRC; j++) {
+            const auto* src = node->src[j];
+            if (src == nullptr) {
+                break;
+            }
+            if (std::string(src->name) == name) {
+                return src;
+            }
+        }
+    }
+    return nullptr;
+}
+
+std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const {
+    std::map<std::string, std::string> kv_param_res_names;
+    for (const auto& name : m_kv_names) {
+        if (name.find("cache_k") == 0 || name.find("cache_v") == 0) {
+            kv_param_res_names[name] = name;
+        }
+    }
+    return kv_param_res_names;
+}
+
+std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(struct ggml_cgraph* cgraph) {
+    std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
+    static std::mutex weights_mutex;
+    auto* nodes = cgraph->nodes;
+    auto n_nodes = cgraph->n_nodes;
+    std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor* node) {
+        for (int i = 0; i < GGML_MAX_SRC; i++) {
+            auto* src = node->src[i];
+            if (src == nullptr) {
+                continue;
+            }
+
+            std::string src_name(src->name);
+            if (!src->view_src) {
+                ggml_backend_buffer* buffer = src->buffer;
+                if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
+                    bool should_create = false;
+                    {
+                        std::lock_guard<std::mutex> lock(weights_mutex);
+                        if (model_weights.find(src_name) == model_weights.end()) {
+                            model_weights[src_name] = nullptr;
+                            should_create = true;
+                        }
+                    }
+                    if (should_create) {
+                        auto weight_node = create_weight_node(src);
+                        weight_node->set_friendly_name(src_name);
+                        {
+                            std::lock_guard<std::mutex> lock(weights_mutex);
+                            model_weights[src_name] = weight_node;
+                        }
+                    }
+                }
+            }
+        }
+    });
+    return model_weights;
+}
+
+std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) {
+    auto node_type = get_ov_type(tensor);
+    auto node_shape = get_shape(tensor);
+    auto ne_total = ggml_nelements(tensor);
+    ov::Tensor weights(node_type, node_shape);
+    memcpy(weights.data(), tensor->data, ne_total * node_type.size());
+    return std::make_shared<ov::op::v0::Constant>(weights);
+}
+
+void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename) {
+    std::ofstream file(filename);
+    if (!file.is_open()) {
+        std::cerr << "Failed to open file" << std::endl;
+        return;
+    }
+
+    file << "=== GRAPH ===\n";
+
+    // clang-format off
+    file << "n_nodes = " << cgraph->n_nodes << "\n";
+    file << " " << std::setw(3) << "nodes"
+                <<  std::setw(15) << "shape"
+                << std::setw(20) << "op"
+                << std::setw(20) << "name"
+                << std::setw(3) << "    "
+                << std::setw(50) << "stride"
+                << "\n";
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        struct ggml_tensor * node = cgraph->nodes[i];
+
+        file << " - " << std::setw(3) << i << ": [ "
+             << std::setw(5) << node->ne[0] << ", "
+             << std::setw(5) << node->ne[1] << ", "
+             << std::setw(5) << node->ne[2] << ", "
+             << std::setw(5) << node->ne[3] << "] "
+             << std::left << std::setw(20) << ggml_op_name(node->op) << std::right << " "
+             << std::left << std::setw(45) << node->name << std::right
+             << std::setw(2) << "[ "
+             << std::setw(0) << node->nb[0] << ", "
+             << std::setw(5) << node->nb[1] << ", "
+             << std::setw(5) << node->nb[2] << ", "
+             << std::setw(5) << node->nb[3] << "] "
+             << "\n";
+
+        for (int i = 0; i < GGML_MAX_SRC; i++) {
+            if (auto* src = node->src[i]) {
+                file << std::setw(10) << " [ "
+                << std::setw(5) << src->ne[0] << ", "
+                << std::setw(5) << src->ne[1] << ", "
+                << std::setw(5) << src->ne[2] << ", "
+                << std::setw(5) << src->ne[3] << "] "
+                << std::setw(12)
+                << i << ": " << std::left << std::setw(12) << ggml_op_name(src->op) << std::right;
+                file << std::left << std::setw(30) << src->name << std::right
+                << std::setw(16) << "[ "
+                << std::setw(0) << src->nb[0] << ", "
+                << std::setw(5) << src->nb[1] << ", "
+                << std::setw(5) << src->nb[2] << ", "
+                << std::setw(5) << src->nb[3] << "] "
+                << "\n";
+            }
+        }
+    }
+
+    file << "n_leafs = " << cgraph->n_leafs << "\n";
+    for (int i = 0; i < cgraph->n_leafs; i++) {
+        struct ggml_tensor * node = cgraph->leafs[i];
+
+        file << " - " << std::setw(3) << i << ": [ "
+             << std::setw(5) << node->ne[0] << ", "
+             << std::setw(5) << node->ne[1] << "] "
+             << std::setw(8) << ggml_op_name(node->op) << " "
+             << std::setw(16) << ggml_get_name(node) << "\n";
+    }
+    // clang-format on
+    file << "========================================\n";
+
+    file.close();
+}
+
+void print_tensor_address_map(const struct ggml_cgraph* cgraph) {
+    std::map<void*, std::vector<std::string>> address_map;
+    for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
+        auto* node = cgraph->nodes[node_n];
+        if (node->data) {
+            auto it = address_map.find(node->data);
+            if (it == address_map.end()) {
+                address_map[node->data] = std::vector<std::string>();
+            }
+            address_map[node->data].push_back(node->name);
+        }
+    }
+    for (const auto& pair : address_map) {
+        std::cout << "Address: " << pair.first << std::endl;
+        for (const auto& name : pair.second) {
+            std::cout << name << " ; ";
+        }
+        std::cout << std::endl << std::endl;
+    }
+}
+
+std::vector<size_t> GgmlOvDecoder::get_shape(const ggml_tensor* tensor) {
+    std::vector<size_t> shape;
+    for (int i = GGML_MAX_DIMS - 2; i >= 0; --i) {
+        shape.push_back(static_cast<size_t>(tensor->ne[i]));
+    }
+    return shape;
+}
+
+std::vector<size_t> GgmlOvDecoder::get_stride(const ggml_tensor* tensor) {
+    std::vector<size_t> stride;
+    for (int i = GGML_MAX_DIMS - 2; i >= 0; --i) {
+        stride.push_back(static_cast<size_t>(tensor->nb[i]));
+    }
+    return stride;
+}
+
+ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor* tensor) {
+    switch (tensor->type) {
+    case GGML_TYPE_F64:
+        return ov::element::f64;
+    case GGML_TYPE_F32:
+        return ov::element::f32;
+    case GGML_TYPE_F16:
+        return ov::element::f16;
+    case GGML_TYPE_BF16:
+        return ov::element::bf16;
+    case GGML_TYPE_I8:
+        return ov::element::i8;
+    case GGML_TYPE_I16:
+        return ov::element::i16;
+    case GGML_TYPE_I32:
+        return ov::element::i32;
+    case GGML_TYPE_I64:
+        return ov::element::i64;
+    default:
+        throw std::runtime_error("Unsupported tensor type");
+    }
+}
+
+ov::PartialShape GgmlOvDecoder::get_input_shape(const std::string& name) const {
+    return ov::PartialShape(get_shape(m_inputs.at(name)));
+}
+
+std::vector<size_t> GgmlOvDecoder::get_input_stride(const std::string& name) const {
+    return get_stride(m_inputs.at(name));
+}
+
+ov::element::Type GgmlOvDecoder::get_input_type(const std::string& name) const {
+    return get_ov_type(m_inputs.at(name));
+}
+
+size_t GgmlOvDecoder::get_input_size() const {
+    return m_input_names.size();
+}
+
+std::string& GgmlOvDecoder::get_input_name(size_t index) const {
+    m_name = m_input_names[index];
+    return m_name;
+}
+
+std::vector<std::string> GgmlOvDecoder::get_input_names() const {
+    return m_input_names;
+}
+
+std::vector<size_t> GgmlOvDecoder::get_output_stride(const std::string& name) const {
+    return get_stride(m_outputs.at(name));
+}
+
+ov::PartialShape GgmlOvDecoder::get_output_shape(const std::string& name) const {
+    return ov::PartialShape(get_shape(m_outputs.at(name)));
+}
+
+ov::element::Type GgmlOvDecoder::get_output_type(const std::string& name) const {
+    return get_ov_type(m_outputs.at(name));
+}
+
+std::string& GgmlOvDecoder::get_output_name(size_t index) const {
+    m_name = std::string(m_output_names[index]);
+    return m_name;
+}
+
+std::vector<std::string> GgmlOvDecoder::get_output_names() const {
+    return m_output_names;
+}
+
+const std::string& GgmlOvDecoder::get_op_name() const {
+    return m_op_name;
+}
+
+int32_t* GgmlOvDecoder::get_input_op_params(const std::string& name) const {
+    return m_inputs.at(name)->op_params;
+}
+
+int32_t* GgmlOvDecoder::get_output_op_params(const std::string& name) const {
+    return m_outputs.at(name)->op_params;
+}
+
+void GgmlOvDecoder::visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>)> node_visitor) const {
+    for (const auto& node : m_nodes) {
+        auto decoder = std::make_shared<GgmlOvDecoder>(
+            node, m_cgraph, m_is_static, m_is_first_token, m_context_size, m_num_heads, m_num_heads_kv, m_head_size);
+        node_visitor(decoder);
+    }
+}
+
+const std::string& GgmlOvDecoder::get_op_type() const {
+    static const std::map<ggml_op, std::string> ops = {
+        {GGML_OP_NONE,      "GGML_OP_NONE"     },
+        {GGML_OP_ACC,       "GGML_OP_ACC"      },
+        {GGML_OP_ADD,       "GGML_OP_ADD"      },
+        {GGML_OP_ADD1,      "GGML_OP_ADD1"     },
+        {GGML_OP_CONT,      "GGML_OP_CONT"     },
+        {GGML_OP_DIV,       "GGML_OP_DIV"      },
+        {GGML_OP_DUP,       "GGML_OP_DUP"      },
+        {GGML_OP_GET_ROWS,  "GGML_OP_GET_ROWS" },
+        {GGML_OP_MUL,       "GGML_OP_MUL"      },
+        {GGML_OP_MUL_MAT,   "GGML_OP_MUL_MAT"  },
+        {GGML_OP_PERMUTE,   "GGML_OP_PERMUTE"  },
+        {GGML_OP_RESHAPE,   "GGML_OP_RESHAPE"  },
+        {GGML_OP_RMS_NORM,  "GGML_OP_RMS_NORM" },
+        {GGML_OP_ROPE,      "GGML_OP_ROPE"     },
+        {GGML_OP_SCALE,     "GGML_OP_SCALE"    },
+        {GGML_OP_SOFT_MAX,  "GGML_OP_SOFT_MAX" },
+        {GGML_OP_SUB,       "GGML_OP_SUB"      },
+        {GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE"},
+        {GGML_OP_VIEW,      "GGML_OP_VIEW"     },
+        {GGML_OP_SET_ROWS,  "GGML_OP_SET_ROWS" },
+    };
+    static const std::map<ggml_unary_op, std::string> unary_ops = {
+        {GGML_UNARY_OP_ABS,         "GGML_UNARY_OP_ABS"        },
+        {GGML_UNARY_OP_SGN,         "GGML_UNARY_OP_SGN"        },
+        {GGML_UNARY_OP_NEG,         "GGML_UNARY_OP_NEG"        },
+        {GGML_UNARY_OP_STEP,        "GGML_UNARY_OP_STEP"       },
+        {GGML_UNARY_OP_TANH,        "GGML_UNARY_OP_TANH"       },
+        {GGML_UNARY_OP_ELU,         "GGML_UNARY_OP_ELU"        },
+        {GGML_UNARY_OP_RELU,        "GGML_UNARY_OP_RELU"       },
+        {GGML_UNARY_OP_SIGMOID,     "GGML_UNARY_OP_SIGMOID"    },
+        {GGML_UNARY_OP_GELU,        "GGML_UNARY_OP_GELU"       },
+        {GGML_UNARY_OP_GELU_QUICK,  "GGML_UNARY_OP_GELU_QUICK" },
+        {GGML_UNARY_OP_SILU,        "GGML_UNARY_OP_SILU"       },
+        {GGML_UNARY_OP_HARDSWISH,   "GGML_UNARY_OP_HARDSWISH"  },
+        {GGML_UNARY_OP_HARDSIGMOID, "GGML_UNARY_OP_HARDSIGMOID"},
+        {GGML_UNARY_OP_EXP,         "GGML_UNARY_OP_EXP"        },
+        {GGML_UNARY_OP_COUNT,       "GGML_UNARY_OP_COUNT"      }
+    };
+    static const std::map<ggml_glu_op, std::string> glu_ops = {
+        {GGML_GLU_OP_SWIGLU, "GGML_GLU_OP_SWIGLU"},
+        {GGML_GLU_OP_GEGLU,  "GGML_GLU_OP_GEGLU" },
+        {GGML_GLU_OP_REGLU,  "GGML_GLU_OP_REGLU" }
+    };
+
+    switch (m_node->op) {
+    case GGML_OP_UNARY:
+        return unary_ops.at(ggml_get_unary_op(m_node));
+    case GGML_OP_GLU:
+        return glu_ops.at(ggml_get_glu_op(m_node));
+    default:
+        return ops.at(m_node->op);
+    }
+    static const std::string unknown_op = "UNKNOWN_GGML_OP";
+    return unknown_op;
+}
diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h
new file mode 100644
index 0000000000000..ae378273d32e0
--- /dev/null
+++ b/ggml/src/ggml-openvino/ggml-decoder.h
@@ -0,0 +1,163 @@
+#pragma once
+
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <openvino/core/partial_shape.hpp>
+#include <vector>
+
+#include "ggml.h"
+#include "openvino/decoder.hpp"
+
+class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
+public:
+    // Graph decoder
+    GgmlOvDecoder(struct ggml_cgraph* cgraph, std::map<std::string, std::shared_ptr<ov::Node>>& model_weights,
+                  bool is_static, bool is_first_token);
+
+    // Node decoder, called in GgmlOvDecoder::visit_subgraph
+    GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token,
+                  int context_size, int num_heads, int num_heads_kv, int head_size);
+
+    // Naive graph decoder
+    GgmlOvDecoder(struct ggml_cgraph* cgraph);
+
+    virtual ov::Any get_attribute(const std::string& name) const override {
+        return nullptr;
+        GGML_UNUSED(name);
+    }
+
+    virtual ov::PartialShape get_input_shape(const std::string& name) const override;
+
+    virtual std::vector<size_t> get_input_stride(const std::string& name) const override;
+
+    virtual ov::element::Type get_input_type(const std::string& name) const override;
+
+    virtual size_t get_input_size() const override;
+
+    virtual void get_input_node(size_t input_port_idx,
+                                std::string& producer_name,
+                                std::string& producer_output_port_name,
+                                size_t& producer_output_port_index) const override {
+        GGML_UNUSED(input_port_idx);
+        GGML_UNUSED(producer_name);
+        GGML_UNUSED(producer_output_port_name);
+        GGML_UNUSED(producer_output_port_index);
+    }
+
+    virtual std::string& get_input_name(size_t index) const override;
+
+    virtual std::vector<std::string> get_input_names() const override;
+
+    virtual ov::PartialShape get_output_shape(const std::string& name) const override;
+
+    virtual std::vector<size_t> get_output_stride(const std::string& name) const override;
+
+    virtual ov::element::Type get_output_type(const std::string& name) const override;
+
+    virtual int32_t* get_input_op_params(const std::string& name) const override;
+
+    virtual int32_t* get_output_op_params(const std::string& name) const override;
+
+    virtual std::string& get_output_name(size_t index) const override;
+
+    virtual std::vector<std::string> get_output_names() const override;
+
+    virtual const std::string& get_op_type() const override;
+
+    virtual const std::string& get_op_name() const override;
+
+    virtual void visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>)> node_visitor) const override;
+
+    const ggml_tensor* get_input_ggml_tensor(const std::string& name) const {
+        return m_inputs.at(name);
+    }
+
+    const ggml_tensor* get_output_ggml_tensor(const std::string& name) const {
+        return m_outputs.at(name);
+    }
+
+    virtual int get_op_case() const override {
+        return m_op_case;
+    }
+
+    virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_inputs() const override {
+        return m_model_inputs;
+    }
+    virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_extra_inputs() const override {
+        return m_model_extra_inputs;
+    }
+    virtual const std::map<std::string, std::shared_ptr<ov::Tensor>>& get_model_extra_input_values() const {
+        return m_model_extra_input_values;
+    }
+    virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_weights() const override {
+        return m_model_weights;
+    }
+    virtual const std::vector<std::string>& get_model_output_names() const override {
+        return m_model_output_names;
+    }
+
+    virtual int get_context_size() const override { return m_context_size; }
+
+    virtual int get_num_heads() const override { return m_num_heads; }
+
+    virtual int get_num_heads_kv() const override { return m_num_heads_kv; }
+
+    virtual int get_head_size() const override { return m_head_size; }
+
+    virtual int32_t* get_rope_params() const override { return m_rope_params; }
+
+    virtual std::map<std::string, std::string> get_kv_param_res_names() const override;
+
+    virtual bool is_static() const override { return m_is_static; }
+
+    virtual bool is_first_token() const override { return m_is_first_token; }
+
+    ov::PartialShape get_graph_input_shape(const ggml_tensor* src) const;
+
+    static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor);
+    static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(struct ggml_cgraph* cgraph);
+
+    const ggml_tensor* get_tensor_used_op(const ggml_tensor* tensor) const;
+    const ggml_tensor* get_tensor_from_name(const std::string& name) const;
+
+    void clear_model_weights() { m_model_weights.clear(); }
+
+private:
+    void set_input_output(ggml_tensor* node, bool naive = false);
+    void add_extra_inputs();
+    static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename);
+    static std::vector<size_t> get_shape(const ggml_tensor* tensor);
+    static std::vector<size_t> get_stride(const ggml_tensor* tensor);
+    static ov::element::Type get_ov_type(const ggml_tensor* tensor);
+
+    // set context_size, num_heads, etc
+    void set_llm_params();
+
+    struct ggml_cgraph* m_cgraph = nullptr;
+    ggml_tensor* m_node = nullptr;
+    std::vector<ggml_tensor*> m_nodes;
+    std::map<std::string, ggml_tensor*> m_inputs;
+    std::vector<std::string> m_input_names;
+    std::map<std::string, ggml_tensor*> m_outputs;
+    std::vector<std::string> m_output_names;
+    std::string m_op_name;
+    mutable std::string m_name;
+    int m_op_case = 0;
+    std::vector<std::pair<std::string, std::string>> m_op_node_name;
+    std::map<std::string, std::shared_ptr<ov::Node>> m_model_inputs;
+    std::map<std::string, std::shared_ptr<ov::Node>> m_model_extra_inputs;
+    std::map<std::string, std::shared_ptr<ov::Tensor>> m_model_extra_input_values;
+    std::map<std::string, std::shared_ptr<ov::Node>> m_model_weights;
+    std::vector<std::string> m_model_output_names;
+    int m_context_size;
+    int m_num_heads;
+    int m_num_heads_kv;
+    int m_head_size;
+    int32_t* m_rope_params;
+    std::vector<std::string> m_kv_names;
+    bool m_is_static;
+    bool m_is_first_token;
+};
+
+void print_tensor_address_map(const struct ggml_cgraph* cgraph);
diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp
new file mode 100644
index 0000000000000..13c2ef74628c7
--- /dev/null
+++ b/ggml/src/ggml-openvino/ggml-openvino.cpp
@@ -0,0 +1,544 @@
+#include "ggml-openvino.h"
+
+#include <cstdint>
+#include <mutex>
+#include <openvino/openvino.hpp>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "ggml-backend-impl.h"
+#include "ggml-impl.h"
+#include "ggml-openvino/utils.h"
+#include "ggml.h"
+
+#define GGML_OPENVINO_MAX_STREAMS 8
+
+struct ggml_backend_openvino_context {
+    int device;                          // the device ID currently in use
+    std::string name;                    // context Name
+    std::string description;             // context description
+
+    // OpenVINO core components
+    ov::Core core;                       // OpenVINO core interface
+    std::shared_ptr<ov::CompiledModel> model; // compiled Model
+    ov::InferRequest infer_request;      // inference Request
+
+    // OpenVINO Multi-stream support
+    static const int MAX_STREAMS = 8;    // define the maximum number of flows
+    std::vector<ov::InferRequest> streams; // used to support multi-stream reasoning
+    int current_stream;                  // the currently active stream index
+
+    // state Management
+    bool is_initialized;                 // initialize
+
+    ggml_backend_openvino_context()
+        : device(0), name("OpenVINO"), description("OpenVINO Backend Context"),
+          current_stream(0), is_initialized(false) {}
+};
+
+static void ggml_backend_openvino_free(ggml_backend_t backend) {
+    ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *)backend->context;
+    delete ctx;
+    delete backend;
+}
+
+static const char * ggml_backend_openvino_get_name(ggml_backend_t backend) {
+    return GGML_OPENVINO_NAME;
+    GGML_UNUSED(backend);
+}
+
+static enum ggml_status
+ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph *cgraph) {
+    openvino_frontend_compute(backend, cgraph);
+
+    return GGML_STATUS_SUCCESS;
+}
+
+static const ggml_backend_i ggml_backend_openvino_interface = {
+    /* .get_name                = */ ggml_backend_openvino_get_name,
+    /* .free                    = */ ggml_backend_openvino_free,
+    /* .set_tensor_async        = */ NULL,
+    /* .get_tensor_async        = */ NULL,
+    /* .cpy_tensor_async        = */ NULL,
+    /* .synchronize             = */ NULL,
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ ggml_backend_openvino_graph_compute,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+};
+
+int ggml_backend_openvino_get_device_count() {
+    return ggml_openvino_info().device_count;
+}
+
+static ggml_guid_t ggml_backend_openvino_guid(void) {
+    static ggml_guid guid = { 0x12, 0xa8, 0xae, 0xf4, 0xc0, 0x1e, 0x61, 0x97, 0x8f, 0xeb, 0x33, 0x04, 0xa1, 0x33, 0x51, 0x2d };
+    return &guid;
+}
+
+// backend API
+GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device) {
+    if (device < 0 || device >= ggml_backend_openvino_get_device_count()) {
+        GGML_LOG_ERROR("%s: invalid device %d\n", __func__, device);
+        return nullptr;
+    }
+
+    ggml_backend_openvino_context * ctx = new ggml_backend_openvino_context;
+    if (ctx == nullptr) {
+        GGML_LOG_ERROR("%s: failed to allocate context\n", __func__);
+        return nullptr;
+    }
+
+    ggml_backend_t openvino_backend = new ggml_backend {
+        /* .guid      = */ ggml_backend_openvino_guid(),
+        /* .interface = */ ggml_backend_openvino_interface,
+        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_openvino_reg(), device),
+        /* .context   = */ ctx,
+    };
+
+    return openvino_backend;
+}
+
+GGML_BACKEND_API bool ggml_backend_is_openvino(ggml_backend_t backend) {
+    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_openvino_guid());
+}
+
+// device buffer
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device) {
+    GGML_ASSERT(device >= 0);
+    return ggml_backend_cpu_buffer_type();
+    GGML_UNUSED(device);
+}
+
+// split tensor buffer that splits matrices by rows across multiple devices
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_split_buffer_type(const float * tensor_split) {
+    GGML_ASSERT(tensor_split != nullptr);
+    return nullptr;
+}
+
+// pinned host buffer for use with the CPU backend for faster copies between CPU
+// and GPU
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(void) {
+    return nullptr;
+}
+
+struct ggml_backend_openvino_buffer_type_context {
+    int device;
+    std::string name;
+};
+
+static const char * ggml_backend_openvino_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    ggml_backend_openvino_buffer_type_context * ctx = (ggml_backend_openvino_buffer_type_context *)buft->context;
+
+    return ctx->name.c_str();
+}
+static bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name;
+}
+
+
+static const char * ggml_backend_openvino_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    return GGML_OPENVINO_NAME "_Split";
+
+    GGML_UNUSED(buft);
+}
+
+static bool ggml_backend_buft_is_openvino_split(ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_name == ggml_backend_openvino_split_buffer_type_get_name;
+}
+
+struct ggml_backend_openvino_device_context {
+    int device;
+    std::string name;
+    std::string description;
+};
+
+static const char * ggml_backend_openvino_device_get_name(ggml_backend_dev_t dev) {
+    ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context;
+    return ctx->name.c_str();
+}
+
+static const char * ggml_backend_openvino_device_get_description(ggml_backend_dev_t dev) {
+    ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context;
+    return ctx->description.c_str();
+}
+
+// TODO
+static void ggml_backend_openvino_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    GGML_ASSERT(dev->context != nullptr);
+    GGML_ASSERT(free != nullptr);
+    GGML_ASSERT(total != nullptr);
+    ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context;
+    // Placeholder
+    GGML_ASSERT(ctx->device >= 0);
+    // ggml_openvino_set_device(ctx->device);
+}
+
+static enum ggml_backend_dev_type ggml_backend_openvino_device_get_type(ggml_backend_dev_t dev) {
+    GGML_UNUSED(dev);
+    return GGML_BACKEND_DEVICE_TYPE_ACCEL;
+}
+
+static void ggml_backend_openvino_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_openvino_device_get_name(dev);
+    props->description = ggml_backend_openvino_device_get_description(dev);
+    props->type        = ggml_backend_openvino_device_get_type(dev);
+    ggml_backend_openvino_device_get_memory(dev, &props->memory_free, &props->memory_total);
+
+    bool host_buffer = getenv("GGML_OPENVINO_NO_PINNED") == nullptr;
+#ifdef GGML_OPENVINO_NO_PEER_COPY
+    bool events = false;
+#else
+    bool events = true;
+#endif
+
+    props->caps = {
+        /* .async                 = */ true,
+        /* .host_buffer           = */ host_buffer,
+        /* .buffer_from_host_ptr  = */ false,
+        /* .events                = */ events,
+    };
+}
+
+static ggml_backend_t ggml_backend_openvino_device_init(ggml_backend_dev_t dev, const char * params) {
+    GGML_UNUSED(params);
+    ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context;
+    return ggml_backend_openvino_init(ctx->device);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_openvino_device_get_buffer_type(ggml_backend_dev_t dev) {
+    ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context;
+    return ggml_backend_openvino_buffer_type(ctx->device);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_openvino_device_get_host_buffer_type(ggml_backend_dev_t dev) {
+    GGML_UNUSED(dev);
+    return ggml_backend_openvino_host_buffer_type();
+}
+
+static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
+    GGML_UNUSED(dev);
+    GGML_UNUSED(ptr);
+    GGML_UNUSED(size);
+    GGML_UNUSED(max_tensor_size);
+    return nullptr;
+}
+
+static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
+    GGML_UNUSED(dev);
+    GGML_UNUSED(ptr);
+    GGML_UNUSED(size);
+    GGML_UNUSED(max_tensor_size);
+    return nullptr;
+}
+
+static bool is_op_unsupported_case(const ggml_tensor* op) {
+    if (op->op == GGML_OP_SOFT_MAX) {
+        if (op->src[2] != nullptr) {
+            GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with sinks\n");
+            return true;
+        }
+        float scale = 1.0f;
+        float max_bias = 0.0f;
+        const auto* op_params = op->op_params;
+        memcpy(&scale, (const float*) op_params + 0, sizeof(float));
+        memcpy(&max_bias, (const float*) op_params + 1, sizeof(float));
+        const uint32_t h = op->src[0]->ne[2];
+        const uint32_t n_head = op->src[0]->ne[0];
+        const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
+
+        const float m0 = powf(2.0f, -(max_bias) / n_head_log2);
+        const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
+        const float slope =
+            (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2 * (h - n_head_log2) + 1) : 1.0f;
+
+        if (slope != 1.0f) {
+            GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with slope != 1.0f\n");
+            return true;
+        }
+    }
+
+    if (op->op == GGML_OP_PERMUTE) {
+        if (op->type == GGML_TYPE_BF16) {
+            // err msg: [GPU] Could not find a suitable kernel for transpose
+            GGML_LOG_WARN("OpenVINO backend does not support PERMUTE with BF16 type\n");
+            return true;
+        }
+    }
+
+    if (op->op == GGML_OP_MUL_MAT) {
+        if ((op->src[0]->view_src && op->src[0]->op != GGML_OP_PERMUTE) ||
+            (op->src[1]->view_src && op->src[1]->op != GGML_OP_PERMUTE)) {
+            GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with view_src tensors that are not PERMUTE\n");
+            return true;
+        }
+        if (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16) {
+            // Has accuracy issue, try enabling this and see `test-backend-ops -o "MUL_MAT"`
+            GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with two F16 tensors\n");
+            return true;
+        }
+    }
+
+    if (op->op == GGML_OP_ROPE) {
+        const int32_t* op_params = op->op_params;
+        const int n_dims = op_params[1];
+        const int mode = op_params[2];
+        if (mode == GGML_ROPE_TYPE_MROPE || mode == GGML_ROPE_TYPE_VISION) {
+            GGML_LOG_WARN("OpenVINO backend does not support ROPE with mode %d\n", mode);
+            return true;
+        }
+        if (n_dims != op->src[0]->ne[0]) {
+            GGML_LOG_WARN("OpenVINO backend does not support ROPE with n_dims %d != src[0]->ne[0] %ld\n",
+                          n_dims,
+                          op->src[0]->ne[0]);
+            return true;
+        }
+        if (op->type != GGML_TYPE_F32) {
+            GGML_LOG_WARN("OpenVINO backend does not support ROPE with type %s\n", ggml_type_name(op->type));
+            return true;
+        }
+        float freq_scale;
+        memcpy(&freq_scale, op_params + 6, sizeof(float));
+        if (freq_scale != 1.0f) {
+            GGML_LOG_WARN("OpenVINO backend does not support ROPE with freq_scale %f != 1.0f\n", freq_scale);
+            return true;
+        }
+        float ext_factor;
+        memcpy(&ext_factor, op_params + 7, sizeof(float));
+        if (ext_factor != 0.0f) {
+            GGML_LOG_WARN("OpenVINO backend does not support ROPE with ext_factor %f != 0.0f\n", ext_factor);
+            return true;
+        }
+        if (op->src[0]->op == GGML_OP_VIEW) {
+            if (op->src[0]->view_src->ne[1] != op->src[0]->ne[2]) {
+                GGML_LOG_WARN(
+                    "OpenVINO backend does not support ROPE with src[0]->view_src->ne[1] %ld != src[0]->ne[2] %ld\n",
+                    op->src[0]->view_src->ne[1],
+                    op->src[0]->ne[2]);
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor* op) {
+    GGML_ASSERT(dev->reg != nullptr);
+
+    static const std::set<ggml_type> supported_types{
+        GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_I64, GGML_TYPE_I32};
+
+    static const std::set<ggml_op> supported_ops{GGML_OP_NONE,
+                                                 GGML_OP_ADD,
+                                                 GGML_OP_MUL,
+                                                 GGML_OP_MUL_MAT,
+                                                 GGML_OP_VIEW,
+                                                 GGML_OP_CONT,
+                                                 GGML_OP_RESHAPE,
+                                                 GGML_OP_PERMUTE,
+                                                 GGML_OP_TRANSPOSE,
+                                                 GGML_OP_GET_ROWS,
+                                                 GGML_OP_ROPE,
+                                                 GGML_OP_RMS_NORM,
+                                                 GGML_OP_SCALE,
+                                                 GGML_OP_SOFT_MAX,
+                                                 GGML_OP_SET_ROWS};
+    static const std::set<ggml_unary_op> supported_unary_ops{
+        GGML_UNARY_OP_SILU,
+    };
+    static const std::set<ggml_glu_op> supported_glu_ops{
+        GGML_GLU_OP_SWIGLU,
+    };
+
+    switch (op->op) {
+    case GGML_OP_UNARY: {
+        auto supported = supported_unary_ops.find(ggml_get_unary_op(op)) != supported_unary_ops.end();
+        if (!supported) {
+            GGML_LOG_WARN("OpenVINO backend does not support unary op %s\n", ggml_unary_op_name(ggml_get_unary_op(op)));
+            return false;
+        }
+        break;
+    }
+    case GGML_OP_GLU: {
+        auto supported = supported_glu_ops.find(ggml_get_glu_op(op)) != supported_glu_ops.end();
+        if (!supported) {
+            GGML_LOG_WARN("OpenVINO backend does not support GLU op %s\n", ggml_glu_op_name(ggml_get_glu_op(op)));
+            return false;
+        }
+        break;
+    }
+    default: {
+        auto supported = supported_ops.find(op->op) != supported_ops.end();
+        if (!supported) {
+            GGML_LOG_WARN("OpenVINO backend does not support op %s\n", ggml_op_name(op->op));
+            return false;
+        }
+    }
+    }
+
+    if (supported_types.find(op->type) == supported_types.end()) {
+        GGML_LOG_WARN("OpenVINO backend does not support tensor type %s\n", ggml_type_name(op->type));
+        return false;
+    }
+    if (op->ne[3] != 1) {
+        GGML_LOG_WARN("OpenVINO backend does not support tensors with ne[3] != 1\n");
+        return false;
+    }
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        if (supported_types.find(op->type) == supported_types.end()) {
+            GGML_LOG_WARN("OpenVINO backend does not support tensor type %s\n", ggml_type_name(op->type));
+            return false;
+        }
+        if (op->src[i] != nullptr && op->src[i]->ne[3] != 1) {
+            GGML_LOG_WARN("OpenVINO backend does not support tensors with ne[3] != 1\n");
+            return false;
+        }
+    }
+
+    if (is_op_unsupported_case(op)) {
+        return false;
+    }
+    return true;
+}
+
+static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    return ggml_backend_buft_is_host(buft);
+    GGML_UNUSED(dev);
+}
+
+static const struct ggml_backend_device_i ggml_backend_openvino_device_interface = {
+    /* .get_name             = */ ggml_backend_openvino_device_get_name,
+    /* .get_description      = */ ggml_backend_openvino_device_get_description,
+    /* .get_memory           = */ ggml_backend_openvino_device_get_memory,
+    /* .get_type             = */ ggml_backend_openvino_device_get_type,
+    /* .get_props            = */ ggml_backend_openvino_device_get_props,
+    /* .init_backend         = */ ggml_backend_openvino_device_init,
+    /* .get_buffer_type      = */ ggml_backend_openvino_device_get_buffer_type,
+    /* .get_host_buffer_type = */ NULL,
+    /* .buffer_from_host_ptr = */ ggml_backend_openvino_device_buffer_from_ptr,
+    /* .supports_op          = */ ggml_backend_openvino_device_supports_op,
+    /* .supports_buft        = */ ggml_backend_openvino_device_supports_buft,
+    /* .offload_op           = */ NULL,
+    /* .event_new            = */ NULL,
+    /* .event_free           = */ NULL,
+    /* .event_synchronize    = */ NULL,
+};
+
+struct ggml_backend_openvino_reg_context {
+    std::vector<ggml_backend_dev_t> devices;
+};
+
+static const char * ggml_backend_openvino_reg_get_name(ggml_backend_reg_t reg) {
+    return GGML_OPENVINO_NAME;
+    GGML_UNUSED(reg);
+}
+
+static size_t ggml_backend_openvino_reg_get_device_count(ggml_backend_reg_t reg) {
+    return ggml_openvino_info().device_count;
+    GGML_UNUSED(reg);
+
+    // TODO
+    ggml_backend_openvino_reg_context * ctx = (ggml_backend_openvino_reg_context *)reg->context;
+
+    return ctx->devices.size();
+}
+
+static ggml_backend_dev_t ggml_backend_openvino_reg_get_device(ggml_backend_reg_t reg, size_t index) {
+    ggml_backend_openvino_reg_context * ctx = (ggml_backend_openvino_reg_context *)reg->context;
+    GGML_ASSERT(index < ctx->devices.size());
+    return ctx->devices[index];
+    // GGML_ASSERT(index == 0);
+
+    // static ggml_backend_device ggml_backend_openvino_device = {
+    //     /* .iface   = */ ggml_backend_openvino_device_interface,
+    //     /* .reg     = */ reg,
+    //     /* .context = */ nullptr,
+    // };
+
+    // return &ggml_backend_openvino_device;
+
+    // GGML_UNUSED(reg);
+    // GGML_UNUSED(index);
+}
+
+static void * ggml_backend_openvino_get_proc_address(ggml_backend_reg_t reg, const char * name) {
+    GGML_UNUSED(reg);
+    if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
+        return (void *)ggml_backend_openvino_split_buffer_type;
+    }
+    // if (strcmp(name, "ggml_backend_register_host_buffer") == 0) {
+    //     return (void *)ggml_backend_openvino_register_host_buffer;
+    // }
+    // if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) {
+    //     return (void *)ggml_backend_openvino_unregister_host_buffer;
+    // }
+    return nullptr;
+}
+
+static const struct ggml_backend_reg_i ggml_backend_openvino_reg_interface = {
+    /* .get_name         = */ ggml_backend_openvino_reg_get_name,
+    /* .get_device_count = */ ggml_backend_openvino_reg_get_device_count,
+    /* .get_device       = */ ggml_backend_openvino_reg_get_device,
+    /* .get_proc_address = */ ggml_backend_openvino_get_proc_address,
+};
+
+static int get_openvino_device_count() {
+    ov::Core core;
+    auto devices = core.get_available_devices();
+    // return devices.size();
+    return 1;
+}
+
+static ggml_openvino_device_info ggml_openvino_init() {
+    ggml_openvino_device_info info = {};
+    // TODO
+    info.device_count = get_openvino_device_count();
+    return info;
+}
+
+const ggml_openvino_device_info & ggml_openvino_info() {
+    static ggml_openvino_device_info info = ggml_openvino_init();
+    return info;
+}
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void) {
+    static ggml_backend_reg reg;
+
+    static bool initialized = false;
+    {
+        static std::mutex mutex;
+        std::lock_guard<std::mutex> lock(mutex);
+        if (!initialized) {
+            ggml_backend_openvino_reg_context * ctx = new ggml_backend_openvino_reg_context;
+
+            // GGML_LOG_DEBUG("ggml_openvino_info().device_count = %d \n", ggml_openvino_info().device_count);
+            for (int i = 0; i < ggml_openvino_info().device_count; i++) {
+                ggml_backend_openvino_device_context * dev_ctx = new ggml_backend_openvino_device_context;
+                dev_ctx->device = i;
+                dev_ctx->name = GGML_OPENVINO_NAME + std::to_string(i);
+
+                // ggml_openvino_set_device(i);
+                dev_ctx->description = ov::get_openvino_version().description;
+
+                ggml_backend_dev_t dev = new ggml_backend_device {
+                    /* .interface = */ ggml_backend_openvino_device_interface,
+                    /* .reg       = */ &reg,
+                    /* .context   = */ dev_ctx
+                };
+                ctx->devices.push_back(dev);
+            }
+
+            reg = ggml_backend_reg{ /* .api_version = */ GGML_BACKEND_API_VERSION,
+                                    /* .iface       = */ ggml_backend_openvino_reg_interface,
+                                    /* .context     = */ ctx };
+        }
+
+        initialized = true;
+    }
+
+    return &reg;
+}
diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp
new file mode 100644
index 0000000000000..a3387ba3947a2
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/decoder.hpp
@@ -0,0 +1,74 @@
+#pragma once
+
+#include <cstdint>
+#include <map>
+#include <openvino/core/node.hpp>
+#include <openvino/frontend/decoder.hpp>
+#include <string>
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+
+class GgmlDecoder : public DecoderBase {
+public:
+    virtual ov::Any get_attribute(const std::string& name) const = 0;
+
+    virtual PartialShape get_input_shape(const std::string& name) const = 0;
+
+    virtual std::vector<size_t> get_input_stride(const std::string& name) const = 0;
+
+    virtual element::Type get_input_type(const std::string& name) const = 0;
+
+    virtual size_t get_input_size() const = 0;
+
+    virtual void get_input_node(size_t input_port_idx,
+                                std::string& producer_name,
+                                std::string& producer_output_port_name,
+                                size_t& producer_output_port_index) const = 0;
+
+    virtual std::string& get_input_name(size_t index) const = 0;
+
+    virtual std::vector<std::string> get_input_names() const = 0;
+
+    virtual PartialShape get_output_shape(const std::string& name) const = 0;
+
+    virtual std::vector<size_t> get_output_stride(const std::string& name) const = 0;
+
+    virtual element::Type get_output_type(const std::string& name) const = 0;
+
+    virtual int32_t* get_input_op_params(const std::string& name) const = 0;
+
+    virtual int32_t* get_output_op_params(const std::string& name) const = 0;
+
+    virtual std::string& get_output_name(size_t index) const = 0;
+
+    virtual std::vector<std::string> get_output_names() const = 0;
+
+    virtual const std::string& get_op_type() const = 0;
+
+    virtual const std::string& get_op_name() const = 0;
+
+    virtual void visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>)> node_visitor) const = 0;
+
+    virtual int get_op_case() const = 0;
+
+    virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_inputs() const = 0;
+    virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_extra_inputs() const = 0;
+    virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_weights() const = 0;
+    virtual const std::vector<std::string>& get_model_output_names() const = 0;
+
+    virtual int get_num_heads() const = 0;
+    virtual int get_num_heads_kv() const = 0;
+    virtual int get_head_size() const = 0;
+    virtual int32_t* get_rope_params() const = 0;
+    virtual std::map<std::string, std::string> get_kv_param_res_names() const = 0;
+
+    virtual bool is_static() const = 0;
+    virtual bool is_first_token() const = 0;
+    virtual int get_context_size() const = 0;
+};
+
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/frontend.cpp b/ggml/src/ggml-openvino/openvino/frontend.cpp
new file mode 100644
index 0000000000000..dbdae1ed45ca1
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/frontend.cpp
@@ -0,0 +1,27 @@
+#include "frontend.hpp"
+
+#include "input_model.hpp"
+#include "op_table.hpp"
+#include "translate_session.hpp"
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+
+FrontEnd::FrontEnd() {}
+
+std::shared_ptr<Model> FrontEnd::convert(const InputModel::Ptr& model, bool naive) {
+    auto ggml_model = std::dynamic_pointer_cast<ggml::InputModel>(model);
+    FRONT_END_GENERAL_CHECK(ggml_model, "Invalid input model");
+    std::shared_ptr<Model> converted_model;
+    const auto& supported_ops = get_supported_ops();
+    {
+        TranslateSession translate_session(model, supported_ops, naive);
+        converted_model = translate_session.get_converted_model();
+    }
+    return converted_model;
+}
+
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/frontend.hpp b/ggml/src/ggml-openvino/openvino/frontend.hpp
new file mode 100644
index 0000000000000..f1c6f0c3e3ce3
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/frontend.hpp
@@ -0,0 +1,23 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <openvino/frontend/frontend.hpp>
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+
+class FrontEnd {
+public:
+    using Ptr = std::shared_ptr<FrontEnd>;
+    FrontEnd();
+
+    static std::shared_ptr<Model> convert(const InputModel::Ptr& model, bool naive = false);
+};
+
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/input_model.cpp b/ggml/src/ggml-openvino/openvino/input_model.cpp
new file mode 100644
index 0000000000000..5fb16ea2db87d
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/input_model.cpp
@@ -0,0 +1,17 @@
+#include "input_model.hpp"
+
+#include "decoder.hpp"
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+
+InputModel::InputModel(const std::shared_ptr<GgmlDecoder>& gdecoder) : m_decoder(gdecoder) {}
+
+const std::shared_ptr<GgmlDecoder>& InputModel::get_model_decoder() const {
+    return m_decoder;
+}
+
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/input_model.hpp b/ggml/src/ggml-openvino/openvino/input_model.hpp
new file mode 100644
index 0000000000000..9bc9a28e9aeca
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/input_model.hpp
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <openvino/frontend/input_model.hpp>
+
+#include "decoder.hpp"
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+
+class FrontEnd;
+class GgmlDecoder;
+using ov::frontend::ggml::GgmlDecoder;
+
+class InputModel : public ov::frontend::InputModel {
+    friend class ::ov::frontend::ggml::FrontEnd;
+
+public:
+    explicit InputModel(const std::shared_ptr<GgmlDecoder>& gdecoder);
+
+    const std::shared_ptr<GgmlDecoder>& get_model_decoder() const;
+
+private:
+    std::shared_ptr<GgmlDecoder> m_decoder;
+};
+
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp
new file mode 100644
index 0000000000000..cc1b5c03329c9
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/node_context.hpp
@@ -0,0 +1,124 @@
+#pragma once
+
+#include <cstdint>
+#include <openvino/frontend/node_context.hpp>
+
+#include "decoder.hpp"
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+
+class TranslateSession;
+
+typedef std::map<std::string, Output<Node>> TensorMap;
+
+class NodeContext : public frontend::NodeContext {
+public:
+    NodeContext(const std::shared_ptr<GgmlDecoder>& decoder,
+                std::shared_ptr<TensorMap>& tensor_map,
+                TranslateSession* translate_session = nullptr)
+        : ov::frontend::NodeContext(decoder->get_op_type()),
+          m_decoder(decoder),
+          m_tensor_map(tensor_map),
+          m_translate_session(translate_session) {
+        m_input_names = decoder->get_input_names();
+        m_output_names = decoder->get_output_names();
+    }
+
+    TranslateSession* get_translate_session() const {
+        return m_translate_session;
+    }
+
+    size_t get_input_size() const override {
+        return m_decoder->get_input_size();
+    }
+
+    ov::element::Type get_input_type(size_t index) const {
+        return m_decoder->get_input_type(m_input_names[index]);
+    }
+
+    PartialShape get_input_shape(size_t index) const {
+        return m_decoder->get_input_shape(m_input_names[index]);
+    }
+
+    std::vector<size_t> get_input_stride(size_t index) const {
+        return m_decoder->get_input_stride(m_input_names[index]);
+    }
+
+    std::string get_output_name() const { return m_output_names[0]; }
+
+    PartialShape get_output_shape(size_t index) const {
+        return m_decoder->get_output_shape(m_output_names[index]);
+    }
+
+    std::vector<size_t> get_output_stride(size_t index) const {
+        return m_decoder->get_output_stride(m_output_names[index]);
+    }
+
+    int32_t* get_input_op_params(size_t index) const {
+        return m_decoder->get_input_op_params(m_input_names[index]);
+    }
+
+    int32_t* get_output_op_params(size_t index) const {
+        return m_decoder->get_output_op_params(m_output_names[index]);
+    }
+
+    ov::element::Type get_output_type(size_t index) const {
+        return m_decoder->get_output_type(m_output_names[index]);
+    }
+
+    Output<Node> get_input(int idx) const override {
+        return m_tensor_map->at(m_decoder->get_input_name(idx));
+    }
+
+    Output<Node> get_input(const std::string& name) const override {
+        if (m_tensor_map->find(name) == m_tensor_map->end()) {
+            throw std::runtime_error("'" + name + "' not found in tensor map.");
+        }
+        return m_tensor_map->at(name);
+    }
+
+    bool has_input(const std::string& name) const {
+        return m_tensor_map->find(name) != m_tensor_map->end();
+    }
+
+    const std::string& get_name() const override {
+        return m_decoder->get_op_name();
+    }
+
+    ov::Any get_attribute_as_any(const std::string& name) const override {
+        return m_decoder->get_attribute(name);
+    }
+
+    int get_op_case() const {
+        return m_decoder->get_op_case();
+    }
+    bool is_static() const {
+        return m_decoder->is_static();
+    }
+    bool is_first_token() const {
+        return m_decoder->is_first_token();
+    }
+
+    int get_num_heads() const { return m_decoder->get_num_heads(); }
+
+    int get_num_heads_kv() const { return m_decoder->get_num_heads_kv(); }
+
+    int get_head_size() const { return m_decoder->get_head_size(); }
+
+    int get_context_size() const { return m_decoder->get_context_size(); }
+
+  private:
+    std::shared_ptr<GgmlDecoder> m_decoder;
+    std::shared_ptr<TensorMap>& m_tensor_map;
+    TranslateSession* m_translate_session;
+    std::vector<std::string> m_input_names;
+    std::vector<std::string> m_output_names;
+};
+
+using CreatorFunction = std::function<ov::OutputVector(const ov::frontend::ggml::NodeContext&)>;
+
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/op/cont.cpp b/ggml/src/ggml-openvino/openvino/op/cont.cpp
new file mode 100644
index 0000000000000..f83c0e62df77b
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp
@@ -0,0 +1,46 @@
+
+#include <climits>
+#include <cstdint>
+#include <memory>
+#include <openvino/op/reshape.hpp>
+#include <openvino/op/slice.hpp>
+#include <vector>
+
+#include "../node_context.hpp"
+#include "../op_table.hpp"
+#include "../utils.hpp"
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+namespace op {
+
+OutputVector translate_cont(const NodeContext& context) {
+    num_inputs_check(context, 1, 1);
+
+    int op_case = context.get_op_case();
+    FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case");
+
+    auto src_shape = context.get_input_shape(0).to_shape();
+    auto dst_shape = context.get_output_shape(0).to_shape();
+    ov::Output<Node> res;
+
+    if (op_case == 1) {
+        // The input comes from a PERMUTE
+        dst_shape[1] = -1;
+        res = std::make_shared<ov::op::v1::Reshape>(
+            context.get_input(0),
+            ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape),
+            false);
+    } else {
+        // The input comes from a VIEW
+        res = process_view_input(context, 0);
+    }
+
+    return rename_outputs_with_suffix({res}, context.get_name());
+}
+
+}  // namespace op
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp
new file mode 100644
index 0000000000000..c97bbbf5a3657
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp
@@ -0,0 +1,55 @@
+#include <cstdint>
+#include <openvino/core/node.hpp>
+#include <openvino/core/node_output.hpp>
+#include <openvino/op/constant.hpp>
+#include <openvino/op/convert.hpp>
+#include <openvino/op/gather.hpp>
+#include <openvino/op/reshape.hpp>
+#include <openvino/op/slice.hpp>
+#include <openvino/op/squeeze.hpp>
+
+#include "../node_context.hpp"
+#include "../op_table.hpp"
+#include "../utils.hpp"
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+namespace op {
+
+OutputVector translate_get_rows(const NodeContext& context) {
+    num_inputs_check(context, 2, 2);
+
+    int op_case = context.get_op_case();
+    FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case");
+
+    Output<Node> res;
+    auto data = context.get_input(0);
+    auto indices = context.get_input(1);
+
+    if (op_case == 2) {
+        // The input comes from a VIEW
+        indices = process_view_input(context, 1);
+    }
+
+    auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1});
+    if (indices.get_partial_shape()[1].get_length() == 1) {
+        indices =
+            std::make_shared<ov::op::v0::Squeeze>(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1}));
+        res = std::make_shared<ov::op::v8::Gather>(data, indices, axis);
+    } else {
+        indices =
+            std::make_shared<ov::op::v0::Squeeze>(indices, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
+        res = std::make_shared<ov::op::v8::Gather>(data, indices, axis, 1);
+    }
+
+    if (res.get_element_type() != context.get_output_type(0)) {
+        res = std::make_shared<ov::op::v0::Convert>(res, context.get_output_type(0));
+    }
+    return rename_outputs_with_suffix({res}, context.get_name());
+}
+
+}  // namespace op
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp
new file mode 100644
index 0000000000000..138ef650901fd
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp
@@ -0,0 +1,44 @@
+#include <cstdint>
+#include <memory>
+#include <openvino/core/node_output.hpp>
+#include <openvino/op/constant.hpp>
+#include <openvino/op/multiply.hpp>
+#include <openvino/op/sigmoid.hpp>
+#include <openvino/op/slice.hpp>
+#include <openvino/op/split.hpp>
+
+#include "../node_context.hpp"
+#include "../op_table.hpp"
+#include "../utils.hpp"
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+namespace op {
+
+OutputVector translate_glu_swiglu(const NodeContext& context) {
+    num_inputs_check(context, 1, 2);
+
+    ov::Output<ov::Node> src0;
+    ov::Output<ov::Node> src1;
+    if (context.get_input_size() == 2) {
+        src0 = context.get_input(0);
+        src1 = context.get_input(1);
+    } else {
+        auto combined = context.get_input(0);
+        auto split_axis = ov::op::v0::Constant::create(ov::element::i64, {}, {2});
+        auto split = std::make_shared<ov::op::v1::Split>(combined, split_axis, 2);
+        src0 = split->output(0);
+        src1 = split->output(1);
+    }
+    auto sigmoid = std::make_shared<ov::op::v0::Sigmoid>(src0);
+    auto silu = std::make_shared<ov::op::v1::Multiply>(src0, sigmoid);
+    auto res = std::make_shared<ov::op::v1::Multiply>(silu, src1);
+
+    return rename_outputs_with_suffix({res}, context.get_name());
+}
+
+}  // namespace op
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp
new file mode 100644
index 0000000000000..9148a27517b92
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp
@@ -0,0 +1,87 @@
+#include <climits>
+#include <cstdint>
+#include <memory>
+#include <openvino/core/node.hpp>
+#include <openvino/core/node_output.hpp>
+#include <openvino/op/broadcast.hpp>
+#include <openvino/op/concat.hpp>
+#include <openvino/op/constant.hpp>
+#include <openvino/op/convert.hpp>
+#include <openvino/op/matmul.hpp>
+#include <openvino/op/reshape.hpp>
+#include <openvino/op/slice.hpp>
+#include <openvino/op/transpose.hpp>
+#include <openvino/op/unsqueeze.hpp>
+#include <openvino/op/util/op_types.hpp>
+#include <vector>
+
+#include "../node_context.hpp"
+#include "../op_table.hpp"
+#include "../utils.hpp"
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+namespace op {
+
+OutputVector translate_mulmat(const NodeContext& context) {
+    num_inputs_check(context, 2, 2);
+
+    ov::Output<Node> res;
+    ov::Output<ov::Node> B = context.get_input(0);
+    ov::Output<ov::Node> A = context.get_input(1);
+
+    bool convert_out_type = false;
+    if (ov::op::util::is_constant(B.get_node()) && context.get_input_type(0) != context.get_input_type(1)) {
+        B = std::make_shared<ov::op::v0::Convert>(context.get_input(0), context.get_input_type(1));
+    } else if (context.get_input_type(0) != context.get_input_type(1)) {
+        A = std::make_shared<ov::op::v0::Convert>(context.get_input(1), context.get_input_type(0));
+        convert_out_type = true;
+    }
+
+    auto B_shape = context.get_input_shape(0).to_shape();
+    auto A_shape = context.get_input_shape(1).to_shape();
+    int64_t A_batch = A_shape[0];
+    int64_t B_batch = B_shape[0];
+    auto A_batch_larger = A_batch > B_batch;
+    Output<Node> Z = A_batch_larger ? B : A;
+    int64_t factor = A_batch_larger ? A_batch / B_batch : B_batch / A_batch;
+    if (factor > 1) {
+        auto A_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{A_batch});
+        auto B_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{B_batch});
+        auto factor_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{factor});
+
+        auto Z_last_two_dim = get_dimensions(Z.get_node_shared_ptr(), {1, 2});
+
+        auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1});
+        auto Z_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(Z, unsqueeze_axes);
+
+        Output<Node> batch_small = A_batch_larger ? B_batch_node : A_batch_node;
+        Output<Node> batch_large = A_batch_larger ? A_batch_node : B_batch_node;
+        auto broadcast_shape =
+            std::make_shared<ov::op::v0::Concat>(ov::OutputVector{batch_small, factor_node, Z_last_two_dim}, 0);
+        auto Z_broadcasted = std::make_shared<ov::op::v3::Broadcast>(Z_unsqueezed, broadcast_shape);
+
+        auto new_Z_shape = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{batch_large, Z_last_two_dim}, 0);
+        Z = std::make_shared<ov::op::v1::Reshape>(Z_broadcasted, new_Z_shape, false);
+        }
+        if (A_batch_larger) {
+            B = Z;
+        } else {
+            A = Z;
+        }
+
+        if (convert_out_type) {
+            auto result_lp = std::make_shared<ov::op::v0::MatMul>(A, B, false, true);
+            res = std::make_shared<ov::op::v0::Convert>(result_lp, context.get_output_type(0));
+        } else {
+            res = std::make_shared<ov::op::v0::MatMul>(A, B, false, true);
+        }
+
+        return rename_outputs_with_suffix({res}, context.get_name());
+}
+
+}  // namespace op
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp
new file mode 100644
index 0000000000000..978b5377fb514
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp
@@ -0,0 +1,77 @@
+#include <climits>
+#include <cstdint>
+#include <memory>
+#include <openvino/core/node.hpp>
+#include <openvino/op/concat.hpp>
+#include <openvino/op/constant.hpp>
+#include <openvino/op/reshape.hpp>
+#include <openvino/op/slice.hpp>
+#include <openvino/op/transpose.hpp>
+
+#include "../node_context.hpp"
+#include "../op_table.hpp"
+#include "../utils.hpp"
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+namespace op {
+
+OutputVector translate_permute(const NodeContext& context) {
+    num_inputs_check(context, 1, 1);
+
+    int op_case = context.get_op_case();
+    FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported CONT case");
+    ov::Output<Node> res;
+
+    if (op_case == 1) {
+        auto perm = argsort_descend(context.get_output_stride(0));
+        res = std::make_shared<ov::op::v1::Transpose>(context.get_input(0),
+                                                      ov::op::v0::Constant::create(ov::element::i64, {3}, perm));
+    } else {
+        auto src = context.get_input(0);
+        auto attention_size = context.get_input("attention_size");
+        if (context.is_static()) {
+            attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {INT_MAX});
+        }
+
+        auto src_shape_ = context.get_input_shape(0).to_shape();
+        std::vector<int64_t> src_shape(src_shape_.begin(), src_shape_.end());
+
+        std::shared_ptr<ov::Node> src_reshaped;
+        if (op_case == 2) {
+            src_reshaped = std::make_shared<ov::op::v1::Reshape>(
+                src,
+                ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{-1, src_shape[1], src_shape[2]}),
+                false);
+        } else {
+            src_reshaped = std::make_shared<ov::op::v1::Reshape>(
+                src,
+                ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{src_shape[1], src_shape[0], -1}),
+                false);
+        }
+
+        auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
+        auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
+        auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
+        std::shared_ptr<ov::Node> slice_axis;
+        if (op_case == 2) {
+            slice_axis = zero;
+        } else {
+            slice_axis = two;
+        }
+        auto src_slice = std::make_shared<ov::op::v8::Slice>(src_reshaped, zero, attention_size, one, slice_axis);
+
+        if (op_case == 2) {
+            res = std::make_shared<ov::op::v1::Transpose>(src_slice, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2}));
+        } else {
+            res = src_slice;
+        }
+    }
+    return rename_outputs_with_suffix({res}, context.get_name());
+}
+
+}  // namespace op
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/op/reshape.cpp b/ggml/src/ggml-openvino/openvino/op/reshape.cpp
new file mode 100644
index 0000000000000..4ef3833c90252
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/op/reshape.cpp
@@ -0,0 +1,51 @@
+#include <cstdint>
+#include <memory>
+#include <openvino/core/node.hpp>
+#include <openvino/core/node_output.hpp>
+#include <openvino/frontend/exception.hpp>
+#include <openvino/op/constant.hpp>
+#include <openvino/op/reshape.hpp>
+#include <vector>
+
+#include "../node_context.hpp"
+#include "../op_table.hpp"
+#include "../utils.hpp"
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+namespace op {
+
+OutputVector translate_reshape(const NodeContext& context) {
+    num_inputs_check(context, 1, 1);
+    if (context.get_input_shape(0) == context.get_output_shape(0)) {
+        return {context.get_input(0)};
+    }
+
+    int op_case = context.get_op_case();
+    FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported RESHAPE case");
+
+    auto output_shape = context.get_output_shape(0).to_shape();
+    std::shared_ptr<ov::Node> new_shape_node;
+    if (op_case == 1) {
+        new_shape_node =
+            ov::op::v0::Constant::create(ov::element::i64,
+                                         {3},
+                                         std::vector<int64_t>{-1, (int64_t)output_shape[1], (int64_t)output_shape[2]});
+    } else if (op_case == 2) {
+        new_shape_node =
+            ov::op::v0::Constant::create(ov::element::i64,
+                                         {3},
+                                         std::vector<int64_t>{(int64_t)output_shape[0], -1, (int64_t)output_shape[2]});
+    } else {
+        new_shape_node =
+            ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{(int64_t) output_shape[0], -1, 1});
+    }
+    auto res = std::make_shared<ov::op::v1::Reshape>(context.get_input(0), new_shape_node, false);
+    return rename_outputs_with_suffix({res}, context.get_name());
+}
+
+}  // namespace op
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp
new file mode 100644
index 0000000000000..211692a3c706c
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp
@@ -0,0 +1,46 @@
+#include <memory>
+#include <openvino/op/add.hpp>
+#include <openvino/op/constant.hpp>
+#include <openvino/op/divide.hpp>
+#include <openvino/op/multiply.hpp>
+#include <openvino/op/reduce_mean.hpp>
+#include <openvino/op/sqrt.hpp>
+
+#include "../node_context.hpp"
+#include "../op_table.hpp"
+#include "../utils.hpp"
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+namespace op {
+
+OutputVector translate_rms_norm(const NodeContext& context) {
+    num_inputs_check(context, 1, 1);
+
+    auto input_node = context.get_input(0);
+    auto square = std::make_shared<ov::op::v1::Multiply>(input_node, input_node);
+
+    auto mean =
+        std::make_shared<ov::op::v1::ReduceMean>(square,
+                                                 ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}),
+                                                 true);
+
+    float eps;
+    memcpy(&eps, context.get_output_op_params(0), sizeof(float));
+
+    auto rms = std::make_shared<ov::op::v0::Sqrt>(
+        std::make_shared<ov::op::v1::Add>(mean, ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {eps})));
+
+    auto reciprocal =
+        std::make_shared<ov::op::v1::Divide>(ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {1.0f}), rms);
+
+    auto res = std::make_shared<ov::op::v1::Multiply>(input_node, reciprocal);
+
+    return rename_outputs_with_suffix({res}, context.get_name());
+}
+
+}  // namespace op
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp
new file mode 100644
index 0000000000000..7951a1e012c54
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp
@@ -0,0 +1,111 @@
+#include <cstdint>
+#include <memory>
+#include <openvino/core/node.hpp>
+#include <openvino/core/node_output.hpp>
+#include <openvino/op/add.hpp>
+#include <openvino/op/concat.hpp>
+#include <openvino/op/constant.hpp>
+#include <openvino/op/multiply.hpp>
+#include <openvino/op/reshape.hpp>
+#include <openvino/op/shape_of.hpp>
+#include <openvino/op/slice.hpp>
+#include <openvino/op/split.hpp>
+#include <openvino/op/subtract.hpp>
+#include <openvino/op/unsqueeze.hpp>
+#include <vector>
+
+#include "../node_context.hpp"
+#include "../op_table.hpp"
+#include "../utils.hpp"
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+namespace op {
+
+OutputVector translate_rope(const NodeContext& context) {
+    num_inputs_check(context, 2, 3);
+
+    int op_case = context.get_op_case();
+    FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case");
+
+    ov::Output<Node> res;
+
+    auto data_node = context.get_input(0).get_node_shared_ptr();
+    auto output_shape = context.get_output_shape(0).to_shape();
+    int32_t* op_params = context.get_output_op_params(0);
+
+    Output<Node> cos_theta_node;
+    Output<Node> sin_theta_node;
+    if (context.has_input("rope_cos")) {
+        cos_theta_node = context.get_input("rope_cos");
+        sin_theta_node = context.get_input("rope_sin");
+    } else {
+        auto inp_pos = context.get_input(1).get_node_shared_ptr();
+        std::shared_ptr<ov::Node> rope_freqs_weight;
+        if (context.get_input_size() == 3) {
+            rope_freqs_weight = context.get_input(2).get_node_shared_ptr();
+        }
+        auto sin_cos = make_sin_cos(op_params, inp_pos, rope_freqs_weight);
+        sin_theta_node = sin_cos.first;
+        cos_theta_node = sin_cos.second;
+    }
+
+    if (op_case == 2) {
+        // The input comes from a VIEW
+        int slice_len = output_shape[1] * output_shape[2];
+        data_node = process_view_input(context, 0, slice_len).get_node_shared_ptr();
+        auto data_shape = ov::op::v0::Constant::create(
+            ov::element::i64, {3}, std::vector<int64_t>{-1, (int64_t) output_shape[1], (int64_t) output_shape[2]});
+        data_node = std::make_shared<ov::op::v1::Reshape>(data_node, data_shape, false);
+    }
+
+    const int mode = op_params[2];
+    constexpr int ROPE_TYPE_NEOX = 2;
+    constexpr int ROPE_TYPE_NORM = 0;
+
+    if (mode == ROPE_TYPE_NORM) {
+        auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
+        auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
+        auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
+        auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[2]});
+        auto even_slice = std::make_shared<ov::op::v8::Slice>(data_node, zero, end, two, two);
+        auto odd_slice = std::make_shared<ov::op::v8::Slice>(data_node, one, end, two, two);
+
+        Output<Node> first_half =
+            std::make_shared<ov::op::v1::Subtract>(std::make_shared<ov::op::v1::Multiply>(even_slice, cos_theta_node),
+                                                   std::make_shared<ov::op::v1::Multiply>(odd_slice, sin_theta_node));
+        Output<Node> second_half =
+            std::make_shared<ov::op::v1::Add>(std::make_shared<ov::op::v1::Multiply>(even_slice, sin_theta_node),
+                                              std::make_shared<ov::op::v1::Multiply>(odd_slice, cos_theta_node));
+
+        first_half = std::make_shared<ov::op::v0::Unsqueeze>(first_half,
+                                                             ov::op::v0::Constant::create(ov::element::i64, {1}, {3}));
+        second_half = std::make_shared<ov::op::v0::Unsqueeze>(second_half,
+                                                              ov::op::v0::Constant::create(ov::element::i64, {1}, {3}));
+        auto stack = std::make_shared<ov::op::v0::Concat>(OutputVector{first_half, second_half}, 3);
+        res = std::make_shared<ov::op::v1::Reshape>(stack, std::make_shared<ov::op::v0::ShapeOf>(data_node), false);
+    } else if (mode == ROPE_TYPE_NEOX) {
+        auto data_split = std::make_shared<ov::op::v1::Split>(
+            data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}), 2);
+        Output<Node> slice_data_node_0 = data_split->outputs()[0];
+        Output<Node> slice_data_node_1 = data_split->outputs()[1];
+
+        auto first_half_node = std::make_shared<ov::op::v1::Subtract>(
+            std::make_shared<ov::op::v1::Multiply>(slice_data_node_0, cos_theta_node),
+            std::make_shared<ov::op::v1::Multiply>(slice_data_node_1, sin_theta_node));
+
+        auto second_half_node = std::make_shared<ov::op::v1::Add>(
+            std::make_shared<ov::op::v1::Multiply>(slice_data_node_0, sin_theta_node),
+            std::make_shared<ov::op::v1::Multiply>(slice_data_node_1, cos_theta_node));
+
+        res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{first_half_node, second_half_node}, 2);
+    }
+
+    return rename_outputs_with_suffix({res}, context.get_name());
+}
+
+}  // namespace op
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/op/scale.cpp b/ggml/src/ggml-openvino/openvino/op/scale.cpp
new file mode 100644
index 0000000000000..783440ebd967e
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/op/scale.cpp
@@ -0,0 +1,29 @@
+#include <openvino/op/constant.hpp>
+#include <openvino/op/multiply.hpp>
+#include <vector>
+
+#include "../node_context.hpp"
+#include "../op_table.hpp"
+#include "../utils.hpp"
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+namespace op {
+
+OutputVector translate_scale(const NodeContext& context) {
+    num_inputs_check(context, 1, 1);
+
+    float scale;
+    memcpy(&scale, context.get_output_op_params(0), sizeof(float));
+    auto scale_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{}, std::vector<float>{scale});
+
+    auto res = std::make_shared<ov::op::v1::Multiply>(context.get_input(0), scale_node);
+
+    return rename_outputs_with_suffix({res}, context.get_name());
+}
+
+}  // namespace op
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp
new file mode 100644
index 0000000000000..758454cd9d72a
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp
@@ -0,0 +1,71 @@
+#include <cstdint>
+#include <memory>
+#include <openvino/core/node.hpp>
+#include <openvino/core/node_output.hpp>
+#include <openvino/frontend/exception.hpp>
+#include <openvino/op/constant.hpp>
+#include <openvino/op/convert.hpp>
+#include <openvino/op/gather.hpp>
+#include <openvino/op/reshape.hpp>
+#include <openvino/op/scatter_update.hpp>
+#include <openvino/op/shape_of.hpp>
+#include <openvino/op/slice.hpp>
+#include <openvino/op/squeeze.hpp>
+#include <openvino/op/transpose.hpp>
+
+#include "../node_context.hpp"
+#include "../op_table.hpp"
+#include "../utils.hpp"
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+namespace op {
+
+OutputVector translate_set_rows(const NodeContext& context) {
+    num_inputs_check(context, 2, 2);
+
+    auto data = context.get_input(0);
+    data = std::make_shared<ov::op::v0::Convert>(data, context.get_output_type(0));
+
+    auto dst_shape = context.get_output_shape(0).to_shape();
+    FRONT_END_OP_CONVERSION_CHECK(dst_shape[0] == 1, "Unsupported shape in SET_ROWS");
+
+    if (context.is_static() && context.is_first_token()) {
+        Output<Node> res;
+        if (context.get_op_case() == 2) {
+            res = std::make_shared<ov::op::v1::Reshape>(
+                data,
+                ov::op::v0::Constant::create(
+                    ov::element::i64,
+                    {3},
+                    {context.get_context_size(), context.get_num_heads_kv(), context.get_head_size()}),
+                false);
+            res = std::make_shared<ov::op::v1::Transpose>(
+                res, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 2, 0}));
+        } else {
+            res = data;
+        }
+        return rename_outputs_with_suffix({res}, context.get_name());
+    }
+
+    auto indices = context.get_input(1);
+    auto dst = context.get_input(context.get_output_name());
+
+    auto zero = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {0});
+    auto dst_reshaped = std::make_shared<ov::op::v1::Reshape>(
+        dst,
+        ov::op::v0::Constant::create(ov::element::i64, {2}, {(int64_t) dst_shape[1], (int64_t) dst_shape[2]}),
+        false);
+    auto indices_reshaped =
+        std::make_shared<ov::op::v0::Squeeze>(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1}));
+    auto data_reshaped = std::make_shared<ov::op::v0::Squeeze>(data, zero);
+    auto updated = std::make_shared<ov::op::v3::ScatterUpdate>(dst_reshaped, indices_reshaped, data_reshaped, zero);
+    auto res = std::make_shared<ov::op::v1::Reshape>(updated, std::make_shared<ov::op::v0::ShapeOf>(dst), false);
+    return rename_outputs_with_suffix({res}, context.get_name());
+}
+
+}  // namespace op
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp
new file mode 100644
index 0000000000000..e072658ecb156
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp
@@ -0,0 +1,84 @@
+#include <climits>
+#include <cstdint>
+#include <memory>
+#include <openvino/core/node.hpp>
+#include <openvino/core/node_output.hpp>
+#include <openvino/op/add.hpp>
+#include <openvino/op/concat.hpp>
+#include <openvino/op/constant.hpp>
+#include <openvino/op/convert.hpp>
+#include <openvino/op/matmul.hpp>
+#include <openvino/op/multiply.hpp>
+#include <openvino/op/slice.hpp>
+#include <openvino/op/softmax.hpp>
+#include <vector>
+
+#include "../node_context.hpp"
+#include "../op_table.hpp"
+#include "../utils.hpp"
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+namespace op {
+
+OutputVector translate_soft_max(const NodeContext& context) {
+    num_inputs_check(context, 1, 2);
+
+    auto input_node = context.get_input(0).get_node_shared_ptr();
+    ov::Output<Node> res;
+
+    float scale = 1.0f;
+    float max_bias = 0.0f;
+    auto* op_params = context.get_output_op_params(0);
+    memcpy(&scale, (float*) op_params + 0, sizeof(float));
+    memcpy(&max_bias, (float*) op_params + 1, sizeof(float));
+    auto src0_shape = context.get_input_shape(0).get_shape();
+    const uint32_t h = src0_shape[2];
+    const uint32_t n_head = src0_shape[0];
+    const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
+
+    const float m0 = powf(2.0f, -(max_bias) / n_head_log2);
+    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
+    const float slope =
+        (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2 * (h - n_head_log2) + 1) : 1.0f;
+
+    auto scale_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{}, std::vector<float>{scale});
+    auto scaled_input = std::make_shared<ov::op::v1::Multiply>(input_node, scale_node);
+
+    if (context.get_input_size() < 2) {
+        res = std::make_shared<ov::op::v8::Softmax>(scaled_input, 2);
+        return rename_outputs_with_suffix({res}, context.get_name());
+    }
+
+    auto mask_node = context.get_input(1);
+
+    auto token_len = context.has_input("token_len") ? context.get_input("token_len") : get_dimensions(input_node, {1});
+    auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
+    auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
+    std::shared_ptr<ov::Node> mask_node_sliced =
+        std::make_shared<ov::op::v8::Slice>(mask_node, zero, token_len, one, one);
+    if (mask_node_sliced->get_element_type() != context.get_output_type(0)) {
+        mask_node_sliced = std::make_shared<ov::op::v0::Convert>(mask_node_sliced, context.get_output_type(0));
+    }
+
+    Output<Node> slope_mask;
+    if (slope != 1.0f) {
+        auto slope_node =
+            std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{}, std::vector<float>{slope});
+        slope_mask = std::make_shared<ov::op::v1::Multiply>(mask_node_sliced, slope_node);
+        throw std::runtime_error("Slope != 1.0f in softmax has not been tested, verify it before use.");
+    }
+    slope_mask = mask_node_sliced;
+
+    auto input_slope_mask_node = std::make_shared<ov::op::v1::Add>(scaled_input, slope_mask);
+
+    res = std::make_shared<ov::op::v8::Softmax>(input_slope_mask_node, 2);
+
+    return rename_outputs_with_suffix({res}, context.get_name());
+}
+
+}  // namespace op
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/op/transpose.cpp b/ggml/src/ggml-openvino/openvino/op/transpose.cpp
new file mode 100644
index 0000000000000..b35f1fb8610ea
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/op/transpose.cpp
@@ -0,0 +1,24 @@
+#include <openvino/op/transpose.hpp>
+
+#include "../node_context.hpp"
+#include "../op_table.hpp"
+#include "../utils.hpp"
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+namespace op {
+
+OutputVector translate_transpose(const NodeContext& context) {
+    num_inputs_check(context, 1, 1);
+
+    auto perm = argsort_descend(context.get_output_stride(0));
+    auto res = std::make_shared<ov::op::v1::Transpose>(context.get_input(0),
+                                                       ov::op::v0::Constant::create(ov::element::i64, {3}, perm));
+    return rename_outputs_with_suffix({res}, context.get_name());
+}
+
+}  // namespace op
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp b/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp
new file mode 100644
index 0000000000000..2b27c0be1227c
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp
@@ -0,0 +1,27 @@
+#include <openvino/core/node_output.hpp>
+#include <openvino/op/multiply.hpp>
+#include <openvino/op/sigmoid.hpp>
+
+#include "../node_context.hpp"
+#include "../op_table.hpp"
+#include "../utils.hpp"
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+namespace op {
+
+OutputVector translate_unary_silu(const NodeContext& context) {
+    num_inputs_check(context, 1, 1);
+
+    auto input = context.get_input(0);
+    auto sigmoid = std::make_shared<ov::op::v0::Sigmoid>(input);
+    auto res = std::make_shared<ov::op::v1::Multiply>(input, sigmoid);
+
+    return rename_outputs_with_suffix({res}, context.get_name());
+}
+
+}  // namespace op
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/op/view.cpp b/ggml/src/ggml-openvino/openvino/op/view.cpp
new file mode 100644
index 0000000000000..58143e667cc6f
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/op/view.cpp
@@ -0,0 +1,18 @@
+#include "../op_table.hpp"
+#include "../utils.hpp"
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+namespace op {
+
+OutputVector translate_view(const NodeContext& context) {
+    num_inputs_check(context, 1, 1);
+
+    return {context.get_input(0)};
+}
+
+}  // namespace op
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp
new file mode 100644
index 0000000000000..ce4b01c3b5163
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/op_table.cpp
@@ -0,0 +1,43 @@
+#include "op_table.hpp"
+
+#include <openvino/op/add.hpp>
+#include <openvino/op/divide.hpp>
+#include <openvino/op/gather.hpp>
+#include <openvino/op/matmul.hpp>
+#include <openvino/op/multiply.hpp>
+#include <openvino/op/subtract.hpp>
+
+#include "utils.hpp"
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+
+std::unordered_map<std::string, CreatorFunction> get_supported_ops() {
+    using namespace ov::op;
+    return {
+        {"GGML_OP_ADD",        op::translate_1to1_match_2_inputs<v1::Add>     },
+        {"GGML_OP_ADD1",       op::translate_1to1_match_2_inputs<v1::Add>     },
+        {"GGML_OP_CONT",       op::translate_cont                             },
+        {"GGML_OP_DIV",        op::translate_1to1_match_2_inputs<v1::Divide>  },
+        {"GGML_OP_GET_ROWS",   op::translate_get_rows                         },
+        {"GGML_OP_MUL",        op::translate_1to1_match_2_inputs<v1::Multiply>},
+        {"GGML_OP_MUL_MAT",    op::translate_mulmat                           },
+        {"GGML_OP_PERMUTE",    op::translate_permute                          },
+        {"GGML_OP_RESHAPE",    op::translate_reshape                          },
+        {"GGML_OP_RMS_NORM",   op::translate_rms_norm                         },
+        {"GGML_OP_ROPE",       op::translate_rope                             },
+        {"GGML_OP_SCALE",      op::translate_scale                            },
+        {"GGML_OP_SOFT_MAX",   op::translate_soft_max                         },
+        {"GGML_OP_SUB",        op::translate_1to1_match_2_inputs<v1::Subtract>},
+        {"GGML_OP_TRANSPOSE",  op::translate_transpose                        },
+        {"GGML_UNARY_OP_SILU", op::translate_unary_silu                       },
+        {"GGML_OP_VIEW",       op::translate_view                             },
+        {"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu                       },
+        {"GGML_OP_SET_ROWS",   op::translate_set_rows                         },
+    };
+}
+
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/op_table.hpp b/ggml/src/ggml-openvino/openvino/op_table.hpp
new file mode 100644
index 0000000000000..332930c3ac115
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/op_table.hpp
@@ -0,0 +1,36 @@
+#pragma once
+
+#include "node_context.hpp"
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+
+namespace op {
+
+#define GGML_OP_CONVERTER(op) OutputVector op(const NodeContext& context)
+
+GGML_OP_CONVERTER(translate_add);
+GGML_OP_CONVERTER(translate_cont);
+GGML_OP_CONVERTER(translate_get_rows);
+GGML_OP_CONVERTER(translate_mul);
+GGML_OP_CONVERTER(translate_mulmat);
+GGML_OP_CONVERTER(translate_permute);
+GGML_OP_CONVERTER(translate_reshape);
+GGML_OP_CONVERTER(translate_rms_norm);
+GGML_OP_CONVERTER(translate_rope);
+GGML_OP_CONVERTER(translate_scale);
+GGML_OP_CONVERTER(translate_unary_silu);
+GGML_OP_CONVERTER(translate_soft_max);
+GGML_OP_CONVERTER(translate_transpose);
+GGML_OP_CONVERTER(translate_view);
+GGML_OP_CONVERTER(translate_glu_swiglu);
+GGML_OP_CONVERTER(translate_set_rows);
+
+} // namespace op
+
+std::unordered_map<std::string, CreatorFunction> get_supported_ops();
+
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp
new file mode 100644
index 0000000000000..1b7ac602716ad
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp
@@ -0,0 +1,61 @@
+#include "fuse_to_sdpa.hpp"
+
+#include <openvino/core/graph_util.hpp>
+#include <openvino/core/rt_info.hpp>
+#include <openvino/op/add.hpp>
+#include <openvino/op/convert.hpp>
+#include <openvino/op/matmul.hpp>
+#include <openvino/op/multiply.hpp>
+#include <openvino/op/scaled_dot_product_attention.hpp>
+#include <openvino/op/softmax.hpp>
+#include <openvino/op/transpose.hpp>
+#include <openvino/pass/pattern/op/label.hpp>
+#include <openvino/pass/pattern/op/pattern.hpp>
+#include <openvino/pass/pattern/op/wrap_type.hpp>
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+namespace pass {
+
+FuseToSDPA::FuseToSDPA() {
+    const auto m_k = ov::pass::pattern::any_input();
+    const auto m_q = ov::pass::pattern::any_input();
+    const auto m_qk = ov::pass::pattern::wrap_type<ov::op::v0::MatMul>({m_q, m_k});
+    const auto m_qk_f32 = ov::pass::pattern::wrap_type<ov::op::v0::Convert>({m_qk});
+    const auto m_scale = ov::pass::pattern::any_input();
+    const auto m_scaled_qk = ov::pass::pattern::wrap_type<ov::op::v1::Multiply>({m_qk_f32, m_scale});
+    const auto m_mask = ov::pass::pattern::any_input();
+    const auto m_masked_qk = ov::pass::pattern::wrap_type<ov::op::v1::Add>({m_scaled_qk, m_mask});
+    const auto m_softmax_qk = ov::pass::pattern::wrap_type<ov::op::v8::Softmax>({m_masked_qk});
+    const auto m_softmax_qk_f16 = ov::pass::pattern::wrap_type<ov::op::v0::Convert>({m_softmax_qk});
+    const auto m_v = ov::pass::pattern::any_input();
+    const auto m_qkv = ov::pass::pattern::wrap_type<ov::op::v0::MatMul>({m_softmax_qk_f16, m_v});
+
+    const auto callback = [=](ov::pass::pattern::Matcher& m) {
+        auto& pattern_to_output = m.get_pattern_value_map();
+        auto k = pattern_to_output[m_k];
+        auto q = pattern_to_output[m_q];
+        auto v = pattern_to_output[m_v];
+        auto mask = pattern_to_output[m_mask];
+        auto scale = pattern_to_output[m_scale];
+
+        auto v_trans =
+            register_new_node<ov::op::v1::Transpose>(v, ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 2, 1}));
+        auto mask_f16 = register_new_node<ov::op::v0::Convert>(mask, ov::element::f16);
+        auto scale_f16 = register_new_node<ov::op::v0::Convert>(scale, ov::element::f16);
+        auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(q, k, v_trans, mask_f16, scale_f16, false);
+
+        ov::replace_node(m.get_match_root(), sdpa);
+        ov::copy_runtime_info(m.get_matched_nodes(), sdpa);
+
+        return true;
+    };
+    register_matcher(std::make_shared<ov::pass::pattern::Matcher>(m_qkv, "ov::frontend::ggml::pass::FuseToSDPA"),
+                     callback);
+}
+
+}  // namespace pass
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.hpp b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.hpp
new file mode 100644
index 0000000000000..8b5164d232932
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.hpp
@@ -0,0 +1,17 @@
+#include "openvino/pass/matcher_pass.hpp"
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+namespace pass {
+
+class FuseToSDPA : public ov::pass::MatcherPass {
+public:
+    OPENVINO_MATCHER_PASS_RTTI("ov::frontend::ggml::pass::FuseToSDPA")
+    FuseToSDPA();
+};
+
+}  // namespace pass
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.hpp b/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.hpp
new file mode 100644
index 0000000000000..b40eaf4205703
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.hpp
@@ -0,0 +1,29 @@
+#pragma once
+
+#include "mark_decompression_convert_constant_folding.hpp"
+#include "openvino/pass/matcher_pass.hpp"
+#include "openvino/core/visibility.hpp"
+
+#ifdef OPENVINO_STATIC_LIBRARY
+#    define TRANSFORMATIONS_API
+#else
+#    ifdef IMPLEMENT_OPENVINO_API
+#        define TRANSFORMATIONS_API OPENVINO_CORE_EXPORTS
+#    else
+#        define TRANSFORMATIONS_API OPENVINO_CORE_IMPORTS
+#    endif  // IMPLEMENT_OPENVINO_API
+#endif      // OPENVINO_STATIC_LIBRARY
+
+namespace ov {
+namespace pass {
+
+class TRANSFORMATIONS_API MarkCompressedFloatConstants;
+
+}  // namespace pass
+}  // namespace ov
+
+class ov::pass::MarkCompressedFloatConstants : public MatcherPass {
+public:
+    OPENVINO_MATCHER_PASS_RTTI("MarkCompressedFloatConstants")
+    MarkCompressedFloatConstants();
+};
diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp
new file mode 100644
index 0000000000000..a09247347f3f1
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp
@@ -0,0 +1,219 @@
+#include "translate_session.hpp"
+
+#include <cstdint>
+#include <cstdlib>
+#include <map>
+#include <memory>
+#include <openvino/core/node.hpp>
+#include <openvino/op/add.hpp>
+#include <openvino/op/broadcast.hpp>
+#include <openvino/op/concat.hpp>
+#include <openvino/op/convert.hpp>
+#include <openvino/op/cos.hpp>
+#include <openvino/op/divide.hpp>
+#include <openvino/op/multiply.hpp>
+#include <openvino/op/parameter.hpp>
+#include <openvino/op/range.hpp>
+#include <openvino/op/reshape.hpp>
+#include <openvino/op/result.hpp>
+#include <openvino/op/sin.hpp>
+#include <openvino/op/squeeze.hpp>
+#include <openvino/op/transpose.hpp>
+#include <openvino/op/unsqueeze.hpp>
+#include <openvino/pass/constant_folding.hpp>
+#include <openvino/pass/make_stateful.hpp>
+
+#include "ggml-openvino/openvino/node_context.hpp"
+#include "ggml-openvino/openvino/utils.hpp"
+#include "input_model.hpp"
+#include "pass/fuse_to_sdpa.hpp"
+#include "pass/mark_decompression_convert_constant_folding.hpp"
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+
+using namespace ov::op;
+
+namespace {
+ov::pass::MakeStateful::ParamResPairs get_kv_param_res_pairs(
+    const std::shared_ptr<ov::Model>& model, const std::map<std::string, std::string>& kv_param_res_names) {
+    ov::pass::MakeStateful::ParamResPairs pairs;
+    const auto& params = model->get_parameters();
+    const auto& results = model->get_results();
+
+    for (const auto& param_res : kv_param_res_names) {
+        const auto& param_name = param_res.first;
+        const auto& res_name = param_res.second;
+
+        auto param_it = std::find_if(params.begin(), params.end(), [&](const std::shared_ptr<v0::Parameter>& node) {
+            return node->get_friendly_name() == param_name;
+        });
+
+        OPENVINO_ASSERT(param_it != params.end(), "The tensor name ", param_name,
+                        " is not associated with any of "
+                        "Parameters in the network.");
+
+        auto res_it = std::find_if(results.begin(), results.end(), [&](const std::shared_ptr<v0::Result>& node) {
+            return node->get_friendly_name() == res_name;
+        });
+
+        OPENVINO_ASSERT(res_it != results.end(), "The tensor name ", res_name,
+                        " is not associated with any of "
+                        "Results in the network.");
+
+        std::shared_ptr<ov::op::v0::Parameter> param = *param_it;
+        std::shared_ptr<ov::op::v0::Result> res = *res_it;
+        pairs.emplace_back(param, res);
+    }
+    return pairs;
+}
+
+void add_token_len(TensorMap& tensor_map) {
+    auto inp_tokens = tensor_map.at("inp_tokens").get_node_shared_ptr();
+    auto token_len = get_dimensions(inp_tokens, {2});
+    token_len->set_friendly_name("token_len");
+    tensor_map.insert({"token_len", token_len->output(0)});
+}
+
+void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
+    int32_t* rope_params = ggml_model_decoder.get_rope_params();
+    auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr();
+    std::shared_ptr<ov::Node> rope_freqs_weight;
+    if (tensor_map.find("rope_freqs_weight") != tensor_map.end()) {
+        rope_freqs_weight = tensor_map.at("rope_freqs.weight").get_node_shared_ptr();
+    }
+
+    auto sin_cos = make_sin_cos(rope_params, inp_pos, rope_freqs_weight);
+    auto sin_theta = sin_cos.first;
+    auto cos_theta = sin_cos.second;
+
+    cos_theta.get_node_shared_ptr()->set_friendly_name("rope_cos");
+    sin_theta.get_node_shared_ptr()->set_friendly_name("rope_sin");
+    tensor_map.insert({"rope_cos", cos_theta});
+    tensor_map.insert({"rope_sin", sin_theta});
+}
+
+// Create common patterns
+void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
+    add_token_len(tensor_map);
+    add_rope_sin_cos(tensor_map, ggml_model_decoder);
+}
+
+}  // namespace
+
+TranslateSession::TranslateSession(const frontend::InputModel::Ptr& input_model,
+                                   const std::unordered_map<std::string, CreatorFunction>& translator_map,
+                                   bool naive) :
+    m_input_model(input_model),
+    m_translator_map(translator_map),
+    m_ov_model(nullptr),
+    m_naive(naive) {}
+
+std::shared_ptr<Model> TranslateSession::get_converted_model() {
+    if (m_ov_model) {
+        return m_ov_model;
+    }
+    m_ov_model = translate_graph(m_input_model);
+    return m_ov_model;
+}
+
+std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputModel::Ptr& input_model) {
+    ov::ParameterVector params;
+    ov::ResultVector results;
+    auto tensor_map = std::make_shared<TensorMap>();
+    std::shared_ptr<Model> resulting_model;
+
+    const auto& ggml_model = std::dynamic_pointer_cast<InputModel>(input_model);
+    std::shared_ptr<GgmlDecoder> ggml_model_decoder = ggml_model->get_model_decoder();
+
+    for (const auto& it : ggml_model_decoder->get_model_inputs()) {
+        params.push_back(std::dynamic_pointer_cast<ov::op::v0::Parameter>(it.second));
+        (*tensor_map)[it.first] = it.second;
+    }
+
+    for (const auto& it : ggml_model_decoder->get_model_extra_inputs()) {
+        params.push_back(std::dynamic_pointer_cast<ov::op::v0::Parameter>(it.second));
+        (*tensor_map)[it.first] = it.second;
+    }
+
+    for (const auto& it : ggml_model_decoder->get_model_weights()) {
+        (*tensor_map)[it.first] = it.second;
+    }
+
+    auto node_visitor = [&](std::shared_ptr<GgmlDecoder> node) {
+        auto operation_type = node->get_op_type();
+        if (operation_type == "GGML_OP_NONE") {
+            return;
+        }
+
+        ov::OutputVector converted_outputs;
+        auto it = m_translator_map.find(operation_type);
+        FRONT_END_OP_CONVERSION_CHECK(it != m_translator_map.end(),
+                                      "Translation for operation type ",
+                                      operation_type,
+                                      " is not implemented.");
+        NodeContext node_context(node, tensor_map, this);
+        converted_outputs = it->second(node_context);
+
+        const auto& node_output_names = node->get_output_names();
+        FRONT_END_OP_CONVERSION_CHECK(node_output_names.size() == converted_outputs.size(),
+                                      "Number of ",
+                                      operation_type,
+                                      " outputs greater than number of converted outputs, which are ",
+                                      node_output_names.size(),
+                                      " and ",
+                                      converted_outputs.size(),
+                                      " respectively.");
+
+        for (size_t i = 0; i < node_output_names.size(); ++i) {
+            auto output_name = node_output_names[i];
+            if (i < converted_outputs.size() && converted_outputs[i].get_node_shared_ptr() != nullptr) {
+                (*tensor_map)[output_name] = converted_outputs[i];
+            }
+        }
+    };
+
+    if (!m_naive) {
+        preprocess(*tensor_map, *ggml_model_decoder);
+    }
+    ggml_model_decoder->visit_subgraph(node_visitor);
+
+    for (const auto& name : ggml_model_decoder->get_model_output_names()) {
+        FRONT_END_GENERAL_CHECK(tensor_map->find(name) != tensor_map->end(),
+                                "Output name not found in tensor map: ",
+                                name);
+        auto result = std::make_shared<v0::Result>(tensor_map->at(name));
+        result->set_friendly_name(name);
+        results.push_back(result);
+    }
+
+    resulting_model = std::make_shared<Model>(results, params);
+
+    apply_transformations(resulting_model);
+    return resulting_model;
+}
+
+std::shared_ptr<Model> TranslateSession::apply_transformations(std::shared_ptr<Model> model) {
+    auto ggml_model_decoder = std::dynamic_pointer_cast<InputModel>(m_input_model)->get_model_decoder();
+    {
+        ov::pass::Manager manager;
+        manager.set_per_pass_validation(true);
+        manager.register_pass<ov::pass::MarkCompressedFloatConstants>();
+        manager.register_pass<ov::pass::ConstantFolding>();
+
+        if (!ggml_model_decoder->is_static()) {
+            const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names();
+            const auto kv_param_res_pairs = get_kv_param_res_pairs(model, kv_param_res_names);
+            manager.register_pass<ov::pass::MakeStateful>(kv_param_res_pairs);
+        }
+
+        manager.register_pass<pass::FuseToSDPA>();
+        manager.run_passes(model);
+    }
+    return model;
+}
+
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/translate_session.hpp b/ggml/src/ggml-openvino/openvino/translate_session.hpp
new file mode 100644
index 0000000000000..7072d4a9e8b1a
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/translate_session.hpp
@@ -0,0 +1,28 @@
+#pragma once
+
+#include "input_model.hpp"
+#include "node_context.hpp"
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+
+class TranslateSession {
+public:
+    TranslateSession(const frontend::InputModel::Ptr& input_model,
+                     const std::unordered_map<std::string, CreatorFunction>& translator_map, bool naive = false);
+
+    std::shared_ptr<Model> get_converted_model();
+    std::shared_ptr<Model> translate_graph(const frontend::InputModel::Ptr& input_model);
+
+private:
+    std::shared_ptr<Model> apply_transformations(std::shared_ptr<Model> model);
+    const frontend::InputModel::Ptr m_input_model;
+    const std::unordered_map<std::string, CreatorFunction>& m_translator_map;
+    std::shared_ptr<Model> m_ov_model;
+    bool m_naive;
+};
+
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp
new file mode 100644
index 0000000000000..9634900753224
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/utils.cpp
@@ -0,0 +1,202 @@
+#include "utils.hpp"
+
+#include <cstddef>
+#include <ctime>
+#include <memory>
+#include <openvino/op/add.hpp>
+#include <openvino/op/clamp.hpp>
+#include <openvino/op/convert.hpp>
+#include <openvino/op/cos.hpp>
+#include <openvino/op/divide.hpp>
+#include <openvino/op/gather.hpp>
+#include <openvino/op/maximum.hpp>
+#include <openvino/op/multiply.hpp>
+#include <openvino/op/shape_of.hpp>
+#include <openvino/op/sin.hpp>
+#include <openvino/op/subtract.hpp>
+#include <openvino/op/transpose.hpp>
+#include <string>
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+
+std::string getCurrentTime() {
+    std::time_t now = std::time(nullptr);
+    char buf[100];
+    std::strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", std::localtime(&now));
+    return buf;
+}
+
+void num_inputs_check(const NodeContext& context, size_t min_inputs, size_t max_inputs) {
+    auto input_size = context.get_input_size();
+    FRONT_END_OP_CONVERSION_CHECK(input_size >= min_inputs, "Got less inputs than expected");
+    FRONT_END_OP_CONVERSION_CHECK(input_size <= max_inputs, "Got more inputs than expected");
+}
+
+int non_cont_dim(std::vector<size_t> ne, std::vector<size_t> nb) {
+    int dim = nb.size() - 1;
+    size_t bytes = nb[dim];
+    for (int i = dim; i > 0; i--) {
+        bytes *= ne[i];
+        if (bytes != nb[i - 1]) {
+            return i;
+        }
+    }
+    return 0;
+}
+
+std::shared_ptr<ov::Node> get_dimensions(const std::shared_ptr<ov::op::v3::ShapeOf>& shape,
+                                         const std::vector<int>& dims) {
+    using namespace ov::op;
+    const auto zero = v0::Constant::create(ov::element::i32, ov::Shape{}, {0});
+    const auto dims_const = v0::Constant::create(ov::element::i32, ov::Shape{dims.size()}, dims);
+    return std::make_shared<v8::Gather>(shape, dims_const, zero);
+}
+
+std::shared_ptr<ov::Node> get_dimensions(const std::shared_ptr<ov::Node>& node, const std::vector<int>& dims) {
+    return get_dimensions(std::make_shared<ov::op::v3::ShapeOf>(node), dims);
+}
+
+OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::string& suffix) {
+    for (const auto& output : outputs) {
+        auto node = output.get_node_shared_ptr();
+        std::string name = node->get_friendly_name();
+        name += "_";
+        name += suffix;
+        node->set_friendly_name(name);
+    }
+    return outputs;
+}
+
+namespace {
+ov::Output<ov::Node> rope_yarn_ramp_mix(int n_dims, const float corr_dims[2], float ext_factor) {
+    int half_n_dims = n_dims / 2;
+    std::vector<float> dim_ids_vec(half_n_dims);
+    std::iota(dim_ids_vec.begin(), dim_ids_vec.end(), 0);
+    auto dim_ids = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, (size_t) half_n_dims}, dim_ids_vec);
+    auto corr_low = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {corr_dims[0]});
+    auto corr_high = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {corr_dims[1]});
+    auto denom =
+        std::make_shared<ov::op::v1::Maximum>(std::make_shared<ov::op::v1::Subtract>(corr_high, corr_low),
+                                              ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {0.001f}));
+    auto ramp_y =
+        std::make_shared<ov::op::v1::Divide>(std::make_shared<ov::op::v1::Subtract>(dim_ids, corr_low), denom);
+    auto ramp_clamped = std::make_shared<ov::op::v0::Clamp>(ramp_y, 0.0f, 1.0f);
+    auto ext_factor_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {ext_factor});
+    auto ramp_mix = std::make_shared<ov::op::v1::Multiply>(ramp_clamped, ext_factor_node);
+    return ramp_mix;
+}
+
+float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
+#ifndef M_PI
+#    define M_PI 3.14159265358979323846
+#endif
+    return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float) M_PI)) / (2 * logf(base));
+}
+
+void ggml_rope_yarn_corr_dims(int n_dims,
+                              int n_ctx_orig,
+                              float freq_base,
+                              float beta_fast,
+                              float beta_slow,
+                              float dims[2]) {
+    float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
+    float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
+    dims[0] = std::max(0.0f, start);
+    dims[1] = std::min(static_cast<float>(n_dims - 1), end);
+}
+}  // namespace
+
+std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t* rope_params,
+                                                           std::shared_ptr<ov::Node> inp_pos,
+                                                           std::shared_ptr<ov::Node> rope_freqs_weight) {
+    inp_pos = std::make_shared<ov::op::v0::Convert>(inp_pos, ov::element::f32);
+    auto pos_perm =
+        std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{3}, std::vector<int64_t>{2, 1, 0});
+    inp_pos = std::make_shared<ov::op::v1::Transpose>(inp_pos, pos_perm);
+
+    float freq_base;
+    float freq_scale;
+    float ext_factor;
+    float attn_factor;
+    float beta_fast;
+    float beta_slow;
+    const int n_dims = rope_params[1];
+    const int n_ctx_orig = rope_params[4];
+    memcpy(&freq_base, rope_params + 5, sizeof(float));
+    memcpy(&freq_scale, rope_params + 6, sizeof(float));
+    memcpy(&ext_factor, rope_params + 7, sizeof(float));
+    memcpy(&attn_factor, rope_params + 8, sizeof(float));
+    memcpy(&beta_fast, rope_params + 9, sizeof(float));
+    memcpy(&beta_slow, rope_params + 10, sizeof(float));
+
+    const float theta_scale = powf(freq_base, -2.0f / n_dims);
+
+    float corr_dims[2];
+    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
+
+    std::vector<float> factor(n_dims / 2);
+    factor[0] = freq_scale;
+    for (size_t i = 1; i < factor.size(); i++) {
+        factor[i] = theta_scale * factor[i - 1];
+    }
+
+    Output<Node> freq_factors =
+        std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor);
+    if (rope_freqs_weight) {
+        freq_factors = std::make_shared<ov::op::v1::Divide>(freq_factors, rope_freqs_weight);
+    }
+
+    auto theta_extrap = std::make_shared<ov::op::v1::Multiply>(freq_factors, inp_pos);
+    auto theta_interp = std::make_shared<ov::op::v1::Multiply>(
+        theta_extrap, ov::op::v0::Constant::create(ov::element::f32, {1}, {freq_scale}));
+
+    Output<Node> theta;
+    float mscale = attn_factor;
+    if (ext_factor == 0.0f) {
+        theta = theta_interp;
+    } else {
+        auto ramp_mix = rope_yarn_ramp_mix(n_dims, corr_dims, ext_factor);
+        auto one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {1.0f});
+        auto one_minus_ramp = std::make_shared<ov::op::v1::Subtract>(one, ramp_mix);
+
+        theta = std::make_shared<ov::op::v1::Add>(std::make_shared<ov::op::v1::Multiply>(theta_interp, one_minus_ramp),
+                                                  std::make_shared<ov::op::v1::Multiply>(theta_extrap, ramp_mix));
+        mscale *= (1.0f + 0.1f * std::log(1.0f / freq_scale));
+    }
+
+    Output<Node> cos_theta = std::make_shared<ov::op::v0::Cos>(theta);
+    Output<Node> sin_theta = std::make_shared<ov::op::v0::Sin>(theta);
+
+    auto mscale_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {mscale});
+
+    cos_theta = std::make_shared<ov::op::v1::Multiply>(cos_theta, mscale_node);
+    sin_theta = std::make_shared<ov::op::v1::Multiply>(sin_theta, mscale_node);
+    return std::make_pair(sin_theta, cos_theta);
+}
+
+ov::Output<ov::Node> process_view_input(const NodeContext& context, int input_index, int slice_len) {
+    // Only works for VIEW operations that slice at the lowest dimension
+    // If the VIEW also reshape the result, `slice_len` should be provided
+    auto input = context.get_input(input_index);
+    int32_t* op_params = context.get_input_op_params(input_index);
+    auto src1_stride = context.get_input_stride(input_index);
+
+    int64_t split_addr = op_params[0] / src1_stride[2];
+    if (slice_len == 0) {
+        slice_len = context.get_input_shape(input_index)[2].get_length();
+    }
+    int64_t slice_end = split_addr + slice_len;
+
+    auto begin = ov::op::v0::Constant::create(ov::element::i64, {1}, {split_addr});
+    auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_end});
+    auto stride = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
+    auto axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
+    auto sliced = std::make_shared<ov::op::v8::Slice>(input, begin, end, stride, axes);
+    return sliced;
+}
+
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/utils.hpp b/ggml/src/ggml-openvino/openvino/utils.hpp
new file mode 100644
index 0000000000000..6c6d2ae8d4f23
--- /dev/null
+++ b/ggml/src/ggml-openvino/openvino/utils.hpp
@@ -0,0 +1,84 @@
+#pragma once
+
+#include <memory>
+#include <openvino/core/node.hpp>
+#include <openvino/op/shape_of.hpp>
+#include <openvino/op/slice.hpp>
+#include <utility>
+
+#include "node_context.hpp"
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+
+std::string getCurrentTime();
+
+void dump_ov_model(std::shared_ptr<ov::Model> model);
+
+void num_inputs_check(const NodeContext& context, size_t min_inputs, size_t max_inputs);
+
+int non_cont_dim(std::vector<size_t> ne, std::vector<size_t> nb);
+
+template <typename T>
+std::vector<int> argsort_descend(const std::vector<T>& v) {
+    std::vector<int> idx(v.size());
+    std::iota(idx.begin(), idx.end(), 0);
+    std::sort(idx.begin(), idx.end(), [&v](int i1, int i2) {
+        return v[i1] > v[i2];
+    });
+    return idx;
+}
+
+template <typename T>
+std::vector<T> sorted_descend(std::vector<T> v) {
+    std::sort(v.begin(), v.end(), [](T a, T b) {
+        return a > b;
+    });
+    return v;
+}
+
+template <typename T>
+bool is_permuted(const std::vector<T>& strides) {
+    for (size_t i = 0; i < strides.size() - 1; ++i) {
+        if (strides[i] < strides[i + 1]) {
+            return true;
+        }
+    }
+    return false;
+}
+
+template <typename T>
+std::vector<T> permute(const std::vector<T>& x, const std::vector<int>& perm) {
+    std::vector<T> result;
+    result.reserve(perm.size());
+    for (int i : perm) {
+        result.push_back(x[i]);
+    }
+    return result;
+}
+
+std::shared_ptr<ov::Node> get_dimensions(const std::shared_ptr<ov::op::v3::ShapeOf>& shape,
+                                         const std::vector<int>& dims);
+std::shared_ptr<ov::Node> get_dimensions(const std::shared_ptr<ov::Node>& node, const std::vector<int>& dims);
+
+OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::string& suffix);
+
+std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t* rope_params,
+                                                           std::shared_ptr<ov::Node> inp_pos,
+                                                           std::shared_ptr<ov::Node> rope_freqs_weight = nullptr);
+
+ov::Output<ov::Node> process_view_input(const NodeContext& context, int input_index, int slice_len = 0);
+
+namespace op {
+template <typename T>
+OutputVector translate_1to1_match_2_inputs(const NodeContext& context) {
+    num_inputs_check(context, 2, 2);
+    auto res = std::make_shared<T>(context.get_input(0), context.get_input(1));
+    return rename_outputs_with_suffix({res}, context.get_name());
+}
+}  // namespace op
+
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
new file mode 100644
index 0000000000000..522e922db8dee
--- /dev/null
+++ b/ggml/src/ggml-openvino/utils.cpp
@@ -0,0 +1,420 @@
+#include "utils.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
+#include <openvino/core/any.hpp>
+#include <openvino/core/graph_util.hpp>
+#include <openvino/core/type/float16.hpp>
+#include <openvino/frontend/manager.hpp>
+#include <openvino/openvino.hpp>
+#include <openvino/runtime/compiled_model.hpp>
+#include <openvino/runtime/infer_request.hpp>
+#include <openvino/runtime/intel_npu/properties.hpp>
+#include <openvino/runtime/properties.hpp>
+#include <openvino/runtime/tensor.hpp>
+#include <unordered_map>
+#include <vector>
+
+#include "ggml-impl.h"
+#include "ggml-openvino/ggml-decoder.h"
+#include "ggml.h"
+#include "openvino/frontend.hpp"
+#include "openvino/input_model.hpp"
+
+ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string& name) {
+    const auto* ggml_tensor = ggml_decoder->get_input_ggml_tensor(name);
+    auto* input_data = ggml_tensor->data;
+    ov::Shape input_shape;
+    if (name.find("cache_k") == 0 || name.find("cache_v") == 0) {
+        input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor).to_shape();
+    } else if (ggml_tensor->op == GGML_OP_VIEW) {
+        // This case is added to make test-backend-ops work
+        input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor->view_src).to_shape();
+    } else {
+        input_shape = ggml_decoder->get_input_shape(name).to_shape();
+    }
+    auto input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data);
+    return input_tensor;
+}
+
+std::map<std::string, void*> get_ggml_graph_output_dst(std::shared_ptr<GgmlOvDecoder> ggml_decoder) {
+    std::map<std::string, void*> output_tensors;
+    auto output_names = ggml_decoder->get_model_output_names();
+    for (size_t inp = 0; inp < output_names.size(); ++inp) {
+        auto name = output_names[inp];
+        const auto* tensor = ggml_decoder->get_output_ggml_tensor(name);
+        auto* output_data = tensor->view_src ? tensor->view_src->data : tensor->data;
+        output_tensors[name] = output_data;
+    }
+    return output_tensors;
+}
+
+static ov::frontend::FrontEnd::Ptr get_ggml_frontend() {
+    auto fem = ov::frontend::FrontEndManager();
+    auto front_end = fem.load_by_framework("ggml");
+    return front_end;
+}
+
+enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph) {
+    static ov::Core core;
+
+    static std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "";
+    if (device.empty()) {
+        const std::vector<std::string> preferred_device = { "GPU", "CPU", "NPU" };
+        const auto available_devices = core.get_available_devices();
+        for (const auto& dev : preferred_device) {
+            if (std::find(available_devices.begin(), available_devices.end(), dev) != available_devices.end()) {
+                device = dev;
+                break;
+            }
+        }
+    }
+
+    bool is_static = device == "NPU" ? true : false;
+    ov::AnyMap config;
+    if (device == "NPU") {
+        config = get_npu_config();
+    }
+
+    if (is_naive(cgraph)) {
+        return naive_compute(cgraph, core, device, config);
+    }
+
+    auto start_time = ggml_time_us();
+
+    auto* cache_dir = getenv("GGML_OPENVINO_CACHE_DIR");
+    if (cache_dir && !is_static) {
+        core.set_property(ov::cache_dir(cache_dir));
+    }
+
+    static std::unordered_map<struct ggml_cgraph*, std::shared_ptr<ov::InferRequest>> infer_request_cache;
+    static std::unordered_map<struct ggml_cgraph*, std::vector<std::string>> ov_input_names_cache;
+    static std::unordered_map<struct ggml_cgraph*, std::vector<std::string>> ov_output_names_cache;
+    // For NPU, store the kvcache model, since we cannot create two infer_request
+    static std::unordered_map<struct ggml_cgraph*, ov::CompiledModel> compiled_model_cache;
+
+    std::shared_ptr<GgmlOvDecoder> ggml_decoder;
+    ov::InferRequest infer_request;
+
+    int64_t decoder_end_time;
+    int64_t conversion_end_time;
+    int64_t compile_end_time;
+
+    auto it = infer_request_cache.find(cgraph);
+    if (it != infer_request_cache.end()) {
+        std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
+        ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, false);
+        decoder_end_time = ggml_time_us();
+
+        // For NPU for the first time we call kvcache modle, pop the compiled kvcache model from cache
+        if (is_static && compiled_model_cache.find(cgraph) != compiled_model_cache.end()) {
+            infer_request_cache[cgraph] =
+                std::make_shared<ov::InferRequest>(compiled_model_cache[cgraph].create_infer_request());
+            compiled_model_cache.erase(cgraph);
+        }
+        infer_request = *infer_request_cache[cgraph];
+
+        conversion_end_time = ggml_time_us();
+        compile_end_time = conversion_end_time;
+    } else {
+        std::shared_ptr<ov::Model> model;
+        auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
+
+        if (is_static) {
+            ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, true);
+            auto ggml_decoder_kvcache = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, false);
+            decoder_end_time = ggml_time_us();
+
+            auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
+            auto input_model_kvcache = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder_kvcache);
+
+            model = ov::frontend::ggml::FrontEnd::convert(input_model);
+            ggml_decoder->clear_model_weights();
+            auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache);
+            ggml_decoder_kvcache->clear_model_weights();
+            conversion_end_time = ggml_time_us();
+
+            auto compiled_model = core.compile_model(model, device, config);
+            auto compiled_model_kvcache = core.compile_model(model_kvcache, device, config);
+            compiled_model_cache[cgraph] = compiled_model_kvcache;
+            compile_end_time = ggml_time_us();
+
+            infer_request_cache[cgraph] = std::make_shared<ov::InferRequest>(compiled_model.create_infer_request());
+            infer_request = *infer_request_cache[cgraph];
+            compiled_model_cache[cgraph] = compiled_model_kvcache;
+
+            if (getenv("GGML_OPENVINO_DUMP_IR")) {
+                char timestamped_filename[64];
+                auto timestamp = (long long) ggml_time_us();
+                snprintf(timestamped_filename, sizeof(timestamped_filename), "model_prefill_%lld.xml", timestamp);
+                ov::serialize(model, timestamped_filename);
+                snprintf(timestamped_filename, sizeof(timestamped_filename), "model_kvcache_%lld.xml", timestamp);
+                ov::serialize(model_kvcache, timestamped_filename);
+            }
+        } else {
+            ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, true);
+            decoder_end_time = ggml_time_us();
+
+            auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
+            model = ov::frontend::ggml::FrontEnd::convert(input_model);
+            ggml_decoder->clear_model_weights();
+            conversion_end_time = ggml_time_us();
+
+            auto compiled_model = core.compile_model(model, device, config);
+            compile_end_time = ggml_time_us();
+            infer_request_cache[cgraph] = std::make_shared<ov::InferRequest>(compiled_model.create_infer_request());
+            infer_request = *infer_request_cache[cgraph];
+
+            if (getenv("GGML_OPENVINO_DUMP_IR")) {
+                char timestamped_filename[64];
+                auto timestamp = (long long) ggml_time_us();
+                snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp);
+                ov::serialize(model, timestamped_filename);
+            }
+        }
+
+        std::vector<std::string> ov_input_names;
+        std::vector<std::string> ov_output_names;
+        for (const auto& ov_param : model->get_parameters()) {
+            ov_input_names.push_back(ov_param->get_friendly_name());
+        }
+        for (const auto& ov_output : model->get_results()) {
+            ov_output_names.push_back(ov_output->get_friendly_name());
+        }
+        ov_input_names_cache[cgraph] = ov_input_names;
+        ov_output_names_cache[cgraph] = ov_output_names;
+    }
+
+    auto ov_input_names = ov_input_names_cache[cgraph];
+    auto ov_output_names = ov_output_names_cache[cgraph];
+    for (size_t i = 0; i < ov_input_names.size(); i++) {
+        auto param_name = ov_input_names[i];
+        auto input_tensor = get_ov_input_tensor(ggml_decoder, param_name);
+        infer_request.set_input_tensor(i, input_tensor);
+
+        if (getenv("GGML_OPENVINO_DEBUG_INPUT")) {
+            print_input_tensor_info(param_name, input_tensor);
+        }
+    }
+    auto input_end_time = ggml_time_us();
+
+    infer_request.infer();
+    auto infer_end_time = ggml_time_us();
+
+    auto gguf_tensor_addrs = get_ggml_graph_output_dst(ggml_decoder);
+    for (size_t i = 0; i < ov_output_names.size(); i++) {
+        auto result_name = ov_output_names[i];
+        const auto output_tensor = infer_request.get_output_tensor(i);
+
+        std::memcpy(gguf_tensor_addrs[result_name], output_tensor.data(), output_tensor.get_byte_size());
+
+        if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) {
+            print_output_tensor_info(result_name, output_tensor, gguf_tensor_addrs);
+        }
+    }
+    auto end_time = ggml_time_us();
+
+    if (getenv("GGML_OPENVINO_PROFILING")) {
+        GGML_LOG_INFO("GGML OpenVINO Backend: \n");
+        GGML_LOG_INFO("  - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000);
+        GGML_LOG_INFO("  - Graph conversion Time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000);
+        GGML_LOG_INFO("  - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000);
+        GGML_LOG_INFO("  - Graph Input Time: %ld ms \n", (input_end_time - compile_end_time) / 1000);
+        GGML_LOG_INFO("  - Graph Inference Time: %ld ms \n", (infer_end_time - input_end_time) / 1000);
+        GGML_LOG_INFO("  - Graph Output Time: %ld ms \n", (end_time - infer_end_time) / 1000);
+    }
+
+    return GGML_STATUS_SUCCESS;
+    GGML_UNUSED(backend);
+}
+
+ov::AnyMap get_npu_config() {
+    ov::AnyMap config = {
+        {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean"                         },
+        {"NPU_USE_NPUW",                "YES"                                                                     },
+        {"NPUW_DEVICES",                "NPU"                                                                     },
+        {"NPUW_FOLD",                   "YES"                                                                     },
+        {"NPUW_HOST_GATHER",            "YES"                                                                     },
+        {"NPUW_DQ",                     "YES"                                                                     },
+        {"NPUW_FUNCALL_ASYNC",          "YES"                                                                     },
+        {"NPUW_WEIGHTS_BANK",           "shared"                                                                  },
+        {"NPUW_CACHE_DIR",              getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""},
+    };
+    return config;
+}
+
+bool is_naive(struct ggml_cgraph* cgraph) {
+    constexpr int naive_graph_size_threshold = 20;
+    return cgraph->n_nodes < naive_graph_size_threshold;
+}
+
+enum ggml_status naive_compute(struct ggml_cgraph* cgraph,
+                               ov::Core& core,
+                               const std::string& device,
+                               const ov::AnyMap& config) {
+    if (cgraph->n_nodes == 1 && cgraph->nodes[0]->op == GGML_OP_NONE) {
+        return GGML_STATUS_SUCCESS;
+    }
+
+    auto decoder = std::make_shared<GgmlOvDecoder>(cgraph);
+    auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(decoder);
+    auto naive = true;
+    auto model = ov::frontend::ggml::FrontEnd::convert(input_model, naive);
+    auto infer_request = core.compile_model(model, device, config).create_infer_request();
+
+    auto ov_params = model->get_parameters();
+    for (size_t i = 0; i < ov_params.size(); i++) {
+        auto param_name = ov_params[i]->get_friendly_name();
+        auto input_tensor = get_ov_input_tensor(decoder, param_name);
+        infer_request.set_input_tensor(i, input_tensor);
+    }
+
+    infer_request.infer();
+
+    auto gguf_tensor_addrs = get_ggml_graph_output_dst(decoder);
+    auto ov_results = model->get_results();
+    for (size_t i = 0; i < ov_results.size(); i++) {
+        auto result_name = ov_results[i]->get_friendly_name();
+        const auto output_tensor = infer_request.get_output_tensor(i);
+
+        std::memcpy(gguf_tensor_addrs[result_name], output_tensor.data(), output_tensor.get_byte_size());
+    }
+    return GGML_STATUS_SUCCESS;
+}
+
+ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string& param_name) {
+    bool is_static = ggml_decoder->is_static();
+    bool is_first_token = ggml_decoder->is_first_token();
+
+    ov::Tensor input_tensor;
+    if (ggml_decoder->get_model_extra_inputs().find(param_name) != ggml_decoder->get_model_extra_inputs().end()) {
+        input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name);
+
+    } else if (!is_static) {
+        input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name);
+
+    } else {
+        if (param_name == "inp_tokens" || param_name == "inp_pos") {
+            if (is_first_token) {
+                size_t context_size = ggml_decoder->get_context_size();
+                const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name);
+                std::vector<int32_t> padded_data = pad_input<int32_t>(input_tensor_ggml, 1, context_size, 0);
+                input_tensor = ov::Tensor(ov::element::i32, ov::Shape{1, 1, context_size});
+                auto* data_ptr = input_tensor.data<int32_t>();
+                std::copy(padded_data.begin(), padded_data.end(), data_ptr);
+            } else {
+                input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name);
+            }
+
+        } else if (param_name == "KQ_mask") {
+            size_t context_size = ggml_decoder->get_context_size();
+            const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name);
+            if (is_first_token) {
+                std::vector<float> padded_data =
+                    pad_input<float>(input_tensor_ggml, context_size, context_size, -INFINITY);
+                set_zero_diagonal(padded_data, context_size);
+                input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, context_size, context_size});
+                auto* data_ptr = input_tensor.data<float>();
+                std::copy(padded_data.begin(), padded_data.end(), data_ptr);
+            } else {
+                std::vector<float> padded_data = pad_input<float>(input_tensor_ggml, 1, context_size, -INFINITY);
+                input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, context_size});
+                auto* data_ptr = input_tensor.data<float>();
+                std::copy(padded_data.begin(), padded_data.end(), data_ptr);
+            }
+
+        } else if (const auto* op = ggml_decoder->get_tensor_used_op(ggml_decoder->get_tensor_from_name(param_name));
+                   op && op->op == GGML_OP_SET_ROWS && is_static && is_first_token) {
+            input_tensor = ov::Tensor(ov::element::i64, ov::Shape{1});
+        } else {
+            input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name);
+        }
+    }
+    return input_tensor;
+}
+
+size_t checksum(const void* data, size_t size) {
+    const uint8_t* bytes = static_cast<const uint8_t*>(data);
+    size_t sum = 0;
+    for (size_t i = 0; i < size; ++i) {
+        sum += (uint8_t) i;
+        sum += bytes[i];
+    }
+    return sum;
+}
+
+// Suppress deprecation warning for ov::Tensor::data()
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
+void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor) {
+    std::cout << "Input name: " << name << ", Input shape: " << tensor.get_shape() << ", Address: " << tensor.data()
+              << std::endl;
+    switch (tensor.get_element_type()) {
+        case ov::element::f32:
+            std::cout << *(tensor.data<float>()) << std::endl;
+            break;
+        case ov::element::f16:
+            std::cout << *(tensor.data<ov::float16>()) << std::endl;
+            break;
+        case ov::element::i32:
+            for (size_t i = 0; i < tensor.get_size(); ++i) {
+                std::cout << tensor.data<int32_t>()[i] << " ";
+            }
+            std::cout << std::endl;
+            break;
+        case ov::element::i64:
+            std::cout << *(tensor.data<int64_t>()) << std::endl;
+            break;
+        default:
+            break;
+    }
+}
+
+void print_output_tensor_info(const std::string& name, const ov::Tensor& tensor,
+                              std::map<std::string, void*>& output_dst) {
+    std::cout << "Output name: " << name << ", Output shape: " << tensor.get_shape()
+              << ", Address: " << output_dst[name] << std::endl;
+    switch (tensor.get_element_type()) {
+        case ov::element::f32:
+            std::cout << *(tensor.data<float>()) << std::endl;
+            std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl;
+            break;
+        case ov::element::f16:
+            std::cout << *(tensor.data<ov::float16>()) << std::endl;
+            std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl;
+            break;
+        default:
+            break;
+    }
+}
+
+#pragma GCC diagnostic pop
+
+void set_zero_diagonal(std::vector<float>& matrix, size_t dim) {
+    for (size_t i = 0; i < dim; ++i) {
+        matrix[i * dim + i] = 0.0f;
+    }
+}
+
+bool is_prefill(struct ggml_cgraph* cgraph) {
+    for (int i = 0; i < cgraph->n_nodes; ++i) {
+        auto* op = cgraph->nodes[i];
+        for (int j = 0; j < GGML_MAX_SRC; ++j) {
+            auto* src = op->src[j];
+            if (src == nullptr) {
+                break;
+            }
+            if (std::string(src->name) == "inp_tokens") {
+                return src->ne[0] != 1;
+            }
+        }
+    }
+    GGML_LOG_ERROR("is_prefill: inp_tokens not found in cgraph");
+    throw std::runtime_error("is_prefill: inp_tokens not found in cgraph");
+}
diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h
new file mode 100644
index 0000000000000..0d71963f53aca
--- /dev/null
+++ b/ggml/src/ggml-openvino/utils.h
@@ -0,0 +1,50 @@
+#include <algorithm>
+#include <openvino/runtime/core.hpp>
+
+#include "ggml-backend-impl.h"
+#include "ggml-decoder.h"
+#include "ggml-impl.h"
+
+enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph);
+
+std::shared_ptr<GgmlOvDecoder> get_ggml_decoder(struct ggml_cgraph* cgraph, bool is_static, bool is_first_token);
+
+ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string& name);
+
+std::map<std::string, void*> get_ggml_graph_output_dst(std::shared_ptr<GgmlOvDecoder> ggml_decoder);
+
+size_t checksum(const void* data, size_t size);
+
+void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor);
+
+void print_output_tensor_info(const std::string& name,
+                              const ov::Tensor& tensor,
+                              std::map<std::string, void*>& output_dst);
+
+template <typename T>
+std::vector<T> pad_input(const ggml_tensor* tensor, size_t padded_rows, size_t padded_cols, T pad_value) {
+    std::vector<T> padded_data(padded_rows * padded_cols, pad_value);
+    size_t rows = tensor->ne[1];
+    size_t cols = tensor->ne[0];
+    T* data = static_cast<T*>(tensor->data);
+
+    for (size_t i = 0; i < std::min(rows, padded_rows); ++i) {
+        for (size_t j = 0; j < std::min(cols, padded_cols); ++j) {
+            padded_data[i * padded_cols + j] = data[i * cols + j];
+        }
+    }
+    return padded_data;
+}
+
+void set_zero_diagonal(std::vector<float>& matrix, size_t dim);
+
+bool is_prefill(struct ggml_cgraph * cgraph);
+
+ov::AnyMap get_npu_config();
+
+ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string& param_name);
+
+bool is_naive(struct ggml_cgraph* cgraph);
+
+enum ggml_status naive_compute(struct ggml_cgraph* cgraph, ov::Core& core, const std::string& device,
+                               const ov::AnyMap& config);
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 053c72d6dc8d1..b1f47fddce3c8 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -1032,7 +1032,7 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
 
     if (ubatch.token) {
         inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
-        //cb(inp->tokens, "inp_tokens", -1);
+        cb(inp->tokens, "inp_tokens", -1);
         ggml_set_input(inp->tokens);
         res->t_tokens = inp->tokens;
 
@@ -1080,6 +1080,7 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
     auto & cur = inp->pos;
 
     cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, (int64_t)n_tokens*hparams.n_pos_per_embd());
+    cb(cur, "inp_pos", -1);
     ggml_set_input(cur);
 
     res->add_input(std::move(inp));
@@ -1115,6 +1116,7 @@ ggml_tensor * llm_graph_context::build_inp_out_ids() const {
     auto & cur = inp->out_ids;
 
     cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
+    cb(cur, "inp_out_ids", -1);
     ggml_set_input(cur);
 
     res->add_input(std::move(inp));
@@ -1345,6 +1347,7 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
 
     // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
     inp->kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
+    cb(inp->kq_mask, "KQ_mask", -1);
     ggml_set_input(inp->kq_mask);
 
     inp->kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->kq_mask, GGML_TYPE_F16) : inp->kq_mask;
@@ -1389,7 +1392,7 @@ ggml_tensor * llm_graph_context::build_attn(
     }
 
     if (wo_b) {
-        //cb(cur, "kqv_wo", il);
+        cb(cur, "kqv_wo", il);
     }
 
     if (wo_b) {
@@ -1419,6 +1422,7 @@ static std::unique_ptr<llm_graph_input_attn_kv_unified> build_attn_inp_kv_unifie
         inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
 
         inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
+        ggml_set_name(inp->self_kq_mask, "KQ_mask");
         ggml_set_input(inp->self_kq_mask);
 
         inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
@@ -1569,7 +1573,7 @@ ggml_tensor * llm_graph_context::build_attn_with_sinks(
     }
 
     if (wo_b) {
-        //cb(cur, "kqv_wo", il);
+        cb(cur, "kqv_wo", il);
     }
 
     if (wo_b) {
@@ -1623,7 +1627,7 @@ ggml_tensor * llm_graph_context::build_attn(
     }
 
     if (wo_b) {
-        //cb(cur, "kqv_wo", il);
+        cb(cur, "kqv_wo", il);
     }
 
     if (wo_b) {
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 9658abf969dd2..6d990a2537608 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -185,7 +185,9 @@ llama_build_and_test(test-json-partial.cpp)
 llama_build_and_test(test-log.cpp)
 llama_build_and_test(test-regex-partial.cpp)
 
-llama_build_and_test(test-thread-safety.cpp ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4 -t 2)
+if (NOT GGML_OPENVINO)
+    llama_build_and_test(test-thread-safety.cpp ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4 -t 2)
+endif()
 
 # this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135)
 if (NOT WIN32)