Add MiniCPM-V-4.0 to MiniCPM-V notebook (#3047)

openvino-dev-samples · aleksandr-mokrov · web-flow · commit dd4bfae50386 · 2025-08-12T23:51:33.000+02:00
Co-authored-by: Aleksandr Mokrov &lt;aleksandr.mokrov@intel.com&gt;
diff --git a/notebooks/minicpm-v-multimodal-chatbot/README.md b/notebooks/minicpm-v-multimodal-chatbot/README.md
@@ -1,8 +1,7 @@
-# Visual-language assistant with MiniCPM-V2 and OpenVINO
+# Visual-language assistant with MiniCPM-V and OpenVINO
 
-MiniCPM-V 2 is a strong multimodal large language model for efficient end-side deployment. MiniCPM-V 2.6 is the latest and most capable model in the MiniCPM-V series. The model is built on SigLip-400M and Qwen2-7B with a total of 8B parameters. It exhibits a significant performance improvement over previous versions, and introduces new features for multi-image and video understanding.
-
-More details about model can be found in [model card](https://huggingface.co/openbmb/MiniCPM-V-2_6) and original [repo](https://github.com/OpenBMB/MiniCPM-V).
+MiniCPM-V 4.0 is the latest efficient model in the MiniCPM-V series. The model is built based on SigLIP2-400M and MiniCPM4-3B with a total of 4.1B parameters. It inherits the strong single-image, multi-image and video understanding performance of MiniCPM-V 2.6 with largely improved efficiency. 
+More details about model can be found in [model card](https://huggingface.co/openbmb/MiniCPM-V-4) and original [repo](https://github.com/OpenBMB/MiniCPM-V).
 
 In this tutorial we consider how to convert and optimize MiniCPM-V2.6 model for creating multimodal chatbot. Additionally, we demonstrate how to apply stateful transformation on LLM part and model optimization techniques like weights compression using [NNCF](https://github.com/openvinotoolkit/nncf)
 
diff --git a/notebooks/minicpm-v-multimodal-chatbot/gradio_helper.py b/notebooks/minicpm-v-multimodal-chatbot/gradio_helper.py
@@ -24,7 +24,7 @@
         Image.open(requests.get(url, stream=True).raw).save(file_name)
 
 
-def make_demo(model):
+def make_demo(model, mode_name):
     import openvino_genai as ov_genai
     import openvino as ov
 
@@ -119,7 +119,7 @@ def generate_and_signal_complete():
         additional_buttons = {"undo_button": None, "retry_button": None}
     demo = gr.ChatInterface(
         fn=bot_streaming,
-        title="MiniCPMV2 OpenVINO Chatbot",
+        title=f"{mode_name} OpenVINO Chatbot",
         examples=[
             {"text": "What is on the flower?", "files": ["./bee.jpg"]},
             {"text": "How to make this pastry?", "files": ["./baklava.png"]},
diff --git a/notebooks/minicpm-v-multimodal-chatbot/minicpm-v-multimodal-chatbot.ipynb b/notebooks/minicpm-v-multimodal-chatbot/minicpm-v-multimodal-chatbot.ipynb
@@ -6,18 +6,19 @@
    "id": "5918b41c-dad7-4f7b-9e39-b3026933dddf",
    "metadata": {},
    "source": [
-    "# Visual-language assistant with MiniCPM-V2 and OpenVINO\n",
+    "# Visual-language assistant with MiniCPM-V and OpenVINO\n",
     "\n",
-    "MiniCPM-V 2 is a strong multimodal large language model for efficient end-side deployment. MiniCPM-V 2.6 is the latest and most capable model in the MiniCPM-V series. The model is built on SigLip-400M and Qwen2-7B with a total of 8B parameters. It exhibits a significant performance improvement over previous versions, and introduces new features for multi-image and video understanding.\n",
+    "MiniCPM-V 4.0 is the latest efficient model in the MiniCPM-V series. The model is built based on SigLIP2-400M and MiniCPM4-3B with a total of 4.1B parameters. It inherits the strong single-image, multi-image and video understanding performance of MiniCPM-V 2.6 with largely improved efficiency. \n",
+    "More details about model can be found in [model card](https://huggingface.co/openbmb/MiniCPM-V-4) and original [repo](https://github.com/OpenBMB/MiniCPM-V).\n",
     "\n",
-    "More details about model can be found in [model card](https://huggingface.co/openbmb/MiniCPM-V-2_6) and original [repo](https://github.com/OpenBMB/MiniCPM-V).\n",
     "\n",
-    "In this tutorial we consider how to convert and optimize MiniCPM-V2 model for creating multimodal chatbot. Additionally, we demonstrate how to apply stateful transformation on LLM part and model optimization techniques like weights compression using [NNCF](https://github.com/openvinotoolkit/nncf)\n",
+    "In this tutorial we consider how to convert and optimize MiniCPM-V model for creating multimodal chatbot. Additionally, we demonstrate how to apply stateful transformation on LLM part and model optimization techniques like weights compression using [NNCF](https://github.com/openvinotoolkit/nncf)\n",
     "\n",
     "#### Table of contents:\n",
     "\n",
     "- [Prerequisites](#Prerequisites)\n",
     "- [Convert model to OpenVINO Intermediate Representation](#Convert-model-to-OpenVINO-Intermediate-Representation)\n",
+    "    - [Select model](#Select-model)\n",
     "    - [Compress Language Model Weights to 4 bits](#Compress-Language-Model-Weights-to-4-bits)\n",
     "- [Prepare model inference pipeline](#Prepare-model-inference-pipeline)\n",
     "    - [Select device](#Select-device)\n",
@@ -47,14 +48,49 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "0116846d-da6f-4e81-b6be-0a882a3eb872",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.1.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.2\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
+      "Note: you may need to restart the kernel to use updated packages.\n",
+      "\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.1.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.2\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
+      "Note: you may need to restart the kernel to use updated packages.\n",
+      "\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.1.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.2\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
+      "Note: you may need to restart the kernel to use updated packages.\n",
+      "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+      "optimum-intel 1.26.0.dev0+7c64417 requires optimum==1.27.*, but you have optimum 2.0.0.dev0 which is incompatible.\u001b[0m\u001b[31m\n",
+      "\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.1.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.2\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
+      "Note: you may need to restart the kernel to use updated packages.\n",
+      "\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.1.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.2\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
    "source": [
+    "import platform\n",
+    "\n",
+    "if platform.system() == \"Darwin\":\n",
+    "    %pip install -q \"numpy<2.0.0\"\n",
+    "\n",
     "%pip install -q \"torch>=2.1\" \"torchvision\" \"timm>=0.9.2\" \"transformers>=4.45\" \"Pillow\" \"gradio>=4.40\" \"tqdm\" \"sentencepiece\" \"peft\" \"huggingface-hub>=0.24.0\" --extra-index-url https://download.pytorch.org/whl/cpu\n",
     "%pip install -q \"nncf>=2.14.0\"\n",
-    "%pip install -q \"git+https://github.com/huggingface/optimum-intel.git\" --extra-index-url https://download.pytorch.org/whl/cpu\n",
+    "%pip install -q \"git+https://github.com/openvino-dev-samples/optimum-intel.git@minicpm4v\" --extra-index-url https://download.pytorch.org/whl/cpu\n",
+    "%pip install -q \"git+https://github.com/openvino-dev-samples/optimum.git@minicpm4v\" --extra-index-url https://download.pytorch.org/whl/cpu\n",
     "%pip install -q -U --pre \"openvino>=2025.0\" \"openvino-tokenizers>=2025.0\" \"openvino-genai>=2025.0\" --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly"
    ]
   },
@@ -110,6 +146,59 @@
     "\n",
     "where task is task to export the model for, if not specified, the task will be auto-inferred based on the model. You can find a mapping between tasks and model classes in Optimum TaskManager [documentation](https://huggingface.co/docs/optimum/exporters/task_manager). Additionally, you can specify weights compression using `--weight-format` argument with one of following options: `fp32`, `fp16`, `int8` and `int4`. Fro int8 and int4 [nncf](https://github.com/openvinotoolkit/nncf) will be used for  weight compression. More details about model export provided in [Optimum Intel documentation](https://huggingface.co/docs/optimum/intel/openvino/export#export-your-model).\n",
     "\n",
+    "## Select model\n",
+    "[back to top ⬆️](#Table-of-contents:)\n",
+    "\n",
+    "* **MiniCPM-V-4**: MiniCPM-V 4.0 is the latest efficient model in the MiniCPM-V series. The model is built based on SigLIP2-400M and MiniCPM4-3B with a total of 4.1B parameters. It inherits the strong single-image, multi-image and video understanding performance of MiniCPM-V 2.6 with largely improved efficiency. \n",
+    "* **MiniCPM-V-2_6**:  MiniCPM-V 2.6 is built on SigLip-400M and Qwen2-7B with a total of 8B parameters. It exhibits a significant performance improvement over MiniCPM-Llama3-V 2.5, and introduces new features for multi-image and video understanding."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "a0851b3c",
+   "metadata": {
+    "test_replace": {
+     "openbmb/MiniCPM-V-4": "katuni4ka/tiny-random-minicpmv-2_6"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "289c2574f5604076bdcd8eccabc4a14f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Dropdown(description='Model:', options=('openbmb/MiniCPM-V-4', 'openbmb/MiniCPM-V-2_6'), value='openbmb/MiniCP…"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import ipywidgets as widgets\n",
+    "\n",
+    "model_ids = [\"openbmb/MiniCPM-V-4\", \"openbmb/MiniCPM-V-2_6\"]\n",
+    "\n",
+    "model_selector = widgets.Dropdown(\n",
+    "    options=model_ids,\n",
+    "    default=model_ids[0],\n",
+    "    description=\"Model:\",\n",
+    ")\n",
+    "\n",
+    "\n",
+    "model_selector"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "59dcd94b",
+   "metadata": {},
+   "source": [
     "### Compress Language Model Weights to 4 bits\n",
     "[back to top ⬆️](#Table-of-contents:)\n",
     "\n",
@@ -134,20 +223,60 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "82e846bb",
-   "metadata": {
-    "test_replace": {
-     "openbmb/MiniCPM-V-2_6": "katuni4ka/tiny-random-minicpmv-2_6"
-    }
-   },
+   "metadata": {},
    "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "**Export command:**"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "`optimum-cli export openvino --model openbmb/MiniCPM-V-4 MiniCPM-V-4-ov --trust-remote-code --weight-format fp16 --task image-text-to-text`"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino\n"
+      "WARNING:nncf:NNCF provides best results with torch==2.7.*, while current torch version is 2.5.1+cpu. If you encounter issues, consider switching to torch==2.7.*\n",
+      "INFO:nncf:Statistics of the bitwidth distribution:\n",
+      "┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑\n",
+      "│ Weight compression mode   │ % all parameters (layers)   │ % ratio-defining parameters (layers)   │\n",
+      "┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥\n",
+      "│ int4_sym                  │ 100% (225 / 225)            │ 100% (225 / 225)                       │\n",
+      "┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙\n"
      ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e5a6ec13d42f41109d029aced33475ff",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
@@ -174,11 +303,10 @@
     "    shutil.move(ov_int4_model_path.with_suffix(\".bin\"), ov_model_path.with_suffix(\".bin\"))\n",
     "\n",
     "\n",
-    "model_id = \"openbmb/MiniCPM-V-2_6\"\n",
-    "model_dir = Path(model_id.split(\"/\")[-1] + \"-ov\")\n",
+    "model_dir = Path(model_selector.value.split(\"/\")[-1] + \"-ov\")\n",
     "\n",
     "if not model_dir.exists():\n",
-    "    optimum_cli(model_id, model_dir, additional_args={\"trust-remote-code\": \"\", \"weight-format\": \"fp16\", \"task\": \"image-text-to-text\"})\n",
+    "    optimum_cli(model_selector.value, model_dir, additional_args={\"trust-remote-code\": \"\", \"weight-format\": \"fp16\", \"task\": \"image-text-to-text\"})\n",
     "    compress_lm_weights(model_dir)"
    ]
   },
@@ -213,26 +341,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "626fef57",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "2362638a795340e6b3effb0805848768",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO')"
-      ]
-     },
-     "execution_count": 1,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "from notebook_utils import device_widget\n",
     "\n",
@@ -243,7 +355,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "e7af404b",
    "metadata": {},
    "outputs": [],
@@ -394,7 +506,7 @@
    "source": [
     "from gradio_helper import make_demo\n",
     "\n",
-    "demo = make_demo(ov_model)\n",
+    "demo = make_demo(ov_model, model_selector.value.split(\"/\")[-1])\n",
     "\n",
     "try:\n",
     "    demo.launch(debug=True, height=600)\n",
@@ -422,7 +534,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.10.12"
   },
   "openvino_notebooks": {
    "imageUrl": "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/7b0919ea-6fe4-4c8f-8395-cb0ee6e87937",