huggingface · molbap · Oct 22, 2025 · Oct 22, 2025 · Oct 22, 2025 · Oct 22, 2025
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -1036,6 +1036,8 @@
         title: ColQwen2
       - local: model_doc/data2vec
         title: Data2Vec
+      - local: model_doc/deepseek_ocr
+        title: DeepSeekOCR
       - local: model_doc/deepseek_vl
         title: DeepseekVL
       - local: model_doc/deepseek_vl_hybrid

diff --git a/docs/source/en/model_doc/deepseek_ocr.md b/docs/source/en/model_doc/deepseek_ocr.md
@@ -0,0 +1,123 @@
+<!--Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+
+# DeepSeekOCR
+
+## Overview
+
+DeepSeekOCR is a vision-language model designed for optical character recognition (OCR) tasks. The model combines dual vision encoders (SAM and CLIP) with a language model to process both text and images for generating contextually relevant OCR outputs, including document understanding, grounding, and markdown conversion.
+The model uses a modified [DeepSeek-V2](./deepseek_v2) as its text decoder.
+
+### Usage tips
+
+The example below demonstrates how to perform OCR with grounding on a document image using the [`AutoModel`] class.
+
+```py
+import torch
+from PIL import Image
+from transformers import AutoModel, AutoProcessor
+
+processor = AutoProcessor.from_pretrained("deepseek-ai/deepseek-ocr")
+model = AutoModel.from_pretrained("deepseek-ai/deepseek-ocr", torch_dtype=torch.bfloat16)
+
+image = Image.open("document.png").convert("RGB")
+
+conversation = [
+    {
+        "role": "<|User|>",
+        "content": [
+            {"type": "image", "path": "./document.png"},
+            {"type": "text", "text": "<|grounding|>Convert the document to markdown."},
+        ],
+    }
+]
+
+inputs = processor.apply_chat_template(
+    conversation,
+    return_dict=True,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_tensors="pt"
+)
+
+with torch.no_grad():
+    generated = model.generate(**inputs, max_new_tokens=250)
+
+text = processor.batch_decode(generated, skip_special_tokens=False)[0]
+print(text)
+```
+
+## DeepseekOcrConfig
+
+[[autodoc]] DeepseekOcrConfig
+
+## DeepseekOcrVisionConfig
+
+[[autodoc]] DeepseekOcrVisionConfig
+
+## DeepseekOcrSamConfig
+
+[[autodoc]] DeepseekOcrSamConfig
+
+## DeepseekOcrCLIPVisionConfig
+
+[[autodoc]] DeepseekOcrCLIPVisionConfig
+
+## DeepseekOcrProjectorConfig
+
+[[autodoc]] DeepseekOcrProjectorConfig
+
+## DeepseekOcrProcessor
+
+[[autodoc]] DeepseekOcrProcessor
+
+## DeepseekOcrImageProcessorFast
+
+[[autodoc]] DeepseekOcrImageProcessorFast
+
+## DeepseekOcrModelOutputWithPast
+
+[[autodoc]] DeepseekOcrModelOutputWithPast
+
+## DeepseekOcrCausalLMOutputWithPast
+
+[[autodoc]] DeepseekOcrCausalLMOutputWithPast
+
+## DeepseekOcrTextModel
+
+[[autodoc]] DeepseekOcrTextModel
+    - forward
+
+## DeepseekOcrCLIPVisionModel
+
+[[autodoc]] DeepseekOcrCLIPVisionModel
+    - forward
+
+## DeepseekOcrProjector
+
+[[autodoc]] DeepseekOcrProjector
+    - forward
+
+## DeepseekOcrModel
+
+[[autodoc]] DeepseekOcrModel
+    - forward
+
+## DeepseekOcrForConditionalGeneration
+
+[[autodoc]] DeepseekOcrForConditionalGeneration
+    - forward
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
@@ -85,6 +85,7 @@
     from .deberta import *
     from .deberta_v2 import *
     from .decision_transformer import *
+    from .deepseek_ocr import *
     from .deepseek_v2 import *
     from .deepseek_v3 import *
     from .deepseek_vl import *

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -104,6 +104,7 @@
         ("deberta", "DebertaConfig"),
         ("deberta-v2", "DebertaV2Config"),
         ("decision_transformer", "DecisionTransformerConfig"),
+        ("deepseek_ocr", "DeepseekOcrConfig"),
         ("deepseek_v2", "DeepseekV2Config"),
         ("deepseek_v3", "DeepseekV3Config"),
         ("deepseek_vl", "DeepseekVLConfig"),
@@ -542,6 +543,7 @@
         ("deberta", "DeBERTa"),
         ("deberta-v2", "DeBERTa-v2"),
         ("decision_transformer", "Decision Transformer"),
+        ("deepseek_ocr", "DeepSeek-OCR"),
         ("deepseek_v2", "DeepSeek-V2"),
         ("deepseek_v3", "DeepSeek-V3"),
         ("deepseek_vl", "DeepseekVL"),

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -110,6 +110,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("deberta", "DebertaModel"),
         ("deberta-v2", "DebertaV2Model"),
         ("decision_transformer", "DecisionTransformerModel"),
+        ("deepseek_ocr", "DeepseekOcrForConditionalGeneration"),
         ("deepseek_v2", "DeepseekV2Model"),
         ("deepseek_v3", "DeepseekV3Model"),
         ("deepseek_vl", "DeepseekVLModel"),
@@ -1020,6 +1021,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("blip-2", "Blip2ForConditionalGeneration"),
         ("chameleon", "ChameleonForConditionalGeneration"),
         ("cohere2_vision", "Cohere2VisionForConditionalGeneration"),
+        ("deepseek_ocr", "DeepseekOcrForConditionalGeneration"),
         ("deepseek_vl", "DeepseekVLForConditionalGeneration"),
         ("deepseek_vl_hybrid", "DeepseekVLHybridForConditionalGeneration"),
         ("emu3", "Emu3ForConditionalGeneration"),

diff --git a/src/transformers/models/deepseek_ocr/__init__.py b/src/transformers/models/deepseek_ocr/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2025 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_deepseek_ocr import *
+    from .modeling_deepseek_ocr import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)