diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index e317998a36..a7c79b002b 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -725,6 +725,10 @@
title: DAB-DETR
- local: model_doc/deepseek_v2
title: DeepSeek-V2
+ - local: model_doc/deepseek_vl
+ title: DeepseekVL
+ - local: model_doc/deepseek_vl_hybrid
+ title: DeepseekVLHybrid
- local: model_doc/deformable_detr
title: Deformable DETR
- local: model_doc/deit
diff --git a/docs/source/en/model_doc/deepseek_vl.md b/docs/source/en/model_doc/deepseek_vl.md
new file mode 100644
index 0000000000..625a2c90b0
--- /dev/null
+++ b/docs/source/en/model_doc/deepseek_vl.md
@@ -0,0 +1,220 @@
+
+
+
+
+# DeepseekVL
+
+[Deepseek-VL](https://arxiv.org/abs/2403.05525) was introduced by the DeepSeek AI team. It is a vision-language model (VLM) designed to process both text and images for generating contextually relevant responses. The model leverages [LLaMA](./llama) as its text encoder, while [SigLip](./siglip) is used for encoding images.
+
+You can find all the original Deepseek-VL checkpoints under the [DeepSeek-community](https://huggingface.co/deepseek-community) organization.
+
+> [!TIP]
+> Click on the Deepseek-VL models in the right sidebar for more examples of how to apply Deepseek-VL to different vision and language tasks.
+
+The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.
+
+
+
+
+```py
+import torch
+from transformers import pipeline
+
+pipe = pipeline(
+ task="image-text-to-text",
+ model="deepseek-community/deepseek-vl-1.3b-chat",
+ device=0,
+ torch_dtype=torch.float16
+)
+
+messages = [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image",
+ "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
+ },
+ { "type": "text", "text": "Describe this image."},
+ ]
+ }
+]
+
+pipe(text=messages, max_new_tokens=20, return_full_text=False)
+```
+
+
+
+
+```py
+import torch
+from transformers import DeepseekVLForConditionalGeneration, AutoProcessor
+
+model = DeepseekVLForConditionalGeneration.from_pretrained(
+ "deepseek-community/deepseek-vl-1.3b-chat",
+ torch_dtype=torch.float16,
+ device_map="auto",
+ attn_implementation="sdpa"
+)
+
+processor = AutoProcessor.from_pretrained("deepseek-community/deepseek-vl-1.3b-chat")
+
+messages = [
+ {
+ "role":"user",
+ "content":[
+ {
+ "type":"image",
+ "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+ },
+ {
+ "type":"text",
+ "text":"Describe this image."
+ }
+ ]
+ }
+
+]
+
+inputs = processor.apply_chat_template(
+ messages,
+ add_generation_prompt=True,
+ tokenize=True,
+ return_dict=True,
+ return_tensors="pt"
+).to(model.device, dtype=model.dtype)
+
+generated_ids = model.generate(**inputs, max_new_tokens=128)
+generated_ids_trimmed = [
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+]
+output_text = processor.batch_decode(
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+)
+
+print(output_text)
+```
+
+
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4.
+
+```python
+import torch
+from transformers import TorchAoConfig, DeepseekVLForConditionalGeneration, AutoProcessor
+
+quantization_config = TorchAoConfig(
+ "int4_weight_only",
+ group_size=128
+)
+
+model = DeepseekVLForConditionalGeneration.from_pretrained(
+ "deepseek-community/deepseek-vl-1.3b-chat",
+ torch_dtype=torch.bfloat16,
+ device_map="auto",
+ quantization_config=quantization_config
+)
+```
+### Notes
+
+- Do inference with multiple images in a single conversation.
+ ```py
+ import torch
+ from transformers import DeepseekVLForConditionalGeneration, AutoProcessor
+
+ model = DeepseekVLForConditionalGeneration.from_pretrained(
+ "deepseek-community/deepseek-vl-1.3b-chat",
+ torch_dtype=torch.float16,
+ device_map="auto",
+ attn_implementation="sdpa"
+ )
+
+ processor = AutoProcessor.from_pretrained("deepseek-community/deepseek-vl-1.3b-chat")
+
+ messages = [
+ [
+ {
+ "role": "user",
+ "content": [
+ {"type": "text", "text": "What’s the difference between"},
+ {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
+ {"type": "text", "text": " and "},
+ {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}
+ ]
+ }
+ ],
+ [
+ {
+ "role": "user",
+ "content": [
+ {"type": "image", "url": "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"},
+ {"type": "text", "text": "What do you see in this image?"}
+ ]
+ }
+ ]
+ ]
+
+ inputs = processor.apply_chat_template(
+ messages,
+ add_generation_prompt=True,
+ padding=True,
+ truncation=True,
+ tokenize=True,
+ return_dict=True,
+ return_tensors="pt"
+ ).to(model.device, dtype=model.dtype)
+
+ generated_ids = model.generate(**inputs, max_new_tokens=128)
+ generated_ids_trimmed = [
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+ ]
+ output_text = processor.batch_decode(
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+ )
+
+ print(output_text)
+ ```
+
+## DeepseekVLConfig
+
+[[autodoc]] DeepseekVLConfig
+
+## DeepseekVLProcessor
+
+[[autodoc]] DeepseekVLProcessor
+
+## DeepseekVLImageProcessor
+
+[[autodoc]] DeepseekVLImageProcessor
+
+## DeepseekVLModel
+
+[[autodoc]] DeepseekVLModel
+ - forward
+
+## DeepseekVLForConditionalGeneration
+
+[[autodoc]] DeepseekVLForConditionalGeneration
+ - forward
diff --git a/docs/source/en/model_doc/deepseek_vl_hybrid.md b/docs/source/en/model_doc/deepseek_vl_hybrid.md
new file mode 100644
index 0000000000..86e1672bce
--- /dev/null
+++ b/docs/source/en/model_doc/deepseek_vl_hybrid.md
@@ -0,0 +1,219 @@
+
+
+
+
+

+

+
+
+
+# DeepseekVLHybrid
+
+[Deepseek-VL-Hybrid](https://arxiv.org/abs/2403.05525) was introduced by the DeepSeek AI team. It is a vision-language model (VLM) designed to process both text and images for generating contextually relevant responses. The model leverages [LLaMA](./llama) as its text encoder, while [SigLip](./siglip) is used for encoding low-resolution images and [SAM (Segment Anything Model)](./sam) is incorporated to handle high-resolution image encoding, enhancing the model’s ability to process fine-grained visual details. Deepseek-VL-Hybrid is a variant of Deepseek-VL that uses [SAM (Segment Anything Model)](./sam) to handle high-resolution image encoding.
+
+You can find all the original Deepseek-VL-Hybrid checkpoints under the [DeepSeek-community](https://huggingface.co/deepseek-community) organization.
+
+> [!TIP]
+> Click on the Deepseek-VL-Hybrid models in the right sidebar for more examples of how to apply Deepseek-VL-Hybrid to different vision and language tasks.
+
+The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.
+
+
+
+
+```py
+import torch
+from transformers import pipeline
+
+pipe = pipeline(
+ task="image-text-to-text",
+ model="deepseek-community/deepseek-vl-7b-chat",
+ device=0,
+ torch_dtype=torch.float16
+)
+
+messages = [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image",
+ "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
+ },
+ { "type": "text", "text": "Describe this image."},
+ ]
+ }
+]
+
+pipe(text=messages, max_new_tokens=20, return_full_text=False)
+```
+
+
+
+
+```py
+import torch
+from transformers import DeepseekVLHybridForConditionalGeneration, AutoProcessor
+
+model = DeepseekVLHybridForConditionalGeneration.from_pretrained(
+ "deepseek-community/deepseek-vl-7b-chat",
+ torch_dtype=torch.float16,
+ device_map="auto",
+ attn_implementation="sdpa"
+)
+
+processor = AutoProcessor.from_pretrained("deepseek-community/deepseek-vl-7b-chat")
+
+messages = [
+ {
+ "role":"user",
+ "content":[
+ {
+ "type":"image",
+ "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+ },
+ {
+ "type":"text",
+ "text":"Describe this image."
+ }
+ ]
+ }
+
+]
+
+inputs = processor.apply_chat_template(
+ messages,
+ add_generation_prompt=True,
+ tokenize=True,
+ return_dict=True,
+ return_tensors="pt"
+).to(model.device, dtype=model.dtype)
+
+generated_ids = model.generate(**inputs, max_new_tokens=128)
+generated_ids_trimmed = [
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+]
+output_text = processor.batch_decode(
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+)
+
+print(output_text)
+```
+
+
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4.
+
+```python
+import torch
+from transformers import TorchAoConfig, DeepseekVLHybridForConditionalGeneration, AutoProcessor
+
+quantization_config = TorchAoConfig(
+ "int4_weight_only",
+ group_size=128
+)
+
+model = DeepseekVLHybridForConditionalGeneration.from_pretrained(
+ "deepseek-community/deepseek-vl-7b-chat",
+ torch_dtype=torch.bfloat16,
+ device_map="auto",
+ quantization_config=quantization_config
+)
+```
+### Notes
+
+- Do inference with multiple images in a single conversation.
+ ```py
+ import torch
+ from transformers import DeepseekVLHybridForConditionalGeneration, AutoProcessor
+
+ model = DeepseekVLHybridForConditionalGeneration.from_pretrained(
+ "deepseek-community/deepseek-vl-7b-chat",
+ torch_dtype=torch.float16,
+ device_map="auto",
+ attn_implementation="sdpa"
+ )
+
+ processor = AutoProcessor.from_pretrained("deepseek-community/deepseek-vl-7b-chat")
+
+ messages = [
+ [
+ {
+ "role": "user",
+ "content": [
+ {"type": "text", "text": "What’s the difference between"},
+ {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
+ {"type": "text", "text": " and "},
+ {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}
+ ]
+ }
+ ],
+ [
+ {
+ "role": "user",
+ "content": [
+ {"type": "image", "url": "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"},
+ {"type": "text", "text": "What do you see in this image?"}
+ ]
+ }
+ ]
+ ]
+
+ inputs = processor.apply_chat_template(
+ messages,
+ add_generation_prompt=True,
+ padding=True,
+ truncation=True,
+ tokenize=True,
+ return_dict=True,
+ return_tensors="pt"
+ ).to(model.device, dtype=model.dtype)
+
+ generated_ids = model.generate(**inputs, max_new_tokens=128)
+ generated_ids_trimmed = [
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+ ]
+ output_text = processor.batch_decode(
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+ )
+
+ print(output_text)
+ ```
+
+## DeepseekVLHybridConfig
+
+[[autodoc]] DeepseekVLHybridConfig
+
+## DeepseekVLHybridProcessor
+
+[[autodoc]] DeepseekVLHybridProcessor
+
+## DeepseekVLHybridImageProcessor
+
+[[autodoc]] DeepseekVLHybridImageProcessor
+
+## DeepseekVLHybridModel
+
+[[autodoc]] DeepseekVLHybridModel
+ - forward
+
+## DeepseekVLHybridForConditionalGeneration
+
+[[autodoc]] DeepseekVLHybridForConditionalGeneration
+ - forward
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 7b59f958f0..b691cea112 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -84,6 +84,8 @@ if TYPE_CHECKING:
from .decision_transformer import *
from .deepseek_v2 import *
from .deepseek_v3 import *
+ from .deepseek_vl import *
+ from .deepseek_vl_hybrid import *
from .deformable_detr import *
from .deit import *
from .deprecated import *
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 4d22bd00ef..eb25e0d025 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -103,6 +103,8 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
("decision_transformer", "DecisionTransformerConfig"),
("deepseek_v2", "DeepseekV2Config"),
("deepseek_v3", "DeepseekV3Config"),
+ ("deepseek_vl", "DeepseekVLConfig"),
+ ("deepseek_vl_hybrid", "DeepseekVLHybridConfig"),
("deformable_detr", "DeformableDetrConfig"),
("deit", "DeiTConfig"),
("depth_anything", "DepthAnythingConfig"),
@@ -495,6 +497,8 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
("decision_transformer", "Decision Transformer"),
("deepseek_v2", "DeepSeek-V2"),
("deepseek_v3", "DeepSeek-V3"),
+ ("deepseek_vl", "DeepseekVL"),
+ ("deepseek_vl_hybrid", "DeepseekVLHybrid"),
("deformable_detr", "Deformable DETR"),
("deit", "DeiT"),
("deplot", "DePlot"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index cd0473a2d7..0a0cc6a38c 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -77,6 +77,8 @@ else:
("convnextv2", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
("cvt", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
("data2vec-vision", ("BeitImageProcessor", "BeitImageProcessorFast")),
+ ("deepseek_vl", ("DeepseekVLImageProcessor")),
+ ("deepseek_vl_hybrid", ("DeepseekVLHybridImageProcessor")),
("deformable_detr", ("DeformableDetrImageProcessor", "DeformableDetrImageProcessorFast")),
("deit", ("DeiTImageProcessor", "DeiTImageProcessorFast")),
("depth_anything", ("DPTImageProcessor", "DPTImageProcessorFast")),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 9d6622f389..85eb8ff6bb 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -97,6 +97,8 @@ MODEL_MAPPING_NAMES = OrderedDict(
("decision_transformer", "DecisionTransformerModel"),
("deepseek_v2", "DeepseekV2Model"),
("deepseek_v3", "DeepseekV3Model"),
+ ("deepseek_vl", "DeepseekVLModel"),
+ ("deepseek_vl_hybrid", "DeepseekVLHybridModel"),
("deformable_detr", "DeformableDetrModel"),
("deit", "DeiTModel"),
("depth_pro", "DepthProModel"),
@@ -935,6 +937,8 @@ MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = OrderedDict(
("blip", "BlipForConditionalGeneration"),
("blip-2", "Blip2ForConditionalGeneration"),
("chameleon", "ChameleonForConditionalGeneration"),
+ ("deepseek_vl", "DeepseekVLForConditionalGeneration"),
+ ("deepseek_vl_hybrid", "DeepseekVLHybridForConditionalGeneration"),
("emu3", "Emu3ForConditionalGeneration"),
("evolla", "EvollaForProteinText2Text"),
("fuyu", "FuyuForCausalLM"),
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 31b798c805..cc2be544f4 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -62,6 +62,8 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
("clvp", "ClvpProcessor"),
("colpali", "ColPaliProcessor"),
("colqwen2", "ColQwen2Processor"),
+ ("deepseek_vl", "DeepseekVLProcessor"),
+ ("deepseek_vl_hybrid", "DeepseekVLHybridProcessor"),
("dia", "DiaProcessor"),
("emu3", "Emu3Processor"),
("evolla", "EvollaProcessor"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 6c4e3e98c7..6e5b07dddf 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -193,6 +193,20 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
"LlamaTokenizerFast" if is_tokenizers_available() else None,
),
),
+ (
+ "deepseek_vl",
+ (
+ "LlamaTokenizer" if is_sentencepiece_available() else None,
+ "LlamaTokenizerFast" if is_tokenizers_available() else None,
+ ),
+ ),
+ (
+ "deepseek_vl_hybrid",
+ (
+ "LlamaTokenizer" if is_sentencepiece_available() else None,
+ "LlamaTokenizerFast" if is_tokenizers_available() else None,
+ ),
+ ),
("dia", ("DiaTokenizer", None)),
(
"diffllama",
diff --git a/src/transformers/models/deepseek_vl/__init__.py b/src/transformers/models/deepseek_vl/__init__.py
new file mode 100644
index 0000000000..2422b31e31
--- /dev/null
+++ b/src/transformers/models/deepseek_vl/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+ from .configuration_deepseek_vl import *
+ from .image_processing_deepseek_vl import *
+ from .image_processing_deepseek_vl_fast import *
+ from .modeling_deepseek_vl import *
+ from .processing_deepseek_vl import *
+else:
+ import sys
+
+ _file = globals()["__file__"]
+ sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/deepseek_vl/configuration_deepseek_vl.py b/src/transformers/models/deepseek_vl/configuration_deepseek_vl.py
new file mode 100644
index 0000000000..af99ac9eeb
--- /dev/null
+++ b/src/transformers/models/deepseek_vl/configuration_deepseek_vl.py
@@ -0,0 +1,96 @@
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# This file was automatically generated from src/transformers/models/deepseek_vl/modular_deepseek_vl.py.
+# Do NOT edit this file manually as any edits will be overwritten by the generation of
+# the file from the modular. If any change should be done, please apply the change to the
+# modular_deepseek_vl.py file directly. One of our CI enforces this.
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class DeepseekVLConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`DeepseekVLModel`]. It is used to instantiate a
+ DeepseekVL model according to the specified arguments, defining the model architecture. Instantiating a configuration
+ with the defaults will yield a similar configuration to that of the DeepseekVL
+ [deepseek-community/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-community/deepseek-vl-1.3b-chat) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
+ The config object or dictionary of the text backbone.
+ vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SiglipVisionConfig`):
+ The config object or dictionary of the vision backbone.
+ image_token_id (`int`, *optional*, defaults to 100015):
+ The index representing image tokens in the model's token vocabulary.
+
+ Example:
+
+ ```python
+ >>> from transformers import DeepseekVLConfig, DeepseekVLModel
+
+ >>> # Initializing a DeepseekVL deepseek-community/deepseek-vl-1.3b-chat style configuration
+ >>> configuration = DeepseekVLConfig()
+
+ >>> # Initializing a model (with random weights) from the deepseek-community/deepseek-vl-1.3b-chat style configuration
+ >>> model = DeepseekVLModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "deepseek_vl"
+ sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
+
+ def __init__(
+ self,
+ text_config: AutoConfig = None,
+ vision_config: AutoConfig = None,
+ image_token_id: int = 100015,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+
+ if text_config is None:
+ text_config = {}
+ logger.info("`text_config` is `None`. Initializing the `LlamaConfig` with default values.")
+
+ if vision_config is None:
+ vision_config = {}
+ logger.info("`vision_config` is `None`. Initializing the `SiglipVisionConfig` with default values.")
+
+ if isinstance(text_config, dict):
+ text_config["model_type"] = text_config.get("model_type", "llama")
+ text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+
+ if isinstance(vision_config, dict):
+ vision_config["model_type"] = vision_config.get("model_type", "siglip_vision_model")
+ vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+
+ self.text_config = text_config
+ self.vision_config = vision_config
+ self.image_token_id = image_token_id
+
+
+__all__ = ["DeepseekVLConfig"]
diff --git a/src/transformers/models/deepseek_vl/convert_deepseek_vl_weights_to_hf.py b/src/transformers/models/deepseek_vl/convert_deepseek_vl_weights_to_hf.py
new file mode 100644
index 0000000000..3e9b6a37fe
--- /dev/null
+++ b/src/transformers/models/deepseek_vl/convert_deepseek_vl_weights_to_hf.py
@@ -0,0 +1,356 @@
+# coding=utf-8
+# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import gc
+import json
+import os
+from typing import Optional
+
+import regex as re
+import torch
+from accelerate import init_empty_weights
+from huggingface_hub import snapshot_download
+from huggingface_hub.errors import HFValidationError
+from safetensors.torch import load_file
+
+from transformers import (
+ AutoTokenizer,
+ DeepseekVLConfig,
+ DeepseekVLForConditionalGeneration,
+ DeepseekVLImageProcessor,
+ DeepseekVLProcessor,
+)
+from transformers.image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD
+
+
+# fmt: off
+ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
+ # Siglip (Low Resolution)
+ r"vision_model.vision_tower.pos_embed": r"model.vision_model.vision_model.embeddings.position_embedding.weight",
+ r"vision_model.vision_tower.patch_embed.proj.(weight|bias)": r"model.vision_model.vision_model.embeddings.patch_embedding.\1",
+ r"vision_model.vision_tower.blocks.(\d+).attn.qkv.(weight|bias)": r"model.vision_model.vision_model.encoder.layers.\1.self_attn.(q|k|v)_proj.\2",
+ r"vision_model.vision_tower.blocks.(\d+).attn.proj.(weight|bias)": r"model.vision_model.vision_model.encoder.layers.\1.self_attn.out_proj.\2",
+ r"vision_model.vision_tower.blocks.(\d+).norm(\d+).(weight|bias)": r"model.vision_model.vision_model.encoder.layers.\1.layer_norm\2.\3",
+ r"vision_model.vision_tower.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"model.vision_model.vision_model.encoder.layers.\1.mlp.fc\2.\3",
+ r"vision_model.vision_tower.norm.(weight|bias)": r"model.vision_model.vision_model.post_layernorm.\1",
+ r"vision_model.vision_tower.attn_pool.latent": r"model.vision_model.vision_model.head.probe",
+ r"vision_model.vision_tower.attn_pool.proj.(weight|bias)": r"model.vision_model.vision_model.head.attention.out_proj.\1",
+ r"vision_model.vision_tower.attn_pool.norm.(weight|bias)": r"model.vision_model.vision_model.head.layernorm.\1",
+ r"vision_model.vision_tower.attn_pool.mlp.fc(\d+).(weight|bias)": r"model.vision_model.vision_model.head.mlp.fc\1.\2",
+
+ # Aligner
+ r"aligner.layers.0.(weight|bias)": r"model.aligner.linear1.\1",
+ r"aligner.layers.2.(weight|bias)": r"model.aligner.linear2.\1",
+
+ # Llama (Text Model)
+ r"language_model.model.(\w+)": r"model.language_model.\1",
+ r"language_model.lm_head.(weight|bias)": r"lm_head.\1",
+}
+# fmt: on
+
+# Adopted from https://github.com/deepseek-ai/DeepSeek-VL/blob/main/deepseek_vl/utils/conversation.py#L80-L91
+CHAT_TEMPLATE = (
+ # Define separators and initialize counter
+ "{% set seps = ['\n\n', '<\uff5cend\u2581of\u2581sentence\uff5c>'] %}"
+ "{% set i = 0 %}"
+ # Start with default system prompt
+ "You are a helpful language and vision assistant. "
+ "You are able to understand the visual content that the user provides, "
+ "and assist the user with a variety of tasks using natural language.\n\n"
+ # Iterate through messages
+ "{% for message in messages %}"
+ # Identify user or assistant role
+ "{% if message['role']|lower == 'user' %}"
+ "User: "
+ "{% elif message['role']|lower == 'assistant' %}"
+ "Assistant:{% if not (loop.last and not add_generation_prompt and message['content'][0]['type']=='text' and message['content'][0]['text']=='') %} {% endif %}"
+ "{% else %}"
+ "{{ message['role'].capitalize() }}: "
+ "{% endif %}"
+ # Iterate through message content (text/images)
+ "{% for content in message['content'] %}"
+ # If content is an image, replace with placeholder
+ "{% if content['type'] == 'image' %}"
+ ""
+ # If content is text, handle formatting
+ "{% elif content['type'] == 'text' %}"
+ "{% set text = content['text'] %}"
+ # Strip whitespace for first and last text blocks
+ "{% if loop.first %}{% set text = text.lstrip() %}{% endif %}"
+ "{% if loop.last %}{% set text = text.rstrip() %}{% endif %}"
+ # If previous content was text, add space
+ "{% if not loop.first and message['content'][loop.index0-1]['type'] == 'text' %}"
+ "{{ ' ' + text }}"
+ "{% else %}"
+ "{{ text }}"
+ "{% endif %}"
+ "{% endif %}"
+ "{% endfor %}" # End message content loop
+ # Add separators between messages
+ "{% if not loop.last or add_generation_prompt %}"
+ "{% if message['role']|lower == 'user' %}"
+ "{{ seps[0] }}"
+ "{% else %}"
+ "{{ seps[1] }}"
+ "{% endif %}"
+ "{% endif %}"
+ "{% endfor %}" # End messages loop
+ # Add final Assistant prompt if required
+ "{% if add_generation_prompt %}Assistant:{% endif %}"
+)
+
+
+def convert_old_keys_to_new_keys(state_dict_keys: dict):
+ output_dict = {}
+
+ old_text = "\n".join(state_dict_keys)
+ new_text = old_text
+ for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items():
+ if replacement is None:
+ new_text = re.sub(pattern, "", new_text) # an empty line
+ continue
+ new_text = re.sub(pattern, replacement, new_text)
+ output_dict = dict(zip(old_text.split("\n"), new_text.split("\n")))
+
+ return output_dict
+
+
+def get_qkv_state_dict(key, parameter):
+ """
+ new key which looks like this
+ xxxx.(q|k|v).xxx (m, n)
+
+ is converted to
+ xxxx.q.xxxx (m//3, n)
+ xxxx.k.xxxx (m//3, n)
+ xxxx.v.xxxx (m//3, n)
+ """
+ qkv_state_dict = {}
+ placeholder = re.search(r"(\(.*?\))", key).group(1) # finds "(query|key|value)"
+ replacements_keys = placeholder[1:-1].split("|") # creates ['query', 'key', 'value']
+ replacements_vals = torch.split(
+ parameter, split_size_or_sections=parameter.size(0) // len(replacements_keys), dim=0
+ )
+ for replacement_key, replacement_val in zip(replacements_keys, replacements_vals):
+ qkv_state_dict[key.replace(placeholder, replacement_key)] = replacement_val
+ return qkv_state_dict
+
+
+def update_state_dict(old_state_dict):
+ all_keys = list(old_state_dict.keys())
+ new_keys = convert_old_keys_to_new_keys(all_keys)
+
+ state_dict = {}
+ for key in all_keys:
+ new_key = new_keys[key]
+ current_parameter = old_state_dict.pop(key)
+
+ if "qkv" in key and "vision_tower_high" not in key:
+ qkv_state_dict = get_qkv_state_dict(new_key, current_parameter)
+ state_dict.update(qkv_state_dict)
+ elif "pos_embed" in key:
+ if "vision_tower_high" not in key:
+ # timm implementation of siglip creates this param of size [1, 576, 1024]
+ # transformers implementation of siglip creates this param of size [576, 1024]
+ state_dict[new_key] = current_parameter.squeeze(0)
+ else:
+ state_dict[new_key] = current_parameter
+ else:
+ state_dict[new_key] = current_parameter
+
+ return state_dict
+
+
+def load_model_state_dict(input_path: str) -> dict:
+ """
+ Load model state dict, handling both single and sharded files.
+ """
+ index_path = os.path.join(input_path, "model.safetensors.index.json")
+ single_file_path = os.path.join(input_path, "model.safetensors")
+
+ # Check if we have a sharded model
+ if os.path.exists(index_path):
+ print("Loading sharded model...")
+ state_dict = {}
+ with open(index_path, "r") as f:
+ index = json.load(f)
+
+ # Get unique shard files and load each one only once
+ unique_shard_files = sorted(set(index["weight_map"].values()))
+ for shard_file in unique_shard_files:
+ print(f"Loading shard {shard_file}...")
+ shard_path = os.path.join(input_path, shard_file)
+ shard_dict = load_file(shard_path)
+ state_dict.update(shard_dict)
+
+ return state_dict
+
+ # Single file model
+ elif os.path.exists(single_file_path):
+ print("Loading single file model...")
+ return load_file(single_file_path, device="cpu")
+
+ else:
+ raise ValueError(f"No model files found in {input_path}")
+
+
+def convert_model(
+ hf_repo_id: str,
+ output_dir: Optional[str] = None,
+ output_hub_path: Optional[str] = None,
+ safe_serialization: bool = True,
+):
+ if output_dir:
+ os.makedirs(output_dir, exist_ok=True)
+
+ try:
+ input_path = snapshot_download(hf_repo_id)
+ except HFValidationError:
+ # If the input path is not a HF repo ID, assume it's a local path
+ input_path = hf_repo_id
+
+ # ------------------------------------------------------------
+ # Create and save config
+ # ------------------------------------------------------------
+
+ config = DeepseekVLConfig(
+ text_config={
+ "hidden_size": 2048,
+ "intermediate_size": 5632,
+ "max_position_embeddings": 16384,
+ "num_attention_heads": 16,
+ "num_hidden_layers": 24,
+ "vocab_size": 102400,
+ },
+ vision_config={
+ "hidden_size": 1024,
+ "intermediate_size": 4096,
+ "image_size": 384,
+ "patch_size": 16,
+ "hidden_act": "gelu",
+ "vision_use_head": False,
+ "num_attention_heads": 16,
+ "num_hidden_layers": 24,
+ },
+ )
+
+ # save config
+ if output_dir:
+ config.save_pretrained(output_dir)
+ print("Model config saved successfully...")
+
+ # ------------------------------------------------------------
+ # Convert processor
+ # ------------------------------------------------------------
+
+ image_processor = DeepseekVLImageProcessor(
+ image_mean=IMAGENET_STANDARD_MEAN,
+ image_std=IMAGENET_STANDARD_STD,
+ )
+
+ tokenizer = AutoTokenizer.from_pretrained(
+ input_path,
+ extra_special_tokens={
+ "pad_token": "<|end▁of▁sentence|>",
+ "image_token": "",
+ },
+ )
+
+ processor = DeepseekVLProcessor(
+ image_processor=image_processor,
+ tokenizer=tokenizer,
+ chat_template=CHAT_TEMPLATE,
+ )
+
+ if output_dir:
+ print(f"Saving processor to {output_dir}...")
+ processor.save_pretrained(output_dir)
+ if output_hub_path:
+ print(f"Pushing processor to hub at {output_hub_path}...")
+ processor.push_to_hub(output_hub_path)
+
+ # ------------------------------------------------------------
+ # Convert weights
+ # ------------------------------------------------------------
+
+ print("Creating empty model...")
+ with init_empty_weights():
+ model = DeepseekVLForConditionalGeneration(config)
+
+ # Load and convert state dict
+ print("Loading state dict...")
+ state_dict = load_model_state_dict(input_path)
+ state_dict = update_state_dict(state_dict)
+
+ # Load converted state dict
+ print("Loading converted weights into model...")
+ info = model.load_state_dict(state_dict, strict=False, assign=True)
+ if len(info.missing_keys) > 0:
+ raise ValueError(f"Missing keys: {info.missing_keys}")
+
+ # Tie weights before any device mapping
+ print("Tying weights...")
+ model.tie_weights()
+
+ # Save the model
+ if output_dir:
+ print(f"Saving model to {output_dir}...")
+ model.save_pretrained(output_dir, safe_serialization=safe_serialization)
+ if output_hub_path:
+ print(f"Pushing model to hub at {output_hub_path}...")
+ model.push_to_hub(output_hub_path, safe_serialization=safe_serialization)
+
+ del state_dict, model
+ gc.collect()
+
+ # Validate the saved model if saved locally
+ if output_dir:
+ print("Reloading the local model to check if it's saved correctly...")
+ DeepseekVLForConditionalGeneration.from_pretrained(output_dir, device_map="auto")
+ print("Local model reloaded successfully.")
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--hf_repo_id",
+ default="deepseek-ai/deepseek-vl-1.3b-chat",
+ help="Location of official weights from DeepseekAI on HF",
+ )
+ parser.add_argument(
+ "--output_dir",
+ default=None,
+ help="Location to write the converted model and processor",
+ )
+ parser.add_argument(
+ "--output_hub_path",
+ default=None,
+ help="Repository ID to push model to hub (e.g. 'username/model-name')",
+ )
+ parser.add_argument(
+ "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`."
+ )
+ args = parser.parse_args()
+
+ convert_model(
+ hf_repo_id=args.hf_repo_id,
+ output_dir=args.output_dir,
+ output_hub_path=args.output_hub_path,
+ safe_serialization=args.safe_serialization,
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py
new file mode 100644
index 0000000000..fad24220ef
--- /dev/null
+++ b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py
@@ -0,0 +1,414 @@
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# This file was automatically generated from src/transformers/models/deepseek_vl/modular_deepseek_vl.py.
+# Do NOT edit this file manually as any edits will be overwritten by the generation of
+# the file from the modular. If any change should be done, please apply the change to the
+# modular_deepseek_vl.py file directly. One of our CI enforces this.
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import convert_to_rgb, resize, to_channel_dimension_format
+from ...image_utils import (
+ OPENAI_CLIP_MEAN,
+ OPENAI_CLIP_STD,
+ ChannelDimension,
+ ImageInput,
+ PILImageResampling,
+ get_image_size,
+ infer_channel_dimension_format,
+ is_scaled_image,
+ make_flat_list_of_images,
+ to_numpy_array,
+ valid_images,
+ validate_preprocess_arguments,
+)
+from ...utils import (
+ TensorType,
+ filter_out_non_signature_kwargs,
+ is_vision_available,
+ logging,
+)
+
+
+if is_vision_available():
+ import PIL
+
+
+logger = logging.get_logger(__name__)
+
+
+class DeepseekVLImageProcessor(BaseImageProcessor):
+ r"""
+ Constructs a DEEPSEEK_VL image processor.
+
+ Args:
+ do_resize (`bool`, *optional*, defaults to `True`):
+ Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+ `do_resize` parameter in the `preprocess` method.
+ size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
+ Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+ method.
+ min_size (`int`, *optional*, defaults to 14):
+ The minimum allowed size for the resized image. Ensures that neither the height nor width
+ falls below this value after resizing.
+ resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+ Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+ overridden by the `resample` parameter in the `preprocess` method.
+ do_rescale (`bool`, *optional*, defaults to `True`):
+ Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+ `do_rescale` parameter in the `preprocess` method.
+ rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+ Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
+ overridden by the `rescale_factor` parameter in the `preprocess` method.
+ do_normalize (`bool`, *optional*, defaults to `True`):
+ Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+ method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
+ image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+ Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+ channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+ overridden by the `image_mean` parameter in the `preprocess` method.
+ image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+ Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+ number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+ Can be overridden by the `image_std` parameter in the `preprocess` method.
+ do_convert_rgb (`bool`, *optional*, defaults to `True`):
+ Whether to convert the image to RGB.
+ """
+
+ model_input_names = ["pixel_values"]
+
+ def __init__(
+ self,
+ do_resize: bool = True,
+ size: Optional[dict[str, int]] = None,
+ min_size: int = 14,
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
+ do_rescale: bool = True,
+ rescale_factor: Union[int, float] = 1 / 255,
+ do_normalize: bool = True,
+ image_mean: Optional[Union[float, list[float]]] = None,
+ image_std: Optional[Union[float, list[float]]] = None,
+ do_convert_rgb: Optional[bool] = None,
+ **kwargs,
+ ) -> None:
+ super().__init__(**kwargs)
+ size = size if size is not None else {"height": 384, "width": 384}
+ size = get_size_dict(size, default_to_square=True)
+
+ self.do_resize = do_resize
+ self.size = size
+ self.resample = resample
+ self.do_rescale = do_rescale
+ self.rescale_factor = rescale_factor
+ self.do_normalize = do_normalize
+ self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+ self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+ self.do_convert_rgb = do_convert_rgb
+
+ self.min_size = min_size
+ if image_mean is None:
+ self.background_color = (127, 127, 127)
+ else:
+ self.background_color = tuple([int(x * 255) for x in image_mean])
+
+ def resize(
+ self,
+ image: np.ndarray,
+ size: Union[dict[str, int], int],
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
+ data_format: Optional[Union[str, ChannelDimension]] = None,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ **kwargs,
+ ) -> np.ndarray:
+ """
+ Resize an image to dynamically calculated size.
+
+ Args:
+ image (`np.ndarray`):
+ Image to resize.
+ resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+ `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
+ data_format (`ChannelDimension` or `str`, *optional*):
+ The channel dimension format for the output image. If unset, the channel dimension format of the input
+ image is used. Can be one of:
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+ - `None`: will be inferred from input
+ input_data_format (`ChannelDimension` or `str`, *optional*):
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
+ from the input image. Can be one of:
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+ Returns:
+ `np.ndarray`: The resized image.
+ """
+ if input_data_format is None:
+ input_data_format = infer_channel_dimension_format(image)
+
+ height, width = get_image_size(image, input_data_format)
+ max_size = max(height, width)
+
+ size = get_size_dict(size, default_to_square=True)
+ if size["height"] != size["width"]:
+ raise ValueError(
+ f"Output height and width must be the same. Got height={size['height']} and width={size['width']}"
+ )
+ size = size["height"]
+
+ delta = size / max_size
+ # Largest side becomes `size` and the other side is scaled according to the aspect ratio.
+ output_size_nonpadded = [
+ max(int(height * delta), self.min_size),
+ max(int(width * delta), self.min_size),
+ ]
+
+ image = resize(
+ image,
+ size=output_size_nonpadded,
+ resample=resample,
+ data_format=data_format,
+ input_data_format=input_data_format,
+ **kwargs,
+ )
+ # Expand and pad the images to obtain a square image of dimensions `size x size`
+ image = self.pad_to_square(
+ image=image,
+ background_color=self.background_color,
+ input_data_format=input_data_format,
+ )
+ return image
+
+ @filter_out_non_signature_kwargs()
+ def preprocess(
+ self,
+ images: ImageInput,
+ do_resize: Optional[bool] = None,
+ size: Optional[dict[str, int]] = None,
+ resample: PILImageResampling = None,
+ do_rescale: Optional[bool] = None,
+ rescale_factor: Optional[float] = None,
+ do_normalize: Optional[bool] = None,
+ image_mean: Optional[Union[float, list[float]]] = None,
+ image_std: Optional[Union[float, list[float]]] = None,
+ return_tensors: Optional[Union[str, TensorType]] = None,
+ do_convert_rgb: Optional[bool] = None,
+ data_format: ChannelDimension = ChannelDimension.FIRST,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ ) -> PIL.Image.Image:
+ """
+ Preprocess an image or batch of images.
+
+ Args:
+ images (`ImageInput`):
+ Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+ passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+ Whether to resize the image.
+ size (`dict[str, int]`, *optional*, defaults to `self.size`):
+ Controls the size of the image after `resize`. The shortest edge of the image is resized to
+ `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image
+ is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest
+ edge equal to `int(size["shortest_edge"] * (1333 / 800))`.
+ resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+ Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`.
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+ Whether to rescale the image values between [0 - 1].
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+ Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+ Whether to normalize the image.
+ image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+ Image mean to normalize the image by if `do_normalize` is set to `True`.
+ image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+ Image standard deviation to normalize the image by if `do_normalize` is set to `True`.
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+ Whether to convert the image to RGB.
+ return_tensors (`str` or `TensorType`, *optional*):
+ The type of tensors to return. Can be one of:
+ - Unset: Return a list of `np.ndarray`.
+ - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+ - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+ - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+ - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+ data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+ The channel dimension format for the output image. Can be one of:
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+ - Unset: Use the channel dimension format of the input image.
+ input_data_format (`ChannelDimension` or `str`, *optional*):
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
+ from the input image. Can be one of:
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+ """
+ do_resize = do_resize if do_resize is not None else self.do_resize
+ resample = resample if resample is not None else self.resample
+ do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+ rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+ do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+ image_mean = image_mean if image_mean is not None else self.image_mean
+ image_std = image_std if image_std is not None else self.image_std
+ do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+ size = size if size is not None else self.size
+ size = get_size_dict(size, default_to_square=False)
+ images = make_flat_list_of_images(images)
+
+ if not valid_images(images):
+ raise ValueError(
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+ "torch.Tensor, tf.Tensor or jax.ndarray."
+ )
+
+ validate_preprocess_arguments(
+ do_rescale=do_rescale,
+ rescale_factor=rescale_factor,
+ do_normalize=do_normalize,
+ image_mean=image_mean,
+ image_std=image_std,
+ do_resize=do_resize,
+ size=size,
+ resample=resample,
+ )
+ # PIL RGBA images are converted to RGB
+ if do_convert_rgb:
+ images = [convert_to_rgb(image) for image in images]
+
+ # All transformations expect numpy arrays.
+ images = [to_numpy_array(image) for image in images]
+
+ if do_rescale and is_scaled_image(images[0]):
+ logger.warning_once(
+ "It looks like you are trying to rescale already rescaled images. If the input"
+ " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+ )
+
+ if input_data_format is None:
+ # We assume that all images have the same channel dimension format.
+ input_data_format = infer_channel_dimension_format(images[0])
+
+ if do_resize:
+ images = [
+ self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+ for image in images
+ ]
+
+ if do_rescale:
+ images = [
+ self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+ for image in images
+ ]
+
+ if do_normalize:
+ images = [
+ self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+ for image in images
+ ]
+
+ images = [
+ to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+ ]
+
+ encoded_outputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
+
+ return encoded_outputs
+
+ def pad_to_square(
+ self,
+ image: np.ndarray,
+ background_color: Union[int, tuple[int, int, int]] = 0,
+ data_format: Optional[Union[str, ChannelDimension]] = None,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ ) -> np.array:
+ """
+ Pads an image to a square based on the longest edge.
+
+ Args:
+ image (`np.ndarray`):
+ The image to pad.
+ background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
+ The color to use for the padding. Can be an integer for single channel or a
+ tuple of integers representing for multi-channel images. If passed as integer
+ in mutli-channel mode, it will default to `0` in subsequent channels.
+ data_format (`str` or `ChannelDimension`, *optional*):
+ The channel dimension format for the output image. Can be one of:
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+ If unset, will use same as the input image.
+ input_data_format (`str` or `ChannelDimension`, *optional*):
+ The channel dimension format for the input image. Can be one of:
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+
+ Returns:
+ `np.ndarray`: The padded image.
+ """
+ height, width = get_image_size(image, input_data_format)
+ num_channels = image.shape[0] if input_data_format == ChannelDimension.FIRST else image.shape[-1]
+
+ if height == width:
+ image = (
+ to_channel_dimension_format(image, data_format, input_data_format)
+ if data_format is not None
+ else image
+ )
+ return image
+
+ max_dim = max(height, width)
+
+ # Ensure background_color is the correct shape
+ if isinstance(background_color, int):
+ background_color = [background_color]
+ elif len(background_color) != num_channels:
+ raise ValueError(
+ f"background_color must have no more than {num_channels} elements to match the number of channels"
+ )
+
+ if input_data_format == ChannelDimension.FIRST:
+ result = np.zeros((num_channels, max_dim, max_dim), dtype=image.dtype)
+ for i, color in enumerate(background_color):
+ result[i, :, :] = color
+ if width > height:
+ start = (max_dim - height) // 2
+ result[:, start : start + height, :] = image
+ else:
+ start = (max_dim - width) // 2
+ result[:, :, start : start + width] = image
+ else:
+ result = np.zeros((max_dim, max_dim, num_channels), dtype=image.dtype)
+ for i, color in enumerate(background_color):
+ result[:, :, i] = color
+ if width > height:
+ start = (max_dim - height) // 2
+ result[start : start + height, :, :] = image
+ else:
+ start = (max_dim - width) // 2
+ result[:, start : start + width, :] = image
+
+ return result
+
+ def postprocess(self):
+ """Applies post-processing to the decoded image tokens by reversing transformations applied during preprocessing."""
+ raise AttributeError("Not needed for DeepseekVL")
+
+
+__all__ = ["DeepseekVLImageProcessor"]
diff --git a/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py b/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py
new file mode 100644
index 0000000000..ce85d739bc
--- /dev/null
+++ b/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py
@@ -0,0 +1,349 @@
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# This file was automatically generated from src/transformers/models/deepseek_vl/modular_deepseek_vl.py.
+# Do NOT edit this file manually as any edits will be overwritten by the generation of
+# the file from the modular. If any change should be done, please apply the change to the
+# modular_deepseek_vl.py file directly. One of our CI enforces this.
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Optional, Union
+
+from ...cache_utils import Cache
+from ...generation import GenerationMixin
+from ...modeling_outputs import ModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import (
+ TransformersKwargs,
+ auto_docstring,
+ can_return_tuple,
+ is_torch_available,
+)
+from ..auto import AutoModel
+from .configuration_deepseek_vl import DeepseekVLConfig
+
+
+if is_torch_available():
+ import torch
+ import torch.nn as nn
+
+
+@dataclass
+@auto_docstring(
+ custom_intro="""
+ Base class for DeepseekVL model's outputs that may also contain a past key/values (to speed up sequential decoding).
+ """
+)
+class DeepseekVLBaseModelOutputWithPast(ModelOutput):
+ r"""
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the model.
+
+ If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+ hidden_size)` is output.
+ past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+ `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+ encoder_sequence_length, embed_size_per_head)`.
+
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+ `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+ input) to speed up sequential decoding.
+ image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+ Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
+ sequence_length, hidden_size)`.
+
+ image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
+ """
+
+ last_hidden_state: Optional[torch.FloatTensor] = None
+ past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None
+ hidden_states: Optional[tuple[torch.FloatTensor]] = None
+ attentions: Optional[tuple[torch.FloatTensor]] = None
+ image_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+@auto_docstring(
+ custom_intro="""
+ Base class for DeepseekVL causal language model (or autoregressive) outputs.
+ """
+)
+class DeepseekVLCausalLMOutputWithPast(ModelOutput):
+ r"""
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+ Language modeling loss (for next-token prediction).
+ logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+ past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+ Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+ `past_key_values` input) to speed up sequential decoding.
+ image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+ Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
+ sequence_length, hidden_size)`.
+
+ image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
+ """
+
+ loss: Optional[torch.FloatTensor] = None
+ logits: Optional[torch.FloatTensor] = None
+ past_key_values: Optional[list[torch.FloatTensor]] = None
+ hidden_states: Optional[tuple[torch.FloatTensor]] = None
+ attentions: Optional[tuple[torch.FloatTensor]] = None
+ image_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+
+
+class DeepseekVLAligner(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+
+ in_features = config.vision_config.hidden_size
+ out_features = config.text_config.hidden_size
+
+ self.linear1 = nn.Linear(in_features, out_features)
+ self.activation = nn.GELU()
+ self.linear2 = nn.Linear(out_features, out_features)
+
+ def forward(self, vision_encodings: torch.Tensor) -> torch.Tensor:
+ x = self.linear1(vision_encodings)
+ x = self.activation(x)
+ x = self.linear2(x)
+ return x
+
+
+@auto_docstring
+class DeepseekVLPreTrainedModel(PreTrainedModel):
+ config: DeepseekVLConfig
+ base_model_prefix = "model"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["LlamaDecoderLayer"]
+ _skip_keys_device_placement = ["past_key_values", "causal_mask"]
+ _supports_flash_attn = True
+ _supports_sdpa = True
+
+ _supports_static_cache = True
+ _supports_param_buffer_assignment = False
+
+ def _init_weights(self, module):
+ """Initialize the weights"""
+ # Required only for Linear layer in DeepseekVLAligner
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=self.config.text_config.initializer_range)
+ if module.bias is not None:
+ module.bias.data.zero_()
+
+
+@auto_docstring
+class DeepseekVLModel(DeepseekVLPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+ self.config = config
+
+ self.vision_model = AutoModel.from_config(config.vision_config)
+ self.aligner = DeepseekVLAligner(config)
+
+ self.language_model = AutoModel.from_config(config=config.text_config)
+
+ self.gradient_checkpointing = False
+ # Initialize weights and apply final processing.
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.language_model.get_input_embeddings()
+
+ def set_input_embeddings(self, value):
+ self.language_model.set_input_embeddings(value)
+
+ def get_image_features(self, pixel_values):
+ image_embeds = self.vision_model(pixel_values)
+ image_embeds = self.aligner(image_embeds.last_hidden_state)
+ return image_embeds
+
+ @can_return_tuple
+ @auto_docstring
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ pixel_values: torch.FloatTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Cache] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = None,
+ logits_to_keep: Union[int, torch.Tensor] = 0,
+ **kwargs,
+ ):
+ if (input_ids is None) ^ (inputs_embeds is not None):
+ raise ValueError(
+ "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+ )
+ if inputs_embeds is None:
+ inputs_embeds = self.get_input_embeddings()(input_ids)
+
+ if pixel_values is not None:
+ if input_ids is None:
+ image_attention_mask = inputs_embeds == self.get_input_embeddings()(
+ torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+ )
+ image_attention_mask = image_attention_mask.all(-1)
+ else:
+ image_attention_mask = input_ids == self.config.image_token_id
+
+ image_attention_mask = image_attention_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+ image_embeds = self.get_image_features(pixel_values)
+ image_features = image_embeds.reshape(-1, inputs_embeds.shape[-1])
+ image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+ inputs_embeds = inputs_embeds.masked_scatter(image_attention_mask, image_features)
+
+ lm_output = self.language_model(
+ inputs_embeds=inputs_embeds,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ use_cache=use_cache,
+ cache_position=cache_position,
+ logits_to_keep=logits_to_keep,
+ **kwargs,
+ )
+
+ return DeepseekVLBaseModelOutputWithPast(
+ last_hidden_state=lm_output.last_hidden_state,
+ past_key_values=lm_output.past_key_values,
+ hidden_states=lm_output.hidden_states,
+ attentions=lm_output.attentions,
+ image_hidden_states=image_embeds if pixel_values is not None else None,
+ )
+
+
+class DeepseekVLForConditionalGeneration(DeepseekVLPreTrainedModel, GenerationMixin):
+ _tied_weights_keys = ["model.language_model.embed_tokens.weight", "lm_head.weight"]
+ _supports_static_cache = True
+
+ def __init__(self, config: DeepseekVLConfig):
+ super().__init__(config)
+ self.config = config
+ self.model = DeepseekVLModel(config)
+ self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+
+ # Initialize weights and apply final processing.
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.language_model.get_input_embeddings()
+
+ def set_input_embeddings(self, value):
+ self.model.language_model.set_input_embeddings(value)
+
+ def prepare_embeddings_for_image_generation(self) -> torch.Tensor:
+ raise AttributeError("Not needed for DeepseekVL")
+
+ def set_decoder(self, decoder):
+ self.model = decoder
+
+ def get_decoder(self):
+ return self.model
+
+ @can_return_tuple
+ @auto_docstring
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ pixel_values: torch.FloatTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Cache] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ logits_to_keep: Union[int, torch.Tensor] = 0,
+ **kwargs: Unpack[TransformersKwargs],
+ ):
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+ """
+ outputs = self.model(
+ input_ids=input_ids,
+ pixel_values=pixel_values,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ cache_position=cache_position,
+ **kwargs,
+ )
+ hidden_states = outputs.last_hidden_state
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+ loss = None
+ if labels is not None:
+ loss = self.loss_function(
+ logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+ )
+
+ return DeepseekVLCausalLMOutputWithPast(
+ loss=loss,
+ logits=logits,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ image_hidden_states=outputs.image_hidden_states,
+ )
+
+ def prepare_inputs_for_generation(
+ self,
+ input_ids,
+ pixel_values=None,
+ past_key_values=None,
+ attention_mask=None,
+ inputs_embeds=None,
+ cache_position=None,
+ logits_to_keep=None,
+ **kwargs,
+ ):
+ # Overwritten -- extra custom processing
+
+ model_inputs = super().prepare_inputs_for_generation(
+ input_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ attention_mask=attention_mask,
+ cache_position=cache_position,
+ logits_to_keep=logits_to_keep,
+ **kwargs,
+ )
+
+ # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+ # Otherwise we need pixel values to be passed to model
+ if cache_position[0] == 0:
+ model_inputs["pixel_values"] = pixel_values
+
+ return model_inputs
+
+
+__all__ = ["DeepseekVLPreTrainedModel", "DeepseekVLModel", "DeepseekVLForConditionalGeneration"]
diff --git a/src/transformers/models/deepseek_vl/modular_deepseek_vl.py b/src/transformers/models/deepseek_vl/modular_deepseek_vl.py
new file mode 100644
index 0000000000..a5190a280b
--- /dev/null
+++ b/src/transformers/models/deepseek_vl/modular_deepseek_vl.py
@@ -0,0 +1,326 @@
+# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Union
+
+from ...configuration_utils import PretrainedConfig
+from ...image_processing_utils import BatchFeature
+from ...image_utils import (
+ ImageInput,
+ make_flat_list_of_images,
+)
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import (
+ PreTokenizedInput,
+ TextInput,
+)
+from ...utils import (
+ auto_docstring,
+ is_torch_available,
+ logging,
+)
+from ..auto import CONFIG_MAPPING, AutoConfig, AutoModel
+from ..idefics.modeling_idefics import IdeficsBaseModelOutputWithPast, IdeficsCausalLMOutputWithPast
+from ..janus.image_processing_janus import JanusImageProcessor
+from ..janus.modeling_janus import JanusForConditionalGeneration, JanusModel, JanusPreTrainedModel
+
+
+if is_torch_available():
+ import torch
+ import torch.nn as nn
+
+logger = logging.get_logger(__name__)
+
+
+class DeepseekVLConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`DeepseekVLModel`]. It is used to instantiate a
+ DeepseekVL model according to the specified arguments, defining the model architecture. Instantiating a configuration
+ with the defaults will yield a similar configuration to that of the DeepseekVL
+ [deepseek-community/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-community/deepseek-vl-1.3b-chat) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
+ The config object or dictionary of the text backbone.
+ vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SiglipVisionConfig`):
+ The config object or dictionary of the vision backbone.
+ image_token_id (`int`, *optional*, defaults to 100015):
+ The index representing image tokens in the model's token vocabulary.
+
+ Example:
+
+ ```python
+ >>> from transformers import DeepseekVLConfig, DeepseekVLModel
+
+ >>> # Initializing a DeepseekVL deepseek-community/deepseek-vl-1.3b-chat style configuration
+ >>> configuration = DeepseekVLConfig()
+
+ >>> # Initializing a model (with random weights) from the deepseek-community/deepseek-vl-1.3b-chat style configuration
+ >>> model = DeepseekVLModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "deepseek_vl"
+ sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
+
+ def __init__(
+ self,
+ text_config: AutoConfig = None,
+ vision_config: AutoConfig = None,
+ image_token_id: int = 100015,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+
+ if text_config is None:
+ text_config = {}
+ logger.info("`text_config` is `None`. Initializing the `LlamaConfig` with default values.")
+
+ if vision_config is None:
+ vision_config = {}
+ logger.info("`vision_config` is `None`. Initializing the `SiglipVisionConfig` with default values.")
+
+ if isinstance(text_config, dict):
+ text_config["model_type"] = text_config.get("model_type", "llama")
+ text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+
+ if isinstance(vision_config, dict):
+ vision_config["model_type"] = vision_config.get("model_type", "siglip_vision_model")
+ vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+
+ self.text_config = text_config
+ self.vision_config = vision_config
+ self.image_token_id = image_token_id
+
+
+class DeepseekVLBaseModelOutputWithPast(IdeficsBaseModelOutputWithPast):
+ pass
+
+
+class DeepseekVLCausalLMOutputWithPast(IdeficsCausalLMOutputWithPast):
+ pass
+
+
+class DeepseekVLAligner(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+
+ in_features = config.vision_config.hidden_size
+ out_features = config.text_config.hidden_size
+
+ self.linear1 = nn.Linear(in_features, out_features)
+ self.activation = nn.GELU()
+ self.linear2 = nn.Linear(out_features, out_features)
+
+ def forward(self, vision_encodings: torch.Tensor) -> torch.Tensor:
+ x = self.linear1(vision_encodings)
+ x = self.activation(x)
+ x = self.linear2(x)
+ return x
+
+
+class DeepseekVLPreTrainedModel(JanusPreTrainedModel):
+ _no_split_modules = ["LlamaDecoderLayer"]
+
+ def _init_weights(self, module):
+ """Initialize the weights"""
+ # Required only for Linear layer in DeepseekVLAligner
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=self.config.text_config.initializer_range)
+ if module.bias is not None:
+ module.bias.data.zero_()
+
+
+@auto_docstring
+class DeepseekVLModel(JanusModel):
+ def __init__(self, config):
+ super().__init__(config)
+ self.config = config
+
+ self.vision_model = AutoModel.from_config(config.vision_config)
+ self.aligner = DeepseekVLAligner(config)
+
+ self.language_model = AutoModel.from_config(config=config.text_config)
+
+ self.gradient_checkpointing = False
+ # Initialize weights and apply final processing.
+ self.post_init()
+
+ del self.vqmodel
+ del self.generation_embeddings
+ del self.generation_aligner
+ del self.generation_head
+
+
+class DeepseekVLForConditionalGeneration(JanusForConditionalGeneration):
+ def prepare_embeddings_for_image_generation(self):
+ raise AttributeError("Not needed for DeepseekVL")
+
+ def decode_image_tokens(self):
+ raise AttributeError("Not needed for DeepseekVL")
+
+ def generate(self):
+ raise AttributeError("Not needed for DeepseekVL")
+
+
+class DeepseekVLImageProcessor(JanusImageProcessor):
+ def postprocess(self):
+ raise AttributeError("Not needed for DeepseekVL")
+
+ def unnormalize(self):
+ raise AttributeError("Not needed for DeepseekVL")
+
+
+class DeepseekVLProcessorKwargs(ProcessingKwargs, total=False):
+ _defaults = {
+ "text_kwargs": {"padding": False},
+ "common_kwargs": {"return_tensors": "pt"},
+ }
+
+
+class DeepseekVLProcessor(ProcessorMixin):
+ r"""
+ Constructs a DeepseekVL processor which wraps a DeepseekVL Image Processor and a Llama tokenizer into a single processor.
+
+ [`DeepseekVLProcessor`] offers all the functionalities of [`DeepseekVLImageProcessor`] and [`LlamaTokenizerFast`]. See the
+ [`~DeepseekVLProcessor.__call__`] and [`~DeepseekVLProcessor.decode`] for more information.
+
+ Args:
+ image_processor ([`DeepseekVLImageProcessor`]):
+ The image processor is a required input.
+ tokenizer ([`LlamaTokenizerFast`]):
+ The tokenizer is a required input.
+ chat_template (`str`, *optional*):
+ A Jinja template which will be used to convert lists of messages
+ in a chat into a tokenizable string.
+ num_image_tokens (`int`, *optional*, defaults to 576):
+ The number of special image tokens used as placeholders for visual content in text sequences.
+ """
+
+ attributes = ["image_processor", "tokenizer"]
+ valid_kwargs = ["chat_template", "num_image_tokens"]
+ image_processor_class = "AutoImageProcessor"
+ tokenizer_class = "AutoTokenizer"
+
+ def __init__(
+ self,
+ image_processor,
+ tokenizer,
+ chat_template=None,
+ num_image_tokens=576,
+ ):
+ self.image_token = tokenizer.image_token
+ self.num_image_tokens = num_image_tokens
+
+ super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+ def __call__(
+ self,
+ text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+ images: ImageInput = None,
+ **kwargs: Unpack[DeepseekVLProcessorKwargs],
+ ) -> BatchFeature:
+ """
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+ and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
+ the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+ DeepseekVLImageProcessor's [`~DeepseekVLImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+ of the above two methods for more information.
+
+ Args:
+ text (`str`, `List[str]`, `List[List[str]]`):
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+ tensor. Both channels-first and channels-last formats are supported.
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
+ If set, will return tensors of a particular framework. Acceptable values are:
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
+ - `'np'`: Return NumPy `np.ndarray` objects.
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+ Returns:
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+ `None`).
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+ """
+ output_kwargs = self._merge_kwargs(
+ DeepseekVLProcessorKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs
+ )
+ if text is None and images is None:
+ raise ValueError("You must specify either text or images.")
+
+ if text is not None:
+ if isinstance(text, str):
+ text = [text]
+ elif not (isinstance(text, (list, tuple)) and all(isinstance(t, str) for t in text)):
+ raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+ prompt_strings = []
+ one_img_tokens = self.image_token * self.num_image_tokens
+ for prompt in text:
+ prompt = prompt.replace(self.image_token, one_img_tokens)
+ prompt_strings.append(prompt)
+
+ data = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
+
+ # process images if pixel_values are provided
+ if images is not None:
+ images = make_flat_list_of_images(images)
+ data["pixel_values"] = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"]
+
+ return BatchFeature(data=data)
+
+ def batch_decode(self, *args, **kwargs):
+ """
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+ refer to the docstring of this method for more information.
+ """
+ return self.tokenizer.batch_decode(*args, **kwargs)
+
+ def decode(self, *args, **kwargs):
+ """
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+ the docstring of this method for more information.
+ """
+ return self.tokenizer.decode(*args, **kwargs)
+
+ @property
+ def model_input_names(self):
+ tokenizer_input_names = self.tokenizer.model_input_names
+ image_processor_input_names = self.image_processor.model_input_names
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+
+__all__ = [
+ "DeepseekVLConfig",
+ "DeepseekVLPreTrainedModel",
+ "DeepseekVLModel",
+ "DeepseekVLForConditionalGeneration",
+ "DeepseekVLImageProcessor",
+ "DeepseekVLProcessor",
+]
diff --git a/src/transformers/models/deepseek_vl/processing_deepseek_vl.py b/src/transformers/models/deepseek_vl/processing_deepseek_vl.py
new file mode 100644
index 0000000000..244e642d7c
--- /dev/null
+++ b/src/transformers/models/deepseek_vl/processing_deepseek_vl.py
@@ -0,0 +1,157 @@
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# This file was automatically generated from src/transformers/models/deepseek_vl/modular_deepseek_vl.py.
+# Do NOT edit this file manually as any edits will be overwritten by the generation of
+# the file from the modular. If any change should be done, please apply the change to the
+# modular_deepseek_vl.py file directly. One of our CI enforces this.
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Union
+
+from ...image_processing_utils import BatchFeature
+from ...image_utils import ImageInput, make_flat_list_of_images
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+
+
+class DeepseekVLProcessorKwargs(ProcessingKwargs, total=False):
+ _defaults = {
+ "text_kwargs": {"padding": False},
+ "common_kwargs": {"return_tensors": "pt"},
+ }
+
+
+class DeepseekVLProcessor(ProcessorMixin):
+ r"""
+ Constructs a DeepseekVL processor which wraps a DeepseekVL Image Processor and a Llama tokenizer into a single processor.
+
+ [`DeepseekVLProcessor`] offers all the functionalities of [`DeepseekVLImageProcessor`] and [`LlamaTokenizerFast`]. See the
+ [`~DeepseekVLProcessor.__call__`] and [`~DeepseekVLProcessor.decode`] for more information.
+
+ Args:
+ image_processor ([`DeepseekVLImageProcessor`]):
+ The image processor is a required input.
+ tokenizer ([`LlamaTokenizerFast`]):
+ The tokenizer is a required input.
+ chat_template (`str`, *optional*):
+ A Jinja template which will be used to convert lists of messages
+ in a chat into a tokenizable string.
+ num_image_tokens (`int`, *optional*, defaults to 576):
+ The number of special image tokens used as placeholders for visual content in text sequences.
+ """
+
+ attributes = ["image_processor", "tokenizer"]
+ valid_kwargs = ["chat_template", "num_image_tokens"]
+ image_processor_class = "AutoImageProcessor"
+ tokenizer_class = "AutoTokenizer"
+
+ def __init__(
+ self,
+ image_processor,
+ tokenizer,
+ chat_template=None,
+ num_image_tokens=576,
+ ):
+ self.image_token = tokenizer.image_token
+ self.num_image_tokens = num_image_tokens
+
+ super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+ def __call__(
+ self,
+ text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+ images: ImageInput = None,
+ **kwargs: Unpack[DeepseekVLProcessorKwargs],
+ ) -> BatchFeature:
+ """
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+ and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
+ the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+ DeepseekVLImageProcessor's [`~DeepseekVLImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+ of the above two methods for more information.
+
+ Args:
+ text (`str`, `List[str]`, `List[List[str]]`):
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+ tensor. Both channels-first and channels-last formats are supported.
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
+ If set, will return tensors of a particular framework. Acceptable values are:
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
+ - `'np'`: Return NumPy `np.ndarray` objects.
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+ Returns:
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+ `None`).
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+ """
+ output_kwargs = self._merge_kwargs(
+ DeepseekVLProcessorKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs
+ )
+ if text is None and images is None:
+ raise ValueError("You must specify either text or images.")
+
+ if text is not None:
+ if isinstance(text, str):
+ text = [text]
+ elif not (isinstance(text, (list, tuple)) and all(isinstance(t, str) for t in text)):
+ raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+ prompt_strings = []
+ one_img_tokens = self.image_token * self.num_image_tokens
+ for prompt in text:
+ prompt = prompt.replace(self.image_token, one_img_tokens)
+ prompt_strings.append(prompt)
+
+ data = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
+
+ # process images if pixel_values are provided
+ if images is not None:
+ images = make_flat_list_of_images(images)
+ data["pixel_values"] = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"]
+
+ return BatchFeature(data=data)
+
+ def batch_decode(self, *args, **kwargs):
+ """
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+ refer to the docstring of this method for more information.
+ """
+ return self.tokenizer.batch_decode(*args, **kwargs)
+
+ def decode(self, *args, **kwargs):
+ """
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+ the docstring of this method for more information.
+ """
+ return self.tokenizer.decode(*args, **kwargs)
+
+ @property
+ def model_input_names(self):
+ tokenizer_input_names = self.tokenizer.model_input_names
+ image_processor_input_names = self.image_processor.model_input_names
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+
+__all__ = ["DeepseekVLProcessor"]
diff --git a/src/transformers/models/deepseek_vl_hybrid/__init__.py b/src/transformers/models/deepseek_vl_hybrid/__init__.py
new file mode 100644
index 0000000000..1836d196ac
--- /dev/null
+++ b/src/transformers/models/deepseek_vl_hybrid/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+ from .configuration_deepseek_vl_hybrid import *
+ from .image_processing_deepseek_vl_fast_hybrid import *
+ from .image_processing_deepseek_vl_hybrid import *
+ from .modeling_deepseek_vl_hybrid import *
+ from .processing_deepseek_vl_hybrid import *
+else:
+ import sys
+
+ _file = globals()["__file__"]
+ sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py
new file mode 100644
index 0000000000..c3a5aa5260
--- /dev/null
+++ b/src/transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py
@@ -0,0 +1,108 @@
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# This file was automatically generated from src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py.
+# Do NOT edit this file manually as any edits will be overwritten by the generation of
+# the file from the modular. If any change should be done, please apply the change to the
+# modular_deepseek_vl_hybrid.py file directly. One of our CI enforces this.
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class DeepseekVLHybridConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`DeepseekVLHybridModel`]. It is used to instantiate a
+ DeepseekVLHybrid model according to the specified arguments, defining the model architecture. Instantiating a configuration
+ with the defaults will yield a similar configuration to that of the DeepseekVLHybrid
+ [deepseek-community/deepseek-vl-7b-chat](https://huggingface.co/deepseek-community/deepseek-vl-7b-chat) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
+ The config object or dictionary of the text backbone.
+ vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SiglipVisionConfig`):
+ The config object or dictionary of the vision backbone.
+ high_res_vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SamVisionConfig`):
+ The config object or dictionary of the high resolution vision backbone.
+ image_token_id (`int`, *optional*, defaults to 100015):
+ The index representing image tokens in the model's token vocabulary.
+
+ Example:
+
+ ```python
+ >>> from transformers import DeepseekVLHybridConfig, DeepseekVLHybridModel
+
+ >>> # Initializing a DeepseekVLHybrid deepseek-community/deepseek-vl-7b-chat style configuration
+ >>> configuration = DeepseekVLHybridConfig()
+
+ >>> # Initializing a model (with random weights) from the deepseek-community/deepseek-vl-7b-chat style configuration
+ >>> model = DeepseekVLHybridModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "deepseek_vl_hybrid"
+ sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig, "high_res_vision_config": AutoConfig}
+
+ def __init__(
+ self,
+ text_config: AutoConfig = None,
+ vision_config: AutoConfig = None,
+ high_res_vision_config: AutoConfig = None,
+ image_token_id: int = 100015,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+
+ if text_config is None:
+ text_config = {}
+ logger.info("`text_config` is `None`. Initializing the `LlamaConfig` with default values.")
+
+ if vision_config is None:
+ vision_config = {}
+ logger.info("`vision_config` is `None`. Initializing the `SiglipVisionConfig` with default values.")
+
+ if isinstance(text_config, dict):
+ text_config["model_type"] = text_config.get("model_type", "llama")
+ text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+
+ if isinstance(vision_config, dict):
+ vision_config["model_type"] = vision_config.get("model_type", "siglip_vision_model")
+ vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+
+ self.text_config = text_config
+ self.vision_config = vision_config
+ self.image_token_id = image_token_id
+
+ if high_res_vision_config is None:
+ high_res_vision_config = {}
+ logger.info("`high_res_vision_config` is `None`. Initializing the `SamVisionConfig` with default values.")
+
+ if isinstance(high_res_vision_config, dict):
+ high_res_vision_config["model_type"] = high_res_vision_config.get("model_type", "sam_vision_model")
+ high_res_vision_config = CONFIG_MAPPING[high_res_vision_config["model_type"]](**high_res_vision_config)
+
+ self.high_res_vision_config = high_res_vision_config
+
+
+__all__ = ["DeepseekVLHybridConfig"]
diff --git a/src/transformers/models/deepseek_vl_hybrid/convert_deepseek_vl_hybrid_weights_to_hf.py b/src/transformers/models/deepseek_vl_hybrid/convert_deepseek_vl_hybrid_weights_to_hf.py
new file mode 100644
index 0000000000..9f377a53c8
--- /dev/null
+++ b/src/transformers/models/deepseek_vl_hybrid/convert_deepseek_vl_hybrid_weights_to_hf.py
@@ -0,0 +1,394 @@
+# coding=utf-8
+# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import gc
+import json
+import os
+from typing import Optional
+
+import regex as re
+import torch
+from accelerate import init_empty_weights
+from huggingface_hub import snapshot_download
+from huggingface_hub.errors import HFValidationError
+from safetensors.torch import load_file
+
+from transformers import (
+ AutoTokenizer,
+ DeepseekVLHybridConfig,
+ DeepseekVLHybridForConditionalGeneration,
+ DeepseekVLHybridImageProcessor,
+ DeepseekVLHybridProcessor,
+)
+from transformers.image_utils import (
+ IMAGENET_STANDARD_MEAN,
+ IMAGENET_STANDARD_STD,
+ OPENAI_CLIP_MEAN,
+ OPENAI_CLIP_STD,
+ PILImageResampling,
+)
+
+
+# fmt: off
+ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
+ # # Sam (High Resolution)
+ r"vision_model.vision_tower_high.vision_tower.pos_embed": r"model.high_res_vision_model.vision_encoder.pos_embed",
+ r"vision_model.vision_tower_high.vision_tower.patch_embed.proj.(weight|bias)": r"model.high_res_vision_model.vision_encoder.patch_embed.projection.\1",
+ r"vision_model.vision_tower_high.vision_tower.blocks.(\d+).norm(\d+).(weight|bias)": r"model.high_res_vision_model.vision_encoder.layers.\1.layer_norm\2.\3",
+ r"vision_model.vision_tower_high.vision_tower.blocks.(\d+).attn.rel_pos_(h|w)": r"model.high_res_vision_model.vision_encoder.layers.\1.attn.rel_pos_\2",
+ r"vision_model.vision_tower_high.vision_tower.blocks.(\d+).attn.qkv.(weight|bias)": r"model.high_res_vision_model.vision_encoder.layers.\1.attn.qkv.\2",
+ r"vision_model.vision_tower_high.vision_tower.blocks.(\d+).attn.proj.(weight|bias)": r"model.high_res_vision_model.vision_encoder.layers.\1.attn.proj.\2",
+ r"vision_model.vision_tower_high.vision_tower.blocks.(\d+).mlp.lin(\d+).(weight|bias)": r"model.high_res_vision_model.vision_encoder.layers.\1.mlp.lin\2.\3",
+ r"vision_model.vision_tower_high.vision_tower.neck.0.weight": r"model.high_res_vision_model.vision_encoder.neck.conv1.weight",
+ r"vision_model.vision_tower_high.vision_tower.neck.1.(weight|bias)": r"model.high_res_vision_model.vision_encoder.neck.layer_norm1.\1",
+ r"vision_model.vision_tower_high.vision_tower.neck.2.weight": r"model.high_res_vision_model.vision_encoder.neck.conv2.weight",
+ r"vision_model.vision_tower_high.vision_tower.neck.3.(weight|bias)": r"model.high_res_vision_model.vision_encoder.neck.layer_norm2.\1",
+ r"vision_model.vision_tower_high.vision_tower.neck_hd.0.weight": r"model.high_res_vision_neck.conv1.weight",
+ r"vision_model.vision_tower_high.vision_tower.neck_hd.1.(weight|bias)": r"model.high_res_vision_neck.layer_norm1.\1",
+ r"vision_model.vision_tower_high.vision_tower.neck_hd.2.weight": r"model.high_res_vision_neck.conv2.weight",
+ r"vision_model.vision_tower_high.vision_tower.neck_hd.3.(weight|bias)": r"model.high_res_vision_neck.layer_norm2.\1",
+ r"vision_model.vision_tower_high.vision_tower.downsamples.0.weight": r"model.high_res_vision_proj.conv1.weight",
+ r"vision_model.vision_tower_high.vision_tower.downsamples.1.weight": r"model.high_res_vision_proj.conv2.weight",
+ r"vision_model.vision_tower_high.vision_tower.hd_alpha_downsamples": r"model.high_res_vision_alpha",
+
+ # Siglip (Low Resolution)
+ r"vision_model.vision_tower_low.vision_tower.pos_embed": r"model.vision_model.vision_model.embeddings.position_embedding.weight",
+ r"vision_model.vision_tower_low.vision_tower.patch_embed.proj.(weight|bias)": r"model.vision_model.vision_model.embeddings.patch_embedding.\1",
+ r"vision_model.vision_tower_low.vision_tower.blocks.(\d+).attn.qkv.(weight|bias)": r"model.vision_model.vision_model.encoder.layers.\1.self_attn.(q|k|v)_proj.\2",
+ r"vision_model.vision_tower_low.vision_tower.blocks.(\d+).attn.proj.(weight|bias)": r"model.vision_model.vision_model.encoder.layers.\1.self_attn.out_proj.\2",
+ r"vision_model.vision_tower_low.vision_tower.blocks.(\d+).norm(\d+).(weight|bias)": r"model.vision_model.vision_model.encoder.layers.\1.layer_norm\2.\3",
+ r"vision_model.vision_tower_low.vision_tower.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"model.vision_model.vision_model.encoder.layers.\1.mlp.fc\2.\3",
+ r"vision_model.vision_tower_low.vision_tower.norm.(weight|bias)": r"model.vision_model.vision_model.post_layernorm.\1",
+ r"vision_model.vision_tower_low.vision_tower.attn_pool.latent": r"model.vision_model.vision_model.head.probe",
+ r"vision_model.vision_tower_low.vision_tower.attn_pool.proj.(weight|bias)": r"model.vision_model.vision_model.head.attention.out_proj.\1",
+ r"vision_model.vision_tower_low.vision_tower.attn_pool.norm.(weight|bias)": r"model.vision_model.vision_model.head.layernorm.\1",
+ r"vision_model.vision_tower_low.vision_tower.attn_pool.mlp.fc(\d+).(weight|bias)": r"model.vision_model.vision_model.head.mlp.fc\1.\2",
+
+ # Vision Projection
+ r"aligner.layers.1.(weight|bias)": r"model.aligner.proj.\1",
+ r"aligner.low_up_proj.(weight|bias)": r"model.aligner.vision_proj.\1",
+ r"aligner.high_up_proj.(weight|bias)": r"model.aligner.high_res_vision_proj.\1",
+
+ # Llama (Text Model)
+ r"language_model.model.(\w+)": r"model.language_model.\1",
+ r"language_model.lm_head.(weight|bias)": r"lm_head.\1",
+}
+# fmt: on
+
+# Adopted from https://github.com/deepseek-ai/DeepSeek-VL/blob/main/deepseek_vl/utils/conversation.py#L80-L91
+CHAT_TEMPLATE = (
+ # Define separators and initialize counter
+ "{% set seps = ['\n\n', '<\uff5cend\u2581of\u2581sentence\uff5c>'] %}"
+ "{% set i = 0 %}"
+ # Start with default system prompt
+ "You are a helpful language and vision assistant. "
+ "You are able to understand the visual content that the user provides, "
+ "and assist the user with a variety of tasks using natural language.\n\n"
+ # Iterate through messages
+ "{% for message in messages %}"
+ # Identify user or assistant role
+ "{% if message['role']|lower == 'user' %}"
+ "User: "
+ "{% elif message['role']|lower == 'assistant' %}"
+ "Assistant:{% if not (loop.last and not add_generation_prompt and message['content'][0]['type']=='text' and message['content'][0]['text']=='') %} {% endif %}"
+ "{% else %}"
+ "{{ message['role'].capitalize() }}: "
+ "{% endif %}"
+ # Iterate through message content (text/images)
+ "{% for content in message['content'] %}"
+ # If content is an image, replace with placeholder
+ "{% if content['type'] == 'image' %}"
+ ""
+ # If content is text, handle formatting
+ "{% elif content['type'] == 'text' %}"
+ "{% set text = content['text'] %}"
+ # Strip whitespace for first and last text blocks
+ "{% if loop.first %}{% set text = text.lstrip() %}{% endif %}"
+ "{% if loop.last %}{% set text = text.rstrip() %}{% endif %}"
+ # If previous content was text, add space
+ "{% if not loop.first and message['content'][loop.index0-1]['type'] == 'text' %}"
+ "{{ ' ' + text }}"
+ "{% else %}"
+ "{{ text }}"
+ "{% endif %}"
+ "{% endif %}"
+ "{% endfor %}" # End message content loop
+ # Add separators between messages
+ "{% if not loop.last or add_generation_prompt %}"
+ "{% if message['role']|lower == 'user' %}"
+ "{{ seps[0] }}"
+ "{% else %}"
+ "{{ seps[1] }}"
+ "{% endif %}"
+ "{% endif %}"
+ "{% endfor %}" # End messages loop
+ # Add final Assistant prompt if required
+ "{% if add_generation_prompt %}Assistant:{% endif %}"
+)
+
+
+def convert_old_keys_to_new_keys(state_dict_keys: dict):
+ output_dict = {}
+
+ old_text = "\n".join(state_dict_keys)
+ new_text = old_text
+ for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items():
+ if replacement is None:
+ new_text = re.sub(pattern, "", new_text) # an empty line
+ continue
+ new_text = re.sub(pattern, replacement, new_text)
+ output_dict = dict(zip(old_text.split("\n"), new_text.split("\n")))
+
+ return output_dict
+
+
+def get_qkv_state_dict(key, parameter):
+ """
+ new key which looks like this
+ xxxx.(q|k|v).xxx (m, n)
+
+ is converted to
+ xxxx.q.xxxx (m//3, n)
+ xxxx.k.xxxx (m//3, n)
+ xxxx.v.xxxx (m//3, n)
+ """
+ qkv_state_dict = {}
+ placeholder = re.search(r"(\(.*?\))", key).group(1) # finds "(query|key|value)"
+ replacements_keys = placeholder[1:-1].split("|") # creates ['query', 'key', 'value']
+ replacements_vals = torch.split(
+ parameter, split_size_or_sections=parameter.size(0) // len(replacements_keys), dim=0
+ )
+ for replacement_key, replacement_val in zip(replacements_keys, replacements_vals):
+ qkv_state_dict[key.replace(placeholder, replacement_key)] = replacement_val
+ return qkv_state_dict
+
+
+def update_state_dict(old_state_dict):
+ all_keys = list(old_state_dict.keys())
+ new_keys = convert_old_keys_to_new_keys(all_keys)
+
+ state_dict = {}
+ for key in all_keys:
+ new_key = new_keys[key]
+ current_parameter = old_state_dict.pop(key)
+
+ if "qkv" in key and "vision_tower_high" not in key:
+ qkv_state_dict = get_qkv_state_dict(new_key, current_parameter)
+ state_dict.update(qkv_state_dict)
+ elif "pos_embed" in key:
+ if "vision_tower_high" not in key:
+ # timm implementation of siglip creates this param of size [1, 576, 1024]
+ # transformers implementation of siglip creates this param of size [576, 1024]
+ state_dict[new_key] = current_parameter.squeeze(0)
+ else:
+ state_dict[new_key] = current_parameter
+ else:
+ state_dict[new_key] = current_parameter
+
+ return state_dict
+
+
+def load_model_state_dict(input_path: str) -> dict:
+ """
+ Load model state dict, handling both single and sharded files.
+ """
+ index_path = os.path.join(input_path, "model.safetensors.index.json")
+ single_file_path = os.path.join(input_path, "model.safetensors")
+
+ # Check if we have a sharded model
+ if os.path.exists(index_path):
+ print("Loading sharded model...")
+ state_dict = {}
+ with open(index_path, "r") as f:
+ index = json.load(f)
+
+ # Get unique shard files and load each one only once
+ unique_shard_files = sorted(set(index["weight_map"].values()))
+ for shard_file in unique_shard_files:
+ print(f"Loading shard {shard_file}...")
+ shard_path = os.path.join(input_path, shard_file)
+ shard_dict = load_file(shard_path)
+ state_dict.update(shard_dict)
+
+ return state_dict
+
+ # Single file model
+ elif os.path.exists(single_file_path):
+ print("Loading single file model...")
+ return load_file(single_file_path, device="cpu")
+
+ else:
+ raise ValueError(f"No model files found in {input_path}")
+
+
+def convert_model(
+ hf_repo_id: str,
+ output_dir: Optional[str] = None,
+ output_hub_path: Optional[str] = None,
+ safe_serialization: bool = True,
+):
+ if output_dir:
+ os.makedirs(output_dir, exist_ok=True)
+
+ try:
+ input_path = snapshot_download(hf_repo_id)
+ except HFValidationError:
+ # If the input path is not a HF repo ID, assume it's a local path
+ input_path = hf_repo_id
+
+ # ------------------------------------------------------------
+ # Create and save config
+ # ------------------------------------------------------------
+
+ config = DeepseekVLHybridConfig(
+ text_config={
+ "hidden_size": 4096,
+ "intermediate_size": 11008,
+ "max_position_embeddings": 16384,
+ "num_attention_heads": 32,
+ "num_hidden_layers": 30,
+ "vocab_size": 102400,
+ },
+ vision_config={
+ "hidden_size": 1024,
+ "intermediate_size": 4096,
+ "image_size": 384,
+ "patch_size": 16,
+ "hidden_act": "gelu",
+ "vision_use_head": False,
+ "num_attention_heads": 16,
+ "num_hidden_layers": 24,
+ },
+ high_res_vision_config={
+ "hidden_size": 768,
+ "intermediate_size": 3072,
+ "image_size": 1024,
+ "patch_size": 16,
+ "num_attention_heads": 12,
+ "num_hidden_layers": 12,
+ },
+ )
+
+ # save config
+ if output_dir:
+ config.save_pretrained(output_dir)
+ print("Model config saved successfully...")
+
+ # ------------------------------------------------------------
+ # Convert processor
+ # ------------------------------------------------------------
+
+ image_processor = DeepseekVLHybridImageProcessor(
+ image_mean=IMAGENET_STANDARD_MEAN,
+ image_std=IMAGENET_STANDARD_STD,
+ high_res_image_mean=OPENAI_CLIP_MEAN,
+ high_res_image_std=OPENAI_CLIP_STD,
+ resample=PILImageResampling.BILINEAR,
+ )
+
+ tokenizer = AutoTokenizer.from_pretrained(
+ input_path,
+ extra_special_tokens={
+ "pad_token": "<|end▁of▁sentence|>",
+ "image_token": "",
+ },
+ )
+
+ processor = DeepseekVLHybridProcessor(
+ image_processor=image_processor,
+ tokenizer=tokenizer,
+ chat_template=CHAT_TEMPLATE,
+ )
+
+ if output_dir:
+ print(f"Saving processor to {output_dir}...")
+ processor.save_pretrained(output_dir)
+ if output_hub_path:
+ print(f"Pushing processor to hub at {output_hub_path}...")
+ processor.push_to_hub(output_hub_path)
+
+ # ------------------------------------------------------------
+ # Convert weights
+ # ------------------------------------------------------------
+
+ print("Creating empty model...")
+ with init_empty_weights():
+ model = DeepseekVLHybridForConditionalGeneration(config)
+
+ # Load and convert state dict
+ print("Loading state dict...")
+ state_dict = load_model_state_dict(input_path)
+ state_dict = update_state_dict(state_dict)
+
+ # Load converted state dict
+ print("Loading converted weights into model...")
+ info = model.load_state_dict(state_dict, strict=False, assign=True)
+ if len(info.missing_keys) > 0:
+ raise ValueError(f"Missing keys: {info.missing_keys}")
+
+ # Tie weights before any device mapping
+ print("Tying weights...")
+ model.tie_weights()
+
+ # Save the model
+ if output_dir:
+ print(f"Saving model to {output_dir}...")
+ model.save_pretrained(output_dir, safe_serialization=safe_serialization)
+ if output_hub_path:
+ print(f"Pushing model to hub at {output_hub_path}...")
+ model.push_to_hub(output_hub_path, safe_serialization=safe_serialization)
+
+ del state_dict, model
+ gc.collect()
+
+ # Validate the saved model if saved locally
+ if output_dir:
+ print("Reloading the local model to check if it's saved correctly...")
+ DeepseekVLHybridForConditionalGeneration.from_pretrained(output_dir, device_map="auto")
+ print("Local model reloaded successfully.")
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--hf_repo_id",
+ default="deepseek-ai/deepseek-vl-7b-chat",
+ help="Location of official weights from DeepseekAI on HF",
+ )
+ parser.add_argument(
+ "--output_dir",
+ default=None,
+ help="Location to write the converted model and processor",
+ )
+ parser.add_argument(
+ "--output_hub_path",
+ default=None,
+ help="Repository ID to push model to hub (e.g. 'username/model-name')",
+ )
+ parser.add_argument(
+ "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`."
+ )
+ args = parser.parse_args()
+
+ convert_model(
+ hf_repo_id=args.hf_repo_id,
+ output_dir=args.output_dir,
+ output_hub_path=args.output_hub_path,
+ safe_serialization=args.safe_serialization,
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py
new file mode 100644
index 0000000000..d42cfbe38b
--- /dev/null
+++ b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py
@@ -0,0 +1,483 @@
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# This file was automatically generated from src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py.
+# Do NOT edit this file manually as any edits will be overwritten by the generation of
+# the file from the modular. If any change should be done, please apply the change to the
+# modular_deepseek_vl_hybrid.py file directly. One of our CI enforces this.
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor
+from ...image_processing_utils_fast import BatchFeature, get_size_dict
+from ...image_transforms import convert_to_rgb, resize, to_channel_dimension_format
+from ...image_utils import (
+ OPENAI_CLIP_MEAN,
+ OPENAI_CLIP_STD,
+ ChannelDimension,
+ ImageInput,
+ PILImageResampling,
+ get_image_size,
+ infer_channel_dimension_format,
+ is_scaled_image,
+ make_list_of_images,
+ to_numpy_array,
+ valid_images,
+ validate_preprocess_arguments,
+)
+from ...utils import (
+ TensorType,
+ filter_out_non_signature_kwargs,
+ is_vision_available,
+ logging,
+)
+
+
+if is_vision_available():
+ import PIL
+
+
+logger = logging.get_logger(__name__)
+
+
+class DeepseekVLHybridImageProcessor(BaseImageProcessor):
+ r"""
+ Constructs a DEEPSEEK_VL_HYBRID image processor.
+
+ Args:
+ do_resize (`bool`, *optional*, defaults to `True`):
+ Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+ `do_resize` parameter in the `preprocess` method.
+ size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
+ Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+ method.
+ high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`):
+ Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess`
+ method.
+ min_size (`int`, *optional*, defaults to 14):
+ The minimum allowed size for the resized image. Ensures that neither the height nor width
+ falls below this value after resizing.
+ resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+ Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+ overridden by the `resample` parameter in the `preprocess` method.
+ high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+ Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+ overridden by the `high_res_resample` parameter in the `preprocess` method.
+ do_rescale (`bool`, *optional*, defaults to `True`):
+ Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+ `do_rescale` parameter in the `preprocess` method.
+ rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+ Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
+ overridden by the `rescale_factor` parameter in the `preprocess` method.
+ do_normalize (`bool`, *optional*, defaults to `True`):
+ Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+ method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
+ image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+ Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+ channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+ overridden by the `image_mean` parameter in the `preprocess` method.
+ image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+ Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+ number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+ Can be overridden by the `image_std` parameter in the `preprocess` method.
+ high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
+ Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of
+ channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method.
+ high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
+ Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the
+ number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
+ do_convert_rgb (`bool`, *optional*, defaults to `True`):
+ Whether to convert the image to RGB.
+ """
+
+ model_input_names = ["pixel_values"]
+
+ def __init__(
+ self,
+ do_resize: bool = True,
+ size: Optional[dict[str, int]] = None,
+ high_res_size: Optional[dict[str, int]] = None,
+ min_size: int = 14,
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
+ high_res_resample: PILImageResampling = PILImageResampling.BICUBIC,
+ do_rescale: bool = True,
+ rescale_factor: Union[int, float] = 1 / 255,
+ do_normalize: bool = True,
+ image_mean: Optional[Union[float, list[float]]] = None,
+ image_std: Optional[Union[float, list[float]]] = None,
+ high_res_image_mean: Optional[Union[float, list[float]]] = None,
+ high_res_image_std: Optional[Union[float, list[float]]] = None,
+ do_convert_rgb: Optional[bool] = None,
+ **kwargs,
+ ) -> None:
+ super().__init__(**kwargs)
+ high_res_size = high_res_size if high_res_size is not None else {"height": 1024, "width": 1024}
+ high_res_size = get_size_dict(high_res_size, default_to_square=True)
+
+ self.high_res_size = high_res_size
+ self.high_res_image_mean = high_res_image_mean if high_res_image_mean is not None else OPENAI_CLIP_MEAN
+ self.high_res_image_std = high_res_image_std if high_res_image_std is not None else OPENAI_CLIP_STD
+
+ self.resample = resample
+ self.high_res_resample = high_res_resample
+ size = size if size is not None else {"height": 384, "width": 384}
+ size = get_size_dict(size, default_to_square=True)
+
+ self.do_resize = do_resize
+ self.size = size
+ self.resample = resample
+ self.do_rescale = do_rescale
+ self.rescale_factor = rescale_factor
+ self.do_normalize = do_normalize
+ self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+ self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+ self.do_convert_rgb = do_convert_rgb
+
+ self.min_size = min_size
+ if image_mean is None:
+ self.background_color = (127, 127, 127)
+ else:
+ self.background_color = tuple([int(x * 255) for x in image_mean])
+
+ if high_res_image_mean is None:
+ self.background_color = (127, 127, 127)
+ else:
+ self.background_color = tuple([int(x * 255) for x in high_res_image_mean])
+
+ def resize(
+ self,
+ image: np.ndarray,
+ size: Union[dict[str, int], int],
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
+ data_format: Optional[Union[str, ChannelDimension]] = None,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ **kwargs,
+ ) -> np.ndarray:
+ """
+ Resize an image to dynamically calculated size.
+
+ Args:
+ image (`np.ndarray`):
+ Image to resize.
+ resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+ `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
+ data_format (`ChannelDimension` or `str`, *optional*):
+ The channel dimension format for the output image. If unset, the channel dimension format of the input
+ image is used. Can be one of:
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+ - `None`: will be inferred from input
+ input_data_format (`ChannelDimension` or `str`, *optional*):
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
+ from the input image. Can be one of:
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+ Returns:
+ `np.ndarray`: The resized image.
+ """
+ if input_data_format is None:
+ input_data_format = infer_channel_dimension_format(image)
+
+ height, width = get_image_size(image, input_data_format)
+ max_size = max(height, width)
+
+ size = get_size_dict(size, default_to_square=True)
+ if size["height"] != size["width"]:
+ raise ValueError(
+ f"Output height and width must be the same. Got height={size['height']} and width={size['width']}"
+ )
+ size = size["height"]
+
+ delta = size / max_size
+ # Largest side becomes `size` and the other side is scaled according to the aspect ratio.
+ output_size_nonpadded = [
+ max(int(height * delta), self.min_size),
+ max(int(width * delta), self.min_size),
+ ]
+
+ image = resize(
+ image,
+ size=output_size_nonpadded,
+ resample=resample,
+ data_format=data_format,
+ input_data_format=input_data_format,
+ **kwargs,
+ )
+ # Expand and pad the images to obtain a square image of dimensions `size x size`
+ image = self.pad_to_square(
+ image=image,
+ background_color=self.background_color,
+ input_data_format=input_data_format,
+ )
+ return image
+
+ @filter_out_non_signature_kwargs()
+ def preprocess(
+ self,
+ images: ImageInput,
+ do_resize: Optional[bool] = None,
+ size: Optional[dict[str, int]] = None,
+ high_res_size: Optional[dict[str, int]] = None,
+ resample: PILImageResampling = None,
+ high_res_resample: PILImageResampling = None,
+ do_rescale: Optional[bool] = None,
+ rescale_factor: Optional[float] = None,
+ do_normalize: Optional[bool] = None,
+ image_mean: Optional[Union[float, list[float]]] = None,
+ image_std: Optional[Union[float, list[float]]] = None,
+ high_res_image_mean: Optional[Union[float, list[float]]] = None,
+ high_res_image_std: Optional[Union[float, list[float]]] = None,
+ return_tensors: Optional[Union[str, TensorType]] = None,
+ data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ do_convert_rgb: Optional[bool] = None,
+ ) -> PIL.Image.Image:
+ """
+ Preprocess an image or batch of images.
+
+ Args:
+ images (`ImageInput`):
+ Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+ passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+ Whether to resize the image.
+ size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+ Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
+ resizing.
+ high_res_size (`Dict[str, int]`, *optional*, defaults to `self.high_res_size`):
+ Dictionary in the format `{"height": h, "width": w}` specifying the size of the high resolution output image after
+ resizing.
+ resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
+ `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
+ an effect if `do_resize` is set to `True`.
+ high_res_resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
+ `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BICUBIC`. Only has
+ an effect if `do_resize` is set to `True`.
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+ Whether to rescale the image values between [0 - 1].
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+ Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+ Whether to normalize the image.
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+ Image mean to use if `do_normalize` is set to `True`.
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+ Image standard deviation to use if `do_normalize` is set to `True`.
+ high_res_image_mean (`float` or `List[float]`, *optional*, defaults to `self.high_res_image_mean`):
+ Image mean to use if `do_normalize` is set to `True`.
+ high_res_image_std (`float` or `List[float]`, *optional*, defaults to `self.high_res_image_std`):
+ Image standard deviation to use if `do_normalize` is set to `True`.
+ return_tensors (`str` or `TensorType`, *optional*):
+ The type of tensors to return. Can be one of:
+ - Unset: Return a list of `np.ndarray`.
+ - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+ - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+ - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+ - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+ data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+ The channel dimension format for the output image. Can be one of:
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+ - Unset: Use the channel dimension format of the input image.
+ input_data_format (`ChannelDimension` or `str`, *optional*):
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
+ from the input image. Can be one of:
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+ Whether to convert the image to RGB.
+ """
+ do_resize = do_resize if do_resize is not None else self.do_resize
+ do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+ do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+ resample = resample if resample is not None else self.resample
+ high_res_resample = high_res_resample if high_res_resample is not None else self.high_res_resample
+ rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+ image_mean = image_mean if image_mean is not None else self.image_mean
+ image_std = image_std if image_std is not None else self.image_std
+ high_res_image_mean = high_res_image_mean if high_res_image_mean is not None else self.high_res_image_mean
+ high_res_image_std = high_res_image_std if high_res_image_std is not None else self.high_res_image_std
+ do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+ size = size if size is not None else self.size
+ size_dict = get_size_dict(size)
+ high_res_size = high_res_size if high_res_size is not None else self.high_res_size
+ high_res_size_dict = get_size_dict(high_res_size)
+
+ images = make_list_of_images(images)
+
+ if not valid_images(images):
+ raise ValueError(
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+ "torch.Tensor, tf.Tensor or jax.ndarray."
+ )
+ validate_preprocess_arguments(
+ do_rescale=do_rescale,
+ rescale_factor=rescale_factor,
+ do_normalize=do_normalize,
+ image_mean=image_mean,
+ image_std=image_std,
+ do_resize=do_resize,
+ size=size,
+ resample=resample,
+ )
+
+ if do_convert_rgb:
+ images = [convert_to_rgb(image) for image in images]
+
+ # All transformations expect numpy arrays.
+ images = [to_numpy_array(image) for image in images]
+
+ if do_rescale and is_scaled_image(images[0]):
+ logger.warning_once(
+ "It looks like you are trying to rescale already rescaled images. If the input"
+ " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+ )
+
+ if input_data_format is None:
+ # We assume that all images have the same channel dimension format.
+ input_data_format = infer_channel_dimension_format(images[0])
+
+ all_images = []
+ all_high_res_images = []
+ for image in images:
+ # high_res_image: resize (high) -> rescale -> normalize (high)
+ # low_res_image: resize (high) -> rescale -> resize (low) -> normalize (low)
+ high_res_image = image
+
+ if do_resize:
+ high_res_image = self.resize(
+ image=high_res_image,
+ size=high_res_size_dict,
+ resample=high_res_resample,
+ input_data_format=input_data_format,
+ )
+ image = self.resize(
+ image=high_res_image, size=size_dict, resample=resample, input_data_format=input_data_format
+ )
+
+ if do_rescale:
+ image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+ high_res_image = self.rescale(
+ image=high_res_image, scale=rescale_factor, input_data_format=input_data_format
+ )
+
+ if do_normalize:
+ image = self.normalize(
+ image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+ )
+ high_res_image = self.normalize(
+ image=high_res_image,
+ mean=high_res_image_mean,
+ std=high_res_image_std,
+ input_data_format=input_data_format,
+ )
+
+ image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+ high_res_image = to_channel_dimension_format(
+ high_res_image, data_format, input_channel_dim=input_data_format
+ )
+
+ all_images.append(image)
+ all_high_res_images.append(high_res_image)
+
+ data = {"pixel_values": all_images, "high_res_pixel_values": all_high_res_images}
+ return BatchFeature(data=data, tensor_type=return_tensors)
+
+ def pad_to_square(
+ self,
+ image: np.ndarray,
+ background_color: Union[int, tuple[int, int, int]] = 0,
+ data_format: Optional[Union[str, ChannelDimension]] = None,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ ) -> np.array:
+ """
+ Pads an image to a square based on the longest edge.
+
+ Args:
+ image (`np.ndarray`):
+ The image to pad.
+ background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
+ The color to use for the padding. Can be an integer for single channel or a
+ tuple of integers representing for multi-channel images. If passed as integer
+ in mutli-channel mode, it will default to `0` in subsequent channels.
+ data_format (`str` or `ChannelDimension`, *optional*):
+ The channel dimension format for the output image. Can be one of:
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+ If unset, will use same as the input image.
+ input_data_format (`str` or `ChannelDimension`, *optional*):
+ The channel dimension format for the input image. Can be one of:
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+
+ Returns:
+ `np.ndarray`: The padded image.
+ """
+ height, width = get_image_size(image, input_data_format)
+ num_channels = image.shape[0] if input_data_format == ChannelDimension.FIRST else image.shape[-1]
+
+ if height == width:
+ image = (
+ to_channel_dimension_format(image, data_format, input_data_format)
+ if data_format is not None
+ else image
+ )
+ return image
+
+ max_dim = max(height, width)
+
+ # Ensure background_color is the correct shape
+ if isinstance(background_color, int):
+ background_color = [background_color]
+ elif len(background_color) != num_channels:
+ raise ValueError(
+ f"background_color must have no more than {num_channels} elements to match the number of channels"
+ )
+
+ if input_data_format == ChannelDimension.FIRST:
+ result = np.zeros((num_channels, max_dim, max_dim), dtype=image.dtype)
+ for i, color in enumerate(background_color):
+ result[i, :, :] = color
+ if width > height:
+ start = (max_dim - height) // 2
+ result[:, start : start + height, :] = image
+ else:
+ start = (max_dim - width) // 2
+ result[:, :, start : start + width] = image
+ else:
+ result = np.zeros((max_dim, max_dim, num_channels), dtype=image.dtype)
+ for i, color in enumerate(background_color):
+ result[:, :, i] = color
+ if width > height:
+ start = (max_dim - height) // 2
+ result[start : start + height, :, :] = image
+ else:
+ start = (max_dim - width) // 2
+ result[:, start : start + width, :] = image
+
+ return result
+
+ def postprocess(self):
+ """Applies post-processing to the decoded image tokens by reversing transformations applied during preprocessing."""
+ raise AttributeError("Not needed for DeepseekVLHybrid")
+
+
+__all__ = ["DeepseekVLHybridImageProcessor"]
diff --git a/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py
new file mode 100644
index 0000000000..67b67371f9
--- /dev/null
+++ b/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py
@@ -0,0 +1,491 @@
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# This file was automatically generated from src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py.
+# Do NOT edit this file manually as any edits will be overwritten by the generation of
+# the file from the modular. If any change should be done, please apply the change to the
+# modular_deepseek_vl_hybrid.py file directly. One of our CI enforces this.
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+
+from ...cache_utils import Cache
+from ...generation import GenerationMixin
+from ...modeling_outputs import ModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import (
+ TransformersKwargs,
+ auto_docstring,
+ can_return_tuple,
+)
+from ..auto import AutoModel
+from .configuration_deepseek_vl_hybrid import DeepseekVLHybridConfig
+
+
+@dataclass
+@auto_docstring(
+ custom_intro="""
+ Base class for DeepseekVLHybrid model's outputs that may also contain a past key/values (to speed up sequential decoding).
+ """
+)
+class DeepseekVLHybridBaseModelOutputWithPast(ModelOutput):
+ r"""
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the model.
+
+ If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+ hidden_size)` is output.
+ past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+ `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+ encoder_sequence_length, embed_size_per_head)`.
+
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+ `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+ input) to speed up sequential decoding.
+ image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+ Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
+ sequence_length, hidden_size)`.
+
+ image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
+ """
+
+ last_hidden_state: Optional[torch.FloatTensor] = None
+ past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None
+ hidden_states: Optional[tuple[torch.FloatTensor]] = None
+ attentions: Optional[tuple[torch.FloatTensor]] = None
+ image_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+@auto_docstring(
+ custom_intro="""
+ Base class for DeepseekVLHybrid causal language model (or autoregressive) outputs.
+ """
+)
+class DeepseekVLHybridCausalLMOutputWithPast(ModelOutput):
+ r"""
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+ Language modeling loss (for next-token prediction).
+ logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+ past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+ Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+ `past_key_values` input) to speed up sequential decoding.
+ image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+ Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
+ sequence_length, hidden_size)`.
+
+ image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
+ """
+
+ loss: Optional[torch.FloatTensor] = None
+ logits: Optional[torch.FloatTensor] = None
+ past_key_values: Optional[list[torch.FloatTensor]] = None
+ hidden_states: Optional[tuple[torch.FloatTensor]] = None
+ attentions: Optional[tuple[torch.FloatTensor]] = None
+ image_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+
+
+class DeepseekVLHybridLayerNorm(nn.Module):
+ r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
+ The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
+ width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
+ """
+
+ def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(normalized_shape))
+ self.bias = nn.Parameter(torch.zeros(normalized_shape))
+ self.eps = eps
+ self.data_format = data_format
+ if self.data_format not in ["channels_last", "channels_first"]:
+ raise NotImplementedError(f"Unsupported data format: {self.data_format}")
+ self.normalized_shape = (normalized_shape,)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ if self.data_format == "channels_last":
+ x = torch.nn.functional.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+ elif self.data_format == "channels_first":
+ input_dtype = x.dtype
+ x = x.float()
+ u = x.mean(1, keepdim=True)
+ s = (x - u).pow(2).mean(1, keepdim=True)
+ x = (x - u) / torch.sqrt(s + self.eps)
+ x = x.to(dtype=input_dtype)
+ x = self.weight[:, None, None] * x + self.bias[:, None, None]
+ return x
+
+
+class DeepseekVLSamVisionNeck(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+
+ self.conv1 = nn.Conv2d(config.hidden_size, config.output_channels, kernel_size=1, bias=False)
+ self.layer_norm1 = DeepseekVLHybridLayerNorm(config.output_channels, data_format="channels_first")
+ self.conv2 = nn.Conv2d(config.output_channels, config.output_channels, kernel_size=3, padding=1, bias=False)
+ self.layer_norm2 = DeepseekVLHybridLayerNorm(config.output_channels, data_format="channels_first")
+
+ def forward(self, hidden_states):
+ hidden_states = hidden_states.permute(0, 3, 1, 2)
+ hidden_states = self.conv1(hidden_states)
+ hidden_states = self.layer_norm1(hidden_states)
+
+ hidden_states = self.conv2(hidden_states)
+ hidden_states = self.layer_norm2(hidden_states)
+ return hidden_states
+
+
+class DeepseekVLSamVisionProj(nn.Module):
+ def __init__(self, config, output_size: int = 24):
+ super().__init__()
+ self.config = config
+ self.output_size = output_size
+
+ self.conv1 = nn.Conv2d(
+ config.output_channels, config.output_channels * 2, kernel_size=3, stride=2, padding=1, bias=False
+ )
+ self.conv2 = nn.Conv2d(
+ config.output_channels * 2, config.output_channels * 4, kernel_size=3, stride=2, padding=1, bias=False
+ )
+
+ def forward(self, features: torch.Tensor) -> torch.Tensor:
+ # interpolate Sam encodings to match Siglip encodings
+ features = torch.nn.functional.interpolate(
+ features,
+ size=(4 * self.output_size, 4 * self.output_size),
+ mode="bilinear",
+ align_corners=False,
+ )
+ features = self.conv1(features)
+ features = self.conv2(features)
+ return features
+
+
+class DeepseekVLHybridAligner(nn.Module):
+ def __init__(self, config: DeepseekVLHybridConfig):
+ super().__init__()
+
+ in_channels = config.vision_config.hidden_size
+ high_res_in_channels = config.high_res_vision_config.output_channels * 4
+ out_channels = config.text_config.hidden_size
+
+ self.vision_proj = nn.Linear(in_channels, out_channels // 2)
+ self.high_res_vision_proj = nn.Linear(high_res_in_channels, out_channels // 2)
+
+ self.act = nn.GELU()
+ self.proj = nn.Linear(out_channels, out_channels)
+
+ def forward(
+ self,
+ vision_encodings: torch.Tensor,
+ high_res_vision_encodings: torch.Tensor,
+ ) -> torch.Tensor:
+ vision_encodings = self.vision_proj(vision_encodings)
+ high_res_vision_encodings = self.high_res_vision_proj(high_res_vision_encodings)
+
+ encodings = torch.concat([high_res_vision_encodings, vision_encodings], dim=-1)
+ encodings = self.act(encodings)
+ encodings = self.proj(encodings)
+
+ return encodings
+
+
+@auto_docstring
+class DeepseekVLHybridPreTrainedModel(PreTrainedModel):
+ config: DeepseekVLHybridConfig
+ base_model_prefix = "model"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["LlamaDecoderLayer"]
+ _skip_keys_device_placement = ["past_key_values", "causal_mask"]
+ _supports_flash_attn = True
+ _supports_sdpa = True
+
+ _supports_static_cache = True
+ _supports_param_buffer_assignment = False
+
+ def _init_weights(self, module):
+ """Initialize the weights"""
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=self.config.text_config.initializer_range)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Conv2d):
+ nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, DeepseekVLHybridLayerNorm):
+ module.weight.data.fill_(1.0)
+ module.bias.data.zero_()
+ elif isinstance(module, DeepseekVLHybridModel):
+ module.high_res_vision_alpha.data.zero_()
+
+
+DEEPSEEK_VL_COMMON_CUSTOM_ARGS = r"""
+ high_res_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size), *optional*):
+ The tensors corresponding to the input images. Pixel values can be obtained using
+ [`AutoImageProcessor`].
+"""
+
+
+@auto_docstring
+class DeepseekVLHybridModel(DeepseekVLHybridPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+ self.output_size = config.vision_config.image_size // config.vision_config.patch_size
+ self.global_attn_index = config.high_res_vision_config.global_attn_indexes[0]
+
+ self.high_res_vision_model = AutoModel.from_config(config.high_res_vision_config)
+ self.high_res_vision_neck = DeepseekVLSamVisionNeck(config.high_res_vision_config)
+ self.high_res_vision_proj = DeepseekVLSamVisionProj(
+ config.high_res_vision_config, output_size=self.output_size
+ )
+ self.high_res_vision_alpha = nn.Parameter(torch.zeros(1))
+ self.config = config
+
+ self.vision_model = AutoModel.from_config(config.vision_config)
+ self.aligner = DeepseekVLHybridAligner(config)
+
+ self.language_model = AutoModel.from_config(config=config.text_config)
+
+ self.gradient_checkpointing = False
+ # Initialize weights and apply final processing.
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.language_model.get_input_embeddings()
+
+ def set_input_embeddings(self, value):
+ self.language_model.set_input_embeddings(value)
+
+ def get_image_features(self, pixel_values, high_res_pixel_values):
+ vision_encodings = self.get_low_res_image_features(pixel_values)
+ high_res_vision_encodings = self.get_high_res_image_features(high_res_pixel_values)
+ images_embeds = self.aligner(vision_encodings, high_res_vision_encodings)
+ return images_embeds
+
+ @can_return_tuple
+ @auto_docstring(custom_args=DEEPSEEK_VL_COMMON_CUSTOM_ARGS)
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ pixel_values: torch.FloatTensor = None,
+ high_res_pixel_values: torch.FloatTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Cache] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = None,
+ logits_to_keep: Union[int, torch.Tensor] = 0,
+ **kwargs,
+ ):
+ if (input_ids is None) ^ (inputs_embeds is not None):
+ raise ValueError(
+ "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+ )
+
+ if pixel_values is not None and high_res_pixel_values is None:
+ raise ValueError("Both pixel_values and high_res_pixel_values should be specified at the same time")
+
+ if inputs_embeds is None:
+ inputs_embeds = self.get_input_embeddings()(input_ids)
+
+ if pixel_values is not None:
+ if input_ids is None:
+ image_attention_mask = inputs_embeds == self.get_input_embeddings()(
+ torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+ )
+ image_attention_mask = image_attention_mask.all(-1)
+ else:
+ image_attention_mask = input_ids == self.config.image_token_id
+
+ image_attention_mask = image_attention_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+ image_embeds = self.get_image_features(pixel_values, high_res_pixel_values)
+ image_features = image_embeds.reshape(-1, inputs_embeds.shape[-1])
+ image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+ inputs_embeds = inputs_embeds.masked_scatter(image_attention_mask, image_features)
+
+ lm_output = self.language_model(
+ inputs_embeds=inputs_embeds,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ use_cache=use_cache,
+ cache_position=cache_position,
+ logits_to_keep=logits_to_keep,
+ **kwargs,
+ )
+
+ return DeepseekVLHybridBaseModelOutputWithPast(
+ last_hidden_state=lm_output.last_hidden_state,
+ past_key_values=lm_output.past_key_values,
+ hidden_states=lm_output.hidden_states,
+ attentions=lm_output.attentions,
+ image_hidden_states=image_embeds if pixel_values is not None else None,
+ )
+
+ def get_low_res_image_features(self, pixel_values):
+ output = self.vision_model(pixel_values)
+ output = output[0]
+ return output
+
+ def get_high_res_image_features(self, pixel_values):
+ output = self.high_res_vision_model(
+ pixel_values=pixel_values,
+ output_hidden_states=True,
+ return_dict=True,
+ )
+ last_hidden_state = output.last_hidden_state
+ last_hidden_state = self.high_res_vision_proj(last_hidden_state)
+
+ hidden_states = output.hidden_states
+ global_hidden_state = hidden_states[self.global_attn_index + 1] # +1 for embedding layer
+ global_hidden_state = self.high_res_vision_neck(global_hidden_state)
+ global_hidden_state = self.high_res_vision_proj(global_hidden_state)
+
+ output = last_hidden_state + global_hidden_state * self.high_res_vision_alpha
+
+ # batch_size, hidden_size, height, width -> batch_size, seq_len, hidden_size
+ output = output.permute(0, 2, 3, 1)
+ output = output.reshape(output.shape[0], -1, output.shape[-1])
+
+ return output
+
+
+class DeepseekVLHybridForConditionalGeneration(DeepseekVLHybridPreTrainedModel, GenerationMixin):
+ _tied_weights_keys = ["model.language_model.embed_tokens.weight", "lm_head.weight"]
+ _supports_static_cache = True
+
+ def __init__(self, config: DeepseekVLHybridConfig):
+ super().__init__(config)
+ self.config = config
+ self.model = DeepseekVLHybridModel(config)
+ self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+
+ # Initialize weights and apply final processing.
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.language_model.get_input_embeddings()
+
+ def set_input_embeddings(self, value):
+ self.model.language_model.set_input_embeddings(value)
+
+ def prepare_embeddings_for_image_generation(self) -> torch.Tensor:
+ raise AttributeError("Not needed for DeepseekVLHybrid")
+
+ def set_decoder(self, decoder):
+ self.model = decoder
+
+ def get_decoder(self):
+ return self.model
+
+ @can_return_tuple
+ @auto_docstring(custom_args=DEEPSEEK_VL_COMMON_CUSTOM_ARGS)
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ pixel_values: torch.FloatTensor = None,
+ high_res_pixel_values: torch.FloatTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Cache] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ logits_to_keep: Union[int, torch.Tensor] = 0,
+ **kwargs: Unpack[TransformersKwargs],
+ ):
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+ """
+ outputs = self.model(
+ input_ids=input_ids,
+ pixel_values=pixel_values,
+ high_res_pixel_values=high_res_pixel_values,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ cache_position=cache_position,
+ **kwargs,
+ )
+ hidden_states = outputs.last_hidden_state
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+ loss = None
+ if labels is not None:
+ loss = self.loss_function(
+ logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+ )
+
+ return DeepseekVLHybridCausalLMOutputWithPast(
+ loss=loss,
+ logits=logits,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ image_hidden_states=outputs.image_hidden_states,
+ )
+
+ def prepare_inputs_for_generation(
+ self,
+ input_ids,
+ past_key_values=None,
+ inputs_embeds=None,
+ pixel_values=None,
+ high_res_pixel_values=None,
+ attention_mask=None,
+ cache_position=None,
+ logits_to_keep=None,
+ **kwargs,
+ ):
+ model_inputs = super().prepare_inputs_for_generation(
+ input_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ attention_mask=attention_mask,
+ cache_position=cache_position,
+ logits_to_keep=logits_to_keep,
+ **kwargs,
+ )
+
+ if cache_position[0] == 0:
+ # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+ # Otherwise we need pixel values to be passed to model
+ model_inputs["pixel_values"] = pixel_values
+ model_inputs["high_res_pixel_values"] = high_res_pixel_values
+
+ return model_inputs
+
+
+__all__ = ["DeepseekVLHybridPreTrainedModel", "DeepseekVLHybridModel", "DeepseekVLHybridForConditionalGeneration"]
diff --git a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
new file mode 100644
index 0000000000..aa0a4f87ba
--- /dev/null
+++ b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
@@ -0,0 +1,777 @@
+# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+
+from ...cache_utils import Cache
+from ...image_processing_utils_fast import (
+ BatchFeature,
+ get_size_dict,
+)
+from ...image_transforms import convert_to_rgb, to_channel_dimension_format
+from ...image_utils import (
+ OPENAI_CLIP_MEAN,
+ OPENAI_CLIP_STD,
+ ChannelDimension,
+ ImageInput,
+ PILImageResampling,
+ infer_channel_dimension_format,
+ is_scaled_image,
+ make_flat_list_of_images,
+ make_list_of_images,
+ to_numpy_array,
+ valid_images,
+ validate_preprocess_arguments,
+)
+from ...processing_utils import Unpack
+from ...tokenization_utils_base import (
+ PreTokenizedInput,
+ TextInput,
+)
+from ...utils import (
+ TensorType,
+ TransformersKwargs,
+ auto_docstring,
+ can_return_tuple,
+ filter_out_non_signature_kwargs,
+ logging,
+)
+from ..auto import CONFIG_MAPPING, AutoConfig, AutoModel
+from ..deepseek_vl.configuration_deepseek_vl import DeepseekVLConfig
+from ..deepseek_vl.image_processing_deepseek_vl import DeepseekVLImageProcessor
+from ..deepseek_vl.modeling_deepseek_vl import (
+ DeepseekVLForConditionalGeneration,
+ DeepseekVLModel,
+ DeepseekVLPreTrainedModel,
+)
+from ..deepseek_vl.processing_deepseek_vl import DeepseekVLProcessor, DeepseekVLProcessorKwargs
+from ..idefics.modeling_idefics import IdeficsBaseModelOutputWithPast, IdeficsCausalLMOutputWithPast
+from ..sam.modeling_sam import SamLayerNorm, SamVisionNeck
+
+
+logger = logging.get_logger(__name__)
+
+
+DEEPSEEK_VL_COMMON_CUSTOM_ARGS = r"""
+ high_res_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size), *optional*):
+ The tensors corresponding to the input images. Pixel values can be obtained using
+ [`AutoImageProcessor`].
+"""
+
+
+class DeepseekVLHybridConfig(DeepseekVLConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`DeepseekVLHybridModel`]. It is used to instantiate a
+ DeepseekVLHybrid model according to the specified arguments, defining the model architecture. Instantiating a configuration
+ with the defaults will yield a similar configuration to that of the DeepseekVLHybrid
+ [deepseek-community/deepseek-vl-7b-chat](https://huggingface.co/deepseek-community/deepseek-vl-7b-chat) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
+ The config object or dictionary of the text backbone.
+ vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SiglipVisionConfig`):
+ The config object or dictionary of the vision backbone.
+ high_res_vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SamVisionConfig`):
+ The config object or dictionary of the high resolution vision backbone.
+ image_token_id (`int`, *optional*, defaults to 100015):
+ The index representing image tokens in the model's token vocabulary.
+
+ Example:
+
+ ```python
+ >>> from transformers import DeepseekVLHybridConfig, DeepseekVLHybridModel
+
+ >>> # Initializing a DeepseekVLHybrid deepseek-community/deepseek-vl-7b-chat style configuration
+ >>> configuration = DeepseekVLHybridConfig()
+
+ >>> # Initializing a model (with random weights) from the deepseek-community/deepseek-vl-7b-chat style configuration
+ >>> model = DeepseekVLHybridModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "deepseek_vl_hybrid"
+ sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig, "high_res_vision_config": AutoConfig}
+
+ def __init__(
+ self,
+ text_config: AutoConfig = None,
+ vision_config: AutoConfig = None,
+ high_res_vision_config: AutoConfig = None,
+ image_token_id: int = 100015,
+ **kwargs,
+ ):
+ super().__init__(
+ text_config=text_config,
+ vision_config=vision_config,
+ image_token_id=image_token_id,
+ **kwargs,
+ )
+
+ if high_res_vision_config is None:
+ high_res_vision_config = {}
+ logger.info("`high_res_vision_config` is `None`. Initializing the `SamVisionConfig` with default values.")
+
+ if isinstance(high_res_vision_config, dict):
+ high_res_vision_config["model_type"] = high_res_vision_config.get("model_type", "sam_vision_model")
+ high_res_vision_config = CONFIG_MAPPING[high_res_vision_config["model_type"]](**high_res_vision_config)
+
+ self.high_res_vision_config = high_res_vision_config
+
+
+class DeepseekVLHybridBaseModelOutputWithPast(IdeficsBaseModelOutputWithPast):
+ pass
+
+
+class DeepseekVLHybridCausalLMOutputWithPast(IdeficsCausalLMOutputWithPast):
+ pass
+
+
+class DeepseekVLHybridLayerNorm(SamLayerNorm):
+ pass
+
+
+class DeepseekVLSamVisionNeck(SamVisionNeck):
+ def __init__(self, config):
+ super().__init__(config)
+
+
+class DeepseekVLSamVisionProj(nn.Module):
+ def __init__(self, config, output_size: int = 24):
+ super().__init__()
+ self.config = config
+ self.output_size = output_size
+
+ self.conv1 = nn.Conv2d(
+ config.output_channels, config.output_channels * 2, kernel_size=3, stride=2, padding=1, bias=False
+ )
+ self.conv2 = nn.Conv2d(
+ config.output_channels * 2, config.output_channels * 4, kernel_size=3, stride=2, padding=1, bias=False
+ )
+
+ def forward(self, features: torch.Tensor) -> torch.Tensor:
+ # interpolate Sam encodings to match Siglip encodings
+ features = torch.nn.functional.interpolate(
+ features,
+ size=(4 * self.output_size, 4 * self.output_size),
+ mode="bilinear",
+ align_corners=False,
+ )
+ features = self.conv1(features)
+ features = self.conv2(features)
+ return features
+
+
+class DeepseekVLHybridAligner(nn.Module):
+ def __init__(self, config: DeepseekVLHybridConfig):
+ super().__init__()
+
+ in_channels = config.vision_config.hidden_size
+ high_res_in_channels = config.high_res_vision_config.output_channels * 4
+ out_channels = config.text_config.hidden_size
+
+ self.vision_proj = nn.Linear(in_channels, out_channels // 2)
+ self.high_res_vision_proj = nn.Linear(high_res_in_channels, out_channels // 2)
+
+ self.act = nn.GELU()
+ self.proj = nn.Linear(out_channels, out_channels)
+
+ def forward(
+ self,
+ vision_encodings: torch.Tensor,
+ high_res_vision_encodings: torch.Tensor,
+ ) -> torch.Tensor:
+ vision_encodings = self.vision_proj(vision_encodings)
+ high_res_vision_encodings = self.high_res_vision_proj(high_res_vision_encodings)
+
+ encodings = torch.concat([high_res_vision_encodings, vision_encodings], dim=-1)
+ encodings = self.act(encodings)
+ encodings = self.proj(encodings)
+
+ return encodings
+
+
+class DeepseekVLHybridPreTrainedModel(DeepseekVLPreTrainedModel):
+ def _init_weights(self, module):
+ """Initialize the weights"""
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=self.config.text_config.initializer_range)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Conv2d):
+ nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, DeepseekVLHybridLayerNorm):
+ module.weight.data.fill_(1.0)
+ module.bias.data.zero_()
+ elif isinstance(module, DeepseekVLHybridModel):
+ module.high_res_vision_alpha.data.zero_()
+
+
+class DeepseekVLHybridModel(DeepseekVLModel):
+ def __init__(self, config):
+ self.output_size = config.vision_config.image_size // config.vision_config.patch_size
+ self.global_attn_index = config.high_res_vision_config.global_attn_indexes[0]
+
+ self.high_res_vision_model = AutoModel.from_config(config.high_res_vision_config)
+ self.high_res_vision_neck = DeepseekVLSamVisionNeck(config.high_res_vision_config)
+ self.high_res_vision_proj = DeepseekVLSamVisionProj(
+ config.high_res_vision_config, output_size=self.output_size
+ )
+ self.high_res_vision_alpha = nn.Parameter(torch.zeros(1))
+
+ super().__init__(config)
+
+ def get_low_res_image_features(self, pixel_values):
+ output = self.vision_model(pixel_values)
+ output = output[0]
+ return output
+
+ def get_high_res_image_features(self, pixel_values):
+ output = self.high_res_vision_model(
+ pixel_values=pixel_values,
+ output_hidden_states=True,
+ return_dict=True,
+ )
+ last_hidden_state = output.last_hidden_state
+ last_hidden_state = self.high_res_vision_proj(last_hidden_state)
+
+ hidden_states = output.hidden_states
+ global_hidden_state = hidden_states[self.global_attn_index + 1] # +1 for embedding layer
+ global_hidden_state = self.high_res_vision_neck(global_hidden_state)
+ global_hidden_state = self.high_res_vision_proj(global_hidden_state)
+
+ output = last_hidden_state + global_hidden_state * self.high_res_vision_alpha
+
+ # batch_size, hidden_size, height, width -> batch_size, seq_len, hidden_size
+ output = output.permute(0, 2, 3, 1)
+ output = output.reshape(output.shape[0], -1, output.shape[-1])
+
+ return output
+
+ def get_image_features(self, pixel_values, high_res_pixel_values):
+ vision_encodings = self.get_low_res_image_features(pixel_values)
+ high_res_vision_encodings = self.get_high_res_image_features(high_res_pixel_values)
+ images_embeds = self.aligner(vision_encodings, high_res_vision_encodings)
+ return images_embeds
+
+ @can_return_tuple
+ @auto_docstring(custom_args=DEEPSEEK_VL_COMMON_CUSTOM_ARGS)
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ pixel_values: torch.FloatTensor = None,
+ high_res_pixel_values: torch.FloatTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Cache] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = None,
+ logits_to_keep: Union[int, torch.Tensor] = 0,
+ **kwargs,
+ ):
+ if (input_ids is None) ^ (inputs_embeds is not None):
+ raise ValueError(
+ "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+ )
+
+ if pixel_values is not None and high_res_pixel_values is None:
+ raise ValueError("Both pixel_values and high_res_pixel_values should be specified at the same time")
+
+ if inputs_embeds is None:
+ inputs_embeds = self.get_input_embeddings()(input_ids)
+
+ if pixel_values is not None:
+ if input_ids is None:
+ image_attention_mask = inputs_embeds == self.get_input_embeddings()(
+ torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+ )
+ image_attention_mask = image_attention_mask.all(-1)
+ else:
+ image_attention_mask = input_ids == self.config.image_token_id
+
+ image_attention_mask = image_attention_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+ image_embeds = self.get_image_features(pixel_values, high_res_pixel_values)
+ image_features = image_embeds.reshape(-1, inputs_embeds.shape[-1])
+ image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+ inputs_embeds = inputs_embeds.masked_scatter(image_attention_mask, image_features)
+
+ lm_output = self.language_model(
+ inputs_embeds=inputs_embeds,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ use_cache=use_cache,
+ cache_position=cache_position,
+ logits_to_keep=logits_to_keep,
+ **kwargs,
+ )
+
+ return DeepseekVLHybridBaseModelOutputWithPast(
+ last_hidden_state=lm_output.last_hidden_state,
+ past_key_values=lm_output.past_key_values,
+ hidden_states=lm_output.hidden_states,
+ attentions=lm_output.attentions,
+ image_hidden_states=image_embeds if pixel_values is not None else None,
+ )
+
+
+class DeepseekVLHybridForConditionalGeneration(DeepseekVLForConditionalGeneration):
+ @can_return_tuple
+ @auto_docstring(custom_args=DEEPSEEK_VL_COMMON_CUSTOM_ARGS)
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ pixel_values: torch.FloatTensor = None,
+ high_res_pixel_values: torch.FloatTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Cache] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ logits_to_keep: Union[int, torch.Tensor] = 0,
+ **kwargs: Unpack[TransformersKwargs],
+ ):
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+ """
+ outputs = self.model(
+ input_ids=input_ids,
+ pixel_values=pixel_values,
+ high_res_pixel_values=high_res_pixel_values,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ cache_position=cache_position,
+ **kwargs,
+ )
+ hidden_states = outputs.last_hidden_state
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+ loss = None
+ if labels is not None:
+ loss = self.loss_function(
+ logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+ )
+
+ return DeepseekVLHybridCausalLMOutputWithPast(
+ loss=loss,
+ logits=logits,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ image_hidden_states=outputs.image_hidden_states,
+ )
+
+ def prepare_inputs_for_generation(
+ self,
+ input_ids,
+ past_key_values=None,
+ inputs_embeds=None,
+ pixel_values=None,
+ high_res_pixel_values=None,
+ attention_mask=None,
+ cache_position=None,
+ logits_to_keep=None,
+ **kwargs,
+ ):
+ model_inputs = super().prepare_inputs_for_generation(
+ input_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ attention_mask=attention_mask,
+ cache_position=cache_position,
+ logits_to_keep=logits_to_keep,
+ **kwargs,
+ )
+
+ if cache_position[0] == 0:
+ # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+ # Otherwise we need pixel values to be passed to model
+ model_inputs["pixel_values"] = pixel_values
+ model_inputs["high_res_pixel_values"] = high_res_pixel_values
+
+ return model_inputs
+
+
+class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor):
+ r"""
+ Constructs a DEEPSEEK_VL_HYBRID image processor.
+
+ Args:
+ do_resize (`bool`, *optional*, defaults to `True`):
+ Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+ `do_resize` parameter in the `preprocess` method.
+ size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
+ Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+ method.
+ high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`):
+ Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess`
+ method.
+ min_size (`int`, *optional*, defaults to 14):
+ The minimum allowed size for the resized image. Ensures that neither the height nor width
+ falls below this value after resizing.
+ resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+ Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+ overridden by the `resample` parameter in the `preprocess` method.
+ high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+ Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+ overridden by the `high_res_resample` parameter in the `preprocess` method.
+ do_rescale (`bool`, *optional*, defaults to `True`):
+ Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+ `do_rescale` parameter in the `preprocess` method.
+ rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+ Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
+ overridden by the `rescale_factor` parameter in the `preprocess` method.
+ do_normalize (`bool`, *optional*, defaults to `True`):
+ Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+ method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
+ image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+ Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+ channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+ overridden by the `image_mean` parameter in the `preprocess` method.
+ image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+ Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+ number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+ Can be overridden by the `image_std` parameter in the `preprocess` method.
+ high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
+ Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of
+ channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method.
+ high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
+ Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the
+ number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
+ do_convert_rgb (`bool`, *optional*, defaults to `True`):
+ Whether to convert the image to RGB.
+ """
+
+ def __init__(
+ self,
+ do_resize: bool = True,
+ size: Optional[dict[str, int]] = None,
+ high_res_size: Optional[dict[str, int]] = None,
+ min_size: int = 14,
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
+ high_res_resample: PILImageResampling = PILImageResampling.BICUBIC,
+ do_rescale: bool = True,
+ rescale_factor: Union[int, float] = 1 / 255,
+ do_normalize: bool = True,
+ image_mean: Optional[Union[float, list[float]]] = None,
+ image_std: Optional[Union[float, list[float]]] = None,
+ high_res_image_mean: Optional[Union[float, list[float]]] = None,
+ high_res_image_std: Optional[Union[float, list[float]]] = None,
+ do_convert_rgb: Optional[bool] = None,
+ **kwargs,
+ ) -> None:
+ high_res_size = high_res_size if high_res_size is not None else {"height": 1024, "width": 1024}
+ high_res_size = get_size_dict(high_res_size, default_to_square=True)
+
+ self.high_res_size = high_res_size
+ self.high_res_image_mean = high_res_image_mean if high_res_image_mean is not None else OPENAI_CLIP_MEAN
+ self.high_res_image_std = high_res_image_std if high_res_image_std is not None else OPENAI_CLIP_STD
+
+ self.resample = resample
+ self.high_res_resample = high_res_resample
+
+ super().__init__(
+ do_resize=do_resize,
+ size=size,
+ min_size=min_size,
+ resample=resample,
+ do_rescale=do_rescale,
+ rescale_factor=rescale_factor,
+ do_normalize=do_normalize,
+ image_mean=image_mean,
+ image_std=image_std,
+ do_convert_rgb=do_convert_rgb,
+ **kwargs,
+ )
+
+ if high_res_image_mean is None:
+ self.background_color = (127, 127, 127)
+ else:
+ self.background_color = tuple([int(x * 255) for x in high_res_image_mean])
+
+ @filter_out_non_signature_kwargs()
+ def preprocess(
+ self,
+ images: ImageInput,
+ do_resize: Optional[bool] = None,
+ size: Optional[dict[str, int]] = None,
+ high_res_size: Optional[dict[str, int]] = None,
+ resample: PILImageResampling = None,
+ high_res_resample: PILImageResampling = None,
+ do_rescale: Optional[bool] = None,
+ rescale_factor: Optional[float] = None,
+ do_normalize: Optional[bool] = None,
+ image_mean: Optional[Union[float, list[float]]] = None,
+ image_std: Optional[Union[float, list[float]]] = None,
+ high_res_image_mean: Optional[Union[float, list[float]]] = None,
+ high_res_image_std: Optional[Union[float, list[float]]] = None,
+ return_tensors: Optional[Union[str, TensorType]] = None,
+ data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ do_convert_rgb: Optional[bool] = None,
+ ):
+ """
+ Preprocess an image or batch of images.
+
+ Args:
+ images (`ImageInput`):
+ Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+ passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+ Whether to resize the image.
+ size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+ Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
+ resizing.
+ high_res_size (`Dict[str, int]`, *optional*, defaults to `self.high_res_size`):
+ Dictionary in the format `{"height": h, "width": w}` specifying the size of the high resolution output image after
+ resizing.
+ resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
+ `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
+ an effect if `do_resize` is set to `True`.
+ high_res_resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
+ `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BICUBIC`. Only has
+ an effect if `do_resize` is set to `True`.
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+ Whether to rescale the image values between [0 - 1].
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+ Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+ Whether to normalize the image.
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+ Image mean to use if `do_normalize` is set to `True`.
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+ Image standard deviation to use if `do_normalize` is set to `True`.
+ high_res_image_mean (`float` or `List[float]`, *optional*, defaults to `self.high_res_image_mean`):
+ Image mean to use if `do_normalize` is set to `True`.
+ high_res_image_std (`float` or `List[float]`, *optional*, defaults to `self.high_res_image_std`):
+ Image standard deviation to use if `do_normalize` is set to `True`.
+ return_tensors (`str` or `TensorType`, *optional*):
+ The type of tensors to return. Can be one of:
+ - Unset: Return a list of `np.ndarray`.
+ - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+ - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+ - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+ - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+ data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+ The channel dimension format for the output image. Can be one of:
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+ - Unset: Use the channel dimension format of the input image.
+ input_data_format (`ChannelDimension` or `str`, *optional*):
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
+ from the input image. Can be one of:
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+ Whether to convert the image to RGB.
+ """
+ do_resize = do_resize if do_resize is not None else self.do_resize
+ do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+ do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+ resample = resample if resample is not None else self.resample
+ high_res_resample = high_res_resample if high_res_resample is not None else self.high_res_resample
+ rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+ image_mean = image_mean if image_mean is not None else self.image_mean
+ image_std = image_std if image_std is not None else self.image_std
+ high_res_image_mean = high_res_image_mean if high_res_image_mean is not None else self.high_res_image_mean
+ high_res_image_std = high_res_image_std if high_res_image_std is not None else self.high_res_image_std
+ do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+ size = size if size is not None else self.size
+ size_dict = get_size_dict(size)
+ high_res_size = high_res_size if high_res_size is not None else self.high_res_size
+ high_res_size_dict = get_size_dict(high_res_size)
+
+ images = make_list_of_images(images)
+
+ if not valid_images(images):
+ raise ValueError(
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+ "torch.Tensor, tf.Tensor or jax.ndarray."
+ )
+ validate_preprocess_arguments(
+ do_rescale=do_rescale,
+ rescale_factor=rescale_factor,
+ do_normalize=do_normalize,
+ image_mean=image_mean,
+ image_std=image_std,
+ do_resize=do_resize,
+ size=size,
+ resample=resample,
+ )
+
+ if do_convert_rgb:
+ images = [convert_to_rgb(image) for image in images]
+
+ # All transformations expect numpy arrays.
+ images = [to_numpy_array(image) for image in images]
+
+ if do_rescale and is_scaled_image(images[0]):
+ logger.warning_once(
+ "It looks like you are trying to rescale already rescaled images. If the input"
+ " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+ )
+
+ if input_data_format is None:
+ # We assume that all images have the same channel dimension format.
+ input_data_format = infer_channel_dimension_format(images[0])
+
+ all_images = []
+ all_high_res_images = []
+ for image in images:
+ # high_res_image: resize (high) -> rescale -> normalize (high)
+ # low_res_image: resize (high) -> rescale -> resize (low) -> normalize (low)
+ high_res_image = image
+
+ if do_resize:
+ high_res_image = self.resize(
+ image=high_res_image,
+ size=high_res_size_dict,
+ resample=high_res_resample,
+ input_data_format=input_data_format,
+ )
+ image = self.resize(
+ image=high_res_image, size=size_dict, resample=resample, input_data_format=input_data_format
+ )
+
+ if do_rescale:
+ image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+ high_res_image = self.rescale(
+ image=high_res_image, scale=rescale_factor, input_data_format=input_data_format
+ )
+
+ if do_normalize:
+ image = self.normalize(
+ image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+ )
+ high_res_image = self.normalize(
+ image=high_res_image,
+ mean=high_res_image_mean,
+ std=high_res_image_std,
+ input_data_format=input_data_format,
+ )
+
+ image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+ high_res_image = to_channel_dimension_format(
+ high_res_image, data_format, input_channel_dim=input_data_format
+ )
+
+ all_images.append(image)
+ all_high_res_images.append(high_res_image)
+
+ data = {"pixel_values": all_images, "high_res_pixel_values": all_high_res_images}
+ return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+class DeepseekVLHybridProcessorKwargs(DeepseekVLProcessorKwargs):
+ pass
+
+
+class DeepseekVLHybridProcessor(DeepseekVLProcessor):
+ def __call__(
+ self,
+ text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+ images: ImageInput = None,
+ **kwargs: Unpack[DeepseekVLHybridProcessorKwargs],
+ ) -> BatchFeature:
+ """
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+ and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
+ the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+ DeepseekVLHybridImageProcessor's [`~DeepseekVLHybridImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+ of the above two methods for more information.
+
+ Args:
+ text (`str`, `List[str]`, `List[List[str]]`):
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+ tensor. Both channels-first and channels-last formats are supported.
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
+ If set, will return tensors of a particular framework. Acceptable values are:
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
+ - `'np'`: Return NumPy `np.ndarray` objects.
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+ Returns:
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+ `None`).
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+ """
+ output_kwargs = self._merge_kwargs(
+ DeepseekVLHybridProcessorKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs
+ )
+ if text is None and images is None:
+ raise ValueError("You must specify either text or images.")
+
+ if text is not None:
+ if isinstance(text, str):
+ text = [text]
+ elif not (isinstance(text, (list, tuple)) and all(isinstance(t, str) for t in text)):
+ raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+ prompt_strings = []
+ one_img_tokens = self.image_token * self.num_image_tokens
+ for prompt in text:
+ prompt = prompt.replace(self.image_token, one_img_tokens)
+ prompt_strings.append(prompt)
+
+ data = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
+
+ # process images if pixel_values are provided
+ if images is not None:
+ images = make_flat_list_of_images(images)
+ inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
+ data["pixel_values"] = inputs["pixel_values"]
+ data["high_res_pixel_values"] = inputs["high_res_pixel_values"]
+
+ return BatchFeature(data=data)
+
+
+__all__ = [
+ "DeepseekVLHybridConfig",
+ "DeepseekVLHybridPreTrainedModel",
+ "DeepseekVLHybridModel",
+ "DeepseekVLHybridForConditionalGeneration",
+ "DeepseekVLHybridImageProcessor",
+ "DeepseekVLHybridProcessor",
+]
diff --git a/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py
new file mode 100644
index 0000000000..4fb765c797
--- /dev/null
+++ b/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py
@@ -0,0 +1,159 @@
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# This file was automatically generated from src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py.
+# Do NOT edit this file manually as any edits will be overwritten by the generation of
+# the file from the modular. If any change should be done, please apply the change to the
+# modular_deepseek_vl_hybrid.py file directly. One of our CI enforces this.
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Union
+
+from ...image_processing_utils_fast import BatchFeature
+from ...image_utils import ImageInput, make_flat_list_of_images
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+
+
+class DeepseekVLHybridProcessorKwargs(ProcessingKwargs, total=False):
+ _defaults = {
+ "text_kwargs": {"padding": False},
+ "common_kwargs": {"return_tensors": "pt"},
+ }
+
+
+class DeepseekVLHybridProcessor(ProcessorMixin):
+ r"""
+ Constructs a DeepseekVLHybrid processor which wraps a DeepseekVLHybrid Image Processor and a Llama tokenizer into a single processor.
+
+ [`DeepseekVLHybridProcessor`] offers all the functionalities of [`DeepseekVLHybridImageProcessor`] and [`LlamaTokenizerFast`]. See the
+ [`~DeepseekVLHybridProcessor.__call__`] and [`~DeepseekVLHybridProcessor.decode`] for more information.
+
+ Args:
+ image_processor ([`DeepseekVLHybridImageProcessor`]):
+ The image processor is a required input.
+ tokenizer ([`LlamaTokenizerFast`]):
+ The tokenizer is a required input.
+ chat_template (`str`, *optional*):
+ A Jinja template which will be used to convert lists of messages
+ in a chat into a tokenizable string.
+ num_image_tokens (`int`, *optional*, defaults to 576):
+ The number of special image tokens used as placeholders for visual content in text sequences.
+ """
+
+ attributes = ["image_processor", "tokenizer"]
+ valid_kwargs = ["chat_template", "num_image_tokens"]
+ image_processor_class = "AutoImageProcessor"
+ tokenizer_class = "AutoTokenizer"
+
+ def __init__(
+ self,
+ image_processor,
+ tokenizer,
+ chat_template=None,
+ num_image_tokens=576,
+ ):
+ self.image_token = tokenizer.image_token
+ self.num_image_tokens = num_image_tokens
+
+ super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+ def __call__(
+ self,
+ text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+ images: ImageInput = None,
+ **kwargs: Unpack[DeepseekVLHybridProcessorKwargs],
+ ) -> BatchFeature:
+ """
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+ and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
+ the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+ DeepseekVLHybridImageProcessor's [`~DeepseekVLHybridImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+ of the above two methods for more information.
+
+ Args:
+ text (`str`, `List[str]`, `List[List[str]]`):
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+ tensor. Both channels-first and channels-last formats are supported.
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
+ If set, will return tensors of a particular framework. Acceptable values are:
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
+ - `'np'`: Return NumPy `np.ndarray` objects.
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+ Returns:
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+ `None`).
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+ """
+ output_kwargs = self._merge_kwargs(
+ DeepseekVLHybridProcessorKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs
+ )
+ if text is None and images is None:
+ raise ValueError("You must specify either text or images.")
+
+ if text is not None:
+ if isinstance(text, str):
+ text = [text]
+ elif not (isinstance(text, (list, tuple)) and all(isinstance(t, str) for t in text)):
+ raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+ prompt_strings = []
+ one_img_tokens = self.image_token * self.num_image_tokens
+ for prompt in text:
+ prompt = prompt.replace(self.image_token, one_img_tokens)
+ prompt_strings.append(prompt)
+
+ data = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
+
+ # process images if pixel_values are provided
+ if images is not None:
+ images = make_flat_list_of_images(images)
+ inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
+ data["pixel_values"] = inputs["pixel_values"]
+ data["high_res_pixel_values"] = inputs["high_res_pixel_values"]
+
+ return BatchFeature(data=data)
+
+ def batch_decode(self, *args, **kwargs):
+ """
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+ refer to the docstring of this method for more information.
+ """
+ return self.tokenizer.batch_decode(*args, **kwargs)
+
+ def decode(self, *args, **kwargs):
+ """
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+ the docstring of this method for more information.
+ """
+ return self.tokenizer.decode(*args, **kwargs)
+
+ @property
+ def model_input_names(self):
+ tokenizer_input_names = self.tokenizer.model_input_names
+ image_processor_input_names = self.image_processor.model_input_names
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+
+__all__ = ["DeepseekVLHybridProcessor"]
diff --git a/src/transformers/models/janus/modeling_janus.py b/src/transformers/models/janus/modeling_janus.py
index b93e1a8b67..ebdc2f23ea 100644
--- a/src/transformers/models/janus/modeling_janus.py
+++ b/src/transformers/models/janus/modeling_janus.py
@@ -1147,7 +1147,7 @@ class JanusForConditionalGeneration(JanusPreTrainedModel, GenerationMixin):
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
- **kwargs,
+ **kwargs: Unpack[TransformersKwargs],
):
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1173,7 +1173,9 @@ class JanusForConditionalGeneration(JanusPreTrainedModel, GenerationMixin):
loss = None
if labels is not None:
- loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size)
+ loss = self.loss_function(
+ logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+ )
return JanusCausalLMOutputWithPast(
loss=loss,
diff --git a/src/transformers/models/janus/modular_janus.py b/src/transformers/models/janus/modular_janus.py
index 29accd88e5..11b0848620 100644
--- a/src/transformers/models/janus/modular_janus.py
+++ b/src/transformers/models/janus/modular_janus.py
@@ -1007,7 +1007,7 @@ class JanusForConditionalGeneration(JanusPreTrainedModel, GenerationMixin):
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
- **kwargs,
+ **kwargs: Unpack[TransformersKwargs],
):
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1033,7 +1033,9 @@ class JanusForConditionalGeneration(JanusPreTrainedModel, GenerationMixin):
loss = None
if labels is not None:
- loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size)
+ loss = self.loss_function(
+ logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+ )
return JanusCausalLMOutputWithPast(
loss=loss,
diff --git a/tests/models/deepseek_vl/__init__.py b/tests/models/deepseek_vl/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/models/deepseek_vl/test_image_processing_deepseek_vl.py b/tests/models/deepseek_vl/test_image_processing_deepseek_vl.py
new file mode 100644
index 0000000000..c1092f05d3
--- /dev/null
+++ b/tests/models/deepseek_vl/test_image_processing_deepseek_vl.py
@@ -0,0 +1,119 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_vision_available():
+ from transformers import DeepseekVLImageProcessor
+
+
+# Copied from tests.models.vit.test_image_processing_vit.ViTImageProcessingTester with ViT->DeepseekVL
+class DeepseekVLImageProcessingTester:
+ def __init__(
+ self,
+ parent,
+ batch_size=7,
+ num_channels=3,
+ image_size=18,
+ min_resolution=30,
+ max_resolution=400,
+ do_resize=True,
+ size=None,
+ do_normalize=True,
+ image_mean=[0.5, 0.5, 0.5],
+ image_std=[0.5, 0.5, 0.5],
+ ):
+ size = size if size is not None else {"height": 18, "width": 18}
+ self.parent = parent
+ self.batch_size = batch_size
+ self.num_channels = num_channels
+ self.image_size = image_size
+ self.min_resolution = min_resolution
+ self.max_resolution = max_resolution
+ self.do_resize = do_resize
+ self.size = size
+ self.do_normalize = do_normalize
+ self.image_mean = image_mean
+ self.image_std = image_std
+
+ def prepare_image_processor_dict(self):
+ return {
+ "image_mean": self.image_mean,
+ "image_std": self.image_std,
+ "do_normalize": self.do_normalize,
+ "do_resize": self.do_resize,
+ "size": self.size,
+ }
+
+ # Ignore copy
+ def expected_output_image_shape(self, images):
+ max_size = max(self.size["height"], self.size["width"])
+ return self.num_channels, max_size, max_size
+
+ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+ return prepare_image_inputs(
+ batch_size=self.batch_size,
+ num_channels=self.num_channels,
+ min_resolution=self.min_resolution,
+ max_resolution=self.max_resolution,
+ equal_resolution=equal_resolution,
+ numpify=numpify,
+ torchify=torchify,
+ )
+
+
+@require_torch
+@require_vision
+# Copied from tests.models.vit.test_image_processing_vit.ViTImageProcessingTest with ViT->DeepseekVL
+class DeepseekVLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+ # Ignore copy
+ image_processing_class = DeepseekVLImageProcessor if is_vision_available() else None
+
+ def setUp(self):
+ super().setUp()
+ self.image_processor_tester = DeepseekVLImageProcessingTester(self)
+
+ @property
+ def image_processor_dict(self):
+ return self.image_processor_tester.prepare_image_processor_dict()
+
+ def test_image_processor_properties(self):
+ for image_processing_class in self.image_processor_list:
+ image_processing = image_processing_class(**self.image_processor_dict)
+ self.assertTrue(hasattr(image_processing, "image_mean"))
+ self.assertTrue(hasattr(image_processing, "image_std"))
+ self.assertTrue(hasattr(image_processing, "do_normalize"))
+ self.assertTrue(hasattr(image_processing, "do_resize"))
+ self.assertTrue(hasattr(image_processing, "size"))
+
+ def test_image_processor_from_dict_with_kwargs(self):
+ for image_processing_class in self.image_processor_list:
+ image_processor = image_processing_class.from_dict(self.image_processor_dict)
+ self.assertEqual(image_processor.size, {"height": 18, "width": 18})
+
+ image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42)
+ self.assertEqual(image_processor.size, {"height": 42, "width": 42})
+
+ # Ignore copy
+ @unittest.skip(reason="Not supported")
+ def test_call_numpy_4_channels(self):
+ pass
diff --git a/tests/models/deepseek_vl/test_modeling_deepseek_vl.py b/tests/models/deepseek_vl/test_modeling_deepseek_vl.py
new file mode 100644
index 0000000000..bff23e9dd5
--- /dev/null
+++ b/tests/models/deepseek_vl/test_modeling_deepseek_vl.py
@@ -0,0 +1,359 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch DeepseekVL model."""
+
+import re
+import tempfile
+import unittest
+
+from transformers import (
+ AutoProcessor,
+ DeepseekVLConfig,
+ DeepseekVLForConditionalGeneration,
+ DeepseekVLModel,
+ is_torch_available,
+)
+from transformers.testing_utils import (
+ require_torch,
+ require_torch_accelerator,
+ require_torch_sdpa,
+ slow,
+ torch_device,
+)
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+ import torch
+
+
+class DeepseekVLModelTester:
+ def __init__(
+ self,
+ parent,
+ batch_size=2,
+ seq_length=25,
+ num_channels=3,
+ initializer_range=0.02,
+ is_training=True,
+ use_cache=False,
+ text_config={
+ "num_hidden_layers": 2,
+ "vocab_size": 99,
+ "hidden_size": 16,
+ "intermediate_size": 37,
+ "max_position_embeddings": 512,
+ "num_attention_heads": 4,
+ "pad_token_id": 1,
+ },
+ vision_config={
+ "num_hidden_layers": 1,
+ "hidden_size": 16,
+ "intermediate_size": 37,
+ "image_size": 32,
+ "patch_size": 8,
+ "hidden_act": "gelu",
+ "vision_use_head": False,
+ "num_attention_heads": 4,
+ },
+ ):
+ self.parent = parent
+ self.batch_size = batch_size
+ self.seq_length = seq_length
+ self.num_channels = num_channels
+ self.initializer_range = initializer_range
+ self.is_training = is_training
+ self.use_cache = use_cache
+
+ self.text_config = text_config
+ self.vision_config = vision_config
+ self.vision_config["num_channels"] = self.num_channels
+
+ self.num_hidden_layers = text_config["num_hidden_layers"]
+ self.vocab_size = text_config["vocab_size"]
+ self.hidden_size = text_config["hidden_size"]
+ self.num_attention_heads = text_config["num_attention_heads"]
+ self.image_size = vision_config["image_size"]
+ self.num_image_tokens = vision_config["image_size"] // vision_config["patch_size"]
+ self.pad_token_id = text_config["pad_token_id"]
+ self.image_token_id = self.vocab_size - 1
+
+ def get_config(self):
+ return DeepseekVLConfig(
+ text_config=self.text_config,
+ vision_config=self.vision_config,
+ image_token_id=self.image_token_id,
+ )
+
+ def prepare_config_and_inputs(self):
+ config = self.get_config()
+
+ # create text and vision inputs
+ input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 1
+ attention_mask = random_attention_mask([self.batch_size, self.seq_length])
+ pixel_values = floats_tensor(
+ [
+ self.batch_size,
+ self.num_channels,
+ self.image_size,
+ self.image_size,
+ ]
+ )
+ # fill image_tokens
+ input_ids[:, : self.num_image_tokens] = self.image_token_id
+
+ return config, input_ids, attention_mask, pixel_values
+
+ def prepare_config_and_inputs_for_common(self):
+ config_and_inputs = self.prepare_config_and_inputs()
+ config, input_ids, attention_mask, pixel_values = config_and_inputs
+ inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask, "pixel_values": pixel_values}
+ return config, inputs_dict
+
+
+@require_torch
+class DeepseekVLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+ all_model_classes = (DeepseekVLModel, DeepseekVLForConditionalGeneration) if is_torch_available() else ()
+ pipeline_model_mapping = (
+ {
+ "feature-extraction": DeepseekVLModel,
+ "image-text-to-text": DeepseekVLForConditionalGeneration,
+ }
+ if is_torch_available()
+ else {}
+ )
+ _is_composite = True
+ test_pruning = False
+ test_head_masking = False
+
+ def setUp(self):
+ self.model_tester = DeepseekVLModelTester(self)
+ self.config_tester = ConfigTester(self, config_class=DeepseekVLConfig, has_text_modality=False)
+
+ # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+ def test_inputs_embeds(self):
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+ for model_class in self.all_model_classes:
+ model = model_class(config)
+ model.to(torch_device)
+ model.eval()
+
+ inputs = self._prepare_for_class(inputs_dict, model_class)
+
+ input_ids = inputs["input_ids"]
+ del inputs["input_ids"]
+ del inputs["pixel_values"]
+
+ wte = model.get_input_embeddings()
+ inputs["inputs_embeds"] = wte(input_ids)
+
+ with torch.no_grad():
+ model(**inputs)
+
+ # overwrite inputs_embeds tests because we need to delete "pixel values" for VLMs.
+ def test_inputs_embeds_matches_input_ids(self):
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+ for model_class in self.all_model_classes:
+ model = model_class(config)
+ model.to(torch_device)
+ model.eval()
+
+ inputs = self._prepare_for_class(inputs_dict, model_class)
+ input_ids = inputs["input_ids"]
+ del inputs["input_ids"]
+ del inputs["pixel_values"]
+
+ inputs_embeds = model.get_input_embeddings()(input_ids)
+
+ with torch.no_grad():
+ out_ids = model(input_ids=input_ids, **inputs)[0]
+ out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
+ torch.testing.assert_close(out_embeds, out_ids)
+
+ @unittest.skip(reason="Siglip uses the same initialization scheme as the Flax original implementation")
+ # Copied from tests.models.siglip.test_modeling_siglip.SiglipVisionModelTest.test_initialization
+ def test_initialization(self):
+ pass
+
+ @require_torch_sdpa
+ # Copied from tests.models.janus.test_modeling_janus.JanusVisionText2TextModelTest.test_sdpa_can_dispatch_composite_models
+ def test_sdpa_can_dispatch_composite_models(self):
+ for model_class in self.all_model_classes:
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+ model = model_class(config)
+
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ model.save_pretrained(tmpdirname)
+
+ # Load the model with SDPA
+ model_sdpa = model_class.from_pretrained(tmpdirname)
+ model_sdpa = model_sdpa.eval().to(torch_device)
+
+ # Load model with eager attention
+ model_eager = model_class.from_pretrained(
+ tmpdirname,
+ attn_implementation="eager",
+ )
+ model_eager = model_eager.eval().to(torch_device)
+
+ # SigLip has one shared cls attr for all models, so we assign both submodels heer
+ vision_attn = language_attn = "sdpa" if model._supports_sdpa else "eager"
+
+ if hasattr(model_sdpa, "vision_model") and hasattr(model_sdpa, "language_model"):
+ self.assertTrue(model_sdpa.vision_model.config._attn_implementation == vision_attn)
+ self.assertTrue(model_sdpa.language_model.config._attn_implementation == language_attn)
+ self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager")
+ self.assertTrue(model_eager.language_model.config._attn_implementation == "eager")
+
+ self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
+ self.assertTrue(model_eager.config._attn_implementation == "eager")
+
+ for name, submodule in model_eager.named_modules():
+ class_name = submodule.__class__.__name__
+ if any(re.finditer(r"Attention(?!Pool)", class_name)):
+ self.assertTrue(submodule.config._attn_implementation == "eager")
+
+ for name, submodule in model_sdpa.named_modules():
+ class_name = submodule.__class__.__name__
+ if any(re.finditer(r"Attention(?!Pool)", class_name)):
+ self.assertTrue(submodule.config._attn_implementation == "sdpa")
+
+
+@require_torch
+@require_torch_accelerator
+@slow
+class DeepseekVLIntegrationTest(unittest.TestCase):
+ def setUp(self):
+ self.model_id = "deepseek-community/deepseek-vl-1.3b-chat"
+
+ def test_model_text_generation(self):
+ model = DeepseekVLForConditionalGeneration.from_pretrained(
+ self.model_id, torch_dtype="auto", device_map="auto"
+ )
+ model.to(torch_device)
+ model.eval()
+ processor = AutoProcessor.from_pretrained(self.model_id)
+
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image",
+ "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
+ },
+ {"type": "text", "text": "Describe this image."},
+ ],
+ }
+ ]
+ EXPECTED_TEXT = 'You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: Describe this image.\n\nAssistant:In the image, a majestic snow leopard is captured in a moment of tranquility. The snow leopard' # fmt: skip
+
+ inputs = processor.apply_chat_template(
+ messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
+ )
+ inputs = inputs.to(model.device, dtype=model.dtype)
+ output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
+ text = processor.decode(output[0], skip_special_tokens=True)
+
+ self.assertEqual(
+ text,
+ EXPECTED_TEXT,
+ )
+
+ def test_model_text_generation_batched(self):
+ model = DeepseekVLForConditionalGeneration.from_pretrained(
+ self.model_id, torch_dtype="auto", device_map="auto"
+ )
+ model.to(torch_device)
+ model.eval()
+ processor = AutoProcessor.from_pretrained(self.model_id)
+
+ messages = [
+ [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image",
+ "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
+ },
+ {"type": "text", "text": "Describe this image."},
+ ],
+ }
+ ],
+ [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image",
+ "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
+ },
+ {"type": "text", "text": "What animal do you see in the image?"},
+ ],
+ }
+ ],
+ ]
+ EXPECTED_TEXT = [
+ "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: Describe this image.\n\nAssistant:In the image, a majestic snow leopard is captured in a moment of tranquility. The snow leopard", # fmt: skip
+ "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: What animal do you see in the image?\n\nAssistant:I see a bear in the image.What is the significance of the color red in the", # fmt: skip
+ ]
+
+ inputs = processor.apply_chat_template(
+ messages, add_generation_prompt=True, tokenize=True, padding=True, return_dict=True, return_tensors="pt"
+ )
+ inputs = inputs.to(model.device, dtype=model.dtype)
+ output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
+ text = processor.batch_decode(output, skip_special_tokens=True)
+
+ self.assertEqual(EXPECTED_TEXT, text)
+
+ def test_model_text_generation_with_multi_image(self):
+ model = DeepseekVLForConditionalGeneration.from_pretrained(
+ self.model_id, torch_dtype="auto", device_map="auto"
+ )
+ model.to(torch_device)
+ model.eval()
+ processor = AutoProcessor.from_pretrained(self.model_id)
+
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {"type": "text", "text": "What's the difference between"},
+ {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
+ {"type": "text", "text": " and "},
+ {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
+ ],
+ }
+ ]
+ EXPECTED_TEXT = "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: What's the difference between and \n\nAssistant:The image is a photograph featuring two cats lying on a pink blanket. The cat on the left is" # fmt: skip
+
+ inputs = processor.apply_chat_template(
+ messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
+ )
+ inputs = inputs.to(model.device, dtype=model.dtype)
+ output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
+ text = processor.decode(output[0], skip_special_tokens=True)
+
+ self.assertEqual(
+ text,
+ EXPECTED_TEXT,
+ )
diff --git a/tests/models/deepseek_vl/test_processor_deepseek_vl.py b/tests/models/deepseek_vl/test_processor_deepseek_vl.py
new file mode 100644
index 0000000000..3c61f377e2
--- /dev/null
+++ b/tests/models/deepseek_vl/test_processor_deepseek_vl.py
@@ -0,0 +1,54 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import tempfile
+import unittest
+
+from transformers import DeepseekVLProcessor, LlamaTokenizer
+from transformers.models.deepseek_vl.convert_deepseek_vl_weights_to_hf import CHAT_TEMPLATE
+from transformers.testing_utils import get_tests_dir
+from transformers.utils import is_vision_available
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+if is_vision_available():
+ from transformers import DeepseekVLImageProcessor
+
+
+SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
+
+
+class DeepseekVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+ processor_class = DeepseekVLProcessor
+
+ def setUp(self):
+ self.tmpdirname = tempfile.mkdtemp()
+ image_processor = DeepseekVLImageProcessor()
+ tokenizer = LlamaTokenizer(
+ vocab_file=SAMPLE_VOCAB,
+ extra_special_tokens={
+ "pad_token": "<|end▁of▁sentence|>",
+ "image_token": "",
+ },
+ )
+ processor = self.processor_class(
+ image_processor=image_processor,
+ tokenizer=tokenizer,
+ chat_template=CHAT_TEMPLATE,
+ )
+ processor.save_pretrained(self.tmpdirname)
+
+ def prepare_processor_dict(self):
+ return {"chat_template": CHAT_TEMPLATE, "num_image_tokens": 576}
diff --git a/tests/models/deepseek_vl_hybrid/__init__.py b/tests/models/deepseek_vl_hybrid/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/models/deepseek_vl_hybrid/test_image_processing_deepseek_vl_hybrid.py b/tests/models/deepseek_vl_hybrid/test_image_processing_deepseek_vl_hybrid.py
new file mode 100644
index 0000000000..b7eaefd71a
--- /dev/null
+++ b/tests/models/deepseek_vl_hybrid/test_image_processing_deepseek_vl_hybrid.py
@@ -0,0 +1,218 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+ import torch
+
+if is_vision_available():
+ from PIL import Image
+
+ from transformers import DeepseekVLHybridImageProcessor
+
+
+class DeepseekVLHybridImageProcessingTester:
+ def __init__(
+ self,
+ parent,
+ batch_size=7,
+ num_channels=3,
+ image_size=18,
+ min_resolution=30,
+ max_resolution=400,
+ do_resize=True,
+ size=None,
+ high_res_size=None,
+ do_normalize=True,
+ image_mean=[0.5, 0.5, 0.5],
+ image_std=[0.5, 0.5, 0.5],
+ high_res_image_mean=[0.5, 0.5, 0.5],
+ high_res_image_std=[0.5, 0.5, 0.5],
+ ):
+ size = size if size is not None else {"height": 18, "width": 18}
+ high_res_size = high_res_size if high_res_size is not None else {"height": 36, "width": 36}
+ self.parent = parent
+ self.batch_size = batch_size
+ self.num_channels = num_channels
+ self.image_size = image_size
+ self.min_resolution = min_resolution
+ self.max_resolution = max_resolution
+ self.do_resize = do_resize
+ self.size = size
+ self.high_res_size = high_res_size
+ self.do_normalize = do_normalize
+ self.image_mean = image_mean
+ self.image_std = image_std
+ self.high_res_image_mean = high_res_image_mean
+ self.high_res_image_std = high_res_image_std
+
+ def prepare_image_processor_dict(self):
+ return {
+ "image_mean": self.image_mean,
+ "image_std": self.image_std,
+ "high_res_image_mean": self.high_res_image_mean,
+ "high_res_image_std": self.high_res_image_std,
+ "do_normalize": self.do_normalize,
+ "do_resize": self.do_resize,
+ "size": self.size,
+ "high_res_size": self.high_res_size,
+ }
+
+ def expected_output_image_shape(self, images):
+ max_size = max(self.size["height"], self.size["width"])
+ return self.num_channels, max_size, max_size
+
+ def expected_output_high_res_image_shape(self, images):
+ max_size = max(self.high_res_size["height"], self.high_res_size["width"])
+ return self.num_channels, max_size, max_size
+
+ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+ return prepare_image_inputs(
+ batch_size=self.batch_size,
+ num_channels=self.num_channels,
+ min_resolution=self.min_resolution,
+ max_resolution=self.max_resolution,
+ equal_resolution=equal_resolution,
+ numpify=numpify,
+ torchify=torchify,
+ )
+
+
+@require_torch
+@require_vision
+class DeepseekVLHybridImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+ image_processing_class = DeepseekVLHybridImageProcessor if is_vision_available() else None
+
+ # Copied from tests.models.vit.test_image_processing_vit.ViTImageProcessingTester.setUp with ViT->DeepseekVLHybrid
+ def setUp(self):
+ super().setUp()
+ self.image_processor_tester = DeepseekVLHybridImageProcessingTester(self)
+
+ @property
+ # Copied from tests.models.vit.test_image_processing_vit.ViTImageProcessingTester.image_processor_dict with ViT->DeepseekVLHybrid
+ def image_processor_dict(self):
+ return self.image_processor_tester.prepare_image_processor_dict()
+
+ # Copied from tests.models.vit.test_image_processing_vit.ViTImageProcessingTester.test_image_processor_from_dict_with_kwargs
+ def test_image_processor_from_dict_with_kwargs(self):
+ for image_processing_class in self.image_processor_list:
+ image_processor = image_processing_class.from_dict(self.image_processor_dict)
+ self.assertEqual(image_processor.size, {"height": 18, "width": 18})
+
+ image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42)
+ self.assertEqual(image_processor.size, {"height": 42, "width": 42})
+
+ def test_image_processor_properties(self):
+ for image_processing_class in self.image_processor_list:
+ image_processing = image_processing_class(**self.image_processor_dict)
+ self.assertTrue(hasattr(image_processing, "image_mean"))
+ self.assertTrue(hasattr(image_processing, "image_std"))
+ self.assertTrue(hasattr(image_processing, "high_res_image_mean"))
+ self.assertTrue(hasattr(image_processing, "high_res_image_std"))
+ self.assertTrue(hasattr(image_processing, "do_normalize"))
+ self.assertTrue(hasattr(image_processing, "do_resize"))
+ self.assertTrue(hasattr(image_processing, "size"))
+ self.assertTrue(hasattr(image_processing, "high_res_size"))
+
+ def test_call_pil_high_res(self):
+ for image_processing_class in self.image_processor_list:
+ # Initialize image_processing
+ image_processing = image_processing_class(**self.image_processor_dict)
+ # create random PIL images
+ image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
+ for image in image_inputs:
+ self.assertIsInstance(image, Image.Image)
+
+ # Test not batched input
+ encoded_images = image_processing(image_inputs[0], return_tensors="pt").high_res_pixel_values
+ expected_output_image_shape = self.image_processor_tester.expected_output_high_res_image_shape(
+ [image_inputs[0]]
+ )
+ self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+
+ # Test batched
+ encoded_images = image_processing(image_inputs, return_tensors="pt").high_res_pixel_values
+ expected_output_image_shape = self.image_processor_tester.expected_output_high_res_image_shape(
+ image_inputs
+ )
+ self.assertEqual(
+ tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
+ )
+
+ def test_call_numpy_high_res(self):
+ for image_processing_class in self.image_processor_list:
+ # Initialize image_processing
+ image_processing = image_processing_class(**self.image_processor_dict)
+ # create random numpy tensors
+ image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+ for image in image_inputs:
+ self.assertIsInstance(image, np.ndarray)
+
+ # Test not batched input
+ encoded_images = image_processing(image_inputs[0], return_tensors="pt").high_res_pixel_values
+ expected_output_image_shape = self.image_processor_tester.expected_output_high_res_image_shape(
+ [image_inputs[0]]
+ )
+ self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+
+ # Test batched
+ encoded_images = image_processing(image_inputs, return_tensors="pt").high_res_pixel_values
+ expected_output_image_shape = self.image_processor_tester.expected_output_high_res_image_shape(
+ image_inputs
+ )
+ self.assertEqual(
+ tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
+ )
+
+ def test_call_pytorch_high_res(self):
+ for image_processing_class in self.image_processor_list:
+ # Initialize image_processing
+ image_processing = image_processing_class(**self.image_processor_dict)
+ # create random PyTorch tensors
+ image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+
+ for image in image_inputs:
+ self.assertIsInstance(image, torch.Tensor)
+
+ # Test not batched input
+ encoded_images = image_processing(image_inputs[0], return_tensors="pt").high_res_pixel_values
+ expected_output_image_shape = self.image_processor_tester.expected_output_high_res_image_shape(
+ [image_inputs[0]]
+ )
+ self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+
+ # Test batched
+ expected_output_image_shape = self.image_processor_tester.expected_output_high_res_image_shape(
+ image_inputs
+ )
+ encoded_images = image_processing(image_inputs, return_tensors="pt").high_res_pixel_values
+ self.assertEqual(
+ tuple(encoded_images.shape),
+ (self.image_processor_tester.batch_size, *expected_output_image_shape),
+ )
+
+ @unittest.skip(reason="Not supported")
+ def test_call_numpy_4_channels(self):
+ pass
diff --git a/tests/models/deepseek_vl_hybrid/test_modeling_deepseek_vl_hybrid.py b/tests/models/deepseek_vl_hybrid/test_modeling_deepseek_vl_hybrid.py
new file mode 100644
index 0000000000..8e68ee19a1
--- /dev/null
+++ b/tests/models/deepseek_vl_hybrid/test_modeling_deepseek_vl_hybrid.py
@@ -0,0 +1,403 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch DeepseekVLHybrid model."""
+
+import re
+import tempfile
+import unittest
+
+from transformers import (
+ AutoProcessor,
+ DeepseekVLHybridConfig,
+ DeepseekVLHybridForConditionalGeneration,
+ DeepseekVLHybridModel,
+ is_torch_available,
+)
+from transformers.testing_utils import (
+ require_torch,
+ require_torch_accelerator,
+ require_torch_sdpa,
+ slow,
+ torch_device,
+)
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+ import torch
+
+
+class DeepseekVLHybridModelTester:
+ def __init__(
+ self,
+ parent,
+ batch_size=2,
+ seq_length=25,
+ num_channels=3,
+ initializer_range=0.02,
+ is_training=True,
+ use_cache=False,
+ text_config={
+ "num_hidden_layers": 2,
+ "vocab_size": 99,
+ "hidden_size": 16,
+ "intermediate_size": 37,
+ "max_position_embeddings": 512,
+ "num_attention_heads": 4,
+ "pad_token_id": 1,
+ },
+ vision_config={
+ "num_hidden_layers": 1,
+ "hidden_size": 16,
+ "intermediate_size": 37,
+ "image_size": 32,
+ "patch_size": 8,
+ "hidden_act": "gelu",
+ "vision_use_head": False,
+ "num_attention_heads": 4,
+ },
+ high_res_vision_config={
+ "num_hidden_layers": 2,
+ "global_attn_indexes": [0],
+ "hidden_size": 16,
+ "intermediate_size": 37,
+ "mlp_dim": 24,
+ "output_channels": 4,
+ "image_size": 128,
+ "patch_size": 32,
+ "num_attention_heads": 4,
+ },
+ ):
+ self.parent = parent
+ self.batch_size = batch_size
+ self.seq_length = seq_length
+ self.num_channels = num_channels
+ self.initializer_range = initializer_range
+ self.is_training = is_training
+ self.use_cache = use_cache
+
+ self.text_config = text_config
+ self.vision_config = vision_config
+ self.high_res_vision_config = high_res_vision_config
+ self.vision_config["num_channels"] = self.num_channels
+ self.high_res_vision_config["num_channels"] = self.num_channels
+
+ self.num_hidden_layers = text_config["num_hidden_layers"]
+ self.vocab_size = text_config["vocab_size"]
+ self.hidden_size = text_config["hidden_size"]
+ self.num_attention_heads = text_config["num_attention_heads"]
+ self.high_res_image_size = high_res_vision_config["image_size"]
+ self.image_size = vision_config["image_size"]
+ self.num_image_tokens = vision_config["image_size"] // vision_config["patch_size"]
+ self.pad_token_id = text_config["pad_token_id"]
+ self.image_token_id = self.vocab_size - 1
+
+ def get_config(self):
+ return DeepseekVLHybridConfig(
+ text_config=self.text_config,
+ vision_config=self.vision_config,
+ high_res_vision_config=self.high_res_vision_config,
+ image_token_id=self.image_token_id,
+ )
+
+ def prepare_config_and_inputs(self):
+ config = self.get_config()
+
+ # create text and vision inputs
+ input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 1
+ attention_mask = random_attention_mask([self.batch_size, self.seq_length])
+ pixel_values = floats_tensor(
+ [
+ self.batch_size,
+ self.num_channels,
+ self.image_size,
+ self.image_size,
+ ]
+ )
+ high_res_pixel_values = floats_tensor(
+ [
+ self.batch_size,
+ self.num_channels,
+ self.high_res_image_size,
+ self.high_res_image_size,
+ ]
+ )
+ # fill image_tokens
+ input_ids[:, : self.num_image_tokens] = self.image_token_id
+
+ return config, input_ids, attention_mask, pixel_values, high_res_pixel_values
+
+ def prepare_config_and_inputs_for_common(self):
+ config_and_inputs = self.prepare_config_and_inputs()
+ config, input_ids, attention_mask, pixel_values, high_res_pixel_values = config_and_inputs
+ inputs_dict = {
+ "input_ids": input_ids,
+ "attention_mask": attention_mask,
+ "pixel_values": pixel_values,
+ "high_res_pixel_values": high_res_pixel_values,
+ }
+ return config, inputs_dict
+
+
+@require_torch
+class DeepseekVLHybridModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+ all_model_classes = (
+ (DeepseekVLHybridModel, DeepseekVLHybridForConditionalGeneration) if is_torch_available() else ()
+ )
+ pipeline_model_mapping = (
+ {
+ "feature-extraction": DeepseekVLHybridModel,
+ "image-text-to-text": DeepseekVLHybridForConditionalGeneration,
+ }
+ if is_torch_available()
+ else {}
+ )
+ _is_composite = True
+ test_pruning = False
+ test_head_masking = False
+
+ def setUp(self):
+ self.model_tester = DeepseekVLHybridModelTester(self)
+ self.config_tester = ConfigTester(self, config_class=DeepseekVLHybridConfig, has_text_modality=False)
+
+ # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+ def test_inputs_embeds(self):
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+ for model_class in self.all_model_classes:
+ model = model_class(config)
+ model.to(torch_device)
+ model.eval()
+
+ inputs = self._prepare_for_class(inputs_dict, model_class)
+
+ input_ids = inputs["input_ids"]
+ del inputs["input_ids"]
+ del inputs["pixel_values"]
+ del inputs["high_res_pixel_values"]
+
+ wte = model.get_input_embeddings()
+ inputs["inputs_embeds"] = wte(input_ids)
+
+ with torch.no_grad():
+ model(**inputs)
+
+ # overwrite inputs_embeds tests because we need to delete "pixel values" for VLMs.
+ def test_inputs_embeds_matches_input_ids(self):
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+ for model_class in self.all_model_classes:
+ model = model_class(config)
+ model.to(torch_device)
+ model.eval()
+
+ inputs = self._prepare_for_class(inputs_dict, model_class)
+ input_ids = inputs["input_ids"]
+ del inputs["input_ids"]
+ del inputs["pixel_values"]
+ del inputs["high_res_pixel_values"]
+
+ inputs_embeds = model.get_input_embeddings()(input_ids)
+
+ with torch.no_grad():
+ out_ids = model(input_ids=input_ids, **inputs)[0]
+ out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
+ torch.testing.assert_close(out_embeds, out_ids)
+
+ @unittest.skip(reason="Siglip uses the same initialization scheme as the Flax original implementation")
+ # Copied from tests.models.siglip.test_modeling_siglip.SiglipVisionModelTest.test_initialization
+ def test_initialization(self):
+ pass
+
+ @require_torch_sdpa
+ def test_sdpa_can_dispatch_composite_models(self):
+ for model_class in self.all_model_classes:
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+ model = model_class(config)
+
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ model.save_pretrained(tmpdirname)
+
+ # Load the model with SDPA
+ model_sdpa = model_class.from_pretrained(
+ tmpdirname,
+ attn_implementation="sdpa",
+ )
+ model_sdpa = model_sdpa.eval().to(torch_device)
+
+ # Load model with eager attention
+ model_eager = model_class.from_pretrained(
+ tmpdirname,
+ attn_implementation="eager",
+ )
+ model_eager = model_eager.eval().to(torch_device)
+
+ self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
+ self.assertTrue(model_eager.config._attn_implementation == "eager")
+
+ if (
+ hasattr(model_sdpa, "vision_model")
+ and hasattr(model_sdpa, "high_res_vision_model")
+ and hasattr(model_sdpa, "language_model")
+ ):
+ self.assertTrue(model_sdpa.language_model.config._attn_implementation == "sdpa")
+ self.assertTrue(model_sdpa.vision_model.config._attn_implementation == "sdpa")
+ self.assertTrue(model_sdpa.high_res_vision_model.config._attn_implementation == "sdpa")
+ self.assertTrue(model_eager.language_model.config._attn_implementation == "eager")
+ self.assertTrue(model_eager.high_res_vision_model.config._attn_implementation == "eager")
+
+ for name, submodule in model_eager.named_modules():
+ class_name = submodule.__class__.__name__
+ if (
+ any(re.finditer(r"Attention(?!Pool)", class_name))
+ and getattr(submodule, "config", None)
+ and submodule.config._attn_implementation == "sdpa"
+ ):
+ self.assertTrue(submodule.config._attn_implementation == "eager")
+
+ for name, submodule in model_sdpa.named_modules():
+ class_name = submodule.__class__.__name__
+ if (
+ any(re.finditer(r"Attention(?!Pool)", class_name))
+ and getattr(submodule, "config", None)
+ and submodule.config._attn_implementation == "eager"
+ ):
+ self.assertTrue(submodule.config._attn_implementation == "sdpa")
+
+
+@require_torch
+@require_torch_accelerator
+@slow
+class DeepseekVLHybridIntegrationTest(unittest.TestCase):
+ def setUp(self):
+ self.model_id = "deepseek-community/deepseek-vl-7b-chat"
+
+ def test_model_text_generation(self):
+ model = DeepseekVLHybridForConditionalGeneration.from_pretrained(
+ self.model_id, torch_dtype="auto", device_map="auto"
+ )
+ model.to(torch_device)
+ model.eval()
+ processor = AutoProcessor.from_pretrained(self.model_id)
+
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image",
+ "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
+ },
+ {"type": "text", "text": "Describe this image."},
+ ],
+ }
+ ]
+ EXPECTED_TEXT = 'You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: Describe this image.\n\nAssistant:The image depicts a fluffy, beige-colored animal with a long tail, walking on snow. The' # fmt: skip
+
+ inputs = processor.apply_chat_template(
+ messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
+ )
+ inputs = inputs.to(model.device, dtype=model.dtype)
+ output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
+ text = processor.decode(output[0], skip_special_tokens=True)
+
+ self.assertEqual(
+ text,
+ EXPECTED_TEXT,
+ )
+
+ def test_model_text_generation_batched(self):
+ model = DeepseekVLHybridForConditionalGeneration.from_pretrained(
+ self.model_id, torch_dtype="auto", device_map="auto"
+ )
+ model.to(torch_device)
+ model.eval()
+ processor = AutoProcessor.from_pretrained(self.model_id)
+
+ messages = [
+ [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image",
+ "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
+ },
+ {"type": "text", "text": "Describe this image."},
+ ],
+ }
+ ],
+ [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image",
+ "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
+ },
+ {"type": "text", "text": "What animal do you see in the image?"},
+ ],
+ }
+ ],
+ ]
+ EXPECTED_TEXT = [
+ "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: Describe this image.\n\nAssistant:The image depicts a fluffy, beige-colored animal with a long tail, walking on snow. The", # fmt: skip
+ "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: What animal do you see in the image?\n\nAssistant:I see a large, furry animal that appears to be a type of bear.The ", # fmt: skip
+ ]
+
+ inputs = processor.apply_chat_template(
+ messages, add_generation_prompt=True, tokenize=True, padding=True, return_dict=True, return_tensors="pt"
+ )
+ inputs = inputs.to(model.device, dtype=model.dtype)
+ output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
+ text = processor.batch_decode(output, skip_special_tokens=True)
+
+ self.assertEqual(EXPECTED_TEXT, text)
+
+ def test_model_text_generation_with_multi_image(self):
+ model = DeepseekVLHybridForConditionalGeneration.from_pretrained(
+ self.model_id, torch_dtype="auto", device_map="auto"
+ )
+ model.to(torch_device)
+ model.eval()
+ processor = AutoProcessor.from_pretrained(self.model_id)
+
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {"type": "text", "text": "What's the difference between"},
+ {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
+ {"type": "text", "text": " and "},
+ {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
+ ],
+ }
+ ]
+ EXPECTED_TEXT = "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: What's the difference between and \n\nAssistant:The image shows a street scene with a prominent red stop sign in the foreground. The sign has the" # fmt: skip
+
+ inputs = processor.apply_chat_template(
+ messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
+ )
+ inputs = inputs.to(model.device, dtype=model.dtype)
+ output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
+ text = processor.decode(output[0], skip_special_tokens=True)
+
+ self.assertEqual(
+ text,
+ EXPECTED_TEXT,
+ )
diff --git a/tests/models/deepseek_vl_hybrid/test_processor_deepseek_vl_hybrid.py b/tests/models/deepseek_vl_hybrid/test_processor_deepseek_vl_hybrid.py
new file mode 100644
index 0000000000..10608d8bdb
--- /dev/null
+++ b/tests/models/deepseek_vl_hybrid/test_processor_deepseek_vl_hybrid.py
@@ -0,0 +1,54 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import tempfile
+import unittest
+
+from transformers import DeepseekVLHybridProcessor, LlamaTokenizer
+from transformers.models.deepseek_vl.convert_deepseek_vl_weights_to_hf import CHAT_TEMPLATE
+from transformers.testing_utils import get_tests_dir
+from transformers.utils import is_vision_available
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+if is_vision_available():
+ from transformers import DeepseekVLHybridImageProcessor
+
+
+SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
+
+
+class DeepseekVLHybridProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+ processor_class = DeepseekVLHybridProcessor
+
+ def setUp(self):
+ self.tmpdirname = tempfile.mkdtemp()
+ image_processor = DeepseekVLHybridImageProcessor()
+ tokenizer = LlamaTokenizer(
+ vocab_file=SAMPLE_VOCAB,
+ extra_special_tokens={
+ "pad_token": "<|end▁of▁sentence|>",
+ "image_token": "",
+ },
+ )
+ processor = self.processor_class(
+ image_processor=image_processor,
+ tokenizer=tokenizer,
+ chat_template=CHAT_TEMPLATE,
+ )
+ processor.save_pretrained(self.tmpdirname)
+
+ def prepare_processor_dict(self):
+ return {"chat_template": CHAT_TEMPLATE, "num_image_tokens": 576}