diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index e317998a36..a7c79b002b 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -725,6 +725,10 @@ title: DAB-DETR - local: model_doc/deepseek_v2 title: DeepSeek-V2 + - local: model_doc/deepseek_vl + title: DeepseekVL + - local: model_doc/deepseek_vl_hybrid + title: DeepseekVLHybrid - local: model_doc/deformable_detr title: Deformable DETR - local: model_doc/deit diff --git a/docs/source/en/model_doc/deepseek_vl.md b/docs/source/en/model_doc/deepseek_vl.md new file mode 100644 index 0000000000..625a2c90b0 --- /dev/null +++ b/docs/source/en/model_doc/deepseek_vl.md @@ -0,0 +1,220 @@ + + +
+
+ PyTorch + FlashAttention + SDPA +
+
+ +# DeepseekVL + +[Deepseek-VL](https://arxiv.org/abs/2403.05525) was introduced by the DeepSeek AI team. It is a vision-language model (VLM) designed to process both text and images for generating contextually relevant responses. The model leverages [LLaMA](./llama) as its text encoder, while [SigLip](./siglip) is used for encoding images. + +You can find all the original Deepseek-VL checkpoints under the [DeepSeek-community](https://huggingface.co/deepseek-community) organization. + +> [!TIP] +> Click on the Deepseek-VL models in the right sidebar for more examples of how to apply Deepseek-VL to different vision and language tasks. + +The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class. + + + + +```py +import torch +from transformers import pipeline + +pipe = pipeline( + task="image-text-to-text", + model="deepseek-community/deepseek-vl-1.3b-chat", + device=0, + torch_dtype=torch.float16 +) + +messages = [ + { + "role": "user", + "content": [ + { + "type": "image", + "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg", + }, + { "type": "text", "text": "Describe this image."}, + ] + } +] + +pipe(text=messages, max_new_tokens=20, return_full_text=False) +``` + + + + +```py +import torch +from transformers import DeepseekVLForConditionalGeneration, AutoProcessor + +model = DeepseekVLForConditionalGeneration.from_pretrained( + "deepseek-community/deepseek-vl-1.3b-chat", + torch_dtype=torch.float16, + device_map="auto", + attn_implementation="sdpa" +) + +processor = AutoProcessor.from_pretrained("deepseek-community/deepseek-vl-1.3b-chat") + +messages = [ + { + "role":"user", + "content":[ + { + "type":"image", + "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" + }, + { + "type":"text", + "text":"Describe this image." + } + ] + } + +] + +inputs = processor.apply_chat_template( + messages, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + return_tensors="pt" +).to(model.device, dtype=model.dtype) + +generated_ids = model.generate(**inputs, max_new_tokens=128) +generated_ids_trimmed = [ + out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) +] +output_text = processor.batch_decode( + generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False +) + +print(output_text) +``` + + + +Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends. + +The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4. + +```python +import torch +from transformers import TorchAoConfig, DeepseekVLForConditionalGeneration, AutoProcessor + +quantization_config = TorchAoConfig( + "int4_weight_only", + group_size=128 +) + +model = DeepseekVLForConditionalGeneration.from_pretrained( + "deepseek-community/deepseek-vl-1.3b-chat", + torch_dtype=torch.bfloat16, + device_map="auto", + quantization_config=quantization_config +) +``` +### Notes + +- Do inference with multiple images in a single conversation. + ```py + import torch + from transformers import DeepseekVLForConditionalGeneration, AutoProcessor + + model = DeepseekVLForConditionalGeneration.from_pretrained( + "deepseek-community/deepseek-vl-1.3b-chat", + torch_dtype=torch.float16, + device_map="auto", + attn_implementation="sdpa" + ) + + processor = AutoProcessor.from_pretrained("deepseek-community/deepseek-vl-1.3b-chat") + + messages = [ + [ + { + "role": "user", + "content": [ + {"type": "text", "text": "What’s the difference between"}, + {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"}, + {"type": "text", "text": " and "}, + {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"} + ] + } + ], + [ + { + "role": "user", + "content": [ + {"type": "image", "url": "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"}, + {"type": "text", "text": "What do you see in this image?"} + ] + } + ] + ] + + inputs = processor.apply_chat_template( + messages, + add_generation_prompt=True, + padding=True, + truncation=True, + tokenize=True, + return_dict=True, + return_tensors="pt" + ).to(model.device, dtype=model.dtype) + + generated_ids = model.generate(**inputs, max_new_tokens=128) + generated_ids_trimmed = [ + out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) + ] + output_text = processor.batch_decode( + generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False + ) + + print(output_text) + ``` + +## DeepseekVLConfig + +[[autodoc]] DeepseekVLConfig + +## DeepseekVLProcessor + +[[autodoc]] DeepseekVLProcessor + +## DeepseekVLImageProcessor + +[[autodoc]] DeepseekVLImageProcessor + +## DeepseekVLModel + +[[autodoc]] DeepseekVLModel + - forward + +## DeepseekVLForConditionalGeneration + +[[autodoc]] DeepseekVLForConditionalGeneration + - forward diff --git a/docs/source/en/model_doc/deepseek_vl_hybrid.md b/docs/source/en/model_doc/deepseek_vl_hybrid.md new file mode 100644 index 0000000000..86e1672bce --- /dev/null +++ b/docs/source/en/model_doc/deepseek_vl_hybrid.md @@ -0,0 +1,219 @@ + + +
+
+ PyTorch + SDPA +
+
+ +# DeepseekVLHybrid + +[Deepseek-VL-Hybrid](https://arxiv.org/abs/2403.05525) was introduced by the DeepSeek AI team. It is a vision-language model (VLM) designed to process both text and images for generating contextually relevant responses. The model leverages [LLaMA](./llama) as its text encoder, while [SigLip](./siglip) is used for encoding low-resolution images and [SAM (Segment Anything Model)](./sam) is incorporated to handle high-resolution image encoding, enhancing the model’s ability to process fine-grained visual details. Deepseek-VL-Hybrid is a variant of Deepseek-VL that uses [SAM (Segment Anything Model)](./sam) to handle high-resolution image encoding. + +You can find all the original Deepseek-VL-Hybrid checkpoints under the [DeepSeek-community](https://huggingface.co/deepseek-community) organization. + +> [!TIP] +> Click on the Deepseek-VL-Hybrid models in the right sidebar for more examples of how to apply Deepseek-VL-Hybrid to different vision and language tasks. + +The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class. + + + + +```py +import torch +from transformers import pipeline + +pipe = pipeline( + task="image-text-to-text", + model="deepseek-community/deepseek-vl-7b-chat", + device=0, + torch_dtype=torch.float16 +) + +messages = [ + { + "role": "user", + "content": [ + { + "type": "image", + "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg", + }, + { "type": "text", "text": "Describe this image."}, + ] + } +] + +pipe(text=messages, max_new_tokens=20, return_full_text=False) +``` + + + + +```py +import torch +from transformers import DeepseekVLHybridForConditionalGeneration, AutoProcessor + +model = DeepseekVLHybridForConditionalGeneration.from_pretrained( + "deepseek-community/deepseek-vl-7b-chat", + torch_dtype=torch.float16, + device_map="auto", + attn_implementation="sdpa" +) + +processor = AutoProcessor.from_pretrained("deepseek-community/deepseek-vl-7b-chat") + +messages = [ + { + "role":"user", + "content":[ + { + "type":"image", + "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" + }, + { + "type":"text", + "text":"Describe this image." + } + ] + } + +] + +inputs = processor.apply_chat_template( + messages, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + return_tensors="pt" +).to(model.device, dtype=model.dtype) + +generated_ids = model.generate(**inputs, max_new_tokens=128) +generated_ids_trimmed = [ + out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) +] +output_text = processor.batch_decode( + generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False +) + +print(output_text) +``` + + + +Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends. + +The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4. + +```python +import torch +from transformers import TorchAoConfig, DeepseekVLHybridForConditionalGeneration, AutoProcessor + +quantization_config = TorchAoConfig( + "int4_weight_only", + group_size=128 +) + +model = DeepseekVLHybridForConditionalGeneration.from_pretrained( + "deepseek-community/deepseek-vl-7b-chat", + torch_dtype=torch.bfloat16, + device_map="auto", + quantization_config=quantization_config +) +``` +### Notes + +- Do inference with multiple images in a single conversation. + ```py + import torch + from transformers import DeepseekVLHybridForConditionalGeneration, AutoProcessor + + model = DeepseekVLHybridForConditionalGeneration.from_pretrained( + "deepseek-community/deepseek-vl-7b-chat", + torch_dtype=torch.float16, + device_map="auto", + attn_implementation="sdpa" + ) + + processor = AutoProcessor.from_pretrained("deepseek-community/deepseek-vl-7b-chat") + + messages = [ + [ + { + "role": "user", + "content": [ + {"type": "text", "text": "What’s the difference between"}, + {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"}, + {"type": "text", "text": " and "}, + {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"} + ] + } + ], + [ + { + "role": "user", + "content": [ + {"type": "image", "url": "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"}, + {"type": "text", "text": "What do you see in this image?"} + ] + } + ] + ] + + inputs = processor.apply_chat_template( + messages, + add_generation_prompt=True, + padding=True, + truncation=True, + tokenize=True, + return_dict=True, + return_tensors="pt" + ).to(model.device, dtype=model.dtype) + + generated_ids = model.generate(**inputs, max_new_tokens=128) + generated_ids_trimmed = [ + out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) + ] + output_text = processor.batch_decode( + generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False + ) + + print(output_text) + ``` + +## DeepseekVLHybridConfig + +[[autodoc]] DeepseekVLHybridConfig + +## DeepseekVLHybridProcessor + +[[autodoc]] DeepseekVLHybridProcessor + +## DeepseekVLHybridImageProcessor + +[[autodoc]] DeepseekVLHybridImageProcessor + +## DeepseekVLHybridModel + +[[autodoc]] DeepseekVLHybridModel + - forward + +## DeepseekVLHybridForConditionalGeneration + +[[autodoc]] DeepseekVLHybridForConditionalGeneration + - forward diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 7b59f958f0..b691cea112 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -84,6 +84,8 @@ if TYPE_CHECKING: from .decision_transformer import * from .deepseek_v2 import * from .deepseek_v3 import * + from .deepseek_vl import * + from .deepseek_vl_hybrid import * from .deformable_detr import * from .deit import * from .deprecated import * diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 4d22bd00ef..eb25e0d025 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -103,6 +103,8 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str]( ("decision_transformer", "DecisionTransformerConfig"), ("deepseek_v2", "DeepseekV2Config"), ("deepseek_v3", "DeepseekV3Config"), + ("deepseek_vl", "DeepseekVLConfig"), + ("deepseek_vl_hybrid", "DeepseekVLHybridConfig"), ("deformable_detr", "DeformableDetrConfig"), ("deit", "DeiTConfig"), ("depth_anything", "DepthAnythingConfig"), @@ -495,6 +497,8 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str]( ("decision_transformer", "Decision Transformer"), ("deepseek_v2", "DeepSeek-V2"), ("deepseek_v3", "DeepSeek-V3"), + ("deepseek_vl", "DeepseekVL"), + ("deepseek_vl_hybrid", "DeepseekVLHybrid"), ("deformable_detr", "Deformable DETR"), ("deit", "DeiT"), ("deplot", "DePlot"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index cd0473a2d7..0a0cc6a38c 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -77,6 +77,8 @@ else: ("convnextv2", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")), ("cvt", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")), ("data2vec-vision", ("BeitImageProcessor", "BeitImageProcessorFast")), + ("deepseek_vl", ("DeepseekVLImageProcessor")), + ("deepseek_vl_hybrid", ("DeepseekVLHybridImageProcessor")), ("deformable_detr", ("DeformableDetrImageProcessor", "DeformableDetrImageProcessorFast")), ("deit", ("DeiTImageProcessor", "DeiTImageProcessorFast")), ("depth_anything", ("DPTImageProcessor", "DPTImageProcessorFast")), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 9d6622f389..85eb8ff6bb 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -97,6 +97,8 @@ MODEL_MAPPING_NAMES = OrderedDict( ("decision_transformer", "DecisionTransformerModel"), ("deepseek_v2", "DeepseekV2Model"), ("deepseek_v3", "DeepseekV3Model"), + ("deepseek_vl", "DeepseekVLModel"), + ("deepseek_vl_hybrid", "DeepseekVLHybridModel"), ("deformable_detr", "DeformableDetrModel"), ("deit", "DeiTModel"), ("depth_pro", "DepthProModel"), @@ -935,6 +937,8 @@ MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = OrderedDict( ("blip", "BlipForConditionalGeneration"), ("blip-2", "Blip2ForConditionalGeneration"), ("chameleon", "ChameleonForConditionalGeneration"), + ("deepseek_vl", "DeepseekVLForConditionalGeneration"), + ("deepseek_vl_hybrid", "DeepseekVLHybridForConditionalGeneration"), ("emu3", "Emu3ForConditionalGeneration"), ("evolla", "EvollaForProteinText2Text"), ("fuyu", "FuyuForCausalLM"), diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index 31b798c805..cc2be544f4 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -62,6 +62,8 @@ PROCESSOR_MAPPING_NAMES = OrderedDict( ("clvp", "ClvpProcessor"), ("colpali", "ColPaliProcessor"), ("colqwen2", "ColQwen2Processor"), + ("deepseek_vl", "DeepseekVLProcessor"), + ("deepseek_vl_hybrid", "DeepseekVLHybridProcessor"), ("dia", "DiaProcessor"), ("emu3", "Emu3Processor"), ("evolla", "EvollaProcessor"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 6c4e3e98c7..6e5b07dddf 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -193,6 +193,20 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]]( "LlamaTokenizerFast" if is_tokenizers_available() else None, ), ), + ( + "deepseek_vl", + ( + "LlamaTokenizer" if is_sentencepiece_available() else None, + "LlamaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "deepseek_vl_hybrid", + ( + "LlamaTokenizer" if is_sentencepiece_available() else None, + "LlamaTokenizerFast" if is_tokenizers_available() else None, + ), + ), ("dia", ("DiaTokenizer", None)), ( "diffllama", diff --git a/src/transformers/models/deepseek_vl/__init__.py b/src/transformers/models/deepseek_vl/__init__.py new file mode 100644 index 0000000000..2422b31e31 --- /dev/null +++ b/src/transformers/models/deepseek_vl/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_deepseek_vl import * + from .image_processing_deepseek_vl import * + from .image_processing_deepseek_vl_fast import * + from .modeling_deepseek_vl import * + from .processing_deepseek_vl import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/deepseek_vl/configuration_deepseek_vl.py b/src/transformers/models/deepseek_vl/configuration_deepseek_vl.py new file mode 100644 index 0000000000..af99ac9eeb --- /dev/null +++ b/src/transformers/models/deepseek_vl/configuration_deepseek_vl.py @@ -0,0 +1,96 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/deepseek_vl/modular_deepseek_vl.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_deepseek_vl.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ..auto import CONFIG_MAPPING, AutoConfig + + +logger = logging.get_logger(__name__) + + +class DeepseekVLConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DeepseekVLModel`]. It is used to instantiate a + DeepseekVL model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the DeepseekVL + [deepseek-community/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-community/deepseek-vl-1.3b-chat) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`): + The config object or dictionary of the text backbone. + vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SiglipVisionConfig`): + The config object or dictionary of the vision backbone. + image_token_id (`int`, *optional*, defaults to 100015): + The index representing image tokens in the model's token vocabulary. + + Example: + + ```python + >>> from transformers import DeepseekVLConfig, DeepseekVLModel + + >>> # Initializing a DeepseekVL deepseek-community/deepseek-vl-1.3b-chat style configuration + >>> configuration = DeepseekVLConfig() + + >>> # Initializing a model (with random weights) from the deepseek-community/deepseek-vl-1.3b-chat style configuration + >>> model = DeepseekVLModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "deepseek_vl" + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} + + def __init__( + self, + text_config: AutoConfig = None, + vision_config: AutoConfig = None, + image_token_id: int = 100015, + **kwargs, + ): + super().__init__(**kwargs) + + if text_config is None: + text_config = {} + logger.info("`text_config` is `None`. Initializing the `LlamaConfig` with default values.") + + if vision_config is None: + vision_config = {} + logger.info("`vision_config` is `None`. Initializing the `SiglipVisionConfig` with default values.") + + if isinstance(text_config, dict): + text_config["model_type"] = text_config.get("model_type", "llama") + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + + if isinstance(vision_config, dict): + vision_config["model_type"] = vision_config.get("model_type", "siglip_vision_model") + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + + self.text_config = text_config + self.vision_config = vision_config + self.image_token_id = image_token_id + + +__all__ = ["DeepseekVLConfig"] diff --git a/src/transformers/models/deepseek_vl/convert_deepseek_vl_weights_to_hf.py b/src/transformers/models/deepseek_vl/convert_deepseek_vl_weights_to_hf.py new file mode 100644 index 0000000000..3e9b6a37fe --- /dev/null +++ b/src/transformers/models/deepseek_vl/convert_deepseek_vl_weights_to_hf.py @@ -0,0 +1,356 @@ +# coding=utf-8 +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import gc +import json +import os +from typing import Optional + +import regex as re +import torch +from accelerate import init_empty_weights +from huggingface_hub import snapshot_download +from huggingface_hub.errors import HFValidationError +from safetensors.torch import load_file + +from transformers import ( + AutoTokenizer, + DeepseekVLConfig, + DeepseekVLForConditionalGeneration, + DeepseekVLImageProcessor, + DeepseekVLProcessor, +) +from transformers.image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD + + +# fmt: off +ORIGINAL_TO_CONVERTED_KEY_MAPPING = { + # Siglip (Low Resolution) + r"vision_model.vision_tower.pos_embed": r"model.vision_model.vision_model.embeddings.position_embedding.weight", + r"vision_model.vision_tower.patch_embed.proj.(weight|bias)": r"model.vision_model.vision_model.embeddings.patch_embedding.\1", + r"vision_model.vision_tower.blocks.(\d+).attn.qkv.(weight|bias)": r"model.vision_model.vision_model.encoder.layers.\1.self_attn.(q|k|v)_proj.\2", + r"vision_model.vision_tower.blocks.(\d+).attn.proj.(weight|bias)": r"model.vision_model.vision_model.encoder.layers.\1.self_attn.out_proj.\2", + r"vision_model.vision_tower.blocks.(\d+).norm(\d+).(weight|bias)": r"model.vision_model.vision_model.encoder.layers.\1.layer_norm\2.\3", + r"vision_model.vision_tower.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"model.vision_model.vision_model.encoder.layers.\1.mlp.fc\2.\3", + r"vision_model.vision_tower.norm.(weight|bias)": r"model.vision_model.vision_model.post_layernorm.\1", + r"vision_model.vision_tower.attn_pool.latent": r"model.vision_model.vision_model.head.probe", + r"vision_model.vision_tower.attn_pool.proj.(weight|bias)": r"model.vision_model.vision_model.head.attention.out_proj.\1", + r"vision_model.vision_tower.attn_pool.norm.(weight|bias)": r"model.vision_model.vision_model.head.layernorm.\1", + r"vision_model.vision_tower.attn_pool.mlp.fc(\d+).(weight|bias)": r"model.vision_model.vision_model.head.mlp.fc\1.\2", + + # Aligner + r"aligner.layers.0.(weight|bias)": r"model.aligner.linear1.\1", + r"aligner.layers.2.(weight|bias)": r"model.aligner.linear2.\1", + + # Llama (Text Model) + r"language_model.model.(\w+)": r"model.language_model.\1", + r"language_model.lm_head.(weight|bias)": r"lm_head.\1", +} +# fmt: on + +# Adopted from https://github.com/deepseek-ai/DeepSeek-VL/blob/main/deepseek_vl/utils/conversation.py#L80-L91 +CHAT_TEMPLATE = ( + # Define separators and initialize counter + "{% set seps = ['\n\n', '<\uff5cend\u2581of\u2581sentence\uff5c>'] %}" + "{% set i = 0 %}" + # Start with default system prompt + "You are a helpful language and vision assistant. " + "You are able to understand the visual content that the user provides, " + "and assist the user with a variety of tasks using natural language.\n\n" + # Iterate through messages + "{% for message in messages %}" + # Identify user or assistant role + "{% if message['role']|lower == 'user' %}" + "User: " + "{% elif message['role']|lower == 'assistant' %}" + "Assistant:{% if not (loop.last and not add_generation_prompt and message['content'][0]['type']=='text' and message['content'][0]['text']=='') %} {% endif %}" + "{% else %}" + "{{ message['role'].capitalize() }}: " + "{% endif %}" + # Iterate through message content (text/images) + "{% for content in message['content'] %}" + # If content is an image, replace with placeholder + "{% if content['type'] == 'image' %}" + "" + # If content is text, handle formatting + "{% elif content['type'] == 'text' %}" + "{% set text = content['text'] %}" + # Strip whitespace for first and last text blocks + "{% if loop.first %}{% set text = text.lstrip() %}{% endif %}" + "{% if loop.last %}{% set text = text.rstrip() %}{% endif %}" + # If previous content was text, add space + "{% if not loop.first and message['content'][loop.index0-1]['type'] == 'text' %}" + "{{ ' ' + text }}" + "{% else %}" + "{{ text }}" + "{% endif %}" + "{% endif %}" + "{% endfor %}" # End message content loop + # Add separators between messages + "{% if not loop.last or add_generation_prompt %}" + "{% if message['role']|lower == 'user' %}" + "{{ seps[0] }}" + "{% else %}" + "{{ seps[1] }}" + "{% endif %}" + "{% endif %}" + "{% endfor %}" # End messages loop + # Add final Assistant prompt if required + "{% if add_generation_prompt %}Assistant:{% endif %}" +) + + +def convert_old_keys_to_new_keys(state_dict_keys: dict): + output_dict = {} + + old_text = "\n".join(state_dict_keys) + new_text = old_text + for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items(): + if replacement is None: + new_text = re.sub(pattern, "", new_text) # an empty line + continue + new_text = re.sub(pattern, replacement, new_text) + output_dict = dict(zip(old_text.split("\n"), new_text.split("\n"))) + + return output_dict + + +def get_qkv_state_dict(key, parameter): + """ + new key which looks like this + xxxx.(q|k|v).xxx (m, n) + + is converted to + xxxx.q.xxxx (m//3, n) + xxxx.k.xxxx (m//3, n) + xxxx.v.xxxx (m//3, n) + """ + qkv_state_dict = {} + placeholder = re.search(r"(\(.*?\))", key).group(1) # finds "(query|key|value)" + replacements_keys = placeholder[1:-1].split("|") # creates ['query', 'key', 'value'] + replacements_vals = torch.split( + parameter, split_size_or_sections=parameter.size(0) // len(replacements_keys), dim=0 + ) + for replacement_key, replacement_val in zip(replacements_keys, replacements_vals): + qkv_state_dict[key.replace(placeholder, replacement_key)] = replacement_val + return qkv_state_dict + + +def update_state_dict(old_state_dict): + all_keys = list(old_state_dict.keys()) + new_keys = convert_old_keys_to_new_keys(all_keys) + + state_dict = {} + for key in all_keys: + new_key = new_keys[key] + current_parameter = old_state_dict.pop(key) + + if "qkv" in key and "vision_tower_high" not in key: + qkv_state_dict = get_qkv_state_dict(new_key, current_parameter) + state_dict.update(qkv_state_dict) + elif "pos_embed" in key: + if "vision_tower_high" not in key: + # timm implementation of siglip creates this param of size [1, 576, 1024] + # transformers implementation of siglip creates this param of size [576, 1024] + state_dict[new_key] = current_parameter.squeeze(0) + else: + state_dict[new_key] = current_parameter + else: + state_dict[new_key] = current_parameter + + return state_dict + + +def load_model_state_dict(input_path: str) -> dict: + """ + Load model state dict, handling both single and sharded files. + """ + index_path = os.path.join(input_path, "model.safetensors.index.json") + single_file_path = os.path.join(input_path, "model.safetensors") + + # Check if we have a sharded model + if os.path.exists(index_path): + print("Loading sharded model...") + state_dict = {} + with open(index_path, "r") as f: + index = json.load(f) + + # Get unique shard files and load each one only once + unique_shard_files = sorted(set(index["weight_map"].values())) + for shard_file in unique_shard_files: + print(f"Loading shard {shard_file}...") + shard_path = os.path.join(input_path, shard_file) + shard_dict = load_file(shard_path) + state_dict.update(shard_dict) + + return state_dict + + # Single file model + elif os.path.exists(single_file_path): + print("Loading single file model...") + return load_file(single_file_path, device="cpu") + + else: + raise ValueError(f"No model files found in {input_path}") + + +def convert_model( + hf_repo_id: str, + output_dir: Optional[str] = None, + output_hub_path: Optional[str] = None, + safe_serialization: bool = True, +): + if output_dir: + os.makedirs(output_dir, exist_ok=True) + + try: + input_path = snapshot_download(hf_repo_id) + except HFValidationError: + # If the input path is not a HF repo ID, assume it's a local path + input_path = hf_repo_id + + # ------------------------------------------------------------ + # Create and save config + # ------------------------------------------------------------ + + config = DeepseekVLConfig( + text_config={ + "hidden_size": 2048, + "intermediate_size": 5632, + "max_position_embeddings": 16384, + "num_attention_heads": 16, + "num_hidden_layers": 24, + "vocab_size": 102400, + }, + vision_config={ + "hidden_size": 1024, + "intermediate_size": 4096, + "image_size": 384, + "patch_size": 16, + "hidden_act": "gelu", + "vision_use_head": False, + "num_attention_heads": 16, + "num_hidden_layers": 24, + }, + ) + + # save config + if output_dir: + config.save_pretrained(output_dir) + print("Model config saved successfully...") + + # ------------------------------------------------------------ + # Convert processor + # ------------------------------------------------------------ + + image_processor = DeepseekVLImageProcessor( + image_mean=IMAGENET_STANDARD_MEAN, + image_std=IMAGENET_STANDARD_STD, + ) + + tokenizer = AutoTokenizer.from_pretrained( + input_path, + extra_special_tokens={ + "pad_token": "<|end▁of▁sentence|>", + "image_token": "", + }, + ) + + processor = DeepseekVLProcessor( + image_processor=image_processor, + tokenizer=tokenizer, + chat_template=CHAT_TEMPLATE, + ) + + if output_dir: + print(f"Saving processor to {output_dir}...") + processor.save_pretrained(output_dir) + if output_hub_path: + print(f"Pushing processor to hub at {output_hub_path}...") + processor.push_to_hub(output_hub_path) + + # ------------------------------------------------------------ + # Convert weights + # ------------------------------------------------------------ + + print("Creating empty model...") + with init_empty_weights(): + model = DeepseekVLForConditionalGeneration(config) + + # Load and convert state dict + print("Loading state dict...") + state_dict = load_model_state_dict(input_path) + state_dict = update_state_dict(state_dict) + + # Load converted state dict + print("Loading converted weights into model...") + info = model.load_state_dict(state_dict, strict=False, assign=True) + if len(info.missing_keys) > 0: + raise ValueError(f"Missing keys: {info.missing_keys}") + + # Tie weights before any device mapping + print("Tying weights...") + model.tie_weights() + + # Save the model + if output_dir: + print(f"Saving model to {output_dir}...") + model.save_pretrained(output_dir, safe_serialization=safe_serialization) + if output_hub_path: + print(f"Pushing model to hub at {output_hub_path}...") + model.push_to_hub(output_hub_path, safe_serialization=safe_serialization) + + del state_dict, model + gc.collect() + + # Validate the saved model if saved locally + if output_dir: + print("Reloading the local model to check if it's saved correctly...") + DeepseekVLForConditionalGeneration.from_pretrained(output_dir, device_map="auto") + print("Local model reloaded successfully.") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--hf_repo_id", + default="deepseek-ai/deepseek-vl-1.3b-chat", + help="Location of official weights from DeepseekAI on HF", + ) + parser.add_argument( + "--output_dir", + default=None, + help="Location to write the converted model and processor", + ) + parser.add_argument( + "--output_hub_path", + default=None, + help="Repository ID to push model to hub (e.g. 'username/model-name')", + ) + parser.add_argument( + "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`." + ) + args = parser.parse_args() + + convert_model( + hf_repo_id=args.hf_repo_id, + output_dir=args.output_dir, + output_hub_path=args.output_hub_path, + safe_serialization=args.safe_serialization, + ) + + +if __name__ == "__main__": + main() diff --git a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py new file mode 100644 index 0000000000..fad24220ef --- /dev/null +++ b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py @@ -0,0 +1,414 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/deepseek_vl/modular_deepseek_vl.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_deepseek_vl.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import convert_to_rgb, resize, to_channel_dimension_format +from ...image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_scaled_image, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import ( + TensorType, + filter_out_non_signature_kwargs, + is_vision_available, + logging, +) + + +if is_vision_available(): + import PIL + + +logger = logging.get_logger(__name__) + + +class DeepseekVLImageProcessor(BaseImageProcessor): + r""" + Constructs a DEEPSEEK_VL image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the + `do_resize` parameter in the `preprocess` method. + size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`): + Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` + method. + min_size (`int`, *optional*, defaults to 14): + The minimum allowed size for the resized image. Ensures that neither the height nor width + falls below this value after resizing. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be + overridden by the `resample` parameter in the `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the + `do_rescale` parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be + overridden by the `rescale_factor` parameter in the `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. Can be overridden by the `do_normalize` parameter in the `preprocess` method. + image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be + overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + min_size: int = 14, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_convert_rgb: Optional[bool] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"height": 384, "width": 384} + size = get_size_dict(size, default_to_square=True) + + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN + self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD + self.do_convert_rgb = do_convert_rgb + + self.min_size = min_size + if image_mean is None: + self.background_color = (127, 127, 127) + else: + self.background_color = tuple([int(x * 255) for x in image_mean]) + + def resize( + self, + image: np.ndarray, + size: Union[dict[str, int], int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image to dynamically calculated size. + + Args: + image (`np.ndarray`): + Image to resize. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `None`: will be inferred from input + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + + Returns: + `np.ndarray`: The resized image. + """ + if input_data_format is None: + input_data_format = infer_channel_dimension_format(image) + + height, width = get_image_size(image, input_data_format) + max_size = max(height, width) + + size = get_size_dict(size, default_to_square=True) + if size["height"] != size["width"]: + raise ValueError( + f"Output height and width must be the same. Got height={size['height']} and width={size['width']}" + ) + size = size["height"] + + delta = size / max_size + # Largest side becomes `size` and the other side is scaled according to the aspect ratio. + output_size_nonpadded = [ + max(int(height * delta), self.min_size), + max(int(width * delta), self.min_size), + ] + + image = resize( + image, + size=output_size_nonpadded, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + # Expand and pad the images to obtain a square image of dimensions `size x size` + image = self.pad_to_square( + image=image, + background_color=self.background_color, + input_data_format=input_data_format, + ) + return image + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + do_convert_rgb: Optional[bool] = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + Controls the size of the image after `resize`. The shortest edge of the image is resized to + `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image + is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest + edge equal to `int(size["shortest_edge"] * (1333 / 800))`. + resample (`PILImageResampling`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`): + Image mean to normalize the image by if `do_normalize` is set to `True`. + image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to normalize the image by if `do_normalize` is set to `True`. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + resample = resample if resample is not None else self.resample + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + + size = size if size is not None else self.size + size = get_size_dict(size, default_to_square=False) + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + # PIL RGBA images are converted to RGB + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + if do_resize: + images = [ + self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + for image in images + ] + + if do_rescale: + images = [ + self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + for image in images + ] + + if do_normalize: + images = [ + self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format) + for image in images + ] + + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images + ] + + encoded_outputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors) + + return encoded_outputs + + def pad_to_square( + self, + image: np.ndarray, + background_color: Union[int, tuple[int, int, int]] = 0, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.array: + """ + Pads an image to a square based on the longest edge. + + Args: + image (`np.ndarray`): + The image to pad. + background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0): + The color to use for the padding. Can be an integer for single channel or a + tuple of integers representing for multi-channel images. If passed as integer + in mutli-channel mode, it will default to `0` in subsequent channels. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use same as the input image. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + + Returns: + `np.ndarray`: The padded image. + """ + height, width = get_image_size(image, input_data_format) + num_channels = image.shape[0] if input_data_format == ChannelDimension.FIRST else image.shape[-1] + + if height == width: + image = ( + to_channel_dimension_format(image, data_format, input_data_format) + if data_format is not None + else image + ) + return image + + max_dim = max(height, width) + + # Ensure background_color is the correct shape + if isinstance(background_color, int): + background_color = [background_color] + elif len(background_color) != num_channels: + raise ValueError( + f"background_color must have no more than {num_channels} elements to match the number of channels" + ) + + if input_data_format == ChannelDimension.FIRST: + result = np.zeros((num_channels, max_dim, max_dim), dtype=image.dtype) + for i, color in enumerate(background_color): + result[i, :, :] = color + if width > height: + start = (max_dim - height) // 2 + result[:, start : start + height, :] = image + else: + start = (max_dim - width) // 2 + result[:, :, start : start + width] = image + else: + result = np.zeros((max_dim, max_dim, num_channels), dtype=image.dtype) + for i, color in enumerate(background_color): + result[:, :, i] = color + if width > height: + start = (max_dim - height) // 2 + result[start : start + height, :, :] = image + else: + start = (max_dim - width) // 2 + result[:, start : start + width, :] = image + + return result + + def postprocess(self): + """Applies post-processing to the decoded image tokens by reversing transformations applied during preprocessing.""" + raise AttributeError("Not needed for DeepseekVL") + + +__all__ = ["DeepseekVLImageProcessor"] diff --git a/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py b/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py new file mode 100644 index 0000000000..ce85d739bc --- /dev/null +++ b/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py @@ -0,0 +1,349 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/deepseek_vl/modular_deepseek_vl.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_deepseek_vl.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Optional, Union + +from ...cache_utils import Cache +from ...generation import GenerationMixin +from ...modeling_outputs import ModelOutput +from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack +from ...utils import ( + TransformersKwargs, + auto_docstring, + can_return_tuple, + is_torch_available, +) +from ..auto import AutoModel +from .configuration_deepseek_vl import DeepseekVLConfig + + +if is_torch_available(): + import torch + import torch.nn as nn + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for DeepseekVL model's outputs that may also contain a past key/values (to speed up sequential decoding). + """ +) +class DeepseekVLBaseModelOutputWithPast(ModelOutput): + r""" + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + + If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, + hidden_size)` is output. + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if + `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads, + encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if + `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values` + input) to speed up sequential decoding. + image_hidden_states (`tuple(torch.FloatTensor)`, *optional*): + Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images, + sequence_length, hidden_size)`. + + image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[tuple[torch.FloatTensor]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for DeepseekVL causal language model (or autoregressive) outputs. + """ +) +class DeepseekVLCausalLMOutputWithPast(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`tuple(torch.FloatTensor)`, *optional*): + Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images, + sequence_length, hidden_size)`. + + image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[list[torch.FloatTensor]] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[tuple[torch.FloatTensor]] = None + + +class DeepseekVLAligner(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + + in_features = config.vision_config.hidden_size + out_features = config.text_config.hidden_size + + self.linear1 = nn.Linear(in_features, out_features) + self.activation = nn.GELU() + self.linear2 = nn.Linear(out_features, out_features) + + def forward(self, vision_encodings: torch.Tensor) -> torch.Tensor: + x = self.linear1(vision_encodings) + x = self.activation(x) + x = self.linear2(x) + return x + + +@auto_docstring +class DeepseekVLPreTrainedModel(PreTrainedModel): + config: DeepseekVLConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["LlamaDecoderLayer"] + _skip_keys_device_placement = ["past_key_values", "causal_mask"] + _supports_flash_attn = True + _supports_sdpa = True + + _supports_static_cache = True + _supports_param_buffer_assignment = False + + def _init_weights(self, module): + """Initialize the weights""" + # Required only for Linear layer in DeepseekVLAligner + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.text_config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + + +@auto_docstring +class DeepseekVLModel(DeepseekVLPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.config = config + + self.vision_model = AutoModel.from_config(config.vision_config) + self.aligner = DeepseekVLAligner(config) + + self.language_model = AutoModel.from_config(config=config.text_config) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing. + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def get_image_features(self, pixel_values): + image_embeds = self.vision_model(pixel_values) + image_embeds = self.aligner(image_embeds.last_hidden_state) + return image_embeds + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: torch.LongTensor = None, + pixel_values: torch.FloatTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs, + ): + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one" + ) + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + if pixel_values is not None: + if input_ids is None: + image_attention_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + image_attention_mask = image_attention_mask.all(-1) + else: + image_attention_mask = input_ids == self.config.image_token_id + + image_attention_mask = image_attention_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + image_embeds = self.get_image_features(pixel_values) + image_features = image_embeds.reshape(-1, inputs_embeds.shape[-1]) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(image_attention_mask, image_features) + + lm_output = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + return DeepseekVLBaseModelOutputWithPast( + last_hidden_state=lm_output.last_hidden_state, + past_key_values=lm_output.past_key_values, + hidden_states=lm_output.hidden_states, + attentions=lm_output.attentions, + image_hidden_states=image_embeds if pixel_values is not None else None, + ) + + +class DeepseekVLForConditionalGeneration(DeepseekVLPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["model.language_model.embed_tokens.weight", "lm_head.weight"] + _supports_static_cache = True + + def __init__(self, config: DeepseekVLConfig): + super().__init__(config) + self.config = config + self.model = DeepseekVLModel(config) + self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) + + # Initialize weights and apply final processing. + self.post_init() + + def get_input_embeddings(self): + return self.model.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.model.language_model.set_input_embeddings(value) + + def prepare_embeddings_for_image_generation(self) -> torch.Tensor: + raise AttributeError("Not needed for DeepseekVL") + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: torch.LongTensor = None, + pixel_values: torch.FloatTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ): + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + """ + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs + ) + + return DeepseekVLCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=outputs.image_hidden_states, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + pixel_values=None, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + logits_to_keep=None, + **kwargs, + ): + # Overwritten -- extra custom processing + + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore + # Otherwise we need pixel values to be passed to model + if cache_position[0] == 0: + model_inputs["pixel_values"] = pixel_values + + return model_inputs + + +__all__ = ["DeepseekVLPreTrainedModel", "DeepseekVLModel", "DeepseekVLForConditionalGeneration"] diff --git a/src/transformers/models/deepseek_vl/modular_deepseek_vl.py b/src/transformers/models/deepseek_vl/modular_deepseek_vl.py new file mode 100644 index 0000000000..a5190a280b --- /dev/null +++ b/src/transformers/models/deepseek_vl/modular_deepseek_vl.py @@ -0,0 +1,326 @@ +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Union + +from ...configuration_utils import PretrainedConfig +from ...image_processing_utils import BatchFeature +from ...image_utils import ( + ImageInput, + make_flat_list_of_images, +) +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils_base import ( + PreTokenizedInput, + TextInput, +) +from ...utils import ( + auto_docstring, + is_torch_available, + logging, +) +from ..auto import CONFIG_MAPPING, AutoConfig, AutoModel +from ..idefics.modeling_idefics import IdeficsBaseModelOutputWithPast, IdeficsCausalLMOutputWithPast +from ..janus.image_processing_janus import JanusImageProcessor +from ..janus.modeling_janus import JanusForConditionalGeneration, JanusModel, JanusPreTrainedModel + + +if is_torch_available(): + import torch + import torch.nn as nn + +logger = logging.get_logger(__name__) + + +class DeepseekVLConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DeepseekVLModel`]. It is used to instantiate a + DeepseekVL model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the DeepseekVL + [deepseek-community/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-community/deepseek-vl-1.3b-chat) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`): + The config object or dictionary of the text backbone. + vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SiglipVisionConfig`): + The config object or dictionary of the vision backbone. + image_token_id (`int`, *optional*, defaults to 100015): + The index representing image tokens in the model's token vocabulary. + + Example: + + ```python + >>> from transformers import DeepseekVLConfig, DeepseekVLModel + + >>> # Initializing a DeepseekVL deepseek-community/deepseek-vl-1.3b-chat style configuration + >>> configuration = DeepseekVLConfig() + + >>> # Initializing a model (with random weights) from the deepseek-community/deepseek-vl-1.3b-chat style configuration + >>> model = DeepseekVLModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "deepseek_vl" + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} + + def __init__( + self, + text_config: AutoConfig = None, + vision_config: AutoConfig = None, + image_token_id: int = 100015, + **kwargs, + ): + super().__init__(**kwargs) + + if text_config is None: + text_config = {} + logger.info("`text_config` is `None`. Initializing the `LlamaConfig` with default values.") + + if vision_config is None: + vision_config = {} + logger.info("`vision_config` is `None`. Initializing the `SiglipVisionConfig` with default values.") + + if isinstance(text_config, dict): + text_config["model_type"] = text_config.get("model_type", "llama") + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + + if isinstance(vision_config, dict): + vision_config["model_type"] = vision_config.get("model_type", "siglip_vision_model") + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + + self.text_config = text_config + self.vision_config = vision_config + self.image_token_id = image_token_id + + +class DeepseekVLBaseModelOutputWithPast(IdeficsBaseModelOutputWithPast): + pass + + +class DeepseekVLCausalLMOutputWithPast(IdeficsCausalLMOutputWithPast): + pass + + +class DeepseekVLAligner(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + + in_features = config.vision_config.hidden_size + out_features = config.text_config.hidden_size + + self.linear1 = nn.Linear(in_features, out_features) + self.activation = nn.GELU() + self.linear2 = nn.Linear(out_features, out_features) + + def forward(self, vision_encodings: torch.Tensor) -> torch.Tensor: + x = self.linear1(vision_encodings) + x = self.activation(x) + x = self.linear2(x) + return x + + +class DeepseekVLPreTrainedModel(JanusPreTrainedModel): + _no_split_modules = ["LlamaDecoderLayer"] + + def _init_weights(self, module): + """Initialize the weights""" + # Required only for Linear layer in DeepseekVLAligner + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.text_config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + + +@auto_docstring +class DeepseekVLModel(JanusModel): + def __init__(self, config): + super().__init__(config) + self.config = config + + self.vision_model = AutoModel.from_config(config.vision_config) + self.aligner = DeepseekVLAligner(config) + + self.language_model = AutoModel.from_config(config=config.text_config) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing. + self.post_init() + + del self.vqmodel + del self.generation_embeddings + del self.generation_aligner + del self.generation_head + + +class DeepseekVLForConditionalGeneration(JanusForConditionalGeneration): + def prepare_embeddings_for_image_generation(self): + raise AttributeError("Not needed for DeepseekVL") + + def decode_image_tokens(self): + raise AttributeError("Not needed for DeepseekVL") + + def generate(self): + raise AttributeError("Not needed for DeepseekVL") + + +class DeepseekVLImageProcessor(JanusImageProcessor): + def postprocess(self): + raise AttributeError("Not needed for DeepseekVL") + + def unnormalize(self): + raise AttributeError("Not needed for DeepseekVL") + + +class DeepseekVLProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": {"padding": False}, + "common_kwargs": {"return_tensors": "pt"}, + } + + +class DeepseekVLProcessor(ProcessorMixin): + r""" + Constructs a DeepseekVL processor which wraps a DeepseekVL Image Processor and a Llama tokenizer into a single processor. + + [`DeepseekVLProcessor`] offers all the functionalities of [`DeepseekVLImageProcessor`] and [`LlamaTokenizerFast`]. See the + [`~DeepseekVLProcessor.__call__`] and [`~DeepseekVLProcessor.decode`] for more information. + + Args: + image_processor ([`DeepseekVLImageProcessor`]): + The image processor is a required input. + tokenizer ([`LlamaTokenizerFast`]): + The tokenizer is a required input. + chat_template (`str`, *optional*): + A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. + num_image_tokens (`int`, *optional*, defaults to 576): + The number of special image tokens used as placeholders for visual content in text sequences. + """ + + attributes = ["image_processor", "tokenizer"] + valid_kwargs = ["chat_template", "num_image_tokens"] + image_processor_class = "AutoImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__( + self, + image_processor, + tokenizer, + chat_template=None, + num_image_tokens=576, + ): + self.image_token = tokenizer.image_token + self.num_image_tokens = num_image_tokens + + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + def __call__( + self, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, + images: ImageInput = None, + **kwargs: Unpack[DeepseekVLProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` + and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode + the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to + DeepseekVLImageProcessor's [`~DeepseekVLImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring + of the above two methods for more information. + + Args: + text (`str`, `List[str]`, `List[List[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + output_kwargs = self._merge_kwargs( + DeepseekVLProcessorKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs + ) + if text is None and images is None: + raise ValueError("You must specify either text or images.") + + if text is not None: + if isinstance(text, str): + text = [text] + elif not (isinstance(text, (list, tuple)) and all(isinstance(t, str) for t in text)): + raise ValueError("Invalid input text. Please provide a string, or a list of strings") + + prompt_strings = [] + one_img_tokens = self.image_token * self.num_image_tokens + for prompt in text: + prompt = prompt.replace(self.image_token, one_img_tokens) + prompt_strings.append(prompt) + + data = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) + + # process images if pixel_values are provided + if images is not None: + images = make_flat_list_of_images(images) + data["pixel_values"] = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"] + + return BatchFeature(data=data) + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + + +__all__ = [ + "DeepseekVLConfig", + "DeepseekVLPreTrainedModel", + "DeepseekVLModel", + "DeepseekVLForConditionalGeneration", + "DeepseekVLImageProcessor", + "DeepseekVLProcessor", +] diff --git a/src/transformers/models/deepseek_vl/processing_deepseek_vl.py b/src/transformers/models/deepseek_vl/processing_deepseek_vl.py new file mode 100644 index 0000000000..244e642d7c --- /dev/null +++ b/src/transformers/models/deepseek_vl/processing_deepseek_vl.py @@ -0,0 +1,157 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/deepseek_vl/modular_deepseek_vl.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_deepseek_vl.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Union + +from ...image_processing_utils import BatchFeature +from ...image_utils import ImageInput, make_flat_list_of_images +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils_base import PreTokenizedInput, TextInput + + +class DeepseekVLProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": {"padding": False}, + "common_kwargs": {"return_tensors": "pt"}, + } + + +class DeepseekVLProcessor(ProcessorMixin): + r""" + Constructs a DeepseekVL processor which wraps a DeepseekVL Image Processor and a Llama tokenizer into a single processor. + + [`DeepseekVLProcessor`] offers all the functionalities of [`DeepseekVLImageProcessor`] and [`LlamaTokenizerFast`]. See the + [`~DeepseekVLProcessor.__call__`] and [`~DeepseekVLProcessor.decode`] for more information. + + Args: + image_processor ([`DeepseekVLImageProcessor`]): + The image processor is a required input. + tokenizer ([`LlamaTokenizerFast`]): + The tokenizer is a required input. + chat_template (`str`, *optional*): + A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. + num_image_tokens (`int`, *optional*, defaults to 576): + The number of special image tokens used as placeholders for visual content in text sequences. + """ + + attributes = ["image_processor", "tokenizer"] + valid_kwargs = ["chat_template", "num_image_tokens"] + image_processor_class = "AutoImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__( + self, + image_processor, + tokenizer, + chat_template=None, + num_image_tokens=576, + ): + self.image_token = tokenizer.image_token + self.num_image_tokens = num_image_tokens + + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + def __call__( + self, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, + images: ImageInput = None, + **kwargs: Unpack[DeepseekVLProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` + and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode + the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to + DeepseekVLImageProcessor's [`~DeepseekVLImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring + of the above two methods for more information. + + Args: + text (`str`, `List[str]`, `List[List[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + output_kwargs = self._merge_kwargs( + DeepseekVLProcessorKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs + ) + if text is None and images is None: + raise ValueError("You must specify either text or images.") + + if text is not None: + if isinstance(text, str): + text = [text] + elif not (isinstance(text, (list, tuple)) and all(isinstance(t, str) for t in text)): + raise ValueError("Invalid input text. Please provide a string, or a list of strings") + + prompt_strings = [] + one_img_tokens = self.image_token * self.num_image_tokens + for prompt in text: + prompt = prompt.replace(self.image_token, one_img_tokens) + prompt_strings.append(prompt) + + data = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) + + # process images if pixel_values are provided + if images is not None: + images = make_flat_list_of_images(images) + data["pixel_values"] = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"] + + return BatchFeature(data=data) + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + + +__all__ = ["DeepseekVLProcessor"] diff --git a/src/transformers/models/deepseek_vl_hybrid/__init__.py b/src/transformers/models/deepseek_vl_hybrid/__init__.py new file mode 100644 index 0000000000..1836d196ac --- /dev/null +++ b/src/transformers/models/deepseek_vl_hybrid/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_deepseek_vl_hybrid import * + from .image_processing_deepseek_vl_fast_hybrid import * + from .image_processing_deepseek_vl_hybrid import * + from .modeling_deepseek_vl_hybrid import * + from .processing_deepseek_vl_hybrid import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py new file mode 100644 index 0000000000..c3a5aa5260 --- /dev/null +++ b/src/transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py @@ -0,0 +1,108 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_deepseek_vl_hybrid.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ..auto import CONFIG_MAPPING, AutoConfig + + +logger = logging.get_logger(__name__) + + +class DeepseekVLHybridConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DeepseekVLHybridModel`]. It is used to instantiate a + DeepseekVLHybrid model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the DeepseekVLHybrid + [deepseek-community/deepseek-vl-7b-chat](https://huggingface.co/deepseek-community/deepseek-vl-7b-chat) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`): + The config object or dictionary of the text backbone. + vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SiglipVisionConfig`): + The config object or dictionary of the vision backbone. + high_res_vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SamVisionConfig`): + The config object or dictionary of the high resolution vision backbone. + image_token_id (`int`, *optional*, defaults to 100015): + The index representing image tokens in the model's token vocabulary. + + Example: + + ```python + >>> from transformers import DeepseekVLHybridConfig, DeepseekVLHybridModel + + >>> # Initializing a DeepseekVLHybrid deepseek-community/deepseek-vl-7b-chat style configuration + >>> configuration = DeepseekVLHybridConfig() + + >>> # Initializing a model (with random weights) from the deepseek-community/deepseek-vl-7b-chat style configuration + >>> model = DeepseekVLHybridModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "deepseek_vl_hybrid" + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig, "high_res_vision_config": AutoConfig} + + def __init__( + self, + text_config: AutoConfig = None, + vision_config: AutoConfig = None, + high_res_vision_config: AutoConfig = None, + image_token_id: int = 100015, + **kwargs, + ): + super().__init__(**kwargs) + + if text_config is None: + text_config = {} + logger.info("`text_config` is `None`. Initializing the `LlamaConfig` with default values.") + + if vision_config is None: + vision_config = {} + logger.info("`vision_config` is `None`. Initializing the `SiglipVisionConfig` with default values.") + + if isinstance(text_config, dict): + text_config["model_type"] = text_config.get("model_type", "llama") + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + + if isinstance(vision_config, dict): + vision_config["model_type"] = vision_config.get("model_type", "siglip_vision_model") + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + + self.text_config = text_config + self.vision_config = vision_config + self.image_token_id = image_token_id + + if high_res_vision_config is None: + high_res_vision_config = {} + logger.info("`high_res_vision_config` is `None`. Initializing the `SamVisionConfig` with default values.") + + if isinstance(high_res_vision_config, dict): + high_res_vision_config["model_type"] = high_res_vision_config.get("model_type", "sam_vision_model") + high_res_vision_config = CONFIG_MAPPING[high_res_vision_config["model_type"]](**high_res_vision_config) + + self.high_res_vision_config = high_res_vision_config + + +__all__ = ["DeepseekVLHybridConfig"] diff --git a/src/transformers/models/deepseek_vl_hybrid/convert_deepseek_vl_hybrid_weights_to_hf.py b/src/transformers/models/deepseek_vl_hybrid/convert_deepseek_vl_hybrid_weights_to_hf.py new file mode 100644 index 0000000000..9f377a53c8 --- /dev/null +++ b/src/transformers/models/deepseek_vl_hybrid/convert_deepseek_vl_hybrid_weights_to_hf.py @@ -0,0 +1,394 @@ +# coding=utf-8 +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import gc +import json +import os +from typing import Optional + +import regex as re +import torch +from accelerate import init_empty_weights +from huggingface_hub import snapshot_download +from huggingface_hub.errors import HFValidationError +from safetensors.torch import load_file + +from transformers import ( + AutoTokenizer, + DeepseekVLHybridConfig, + DeepseekVLHybridForConditionalGeneration, + DeepseekVLHybridImageProcessor, + DeepseekVLHybridProcessor, +) +from transformers.image_utils import ( + IMAGENET_STANDARD_MEAN, + IMAGENET_STANDARD_STD, + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + PILImageResampling, +) + + +# fmt: off +ORIGINAL_TO_CONVERTED_KEY_MAPPING = { + # # Sam (High Resolution) + r"vision_model.vision_tower_high.vision_tower.pos_embed": r"model.high_res_vision_model.vision_encoder.pos_embed", + r"vision_model.vision_tower_high.vision_tower.patch_embed.proj.(weight|bias)": r"model.high_res_vision_model.vision_encoder.patch_embed.projection.\1", + r"vision_model.vision_tower_high.vision_tower.blocks.(\d+).norm(\d+).(weight|bias)": r"model.high_res_vision_model.vision_encoder.layers.\1.layer_norm\2.\3", + r"vision_model.vision_tower_high.vision_tower.blocks.(\d+).attn.rel_pos_(h|w)": r"model.high_res_vision_model.vision_encoder.layers.\1.attn.rel_pos_\2", + r"vision_model.vision_tower_high.vision_tower.blocks.(\d+).attn.qkv.(weight|bias)": r"model.high_res_vision_model.vision_encoder.layers.\1.attn.qkv.\2", + r"vision_model.vision_tower_high.vision_tower.blocks.(\d+).attn.proj.(weight|bias)": r"model.high_res_vision_model.vision_encoder.layers.\1.attn.proj.\2", + r"vision_model.vision_tower_high.vision_tower.blocks.(\d+).mlp.lin(\d+).(weight|bias)": r"model.high_res_vision_model.vision_encoder.layers.\1.mlp.lin\2.\3", + r"vision_model.vision_tower_high.vision_tower.neck.0.weight": r"model.high_res_vision_model.vision_encoder.neck.conv1.weight", + r"vision_model.vision_tower_high.vision_tower.neck.1.(weight|bias)": r"model.high_res_vision_model.vision_encoder.neck.layer_norm1.\1", + r"vision_model.vision_tower_high.vision_tower.neck.2.weight": r"model.high_res_vision_model.vision_encoder.neck.conv2.weight", + r"vision_model.vision_tower_high.vision_tower.neck.3.(weight|bias)": r"model.high_res_vision_model.vision_encoder.neck.layer_norm2.\1", + r"vision_model.vision_tower_high.vision_tower.neck_hd.0.weight": r"model.high_res_vision_neck.conv1.weight", + r"vision_model.vision_tower_high.vision_tower.neck_hd.1.(weight|bias)": r"model.high_res_vision_neck.layer_norm1.\1", + r"vision_model.vision_tower_high.vision_tower.neck_hd.2.weight": r"model.high_res_vision_neck.conv2.weight", + r"vision_model.vision_tower_high.vision_tower.neck_hd.3.(weight|bias)": r"model.high_res_vision_neck.layer_norm2.\1", + r"vision_model.vision_tower_high.vision_tower.downsamples.0.weight": r"model.high_res_vision_proj.conv1.weight", + r"vision_model.vision_tower_high.vision_tower.downsamples.1.weight": r"model.high_res_vision_proj.conv2.weight", + r"vision_model.vision_tower_high.vision_tower.hd_alpha_downsamples": r"model.high_res_vision_alpha", + + # Siglip (Low Resolution) + r"vision_model.vision_tower_low.vision_tower.pos_embed": r"model.vision_model.vision_model.embeddings.position_embedding.weight", + r"vision_model.vision_tower_low.vision_tower.patch_embed.proj.(weight|bias)": r"model.vision_model.vision_model.embeddings.patch_embedding.\1", + r"vision_model.vision_tower_low.vision_tower.blocks.(\d+).attn.qkv.(weight|bias)": r"model.vision_model.vision_model.encoder.layers.\1.self_attn.(q|k|v)_proj.\2", + r"vision_model.vision_tower_low.vision_tower.blocks.(\d+).attn.proj.(weight|bias)": r"model.vision_model.vision_model.encoder.layers.\1.self_attn.out_proj.\2", + r"vision_model.vision_tower_low.vision_tower.blocks.(\d+).norm(\d+).(weight|bias)": r"model.vision_model.vision_model.encoder.layers.\1.layer_norm\2.\3", + r"vision_model.vision_tower_low.vision_tower.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"model.vision_model.vision_model.encoder.layers.\1.mlp.fc\2.\3", + r"vision_model.vision_tower_low.vision_tower.norm.(weight|bias)": r"model.vision_model.vision_model.post_layernorm.\1", + r"vision_model.vision_tower_low.vision_tower.attn_pool.latent": r"model.vision_model.vision_model.head.probe", + r"vision_model.vision_tower_low.vision_tower.attn_pool.proj.(weight|bias)": r"model.vision_model.vision_model.head.attention.out_proj.\1", + r"vision_model.vision_tower_low.vision_tower.attn_pool.norm.(weight|bias)": r"model.vision_model.vision_model.head.layernorm.\1", + r"vision_model.vision_tower_low.vision_tower.attn_pool.mlp.fc(\d+).(weight|bias)": r"model.vision_model.vision_model.head.mlp.fc\1.\2", + + # Vision Projection + r"aligner.layers.1.(weight|bias)": r"model.aligner.proj.\1", + r"aligner.low_up_proj.(weight|bias)": r"model.aligner.vision_proj.\1", + r"aligner.high_up_proj.(weight|bias)": r"model.aligner.high_res_vision_proj.\1", + + # Llama (Text Model) + r"language_model.model.(\w+)": r"model.language_model.\1", + r"language_model.lm_head.(weight|bias)": r"lm_head.\1", +} +# fmt: on + +# Adopted from https://github.com/deepseek-ai/DeepSeek-VL/blob/main/deepseek_vl/utils/conversation.py#L80-L91 +CHAT_TEMPLATE = ( + # Define separators and initialize counter + "{% set seps = ['\n\n', '<\uff5cend\u2581of\u2581sentence\uff5c>'] %}" + "{% set i = 0 %}" + # Start with default system prompt + "You are a helpful language and vision assistant. " + "You are able to understand the visual content that the user provides, " + "and assist the user with a variety of tasks using natural language.\n\n" + # Iterate through messages + "{% for message in messages %}" + # Identify user or assistant role + "{% if message['role']|lower == 'user' %}" + "User: " + "{% elif message['role']|lower == 'assistant' %}" + "Assistant:{% if not (loop.last and not add_generation_prompt and message['content'][0]['type']=='text' and message['content'][0]['text']=='') %} {% endif %}" + "{% else %}" + "{{ message['role'].capitalize() }}: " + "{% endif %}" + # Iterate through message content (text/images) + "{% for content in message['content'] %}" + # If content is an image, replace with placeholder + "{% if content['type'] == 'image' %}" + "" + # If content is text, handle formatting + "{% elif content['type'] == 'text' %}" + "{% set text = content['text'] %}" + # Strip whitespace for first and last text blocks + "{% if loop.first %}{% set text = text.lstrip() %}{% endif %}" + "{% if loop.last %}{% set text = text.rstrip() %}{% endif %}" + # If previous content was text, add space + "{% if not loop.first and message['content'][loop.index0-1]['type'] == 'text' %}" + "{{ ' ' + text }}" + "{% else %}" + "{{ text }}" + "{% endif %}" + "{% endif %}" + "{% endfor %}" # End message content loop + # Add separators between messages + "{% if not loop.last or add_generation_prompt %}" + "{% if message['role']|lower == 'user' %}" + "{{ seps[0] }}" + "{% else %}" + "{{ seps[1] }}" + "{% endif %}" + "{% endif %}" + "{% endfor %}" # End messages loop + # Add final Assistant prompt if required + "{% if add_generation_prompt %}Assistant:{% endif %}" +) + + +def convert_old_keys_to_new_keys(state_dict_keys: dict): + output_dict = {} + + old_text = "\n".join(state_dict_keys) + new_text = old_text + for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items(): + if replacement is None: + new_text = re.sub(pattern, "", new_text) # an empty line + continue + new_text = re.sub(pattern, replacement, new_text) + output_dict = dict(zip(old_text.split("\n"), new_text.split("\n"))) + + return output_dict + + +def get_qkv_state_dict(key, parameter): + """ + new key which looks like this + xxxx.(q|k|v).xxx (m, n) + + is converted to + xxxx.q.xxxx (m//3, n) + xxxx.k.xxxx (m//3, n) + xxxx.v.xxxx (m//3, n) + """ + qkv_state_dict = {} + placeholder = re.search(r"(\(.*?\))", key).group(1) # finds "(query|key|value)" + replacements_keys = placeholder[1:-1].split("|") # creates ['query', 'key', 'value'] + replacements_vals = torch.split( + parameter, split_size_or_sections=parameter.size(0) // len(replacements_keys), dim=0 + ) + for replacement_key, replacement_val in zip(replacements_keys, replacements_vals): + qkv_state_dict[key.replace(placeholder, replacement_key)] = replacement_val + return qkv_state_dict + + +def update_state_dict(old_state_dict): + all_keys = list(old_state_dict.keys()) + new_keys = convert_old_keys_to_new_keys(all_keys) + + state_dict = {} + for key in all_keys: + new_key = new_keys[key] + current_parameter = old_state_dict.pop(key) + + if "qkv" in key and "vision_tower_high" not in key: + qkv_state_dict = get_qkv_state_dict(new_key, current_parameter) + state_dict.update(qkv_state_dict) + elif "pos_embed" in key: + if "vision_tower_high" not in key: + # timm implementation of siglip creates this param of size [1, 576, 1024] + # transformers implementation of siglip creates this param of size [576, 1024] + state_dict[new_key] = current_parameter.squeeze(0) + else: + state_dict[new_key] = current_parameter + else: + state_dict[new_key] = current_parameter + + return state_dict + + +def load_model_state_dict(input_path: str) -> dict: + """ + Load model state dict, handling both single and sharded files. + """ + index_path = os.path.join(input_path, "model.safetensors.index.json") + single_file_path = os.path.join(input_path, "model.safetensors") + + # Check if we have a sharded model + if os.path.exists(index_path): + print("Loading sharded model...") + state_dict = {} + with open(index_path, "r") as f: + index = json.load(f) + + # Get unique shard files and load each one only once + unique_shard_files = sorted(set(index["weight_map"].values())) + for shard_file in unique_shard_files: + print(f"Loading shard {shard_file}...") + shard_path = os.path.join(input_path, shard_file) + shard_dict = load_file(shard_path) + state_dict.update(shard_dict) + + return state_dict + + # Single file model + elif os.path.exists(single_file_path): + print("Loading single file model...") + return load_file(single_file_path, device="cpu") + + else: + raise ValueError(f"No model files found in {input_path}") + + +def convert_model( + hf_repo_id: str, + output_dir: Optional[str] = None, + output_hub_path: Optional[str] = None, + safe_serialization: bool = True, +): + if output_dir: + os.makedirs(output_dir, exist_ok=True) + + try: + input_path = snapshot_download(hf_repo_id) + except HFValidationError: + # If the input path is not a HF repo ID, assume it's a local path + input_path = hf_repo_id + + # ------------------------------------------------------------ + # Create and save config + # ------------------------------------------------------------ + + config = DeepseekVLHybridConfig( + text_config={ + "hidden_size": 4096, + "intermediate_size": 11008, + "max_position_embeddings": 16384, + "num_attention_heads": 32, + "num_hidden_layers": 30, + "vocab_size": 102400, + }, + vision_config={ + "hidden_size": 1024, + "intermediate_size": 4096, + "image_size": 384, + "patch_size": 16, + "hidden_act": "gelu", + "vision_use_head": False, + "num_attention_heads": 16, + "num_hidden_layers": 24, + }, + high_res_vision_config={ + "hidden_size": 768, + "intermediate_size": 3072, + "image_size": 1024, + "patch_size": 16, + "num_attention_heads": 12, + "num_hidden_layers": 12, + }, + ) + + # save config + if output_dir: + config.save_pretrained(output_dir) + print("Model config saved successfully...") + + # ------------------------------------------------------------ + # Convert processor + # ------------------------------------------------------------ + + image_processor = DeepseekVLHybridImageProcessor( + image_mean=IMAGENET_STANDARD_MEAN, + image_std=IMAGENET_STANDARD_STD, + high_res_image_mean=OPENAI_CLIP_MEAN, + high_res_image_std=OPENAI_CLIP_STD, + resample=PILImageResampling.BILINEAR, + ) + + tokenizer = AutoTokenizer.from_pretrained( + input_path, + extra_special_tokens={ + "pad_token": "<|end▁of▁sentence|>", + "image_token": "", + }, + ) + + processor = DeepseekVLHybridProcessor( + image_processor=image_processor, + tokenizer=tokenizer, + chat_template=CHAT_TEMPLATE, + ) + + if output_dir: + print(f"Saving processor to {output_dir}...") + processor.save_pretrained(output_dir) + if output_hub_path: + print(f"Pushing processor to hub at {output_hub_path}...") + processor.push_to_hub(output_hub_path) + + # ------------------------------------------------------------ + # Convert weights + # ------------------------------------------------------------ + + print("Creating empty model...") + with init_empty_weights(): + model = DeepseekVLHybridForConditionalGeneration(config) + + # Load and convert state dict + print("Loading state dict...") + state_dict = load_model_state_dict(input_path) + state_dict = update_state_dict(state_dict) + + # Load converted state dict + print("Loading converted weights into model...") + info = model.load_state_dict(state_dict, strict=False, assign=True) + if len(info.missing_keys) > 0: + raise ValueError(f"Missing keys: {info.missing_keys}") + + # Tie weights before any device mapping + print("Tying weights...") + model.tie_weights() + + # Save the model + if output_dir: + print(f"Saving model to {output_dir}...") + model.save_pretrained(output_dir, safe_serialization=safe_serialization) + if output_hub_path: + print(f"Pushing model to hub at {output_hub_path}...") + model.push_to_hub(output_hub_path, safe_serialization=safe_serialization) + + del state_dict, model + gc.collect() + + # Validate the saved model if saved locally + if output_dir: + print("Reloading the local model to check if it's saved correctly...") + DeepseekVLHybridForConditionalGeneration.from_pretrained(output_dir, device_map="auto") + print("Local model reloaded successfully.") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--hf_repo_id", + default="deepseek-ai/deepseek-vl-7b-chat", + help="Location of official weights from DeepseekAI on HF", + ) + parser.add_argument( + "--output_dir", + default=None, + help="Location to write the converted model and processor", + ) + parser.add_argument( + "--output_hub_path", + default=None, + help="Repository ID to push model to hub (e.g. 'username/model-name')", + ) + parser.add_argument( + "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`." + ) + args = parser.parse_args() + + convert_model( + hf_repo_id=args.hf_repo_id, + output_dir=args.output_dir, + output_hub_path=args.output_hub_path, + safe_serialization=args.safe_serialization, + ) + + +if __name__ == "__main__": + main() diff --git a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py new file mode 100644 index 0000000000..d42cfbe38b --- /dev/null +++ b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py @@ -0,0 +1,483 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_deepseek_vl_hybrid.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor +from ...image_processing_utils_fast import BatchFeature, get_size_dict +from ...image_transforms import convert_to_rgb, resize, to_channel_dimension_format +from ...image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_scaled_image, + make_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import ( + TensorType, + filter_out_non_signature_kwargs, + is_vision_available, + logging, +) + + +if is_vision_available(): + import PIL + + +logger = logging.get_logger(__name__) + + +class DeepseekVLHybridImageProcessor(BaseImageProcessor): + r""" + Constructs a DEEPSEEK_VL_HYBRID image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the + `do_resize` parameter in the `preprocess` method. + size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`): + Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` + method. + high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`): + Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess` + method. + min_size (`int`, *optional*, defaults to 14): + The minimum allowed size for the resized image. Ensures that neither the height nor width + falls below this value after resizing. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be + overridden by the `resample` parameter in the `preprocess` method. + high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be + overridden by the `high_res_resample` parameter in the `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the + `do_rescale` parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be + overridden by the `rescale_factor` parameter in the `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. Can be overridden by the `do_normalize` parameter in the `preprocess` method. + image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be + overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`): + Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method. + high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`): + Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + high_res_size: Optional[dict[str, int]] = None, + min_size: int = 14, + resample: PILImageResampling = PILImageResampling.BICUBIC, + high_res_resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + high_res_image_mean: Optional[Union[float, list[float]]] = None, + high_res_image_std: Optional[Union[float, list[float]]] = None, + do_convert_rgb: Optional[bool] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + high_res_size = high_res_size if high_res_size is not None else {"height": 1024, "width": 1024} + high_res_size = get_size_dict(high_res_size, default_to_square=True) + + self.high_res_size = high_res_size + self.high_res_image_mean = high_res_image_mean if high_res_image_mean is not None else OPENAI_CLIP_MEAN + self.high_res_image_std = high_res_image_std if high_res_image_std is not None else OPENAI_CLIP_STD + + self.resample = resample + self.high_res_resample = high_res_resample + size = size if size is not None else {"height": 384, "width": 384} + size = get_size_dict(size, default_to_square=True) + + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN + self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD + self.do_convert_rgb = do_convert_rgb + + self.min_size = min_size + if image_mean is None: + self.background_color = (127, 127, 127) + else: + self.background_color = tuple([int(x * 255) for x in image_mean]) + + if high_res_image_mean is None: + self.background_color = (127, 127, 127) + else: + self.background_color = tuple([int(x * 255) for x in high_res_image_mean]) + + def resize( + self, + image: np.ndarray, + size: Union[dict[str, int], int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image to dynamically calculated size. + + Args: + image (`np.ndarray`): + Image to resize. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `None`: will be inferred from input + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + + Returns: + `np.ndarray`: The resized image. + """ + if input_data_format is None: + input_data_format = infer_channel_dimension_format(image) + + height, width = get_image_size(image, input_data_format) + max_size = max(height, width) + + size = get_size_dict(size, default_to_square=True) + if size["height"] != size["width"]: + raise ValueError( + f"Output height and width must be the same. Got height={size['height']} and width={size['width']}" + ) + size = size["height"] + + delta = size / max_size + # Largest side becomes `size` and the other side is scaled according to the aspect ratio. + output_size_nonpadded = [ + max(int(height * delta), self.min_size), + max(int(width * delta), self.min_size), + ] + + image = resize( + image, + size=output_size_nonpadded, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + # Expand and pad the images to obtain a square image of dimensions `size x size` + image = self.pad_to_square( + image=image, + background_color=self.background_color, + input_data_format=input_data_format, + ) + return image + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + high_res_size: Optional[dict[str, int]] = None, + resample: PILImageResampling = None, + high_res_resample: PILImageResampling = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + high_res_image_mean: Optional[Union[float, list[float]]] = None, + high_res_image_std: Optional[Union[float, list[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + do_convert_rgb: Optional[bool] = None, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after + resizing. + high_res_size (`Dict[str, int]`, *optional*, defaults to `self.high_res_size`): + Dictionary in the format `{"height": h, "width": w}` specifying the size of the high resolution output image after + resizing. + resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`): + `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has + an effect if `do_resize` is set to `True`. + high_res_resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`): + `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BICUBIC`. Only has + an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use if `do_normalize` is set to `True`. + high_res_image_mean (`float` or `List[float]`, *optional*, defaults to `self.high_res_image_mean`): + Image mean to use if `do_normalize` is set to `True`. + high_res_image_std (`float` or `List[float]`, *optional*, defaults to `self.high_res_image_std`): + Image standard deviation to use if `do_normalize` is set to `True`. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + resample = resample if resample is not None else self.resample + high_res_resample = high_res_resample if high_res_resample is not None else self.high_res_resample + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + high_res_image_mean = high_res_image_mean if high_res_image_mean is not None else self.high_res_image_mean + high_res_image_std = high_res_image_std if high_res_image_std is not None else self.high_res_image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + + size = size if size is not None else self.size + size_dict = get_size_dict(size) + high_res_size = high_res_size if high_res_size is not None else self.high_res_size + high_res_size_dict = get_size_dict(high_res_size) + + images = make_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + all_images = [] + all_high_res_images = [] + for image in images: + # high_res_image: resize (high) -> rescale -> normalize (high) + # low_res_image: resize (high) -> rescale -> resize (low) -> normalize (low) + high_res_image = image + + if do_resize: + high_res_image = self.resize( + image=high_res_image, + size=high_res_size_dict, + resample=high_res_resample, + input_data_format=input_data_format, + ) + image = self.resize( + image=high_res_image, size=size_dict, resample=resample, input_data_format=input_data_format + ) + + if do_rescale: + image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + high_res_image = self.rescale( + image=high_res_image, scale=rescale_factor, input_data_format=input_data_format + ) + + if do_normalize: + image = self.normalize( + image=image, mean=image_mean, std=image_std, input_data_format=input_data_format + ) + high_res_image = self.normalize( + image=high_res_image, + mean=high_res_image_mean, + std=high_res_image_std, + input_data_format=input_data_format, + ) + + image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + high_res_image = to_channel_dimension_format( + high_res_image, data_format, input_channel_dim=input_data_format + ) + + all_images.append(image) + all_high_res_images.append(high_res_image) + + data = {"pixel_values": all_images, "high_res_pixel_values": all_high_res_images} + return BatchFeature(data=data, tensor_type=return_tensors) + + def pad_to_square( + self, + image: np.ndarray, + background_color: Union[int, tuple[int, int, int]] = 0, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.array: + """ + Pads an image to a square based on the longest edge. + + Args: + image (`np.ndarray`): + The image to pad. + background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0): + The color to use for the padding. Can be an integer for single channel or a + tuple of integers representing for multi-channel images. If passed as integer + in mutli-channel mode, it will default to `0` in subsequent channels. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use same as the input image. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + + Returns: + `np.ndarray`: The padded image. + """ + height, width = get_image_size(image, input_data_format) + num_channels = image.shape[0] if input_data_format == ChannelDimension.FIRST else image.shape[-1] + + if height == width: + image = ( + to_channel_dimension_format(image, data_format, input_data_format) + if data_format is not None + else image + ) + return image + + max_dim = max(height, width) + + # Ensure background_color is the correct shape + if isinstance(background_color, int): + background_color = [background_color] + elif len(background_color) != num_channels: + raise ValueError( + f"background_color must have no more than {num_channels} elements to match the number of channels" + ) + + if input_data_format == ChannelDimension.FIRST: + result = np.zeros((num_channels, max_dim, max_dim), dtype=image.dtype) + for i, color in enumerate(background_color): + result[i, :, :] = color + if width > height: + start = (max_dim - height) // 2 + result[:, start : start + height, :] = image + else: + start = (max_dim - width) // 2 + result[:, :, start : start + width] = image + else: + result = np.zeros((max_dim, max_dim, num_channels), dtype=image.dtype) + for i, color in enumerate(background_color): + result[:, :, i] = color + if width > height: + start = (max_dim - height) // 2 + result[start : start + height, :, :] = image + else: + start = (max_dim - width) // 2 + result[:, start : start + width, :] = image + + return result + + def postprocess(self): + """Applies post-processing to the decoded image tokens by reversing transformations applied during preprocessing.""" + raise AttributeError("Not needed for DeepseekVLHybrid") + + +__all__ = ["DeepseekVLHybridImageProcessor"] diff --git a/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py new file mode 100644 index 0000000000..67b67371f9 --- /dev/null +++ b/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py @@ -0,0 +1,491 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_deepseek_vl_hybrid.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Optional, Union + +import torch +import torch.nn as nn + +from ...cache_utils import Cache +from ...generation import GenerationMixin +from ...modeling_outputs import ModelOutput +from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack +from ...utils import ( + TransformersKwargs, + auto_docstring, + can_return_tuple, +) +from ..auto import AutoModel +from .configuration_deepseek_vl_hybrid import DeepseekVLHybridConfig + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for DeepseekVLHybrid model's outputs that may also contain a past key/values (to speed up sequential decoding). + """ +) +class DeepseekVLHybridBaseModelOutputWithPast(ModelOutput): + r""" + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + + If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, + hidden_size)` is output. + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if + `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads, + encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if + `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values` + input) to speed up sequential decoding. + image_hidden_states (`tuple(torch.FloatTensor)`, *optional*): + Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images, + sequence_length, hidden_size)`. + + image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[tuple[torch.FloatTensor]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for DeepseekVLHybrid causal language model (or autoregressive) outputs. + """ +) +class DeepseekVLHybridCausalLMOutputWithPast(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`tuple(torch.FloatTensor)`, *optional*): + Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images, + sequence_length, hidden_size)`. + + image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[list[torch.FloatTensor]] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[tuple[torch.FloatTensor]] = None + + +class DeepseekVLHybridLayerNorm(nn.Module): + r"""LayerNorm that supports two data formats: channels_last (default) or channels_first. + The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height, + width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width). + """ + + def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"): + super().__init__() + self.weight = nn.Parameter(torch.ones(normalized_shape)) + self.bias = nn.Parameter(torch.zeros(normalized_shape)) + self.eps = eps + self.data_format = data_format + if self.data_format not in ["channels_last", "channels_first"]: + raise NotImplementedError(f"Unsupported data format: {self.data_format}") + self.normalized_shape = (normalized_shape,) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if self.data_format == "channels_last": + x = torch.nn.functional.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) + elif self.data_format == "channels_first": + input_dtype = x.dtype + x = x.float() + u = x.mean(1, keepdim=True) + s = (x - u).pow(2).mean(1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.eps) + x = x.to(dtype=input_dtype) + x = self.weight[:, None, None] * x + self.bias[:, None, None] + return x + + +class DeepseekVLSamVisionNeck(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + + self.conv1 = nn.Conv2d(config.hidden_size, config.output_channels, kernel_size=1, bias=False) + self.layer_norm1 = DeepseekVLHybridLayerNorm(config.output_channels, data_format="channels_first") + self.conv2 = nn.Conv2d(config.output_channels, config.output_channels, kernel_size=3, padding=1, bias=False) + self.layer_norm2 = DeepseekVLHybridLayerNorm(config.output_channels, data_format="channels_first") + + def forward(self, hidden_states): + hidden_states = hidden_states.permute(0, 3, 1, 2) + hidden_states = self.conv1(hidden_states) + hidden_states = self.layer_norm1(hidden_states) + + hidden_states = self.conv2(hidden_states) + hidden_states = self.layer_norm2(hidden_states) + return hidden_states + + +class DeepseekVLSamVisionProj(nn.Module): + def __init__(self, config, output_size: int = 24): + super().__init__() + self.config = config + self.output_size = output_size + + self.conv1 = nn.Conv2d( + config.output_channels, config.output_channels * 2, kernel_size=3, stride=2, padding=1, bias=False + ) + self.conv2 = nn.Conv2d( + config.output_channels * 2, config.output_channels * 4, kernel_size=3, stride=2, padding=1, bias=False + ) + + def forward(self, features: torch.Tensor) -> torch.Tensor: + # interpolate Sam encodings to match Siglip encodings + features = torch.nn.functional.interpolate( + features, + size=(4 * self.output_size, 4 * self.output_size), + mode="bilinear", + align_corners=False, + ) + features = self.conv1(features) + features = self.conv2(features) + return features + + +class DeepseekVLHybridAligner(nn.Module): + def __init__(self, config: DeepseekVLHybridConfig): + super().__init__() + + in_channels = config.vision_config.hidden_size + high_res_in_channels = config.high_res_vision_config.output_channels * 4 + out_channels = config.text_config.hidden_size + + self.vision_proj = nn.Linear(in_channels, out_channels // 2) + self.high_res_vision_proj = nn.Linear(high_res_in_channels, out_channels // 2) + + self.act = nn.GELU() + self.proj = nn.Linear(out_channels, out_channels) + + def forward( + self, + vision_encodings: torch.Tensor, + high_res_vision_encodings: torch.Tensor, + ) -> torch.Tensor: + vision_encodings = self.vision_proj(vision_encodings) + high_res_vision_encodings = self.high_res_vision_proj(high_res_vision_encodings) + + encodings = torch.concat([high_res_vision_encodings, vision_encodings], dim=-1) + encodings = self.act(encodings) + encodings = self.proj(encodings) + + return encodings + + +@auto_docstring +class DeepseekVLHybridPreTrainedModel(PreTrainedModel): + config: DeepseekVLHybridConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["LlamaDecoderLayer"] + _skip_keys_device_placement = ["past_key_values", "causal_mask"] + _supports_flash_attn = True + _supports_sdpa = True + + _supports_static_cache = True + _supports_param_buffer_assignment = False + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.text_config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Conv2d): + nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu") + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, DeepseekVLHybridLayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + elif isinstance(module, DeepseekVLHybridModel): + module.high_res_vision_alpha.data.zero_() + + +DEEPSEEK_VL_COMMON_CUSTOM_ARGS = r""" + high_res_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size), *optional*): + The tensors corresponding to the input images. Pixel values can be obtained using + [`AutoImageProcessor`]. +""" + + +@auto_docstring +class DeepseekVLHybridModel(DeepseekVLHybridPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.output_size = config.vision_config.image_size // config.vision_config.patch_size + self.global_attn_index = config.high_res_vision_config.global_attn_indexes[0] + + self.high_res_vision_model = AutoModel.from_config(config.high_res_vision_config) + self.high_res_vision_neck = DeepseekVLSamVisionNeck(config.high_res_vision_config) + self.high_res_vision_proj = DeepseekVLSamVisionProj( + config.high_res_vision_config, output_size=self.output_size + ) + self.high_res_vision_alpha = nn.Parameter(torch.zeros(1)) + self.config = config + + self.vision_model = AutoModel.from_config(config.vision_config) + self.aligner = DeepseekVLHybridAligner(config) + + self.language_model = AutoModel.from_config(config=config.text_config) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing. + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def get_image_features(self, pixel_values, high_res_pixel_values): + vision_encodings = self.get_low_res_image_features(pixel_values) + high_res_vision_encodings = self.get_high_res_image_features(high_res_pixel_values) + images_embeds = self.aligner(vision_encodings, high_res_vision_encodings) + return images_embeds + + @can_return_tuple + @auto_docstring(custom_args=DEEPSEEK_VL_COMMON_CUSTOM_ARGS) + def forward( + self, + input_ids: torch.LongTensor = None, + pixel_values: torch.FloatTensor = None, + high_res_pixel_values: torch.FloatTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs, + ): + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one" + ) + + if pixel_values is not None and high_res_pixel_values is None: + raise ValueError("Both pixel_values and high_res_pixel_values should be specified at the same time") + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + if pixel_values is not None: + if input_ids is None: + image_attention_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + image_attention_mask = image_attention_mask.all(-1) + else: + image_attention_mask = input_ids == self.config.image_token_id + + image_attention_mask = image_attention_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + image_embeds = self.get_image_features(pixel_values, high_res_pixel_values) + image_features = image_embeds.reshape(-1, inputs_embeds.shape[-1]) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(image_attention_mask, image_features) + + lm_output = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + return DeepseekVLHybridBaseModelOutputWithPast( + last_hidden_state=lm_output.last_hidden_state, + past_key_values=lm_output.past_key_values, + hidden_states=lm_output.hidden_states, + attentions=lm_output.attentions, + image_hidden_states=image_embeds if pixel_values is not None else None, + ) + + def get_low_res_image_features(self, pixel_values): + output = self.vision_model(pixel_values) + output = output[0] + return output + + def get_high_res_image_features(self, pixel_values): + output = self.high_res_vision_model( + pixel_values=pixel_values, + output_hidden_states=True, + return_dict=True, + ) + last_hidden_state = output.last_hidden_state + last_hidden_state = self.high_res_vision_proj(last_hidden_state) + + hidden_states = output.hidden_states + global_hidden_state = hidden_states[self.global_attn_index + 1] # +1 for embedding layer + global_hidden_state = self.high_res_vision_neck(global_hidden_state) + global_hidden_state = self.high_res_vision_proj(global_hidden_state) + + output = last_hidden_state + global_hidden_state * self.high_res_vision_alpha + + # batch_size, hidden_size, height, width -> batch_size, seq_len, hidden_size + output = output.permute(0, 2, 3, 1) + output = output.reshape(output.shape[0], -1, output.shape[-1]) + + return output + + +class DeepseekVLHybridForConditionalGeneration(DeepseekVLHybridPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["model.language_model.embed_tokens.weight", "lm_head.weight"] + _supports_static_cache = True + + def __init__(self, config: DeepseekVLHybridConfig): + super().__init__(config) + self.config = config + self.model = DeepseekVLHybridModel(config) + self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) + + # Initialize weights and apply final processing. + self.post_init() + + def get_input_embeddings(self): + return self.model.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.model.language_model.set_input_embeddings(value) + + def prepare_embeddings_for_image_generation(self) -> torch.Tensor: + raise AttributeError("Not needed for DeepseekVLHybrid") + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @can_return_tuple + @auto_docstring(custom_args=DEEPSEEK_VL_COMMON_CUSTOM_ARGS) + def forward( + self, + input_ids: torch.LongTensor = None, + pixel_values: torch.FloatTensor = None, + high_res_pixel_values: torch.FloatTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ): + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + """ + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + high_res_pixel_values=high_res_pixel_values, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs + ) + + return DeepseekVLHybridCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=outputs.image_hidden_states, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + high_res_pixel_values=None, + attention_mask=None, + cache_position=None, + logits_to_keep=None, + **kwargs, + ): + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + if cache_position[0] == 0: + # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore + # Otherwise we need pixel values to be passed to model + model_inputs["pixel_values"] = pixel_values + model_inputs["high_res_pixel_values"] = high_res_pixel_values + + return model_inputs + + +__all__ = ["DeepseekVLHybridPreTrainedModel", "DeepseekVLHybridModel", "DeepseekVLHybridForConditionalGeneration"] diff --git a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py new file mode 100644 index 0000000000..aa0a4f87ba --- /dev/null +++ b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py @@ -0,0 +1,777 @@ +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Union + +import torch +import torch.nn as nn + +from ...cache_utils import Cache +from ...image_processing_utils_fast import ( + BatchFeature, + get_size_dict, +) +from ...image_transforms import convert_to_rgb, to_channel_dimension_format +from ...image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_scaled_image, + make_flat_list_of_images, + make_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...processing_utils import Unpack +from ...tokenization_utils_base import ( + PreTokenizedInput, + TextInput, +) +from ...utils import ( + TensorType, + TransformersKwargs, + auto_docstring, + can_return_tuple, + filter_out_non_signature_kwargs, + logging, +) +from ..auto import CONFIG_MAPPING, AutoConfig, AutoModel +from ..deepseek_vl.configuration_deepseek_vl import DeepseekVLConfig +from ..deepseek_vl.image_processing_deepseek_vl import DeepseekVLImageProcessor +from ..deepseek_vl.modeling_deepseek_vl import ( + DeepseekVLForConditionalGeneration, + DeepseekVLModel, + DeepseekVLPreTrainedModel, +) +from ..deepseek_vl.processing_deepseek_vl import DeepseekVLProcessor, DeepseekVLProcessorKwargs +from ..idefics.modeling_idefics import IdeficsBaseModelOutputWithPast, IdeficsCausalLMOutputWithPast +from ..sam.modeling_sam import SamLayerNorm, SamVisionNeck + + +logger = logging.get_logger(__name__) + + +DEEPSEEK_VL_COMMON_CUSTOM_ARGS = r""" + high_res_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size), *optional*): + The tensors corresponding to the input images. Pixel values can be obtained using + [`AutoImageProcessor`]. +""" + + +class DeepseekVLHybridConfig(DeepseekVLConfig): + r""" + This is the configuration class to store the configuration of a [`DeepseekVLHybridModel`]. It is used to instantiate a + DeepseekVLHybrid model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the DeepseekVLHybrid + [deepseek-community/deepseek-vl-7b-chat](https://huggingface.co/deepseek-community/deepseek-vl-7b-chat) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`): + The config object or dictionary of the text backbone. + vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SiglipVisionConfig`): + The config object or dictionary of the vision backbone. + high_res_vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SamVisionConfig`): + The config object or dictionary of the high resolution vision backbone. + image_token_id (`int`, *optional*, defaults to 100015): + The index representing image tokens in the model's token vocabulary. + + Example: + + ```python + >>> from transformers import DeepseekVLHybridConfig, DeepseekVLHybridModel + + >>> # Initializing a DeepseekVLHybrid deepseek-community/deepseek-vl-7b-chat style configuration + >>> configuration = DeepseekVLHybridConfig() + + >>> # Initializing a model (with random weights) from the deepseek-community/deepseek-vl-7b-chat style configuration + >>> model = DeepseekVLHybridModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "deepseek_vl_hybrid" + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig, "high_res_vision_config": AutoConfig} + + def __init__( + self, + text_config: AutoConfig = None, + vision_config: AutoConfig = None, + high_res_vision_config: AutoConfig = None, + image_token_id: int = 100015, + **kwargs, + ): + super().__init__( + text_config=text_config, + vision_config=vision_config, + image_token_id=image_token_id, + **kwargs, + ) + + if high_res_vision_config is None: + high_res_vision_config = {} + logger.info("`high_res_vision_config` is `None`. Initializing the `SamVisionConfig` with default values.") + + if isinstance(high_res_vision_config, dict): + high_res_vision_config["model_type"] = high_res_vision_config.get("model_type", "sam_vision_model") + high_res_vision_config = CONFIG_MAPPING[high_res_vision_config["model_type"]](**high_res_vision_config) + + self.high_res_vision_config = high_res_vision_config + + +class DeepseekVLHybridBaseModelOutputWithPast(IdeficsBaseModelOutputWithPast): + pass + + +class DeepseekVLHybridCausalLMOutputWithPast(IdeficsCausalLMOutputWithPast): + pass + + +class DeepseekVLHybridLayerNorm(SamLayerNorm): + pass + + +class DeepseekVLSamVisionNeck(SamVisionNeck): + def __init__(self, config): + super().__init__(config) + + +class DeepseekVLSamVisionProj(nn.Module): + def __init__(self, config, output_size: int = 24): + super().__init__() + self.config = config + self.output_size = output_size + + self.conv1 = nn.Conv2d( + config.output_channels, config.output_channels * 2, kernel_size=3, stride=2, padding=1, bias=False + ) + self.conv2 = nn.Conv2d( + config.output_channels * 2, config.output_channels * 4, kernel_size=3, stride=2, padding=1, bias=False + ) + + def forward(self, features: torch.Tensor) -> torch.Tensor: + # interpolate Sam encodings to match Siglip encodings + features = torch.nn.functional.interpolate( + features, + size=(4 * self.output_size, 4 * self.output_size), + mode="bilinear", + align_corners=False, + ) + features = self.conv1(features) + features = self.conv2(features) + return features + + +class DeepseekVLHybridAligner(nn.Module): + def __init__(self, config: DeepseekVLHybridConfig): + super().__init__() + + in_channels = config.vision_config.hidden_size + high_res_in_channels = config.high_res_vision_config.output_channels * 4 + out_channels = config.text_config.hidden_size + + self.vision_proj = nn.Linear(in_channels, out_channels // 2) + self.high_res_vision_proj = nn.Linear(high_res_in_channels, out_channels // 2) + + self.act = nn.GELU() + self.proj = nn.Linear(out_channels, out_channels) + + def forward( + self, + vision_encodings: torch.Tensor, + high_res_vision_encodings: torch.Tensor, + ) -> torch.Tensor: + vision_encodings = self.vision_proj(vision_encodings) + high_res_vision_encodings = self.high_res_vision_proj(high_res_vision_encodings) + + encodings = torch.concat([high_res_vision_encodings, vision_encodings], dim=-1) + encodings = self.act(encodings) + encodings = self.proj(encodings) + + return encodings + + +class DeepseekVLHybridPreTrainedModel(DeepseekVLPreTrainedModel): + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.text_config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Conv2d): + nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu") + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, DeepseekVLHybridLayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + elif isinstance(module, DeepseekVLHybridModel): + module.high_res_vision_alpha.data.zero_() + + +class DeepseekVLHybridModel(DeepseekVLModel): + def __init__(self, config): + self.output_size = config.vision_config.image_size // config.vision_config.patch_size + self.global_attn_index = config.high_res_vision_config.global_attn_indexes[0] + + self.high_res_vision_model = AutoModel.from_config(config.high_res_vision_config) + self.high_res_vision_neck = DeepseekVLSamVisionNeck(config.high_res_vision_config) + self.high_res_vision_proj = DeepseekVLSamVisionProj( + config.high_res_vision_config, output_size=self.output_size + ) + self.high_res_vision_alpha = nn.Parameter(torch.zeros(1)) + + super().__init__(config) + + def get_low_res_image_features(self, pixel_values): + output = self.vision_model(pixel_values) + output = output[0] + return output + + def get_high_res_image_features(self, pixel_values): + output = self.high_res_vision_model( + pixel_values=pixel_values, + output_hidden_states=True, + return_dict=True, + ) + last_hidden_state = output.last_hidden_state + last_hidden_state = self.high_res_vision_proj(last_hidden_state) + + hidden_states = output.hidden_states + global_hidden_state = hidden_states[self.global_attn_index + 1] # +1 for embedding layer + global_hidden_state = self.high_res_vision_neck(global_hidden_state) + global_hidden_state = self.high_res_vision_proj(global_hidden_state) + + output = last_hidden_state + global_hidden_state * self.high_res_vision_alpha + + # batch_size, hidden_size, height, width -> batch_size, seq_len, hidden_size + output = output.permute(0, 2, 3, 1) + output = output.reshape(output.shape[0], -1, output.shape[-1]) + + return output + + def get_image_features(self, pixel_values, high_res_pixel_values): + vision_encodings = self.get_low_res_image_features(pixel_values) + high_res_vision_encodings = self.get_high_res_image_features(high_res_pixel_values) + images_embeds = self.aligner(vision_encodings, high_res_vision_encodings) + return images_embeds + + @can_return_tuple + @auto_docstring(custom_args=DEEPSEEK_VL_COMMON_CUSTOM_ARGS) + def forward( + self, + input_ids: torch.LongTensor = None, + pixel_values: torch.FloatTensor = None, + high_res_pixel_values: torch.FloatTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs, + ): + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one" + ) + + if pixel_values is not None and high_res_pixel_values is None: + raise ValueError("Both pixel_values and high_res_pixel_values should be specified at the same time") + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + if pixel_values is not None: + if input_ids is None: + image_attention_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + image_attention_mask = image_attention_mask.all(-1) + else: + image_attention_mask = input_ids == self.config.image_token_id + + image_attention_mask = image_attention_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + image_embeds = self.get_image_features(pixel_values, high_res_pixel_values) + image_features = image_embeds.reshape(-1, inputs_embeds.shape[-1]) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(image_attention_mask, image_features) + + lm_output = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + return DeepseekVLHybridBaseModelOutputWithPast( + last_hidden_state=lm_output.last_hidden_state, + past_key_values=lm_output.past_key_values, + hidden_states=lm_output.hidden_states, + attentions=lm_output.attentions, + image_hidden_states=image_embeds if pixel_values is not None else None, + ) + + +class DeepseekVLHybridForConditionalGeneration(DeepseekVLForConditionalGeneration): + @can_return_tuple + @auto_docstring(custom_args=DEEPSEEK_VL_COMMON_CUSTOM_ARGS) + def forward( + self, + input_ids: torch.LongTensor = None, + pixel_values: torch.FloatTensor = None, + high_res_pixel_values: torch.FloatTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ): + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + """ + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + high_res_pixel_values=high_res_pixel_values, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs + ) + + return DeepseekVLHybridCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=outputs.image_hidden_states, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + high_res_pixel_values=None, + attention_mask=None, + cache_position=None, + logits_to_keep=None, + **kwargs, + ): + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + if cache_position[0] == 0: + # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore + # Otherwise we need pixel values to be passed to model + model_inputs["pixel_values"] = pixel_values + model_inputs["high_res_pixel_values"] = high_res_pixel_values + + return model_inputs + + +class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor): + r""" + Constructs a DEEPSEEK_VL_HYBRID image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the + `do_resize` parameter in the `preprocess` method. + size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`): + Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` + method. + high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`): + Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess` + method. + min_size (`int`, *optional*, defaults to 14): + The minimum allowed size for the resized image. Ensures that neither the height nor width + falls below this value after resizing. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be + overridden by the `resample` parameter in the `preprocess` method. + high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be + overridden by the `high_res_resample` parameter in the `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the + `do_rescale` parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be + overridden by the `rescale_factor` parameter in the `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. Can be overridden by the `do_normalize` parameter in the `preprocess` method. + image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be + overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`): + Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method. + high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`): + Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + """ + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + high_res_size: Optional[dict[str, int]] = None, + min_size: int = 14, + resample: PILImageResampling = PILImageResampling.BICUBIC, + high_res_resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + high_res_image_mean: Optional[Union[float, list[float]]] = None, + high_res_image_std: Optional[Union[float, list[float]]] = None, + do_convert_rgb: Optional[bool] = None, + **kwargs, + ) -> None: + high_res_size = high_res_size if high_res_size is not None else {"height": 1024, "width": 1024} + high_res_size = get_size_dict(high_res_size, default_to_square=True) + + self.high_res_size = high_res_size + self.high_res_image_mean = high_res_image_mean if high_res_image_mean is not None else OPENAI_CLIP_MEAN + self.high_res_image_std = high_res_image_std if high_res_image_std is not None else OPENAI_CLIP_STD + + self.resample = resample + self.high_res_resample = high_res_resample + + super().__init__( + do_resize=do_resize, + size=size, + min_size=min_size, + resample=resample, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_convert_rgb=do_convert_rgb, + **kwargs, + ) + + if high_res_image_mean is None: + self.background_color = (127, 127, 127) + else: + self.background_color = tuple([int(x * 255) for x in high_res_image_mean]) + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + high_res_size: Optional[dict[str, int]] = None, + resample: PILImageResampling = None, + high_res_resample: PILImageResampling = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + high_res_image_mean: Optional[Union[float, list[float]]] = None, + high_res_image_std: Optional[Union[float, list[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + do_convert_rgb: Optional[bool] = None, + ): + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after + resizing. + high_res_size (`Dict[str, int]`, *optional*, defaults to `self.high_res_size`): + Dictionary in the format `{"height": h, "width": w}` specifying the size of the high resolution output image after + resizing. + resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`): + `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has + an effect if `do_resize` is set to `True`. + high_res_resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`): + `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BICUBIC`. Only has + an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use if `do_normalize` is set to `True`. + high_res_image_mean (`float` or `List[float]`, *optional*, defaults to `self.high_res_image_mean`): + Image mean to use if `do_normalize` is set to `True`. + high_res_image_std (`float` or `List[float]`, *optional*, defaults to `self.high_res_image_std`): + Image standard deviation to use if `do_normalize` is set to `True`. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + resample = resample if resample is not None else self.resample + high_res_resample = high_res_resample if high_res_resample is not None else self.high_res_resample + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + high_res_image_mean = high_res_image_mean if high_res_image_mean is not None else self.high_res_image_mean + high_res_image_std = high_res_image_std if high_res_image_std is not None else self.high_res_image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + + size = size if size is not None else self.size + size_dict = get_size_dict(size) + high_res_size = high_res_size if high_res_size is not None else self.high_res_size + high_res_size_dict = get_size_dict(high_res_size) + + images = make_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + all_images = [] + all_high_res_images = [] + for image in images: + # high_res_image: resize (high) -> rescale -> normalize (high) + # low_res_image: resize (high) -> rescale -> resize (low) -> normalize (low) + high_res_image = image + + if do_resize: + high_res_image = self.resize( + image=high_res_image, + size=high_res_size_dict, + resample=high_res_resample, + input_data_format=input_data_format, + ) + image = self.resize( + image=high_res_image, size=size_dict, resample=resample, input_data_format=input_data_format + ) + + if do_rescale: + image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + high_res_image = self.rescale( + image=high_res_image, scale=rescale_factor, input_data_format=input_data_format + ) + + if do_normalize: + image = self.normalize( + image=image, mean=image_mean, std=image_std, input_data_format=input_data_format + ) + high_res_image = self.normalize( + image=high_res_image, + mean=high_res_image_mean, + std=high_res_image_std, + input_data_format=input_data_format, + ) + + image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + high_res_image = to_channel_dimension_format( + high_res_image, data_format, input_channel_dim=input_data_format + ) + + all_images.append(image) + all_high_res_images.append(high_res_image) + + data = {"pixel_values": all_images, "high_res_pixel_values": all_high_res_images} + return BatchFeature(data=data, tensor_type=return_tensors) + + +class DeepseekVLHybridProcessorKwargs(DeepseekVLProcessorKwargs): + pass + + +class DeepseekVLHybridProcessor(DeepseekVLProcessor): + def __call__( + self, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, + images: ImageInput = None, + **kwargs: Unpack[DeepseekVLHybridProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` + and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode + the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to + DeepseekVLHybridImageProcessor's [`~DeepseekVLHybridImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring + of the above two methods for more information. + + Args: + text (`str`, `List[str]`, `List[List[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + output_kwargs = self._merge_kwargs( + DeepseekVLHybridProcessorKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs + ) + if text is None and images is None: + raise ValueError("You must specify either text or images.") + + if text is not None: + if isinstance(text, str): + text = [text] + elif not (isinstance(text, (list, tuple)) and all(isinstance(t, str) for t in text)): + raise ValueError("Invalid input text. Please provide a string, or a list of strings") + + prompt_strings = [] + one_img_tokens = self.image_token * self.num_image_tokens + for prompt in text: + prompt = prompt.replace(self.image_token, one_img_tokens) + prompt_strings.append(prompt) + + data = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) + + # process images if pixel_values are provided + if images is not None: + images = make_flat_list_of_images(images) + inputs = self.image_processor(images, **output_kwargs["images_kwargs"]) + data["pixel_values"] = inputs["pixel_values"] + data["high_res_pixel_values"] = inputs["high_res_pixel_values"] + + return BatchFeature(data=data) + + +__all__ = [ + "DeepseekVLHybridConfig", + "DeepseekVLHybridPreTrainedModel", + "DeepseekVLHybridModel", + "DeepseekVLHybridForConditionalGeneration", + "DeepseekVLHybridImageProcessor", + "DeepseekVLHybridProcessor", +] diff --git a/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py new file mode 100644 index 0000000000..4fb765c797 --- /dev/null +++ b/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py @@ -0,0 +1,159 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_deepseek_vl_hybrid.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Union + +from ...image_processing_utils_fast import BatchFeature +from ...image_utils import ImageInput, make_flat_list_of_images +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils_base import PreTokenizedInput, TextInput + + +class DeepseekVLHybridProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": {"padding": False}, + "common_kwargs": {"return_tensors": "pt"}, + } + + +class DeepseekVLHybridProcessor(ProcessorMixin): + r""" + Constructs a DeepseekVLHybrid processor which wraps a DeepseekVLHybrid Image Processor and a Llama tokenizer into a single processor. + + [`DeepseekVLHybridProcessor`] offers all the functionalities of [`DeepseekVLHybridImageProcessor`] and [`LlamaTokenizerFast`]. See the + [`~DeepseekVLHybridProcessor.__call__`] and [`~DeepseekVLHybridProcessor.decode`] for more information. + + Args: + image_processor ([`DeepseekVLHybridImageProcessor`]): + The image processor is a required input. + tokenizer ([`LlamaTokenizerFast`]): + The tokenizer is a required input. + chat_template (`str`, *optional*): + A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. + num_image_tokens (`int`, *optional*, defaults to 576): + The number of special image tokens used as placeholders for visual content in text sequences. + """ + + attributes = ["image_processor", "tokenizer"] + valid_kwargs = ["chat_template", "num_image_tokens"] + image_processor_class = "AutoImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__( + self, + image_processor, + tokenizer, + chat_template=None, + num_image_tokens=576, + ): + self.image_token = tokenizer.image_token + self.num_image_tokens = num_image_tokens + + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + def __call__( + self, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, + images: ImageInput = None, + **kwargs: Unpack[DeepseekVLHybridProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` + and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode + the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to + DeepseekVLHybridImageProcessor's [`~DeepseekVLHybridImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring + of the above two methods for more information. + + Args: + text (`str`, `List[str]`, `List[List[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + output_kwargs = self._merge_kwargs( + DeepseekVLHybridProcessorKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs + ) + if text is None and images is None: + raise ValueError("You must specify either text or images.") + + if text is not None: + if isinstance(text, str): + text = [text] + elif not (isinstance(text, (list, tuple)) and all(isinstance(t, str) for t in text)): + raise ValueError("Invalid input text. Please provide a string, or a list of strings") + + prompt_strings = [] + one_img_tokens = self.image_token * self.num_image_tokens + for prompt in text: + prompt = prompt.replace(self.image_token, one_img_tokens) + prompt_strings.append(prompt) + + data = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) + + # process images if pixel_values are provided + if images is not None: + images = make_flat_list_of_images(images) + inputs = self.image_processor(images, **output_kwargs["images_kwargs"]) + data["pixel_values"] = inputs["pixel_values"] + data["high_res_pixel_values"] = inputs["high_res_pixel_values"] + + return BatchFeature(data=data) + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + + +__all__ = ["DeepseekVLHybridProcessor"] diff --git a/src/transformers/models/janus/modeling_janus.py b/src/transformers/models/janus/modeling_janus.py index b93e1a8b67..ebdc2f23ea 100644 --- a/src/transformers/models/janus/modeling_janus.py +++ b/src/transformers/models/janus/modeling_janus.py @@ -1147,7 +1147,7 @@ class JanusForConditionalGeneration(JanusPreTrainedModel, GenerationMixin): labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, logits_to_keep: Union[int, torch.Tensor] = 0, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ): r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): @@ -1173,7 +1173,9 @@ class JanusForConditionalGeneration(JanusPreTrainedModel, GenerationMixin): loss = None if labels is not None: - loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size) + loss = self.loss_function( + logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs + ) return JanusCausalLMOutputWithPast( loss=loss, diff --git a/src/transformers/models/janus/modular_janus.py b/src/transformers/models/janus/modular_janus.py index 29accd88e5..11b0848620 100644 --- a/src/transformers/models/janus/modular_janus.py +++ b/src/transformers/models/janus/modular_janus.py @@ -1007,7 +1007,7 @@ class JanusForConditionalGeneration(JanusPreTrainedModel, GenerationMixin): labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, logits_to_keep: Union[int, torch.Tensor] = 0, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ): r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): @@ -1033,7 +1033,9 @@ class JanusForConditionalGeneration(JanusPreTrainedModel, GenerationMixin): loss = None if labels is not None: - loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size) + loss = self.loss_function( + logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs + ) return JanusCausalLMOutputWithPast( loss=loss, diff --git a/tests/models/deepseek_vl/__init__.py b/tests/models/deepseek_vl/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/models/deepseek_vl/test_image_processing_deepseek_vl.py b/tests/models/deepseek_vl/test_image_processing_deepseek_vl.py new file mode 100644 index 0000000000..c1092f05d3 --- /dev/null +++ b/tests/models/deepseek_vl/test_image_processing_deepseek_vl.py @@ -0,0 +1,119 @@ +# coding=utf-8 +# Copyright 2025 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_vision_available(): + from transformers import DeepseekVLImageProcessor + + +# Copied from tests.models.vit.test_image_processing_vit.ViTImageProcessingTester with ViT->DeepseekVL +class DeepseekVLImageProcessingTester: + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + image_size=18, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + do_normalize=True, + image_mean=[0.5, 0.5, 0.5], + image_std=[0.5, 0.5, 0.5], + ): + size = size if size is not None else {"height": 18, "width": 18} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + + def prepare_image_processor_dict(self): + return { + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_normalize": self.do_normalize, + "do_resize": self.do_resize, + "size": self.size, + } + + # Ignore copy + def expected_output_image_shape(self, images): + max_size = max(self.size["height"], self.size["width"]) + return self.num_channels, max_size, max_size + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +# Copied from tests.models.vit.test_image_processing_vit.ViTImageProcessingTest with ViT->DeepseekVL +class DeepseekVLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + # Ignore copy + image_processing_class = DeepseekVLImageProcessor if is_vision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = DeepseekVLImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + + def test_image_processor_from_dict_with_kwargs(self): + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"height": 18, "width": 18}) + + image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42) + self.assertEqual(image_processor.size, {"height": 42, "width": 42}) + + # Ignore copy + @unittest.skip(reason="Not supported") + def test_call_numpy_4_channels(self): + pass diff --git a/tests/models/deepseek_vl/test_modeling_deepseek_vl.py b/tests/models/deepseek_vl/test_modeling_deepseek_vl.py new file mode 100644 index 0000000000..bff23e9dd5 --- /dev/null +++ b/tests/models/deepseek_vl/test_modeling_deepseek_vl.py @@ -0,0 +1,359 @@ +# coding=utf-8 +# Copyright 2025 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch DeepseekVL model.""" + +import re +import tempfile +import unittest + +from transformers import ( + AutoProcessor, + DeepseekVLConfig, + DeepseekVLForConditionalGeneration, + DeepseekVLModel, + is_torch_available, +) +from transformers.testing_utils import ( + require_torch, + require_torch_accelerator, + require_torch_sdpa, + slow, + torch_device, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask + + +if is_torch_available(): + import torch + + +class DeepseekVLModelTester: + def __init__( + self, + parent, + batch_size=2, + seq_length=25, + num_channels=3, + initializer_range=0.02, + is_training=True, + use_cache=False, + text_config={ + "num_hidden_layers": 2, + "vocab_size": 99, + "hidden_size": 16, + "intermediate_size": 37, + "max_position_embeddings": 512, + "num_attention_heads": 4, + "pad_token_id": 1, + }, + vision_config={ + "num_hidden_layers": 1, + "hidden_size": 16, + "intermediate_size": 37, + "image_size": 32, + "patch_size": 8, + "hidden_act": "gelu", + "vision_use_head": False, + "num_attention_heads": 4, + }, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.num_channels = num_channels + self.initializer_range = initializer_range + self.is_training = is_training + self.use_cache = use_cache + + self.text_config = text_config + self.vision_config = vision_config + self.vision_config["num_channels"] = self.num_channels + + self.num_hidden_layers = text_config["num_hidden_layers"] + self.vocab_size = text_config["vocab_size"] + self.hidden_size = text_config["hidden_size"] + self.num_attention_heads = text_config["num_attention_heads"] + self.image_size = vision_config["image_size"] + self.num_image_tokens = vision_config["image_size"] // vision_config["patch_size"] + self.pad_token_id = text_config["pad_token_id"] + self.image_token_id = self.vocab_size - 1 + + def get_config(self): + return DeepseekVLConfig( + text_config=self.text_config, + vision_config=self.vision_config, + image_token_id=self.image_token_id, + ) + + def prepare_config_and_inputs(self): + config = self.get_config() + + # create text and vision inputs + input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 1 + attention_mask = random_attention_mask([self.batch_size, self.seq_length]) + pixel_values = floats_tensor( + [ + self.batch_size, + self.num_channels, + self.image_size, + self.image_size, + ] + ) + # fill image_tokens + input_ids[:, : self.num_image_tokens] = self.image_token_id + + return config, input_ids, attention_mask, pixel_values + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, attention_mask, pixel_values = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask, "pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class DeepseekVLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): + all_model_classes = (DeepseekVLModel, DeepseekVLForConditionalGeneration) if is_torch_available() else () + pipeline_model_mapping = ( + { + "feature-extraction": DeepseekVLModel, + "image-text-to-text": DeepseekVLForConditionalGeneration, + } + if is_torch_available() + else {} + ) + _is_composite = True + test_pruning = False + test_head_masking = False + + def setUp(self): + self.model_tester = DeepseekVLModelTester(self) + self.config_tester = ConfigTester(self, config_class=DeepseekVLConfig, has_text_modality=False) + + # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs + def test_inputs_embeds(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = self._prepare_for_class(inputs_dict, model_class) + + input_ids = inputs["input_ids"] + del inputs["input_ids"] + del inputs["pixel_values"] + + wte = model.get_input_embeddings() + inputs["inputs_embeds"] = wte(input_ids) + + with torch.no_grad(): + model(**inputs) + + # overwrite inputs_embeds tests because we need to delete "pixel values" for VLMs. + def test_inputs_embeds_matches_input_ids(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = self._prepare_for_class(inputs_dict, model_class) + input_ids = inputs["input_ids"] + del inputs["input_ids"] + del inputs["pixel_values"] + + inputs_embeds = model.get_input_embeddings()(input_ids) + + with torch.no_grad(): + out_ids = model(input_ids=input_ids, **inputs)[0] + out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0] + torch.testing.assert_close(out_embeds, out_ids) + + @unittest.skip(reason="Siglip uses the same initialization scheme as the Flax original implementation") + # Copied from tests.models.siglip.test_modeling_siglip.SiglipVisionModelTest.test_initialization + def test_initialization(self): + pass + + @require_torch_sdpa + # Copied from tests.models.janus.test_modeling_janus.JanusVisionText2TextModelTest.test_sdpa_can_dispatch_composite_models + def test_sdpa_can_dispatch_composite_models(self): + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + + # Load the model with SDPA + model_sdpa = model_class.from_pretrained(tmpdirname) + model_sdpa = model_sdpa.eval().to(torch_device) + + # Load model with eager attention + model_eager = model_class.from_pretrained( + tmpdirname, + attn_implementation="eager", + ) + model_eager = model_eager.eval().to(torch_device) + + # SigLip has one shared cls attr for all models, so we assign both submodels heer + vision_attn = language_attn = "sdpa" if model._supports_sdpa else "eager" + + if hasattr(model_sdpa, "vision_model") and hasattr(model_sdpa, "language_model"): + self.assertTrue(model_sdpa.vision_model.config._attn_implementation == vision_attn) + self.assertTrue(model_sdpa.language_model.config._attn_implementation == language_attn) + self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager") + self.assertTrue(model_eager.language_model.config._attn_implementation == "eager") + + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + self.assertTrue(model_eager.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + class_name = submodule.__class__.__name__ + if any(re.finditer(r"Attention(?!Pool)", class_name)): + self.assertTrue(submodule.config._attn_implementation == "eager") + + for name, submodule in model_sdpa.named_modules(): + class_name = submodule.__class__.__name__ + if any(re.finditer(r"Attention(?!Pool)", class_name)): + self.assertTrue(submodule.config._attn_implementation == "sdpa") + + +@require_torch +@require_torch_accelerator +@slow +class DeepseekVLIntegrationTest(unittest.TestCase): + def setUp(self): + self.model_id = "deepseek-community/deepseek-vl-1.3b-chat" + + def test_model_text_generation(self): + model = DeepseekVLForConditionalGeneration.from_pretrained( + self.model_id, torch_dtype="auto", device_map="auto" + ) + model.to(torch_device) + model.eval() + processor = AutoProcessor.from_pretrained(self.model_id) + + messages = [ + { + "role": "user", + "content": [ + { + "type": "image", + "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg", + }, + {"type": "text", "text": "Describe this image."}, + ], + } + ] + EXPECTED_TEXT = 'You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: Describe this image.\n\nAssistant:In the image, a majestic snow leopard is captured in a moment of tranquility. The snow leopard' # fmt: skip + + inputs = processor.apply_chat_template( + messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt" + ) + inputs = inputs.to(model.device, dtype=model.dtype) + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + text = processor.decode(output[0], skip_special_tokens=True) + + self.assertEqual( + text, + EXPECTED_TEXT, + ) + + def test_model_text_generation_batched(self): + model = DeepseekVLForConditionalGeneration.from_pretrained( + self.model_id, torch_dtype="auto", device_map="auto" + ) + model.to(torch_device) + model.eval() + processor = AutoProcessor.from_pretrained(self.model_id) + + messages = [ + [ + { + "role": "user", + "content": [ + { + "type": "image", + "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg", + }, + {"type": "text", "text": "Describe this image."}, + ], + } + ], + [ + { + "role": "user", + "content": [ + { + "type": "image", + "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg", + }, + {"type": "text", "text": "What animal do you see in the image?"}, + ], + } + ], + ] + EXPECTED_TEXT = [ + "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: Describe this image.\n\nAssistant:In the image, a majestic snow leopard is captured in a moment of tranquility. The snow leopard", # fmt: skip + "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: What animal do you see in the image?\n\nAssistant:I see a bear in the image.What is the significance of the color red in the", # fmt: skip + ] + + inputs = processor.apply_chat_template( + messages, add_generation_prompt=True, tokenize=True, padding=True, return_dict=True, return_tensors="pt" + ) + inputs = inputs.to(model.device, dtype=model.dtype) + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + text = processor.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(EXPECTED_TEXT, text) + + def test_model_text_generation_with_multi_image(self): + model = DeepseekVLForConditionalGeneration.from_pretrained( + self.model_id, torch_dtype="auto", device_map="auto" + ) + model.to(torch_device) + model.eval() + processor = AutoProcessor.from_pretrained(self.model_id) + + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's the difference between"}, + {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"}, + {"type": "text", "text": " and "}, + {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}, + ], + } + ] + EXPECTED_TEXT = "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: What's the difference between and \n\nAssistant:The image is a photograph featuring two cats lying on a pink blanket. The cat on the left is" # fmt: skip + + inputs = processor.apply_chat_template( + messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt" + ) + inputs = inputs.to(model.device, dtype=model.dtype) + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + text = processor.decode(output[0], skip_special_tokens=True) + + self.assertEqual( + text, + EXPECTED_TEXT, + ) diff --git a/tests/models/deepseek_vl/test_processor_deepseek_vl.py b/tests/models/deepseek_vl/test_processor_deepseek_vl.py new file mode 100644 index 0000000000..3c61f377e2 --- /dev/null +++ b/tests/models/deepseek_vl/test_processor_deepseek_vl.py @@ -0,0 +1,54 @@ +# coding=utf-8 +# Copyright 2025 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import tempfile +import unittest + +from transformers import DeepseekVLProcessor, LlamaTokenizer +from transformers.models.deepseek_vl.convert_deepseek_vl_weights_to_hf import CHAT_TEMPLATE +from transformers.testing_utils import get_tests_dir +from transformers.utils import is_vision_available + +from ...test_processing_common import ProcessorTesterMixin + + +if is_vision_available(): + from transformers import DeepseekVLImageProcessor + + +SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") + + +class DeepseekVLProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = DeepseekVLProcessor + + def setUp(self): + self.tmpdirname = tempfile.mkdtemp() + image_processor = DeepseekVLImageProcessor() + tokenizer = LlamaTokenizer( + vocab_file=SAMPLE_VOCAB, + extra_special_tokens={ + "pad_token": "<|end▁of▁sentence|>", + "image_token": "", + }, + ) + processor = self.processor_class( + image_processor=image_processor, + tokenizer=tokenizer, + chat_template=CHAT_TEMPLATE, + ) + processor.save_pretrained(self.tmpdirname) + + def prepare_processor_dict(self): + return {"chat_template": CHAT_TEMPLATE, "num_image_tokens": 576} diff --git a/tests/models/deepseek_vl_hybrid/__init__.py b/tests/models/deepseek_vl_hybrid/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/models/deepseek_vl_hybrid/test_image_processing_deepseek_vl_hybrid.py b/tests/models/deepseek_vl_hybrid/test_image_processing_deepseek_vl_hybrid.py new file mode 100644 index 0000000000..b7eaefd71a --- /dev/null +++ b/tests/models/deepseek_vl_hybrid/test_image_processing_deepseek_vl_hybrid.py @@ -0,0 +1,218 @@ +# coding=utf-8 +# Copyright 2025 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +import numpy as np + +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torch_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_torch_available(): + import torch + +if is_vision_available(): + from PIL import Image + + from transformers import DeepseekVLHybridImageProcessor + + +class DeepseekVLHybridImageProcessingTester: + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + image_size=18, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + high_res_size=None, + do_normalize=True, + image_mean=[0.5, 0.5, 0.5], + image_std=[0.5, 0.5, 0.5], + high_res_image_mean=[0.5, 0.5, 0.5], + high_res_image_std=[0.5, 0.5, 0.5], + ): + size = size if size is not None else {"height": 18, "width": 18} + high_res_size = high_res_size if high_res_size is not None else {"height": 36, "width": 36} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.high_res_size = high_res_size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.high_res_image_mean = high_res_image_mean + self.high_res_image_std = high_res_image_std + + def prepare_image_processor_dict(self): + return { + "image_mean": self.image_mean, + "image_std": self.image_std, + "high_res_image_mean": self.high_res_image_mean, + "high_res_image_std": self.high_res_image_std, + "do_normalize": self.do_normalize, + "do_resize": self.do_resize, + "size": self.size, + "high_res_size": self.high_res_size, + } + + def expected_output_image_shape(self, images): + max_size = max(self.size["height"], self.size["width"]) + return self.num_channels, max_size, max_size + + def expected_output_high_res_image_shape(self, images): + max_size = max(self.high_res_size["height"], self.high_res_size["width"]) + return self.num_channels, max_size, max_size + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class DeepseekVLHybridImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = DeepseekVLHybridImageProcessor if is_vision_available() else None + + # Copied from tests.models.vit.test_image_processing_vit.ViTImageProcessingTester.setUp with ViT->DeepseekVLHybrid + def setUp(self): + super().setUp() + self.image_processor_tester = DeepseekVLHybridImageProcessingTester(self) + + @property + # Copied from tests.models.vit.test_image_processing_vit.ViTImageProcessingTester.image_processor_dict with ViT->DeepseekVLHybrid + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + # Copied from tests.models.vit.test_image_processing_vit.ViTImageProcessingTester.test_image_processor_from_dict_with_kwargs + def test_image_processor_from_dict_with_kwargs(self): + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"height": 18, "width": 18}) + + image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42) + self.assertEqual(image_processor.size, {"height": 42, "width": 42}) + + def test_image_processor_properties(self): + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "high_res_image_mean")) + self.assertTrue(hasattr(image_processing, "high_res_image_std")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "high_res_size")) + + def test_call_pil_high_res(self): + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = image_processing_class(**self.image_processor_dict) + # create random PIL images + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False) + for image in image_inputs: + self.assertIsInstance(image, Image.Image) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").high_res_pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_high_res_image_shape( + [image_inputs[0]] + ) + self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").high_res_pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_high_res_image_shape( + image_inputs + ) + self.assertEqual( + tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape) + ) + + def test_call_numpy_high_res(self): + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = image_processing_class(**self.image_processor_dict) + # create random numpy tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True) + for image in image_inputs: + self.assertIsInstance(image, np.ndarray) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").high_res_pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_high_res_image_shape( + [image_inputs[0]] + ) + self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").high_res_pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_high_res_image_shape( + image_inputs + ) + self.assertEqual( + tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape) + ) + + def test_call_pytorch_high_res(self): + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = image_processing_class(**self.image_processor_dict) + # create random PyTorch tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True) + + for image in image_inputs: + self.assertIsInstance(image, torch.Tensor) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").high_res_pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_high_res_image_shape( + [image_inputs[0]] + ) + self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) + + # Test batched + expected_output_image_shape = self.image_processor_tester.expected_output_high_res_image_shape( + image_inputs + ) + encoded_images = image_processing(image_inputs, return_tensors="pt").high_res_pixel_values + self.assertEqual( + tuple(encoded_images.shape), + (self.image_processor_tester.batch_size, *expected_output_image_shape), + ) + + @unittest.skip(reason="Not supported") + def test_call_numpy_4_channels(self): + pass diff --git a/tests/models/deepseek_vl_hybrid/test_modeling_deepseek_vl_hybrid.py b/tests/models/deepseek_vl_hybrid/test_modeling_deepseek_vl_hybrid.py new file mode 100644 index 0000000000..8e68ee19a1 --- /dev/null +++ b/tests/models/deepseek_vl_hybrid/test_modeling_deepseek_vl_hybrid.py @@ -0,0 +1,403 @@ +# coding=utf-8 +# Copyright 2025 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch DeepseekVLHybrid model.""" + +import re +import tempfile +import unittest + +from transformers import ( + AutoProcessor, + DeepseekVLHybridConfig, + DeepseekVLHybridForConditionalGeneration, + DeepseekVLHybridModel, + is_torch_available, +) +from transformers.testing_utils import ( + require_torch, + require_torch_accelerator, + require_torch_sdpa, + slow, + torch_device, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask + + +if is_torch_available(): + import torch + + +class DeepseekVLHybridModelTester: + def __init__( + self, + parent, + batch_size=2, + seq_length=25, + num_channels=3, + initializer_range=0.02, + is_training=True, + use_cache=False, + text_config={ + "num_hidden_layers": 2, + "vocab_size": 99, + "hidden_size": 16, + "intermediate_size": 37, + "max_position_embeddings": 512, + "num_attention_heads": 4, + "pad_token_id": 1, + }, + vision_config={ + "num_hidden_layers": 1, + "hidden_size": 16, + "intermediate_size": 37, + "image_size": 32, + "patch_size": 8, + "hidden_act": "gelu", + "vision_use_head": False, + "num_attention_heads": 4, + }, + high_res_vision_config={ + "num_hidden_layers": 2, + "global_attn_indexes": [0], + "hidden_size": 16, + "intermediate_size": 37, + "mlp_dim": 24, + "output_channels": 4, + "image_size": 128, + "patch_size": 32, + "num_attention_heads": 4, + }, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.num_channels = num_channels + self.initializer_range = initializer_range + self.is_training = is_training + self.use_cache = use_cache + + self.text_config = text_config + self.vision_config = vision_config + self.high_res_vision_config = high_res_vision_config + self.vision_config["num_channels"] = self.num_channels + self.high_res_vision_config["num_channels"] = self.num_channels + + self.num_hidden_layers = text_config["num_hidden_layers"] + self.vocab_size = text_config["vocab_size"] + self.hidden_size = text_config["hidden_size"] + self.num_attention_heads = text_config["num_attention_heads"] + self.high_res_image_size = high_res_vision_config["image_size"] + self.image_size = vision_config["image_size"] + self.num_image_tokens = vision_config["image_size"] // vision_config["patch_size"] + self.pad_token_id = text_config["pad_token_id"] + self.image_token_id = self.vocab_size - 1 + + def get_config(self): + return DeepseekVLHybridConfig( + text_config=self.text_config, + vision_config=self.vision_config, + high_res_vision_config=self.high_res_vision_config, + image_token_id=self.image_token_id, + ) + + def prepare_config_and_inputs(self): + config = self.get_config() + + # create text and vision inputs + input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 1 + attention_mask = random_attention_mask([self.batch_size, self.seq_length]) + pixel_values = floats_tensor( + [ + self.batch_size, + self.num_channels, + self.image_size, + self.image_size, + ] + ) + high_res_pixel_values = floats_tensor( + [ + self.batch_size, + self.num_channels, + self.high_res_image_size, + self.high_res_image_size, + ] + ) + # fill image_tokens + input_ids[:, : self.num_image_tokens] = self.image_token_id + + return config, input_ids, attention_mask, pixel_values, high_res_pixel_values + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, attention_mask, pixel_values, high_res_pixel_values = config_and_inputs + inputs_dict = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "pixel_values": pixel_values, + "high_res_pixel_values": high_res_pixel_values, + } + return config, inputs_dict + + +@require_torch +class DeepseekVLHybridModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): + all_model_classes = ( + (DeepseekVLHybridModel, DeepseekVLHybridForConditionalGeneration) if is_torch_available() else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": DeepseekVLHybridModel, + "image-text-to-text": DeepseekVLHybridForConditionalGeneration, + } + if is_torch_available() + else {} + ) + _is_composite = True + test_pruning = False + test_head_masking = False + + def setUp(self): + self.model_tester = DeepseekVLHybridModelTester(self) + self.config_tester = ConfigTester(self, config_class=DeepseekVLHybridConfig, has_text_modality=False) + + # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs + def test_inputs_embeds(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = self._prepare_for_class(inputs_dict, model_class) + + input_ids = inputs["input_ids"] + del inputs["input_ids"] + del inputs["pixel_values"] + del inputs["high_res_pixel_values"] + + wte = model.get_input_embeddings() + inputs["inputs_embeds"] = wte(input_ids) + + with torch.no_grad(): + model(**inputs) + + # overwrite inputs_embeds tests because we need to delete "pixel values" for VLMs. + def test_inputs_embeds_matches_input_ids(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = self._prepare_for_class(inputs_dict, model_class) + input_ids = inputs["input_ids"] + del inputs["input_ids"] + del inputs["pixel_values"] + del inputs["high_res_pixel_values"] + + inputs_embeds = model.get_input_embeddings()(input_ids) + + with torch.no_grad(): + out_ids = model(input_ids=input_ids, **inputs)[0] + out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0] + torch.testing.assert_close(out_embeds, out_ids) + + @unittest.skip(reason="Siglip uses the same initialization scheme as the Flax original implementation") + # Copied from tests.models.siglip.test_modeling_siglip.SiglipVisionModelTest.test_initialization + def test_initialization(self): + pass + + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + + # Load the model with SDPA + model_sdpa = model_class.from_pretrained( + tmpdirname, + attn_implementation="sdpa", + ) + model_sdpa = model_sdpa.eval().to(torch_device) + + # Load model with eager attention + model_eager = model_class.from_pretrained( + tmpdirname, + attn_implementation="eager", + ) + model_eager = model_eager.eval().to(torch_device) + + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + self.assertTrue(model_eager.config._attn_implementation == "eager") + + if ( + hasattr(model_sdpa, "vision_model") + and hasattr(model_sdpa, "high_res_vision_model") + and hasattr(model_sdpa, "language_model") + ): + self.assertTrue(model_sdpa.language_model.config._attn_implementation == "sdpa") + self.assertTrue(model_sdpa.vision_model.config._attn_implementation == "sdpa") + self.assertTrue(model_sdpa.high_res_vision_model.config._attn_implementation == "sdpa") + self.assertTrue(model_eager.language_model.config._attn_implementation == "eager") + self.assertTrue(model_eager.high_res_vision_model.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + class_name = submodule.__class__.__name__ + if ( + any(re.finditer(r"Attention(?!Pool)", class_name)) + and getattr(submodule, "config", None) + and submodule.config._attn_implementation == "sdpa" + ): + self.assertTrue(submodule.config._attn_implementation == "eager") + + for name, submodule in model_sdpa.named_modules(): + class_name = submodule.__class__.__name__ + if ( + any(re.finditer(r"Attention(?!Pool)", class_name)) + and getattr(submodule, "config", None) + and submodule.config._attn_implementation == "eager" + ): + self.assertTrue(submodule.config._attn_implementation == "sdpa") + + +@require_torch +@require_torch_accelerator +@slow +class DeepseekVLHybridIntegrationTest(unittest.TestCase): + def setUp(self): + self.model_id = "deepseek-community/deepseek-vl-7b-chat" + + def test_model_text_generation(self): + model = DeepseekVLHybridForConditionalGeneration.from_pretrained( + self.model_id, torch_dtype="auto", device_map="auto" + ) + model.to(torch_device) + model.eval() + processor = AutoProcessor.from_pretrained(self.model_id) + + messages = [ + { + "role": "user", + "content": [ + { + "type": "image", + "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg", + }, + {"type": "text", "text": "Describe this image."}, + ], + } + ] + EXPECTED_TEXT = 'You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: Describe this image.\n\nAssistant:The image depicts a fluffy, beige-colored animal with a long tail, walking on snow. The' # fmt: skip + + inputs = processor.apply_chat_template( + messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt" + ) + inputs = inputs.to(model.device, dtype=model.dtype) + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + text = processor.decode(output[0], skip_special_tokens=True) + + self.assertEqual( + text, + EXPECTED_TEXT, + ) + + def test_model_text_generation_batched(self): + model = DeepseekVLHybridForConditionalGeneration.from_pretrained( + self.model_id, torch_dtype="auto", device_map="auto" + ) + model.to(torch_device) + model.eval() + processor = AutoProcessor.from_pretrained(self.model_id) + + messages = [ + [ + { + "role": "user", + "content": [ + { + "type": "image", + "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg", + }, + {"type": "text", "text": "Describe this image."}, + ], + } + ], + [ + { + "role": "user", + "content": [ + { + "type": "image", + "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg", + }, + {"type": "text", "text": "What animal do you see in the image?"}, + ], + } + ], + ] + EXPECTED_TEXT = [ + "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: Describe this image.\n\nAssistant:The image depicts a fluffy, beige-colored animal with a long tail, walking on snow. The", # fmt: skip + "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: What animal do you see in the image?\n\nAssistant:I see a large, furry animal that appears to be a type of bear.The ", # fmt: skip + ] + + inputs = processor.apply_chat_template( + messages, add_generation_prompt=True, tokenize=True, padding=True, return_dict=True, return_tensors="pt" + ) + inputs = inputs.to(model.device, dtype=model.dtype) + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + text = processor.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(EXPECTED_TEXT, text) + + def test_model_text_generation_with_multi_image(self): + model = DeepseekVLHybridForConditionalGeneration.from_pretrained( + self.model_id, torch_dtype="auto", device_map="auto" + ) + model.to(torch_device) + model.eval() + processor = AutoProcessor.from_pretrained(self.model_id) + + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's the difference between"}, + {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"}, + {"type": "text", "text": " and "}, + {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}, + ], + } + ] + EXPECTED_TEXT = "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: What's the difference between and \n\nAssistant:The image shows a street scene with a prominent red stop sign in the foreground. The sign has the" # fmt: skip + + inputs = processor.apply_chat_template( + messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt" + ) + inputs = inputs.to(model.device, dtype=model.dtype) + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + text = processor.decode(output[0], skip_special_tokens=True) + + self.assertEqual( + text, + EXPECTED_TEXT, + ) diff --git a/tests/models/deepseek_vl_hybrid/test_processor_deepseek_vl_hybrid.py b/tests/models/deepseek_vl_hybrid/test_processor_deepseek_vl_hybrid.py new file mode 100644 index 0000000000..10608d8bdb --- /dev/null +++ b/tests/models/deepseek_vl_hybrid/test_processor_deepseek_vl_hybrid.py @@ -0,0 +1,54 @@ +# coding=utf-8 +# Copyright 2025 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import tempfile +import unittest + +from transformers import DeepseekVLHybridProcessor, LlamaTokenizer +from transformers.models.deepseek_vl.convert_deepseek_vl_weights_to_hf import CHAT_TEMPLATE +from transformers.testing_utils import get_tests_dir +from transformers.utils import is_vision_available + +from ...test_processing_common import ProcessorTesterMixin + + +if is_vision_available(): + from transformers import DeepseekVLHybridImageProcessor + + +SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") + + +class DeepseekVLHybridProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = DeepseekVLHybridProcessor + + def setUp(self): + self.tmpdirname = tempfile.mkdtemp() + image_processor = DeepseekVLHybridImageProcessor() + tokenizer = LlamaTokenizer( + vocab_file=SAMPLE_VOCAB, + extra_special_tokens={ + "pad_token": "<|end▁of▁sentence|>", + "image_token": "", + }, + ) + processor = self.processor_class( + image_processor=image_processor, + tokenizer=tokenizer, + chat_template=CHAT_TEMPLATE, + ) + processor.save_pretrained(self.tmpdirname) + + def prepare_processor_dict(self): + return {"chat_template": CHAT_TEMPLATE, "num_image_tokens": 576}