Add support for DeepseekAI's DeepseekVL (#36248)

* upload initial code

* update deepseek-vl adaptor

* update hierarchy of vision model classes

* udpate aligner model

* add text model

* Added Image Processor

* Added Image Processor

* Added Image Processor

* apply masks

* remove projection; add aligner

* remove interpolate_pos_encoding

* remove unused params in config

* cleaning

* Add the __init__ file

* added processing deepseek_vl class

* modified the deepseek-vl processor

* modified the deepseek-vl processor

* update __init__

* Update the image processor class name

* Added Deepseek to src/transformers/__init__.py file

* Added Deepseek to image_processing_auto.py

* update the __init__ file

* update deepseek_vl image processor

* Update Deepseek Processor

* upload fast image processor

* Revert "upload fast image processor"

This reverts commit 68c8fd50bafbb9770ac70c9de02448e2519219b4.

* update image processor

* flatten heirarchy

* remove DeepseekVLModel

* major update (complete modeling)

* auto modeling and other files

* formatting

* fix quality

* replace torchvision in modeling

* set default do_normalize to False

* add fast image processor template using tool

* update image processors

* add fast image processor to other files

* update liscense

* Added deepseek image testcases

* update image test

* update processor

* write CHAT_TEMPLATE

* update model for processor

* fix processor

* minor fixes and formatting

* fix image processing and tests

* fix interpolation in sam

* fix output_attentions in DeepseekVLModel

* upload test_modeling

* fix tests because of vocab size

* set use_high_res_vision=False in tests

* fix all modeling tests

* fix styling

* remove explicit background_color from image processors

* added test_processor

* added test_processor

* fix processor tests

* update docs

* update docs

* update docs

* update conversion script

* Fixed typos

* minor fixes from review

- remove model_id comments in examples
- remove from pre-trained auto mapping
- move to image-text-to-text from vision-to-seq in auto mapping
- add image_token_index to __init__ for config
- remove outdated temporary config in conversion script
- update example to use chat_template in docstring example
- update liscense 2021->2025

* fix type in config docstring

Co-authored-by: Raushan Turganbay <raushan.turganbay@alumni.nu.edu.kz>

* update get_image_features

* fix config

* improve DeepseekVLImageProcessor.preprocess

* return image_hidden_states

* use AutoTokenizer and AutoImageProcessor in Processor

* fix model outputs

* make num_image_tokens configurable

* fix docstring of processor

* move system prompt to chat template

* fix repo consistency

* fix return_dict

* replace SamVisionEncoder with SamVisionModel

* update to remove deepcopy

* 🛠️  Major Architectural Changes (Adds DeepseekVLHybrid)

* fix quality checks

* add missing hybrid in auto modeling

* run make style

* update sam_hq

* update high_res_size in test

* update docs following #36979

* update code with auto_docstring

* update conversion scripts

* fix style

* fix failing test because of tuple

* set weights_only=True in conversion script

* use safetensors.torch.load_file instead of torch.load in conversion script

* make output_dir optional in conversion script

* fix code snippets in docs (now the examples work fine)

* integration tests for DeepseekVL

* update expected texts

* make style

* integration tests for DeepseekVLHybrid

* fix class name

* update expected texts for hybrid

* run "make style"

* update since changes in main

* run make-style

* nits since changes in main

* undo changes in sam

* fix tests

* fix tests; update with main

* update with main: output_attention/output_hidden_states

* fix copied part in deepseek_vl

* run fix-copies

* fix output_hidden_states

* sam: fix _init_weigths

* use modular for DeepseekVL

* make image processor more modular

* modular: use JanusPreTrainedModel

* janus: provide kwargs in loss

* update processors in conversion script

* Revert "sam: fix _init_weigths"

This reverts commit db625d0c68956c0dad45edd7a469b6a074905c27.

* run fix-copies

---------

Co-authored-by: Shakib-IO <shakib.khan17@northsouth.edu>
Co-authored-by: Raushan Turganbay <raushan.turganbay@alumni.nu.edu.kz>
This commit is contained in:
Armaghan Shakir
2025-07-25 22:18:50 +05:00
committed by GitHub
parent a98bbc294c
commit 69cff312f5
33 changed files with 5856 additions and 4 deletions

View File

View File

@@ -0,0 +1,119 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
if is_vision_available():
from transformers import DeepseekVLImageProcessor
# Copied from tests.models.vit.test_image_processing_vit.ViTImageProcessingTester with ViT->DeepseekVL
class DeepseekVLImageProcessingTester:
def __init__(
self,
parent,
batch_size=7,
num_channels=3,
image_size=18,
min_resolution=30,
max_resolution=400,
do_resize=True,
size=None,
do_normalize=True,
image_mean=[0.5, 0.5, 0.5],
image_std=[0.5, 0.5, 0.5],
):
size = size if size is not None else {"height": 18, "width": 18}
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.image_size = image_size
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.size = size
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
def prepare_image_processor_dict(self):
return {
"image_mean": self.image_mean,
"image_std": self.image_std,
"do_normalize": self.do_normalize,
"do_resize": self.do_resize,
"size": self.size,
}
# Ignore copy
def expected_output_image_shape(self, images):
max_size = max(self.size["height"], self.size["width"])
return self.num_channels, max_size, max_size
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
return prepare_image_inputs(
batch_size=self.batch_size,
num_channels=self.num_channels,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
numpify=numpify,
torchify=torchify,
)
@require_torch
@require_vision
# Copied from tests.models.vit.test_image_processing_vit.ViTImageProcessingTest with ViT->DeepseekVL
class DeepseekVLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
# Ignore copy
image_processing_class = DeepseekVLImageProcessor if is_vision_available() else None
def setUp(self):
super().setUp()
self.image_processor_tester = DeepseekVLImageProcessingTester(self)
@property
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "size"))
def test_image_processor_from_dict_with_kwargs(self):
for image_processing_class in self.image_processor_list:
image_processor = image_processing_class.from_dict(self.image_processor_dict)
self.assertEqual(image_processor.size, {"height": 18, "width": 18})
image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42)
self.assertEqual(image_processor.size, {"height": 42, "width": 42})
# Ignore copy
@unittest.skip(reason="Not supported")
def test_call_numpy_4_channels(self):
pass

View File

@@ -0,0 +1,359 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch DeepseekVL model."""
import re
import tempfile
import unittest
from transformers import (
AutoProcessor,
DeepseekVLConfig,
DeepseekVLForConditionalGeneration,
DeepseekVLModel,
is_torch_available,
)
from transformers.testing_utils import (
require_torch,
require_torch_accelerator,
require_torch_sdpa,
slow,
torch_device,
)
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
if is_torch_available():
import torch
class DeepseekVLModelTester:
def __init__(
self,
parent,
batch_size=2,
seq_length=25,
num_channels=3,
initializer_range=0.02,
is_training=True,
use_cache=False,
text_config={
"num_hidden_layers": 2,
"vocab_size": 99,
"hidden_size": 16,
"intermediate_size": 37,
"max_position_embeddings": 512,
"num_attention_heads": 4,
"pad_token_id": 1,
},
vision_config={
"num_hidden_layers": 1,
"hidden_size": 16,
"intermediate_size": 37,
"image_size": 32,
"patch_size": 8,
"hidden_act": "gelu",
"vision_use_head": False,
"num_attention_heads": 4,
},
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.num_channels = num_channels
self.initializer_range = initializer_range
self.is_training = is_training
self.use_cache = use_cache
self.text_config = text_config
self.vision_config = vision_config
self.vision_config["num_channels"] = self.num_channels
self.num_hidden_layers = text_config["num_hidden_layers"]
self.vocab_size = text_config["vocab_size"]
self.hidden_size = text_config["hidden_size"]
self.num_attention_heads = text_config["num_attention_heads"]
self.image_size = vision_config["image_size"]
self.num_image_tokens = vision_config["image_size"] // vision_config["patch_size"]
self.pad_token_id = text_config["pad_token_id"]
self.image_token_id = self.vocab_size - 1
def get_config(self):
return DeepseekVLConfig(
text_config=self.text_config,
vision_config=self.vision_config,
image_token_id=self.image_token_id,
)
def prepare_config_and_inputs(self):
config = self.get_config()
# create text and vision inputs
input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 1
attention_mask = random_attention_mask([self.batch_size, self.seq_length])
pixel_values = floats_tensor(
[
self.batch_size,
self.num_channels,
self.image_size,
self.image_size,
]
)
# fill image_tokens
input_ids[:, : self.num_image_tokens] = self.image_token_id
return config, input_ids, attention_mask, pixel_values
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, input_ids, attention_mask, pixel_values = config_and_inputs
inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask, "pixel_values": pixel_values}
return config, inputs_dict
@require_torch
class DeepseekVLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
all_model_classes = (DeepseekVLModel, DeepseekVLForConditionalGeneration) if is_torch_available() else ()
pipeline_model_mapping = (
{
"feature-extraction": DeepseekVLModel,
"image-text-to-text": DeepseekVLForConditionalGeneration,
}
if is_torch_available()
else {}
)
_is_composite = True
test_pruning = False
test_head_masking = False
def setUp(self):
self.model_tester = DeepseekVLModelTester(self)
self.config_tester = ConfigTester(self, config_class=DeepseekVLConfig, has_text_modality=False)
# overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
def test_inputs_embeds(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
inputs = self._prepare_for_class(inputs_dict, model_class)
input_ids = inputs["input_ids"]
del inputs["input_ids"]
del inputs["pixel_values"]
wte = model.get_input_embeddings()
inputs["inputs_embeds"] = wte(input_ids)
with torch.no_grad():
model(**inputs)
# overwrite inputs_embeds tests because we need to delete "pixel values" for VLMs.
def test_inputs_embeds_matches_input_ids(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
inputs = self._prepare_for_class(inputs_dict, model_class)
input_ids = inputs["input_ids"]
del inputs["input_ids"]
del inputs["pixel_values"]
inputs_embeds = model.get_input_embeddings()(input_ids)
with torch.no_grad():
out_ids = model(input_ids=input_ids, **inputs)[0]
out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
torch.testing.assert_close(out_embeds, out_ids)
@unittest.skip(reason="Siglip uses the same initialization scheme as the Flax original implementation")
# Copied from tests.models.siglip.test_modeling_siglip.SiglipVisionModelTest.test_initialization
def test_initialization(self):
pass
@require_torch_sdpa
# Copied from tests.models.janus.test_modeling_janus.JanusVisionText2TextModelTest.test_sdpa_can_dispatch_composite_models
def test_sdpa_can_dispatch_composite_models(self):
for model_class in self.all_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
# Load the model with SDPA
model_sdpa = model_class.from_pretrained(tmpdirname)
model_sdpa = model_sdpa.eval().to(torch_device)
# Load model with eager attention
model_eager = model_class.from_pretrained(
tmpdirname,
attn_implementation="eager",
)
model_eager = model_eager.eval().to(torch_device)
# SigLip has one shared cls attr for all models, so we assign both submodels heer
vision_attn = language_attn = "sdpa" if model._supports_sdpa else "eager"
if hasattr(model_sdpa, "vision_model") and hasattr(model_sdpa, "language_model"):
self.assertTrue(model_sdpa.vision_model.config._attn_implementation == vision_attn)
self.assertTrue(model_sdpa.language_model.config._attn_implementation == language_attn)
self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager")
self.assertTrue(model_eager.language_model.config._attn_implementation == "eager")
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
self.assertTrue(model_eager.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
class_name = submodule.__class__.__name__
if any(re.finditer(r"Attention(?!Pool)", class_name)):
self.assertTrue(submodule.config._attn_implementation == "eager")
for name, submodule in model_sdpa.named_modules():
class_name = submodule.__class__.__name__
if any(re.finditer(r"Attention(?!Pool)", class_name)):
self.assertTrue(submodule.config._attn_implementation == "sdpa")
@require_torch
@require_torch_accelerator
@slow
class DeepseekVLIntegrationTest(unittest.TestCase):
def setUp(self):
self.model_id = "deepseek-community/deepseek-vl-1.3b-chat"
def test_model_text_generation(self):
model = DeepseekVLForConditionalGeneration.from_pretrained(
self.model_id, torch_dtype="auto", device_map="auto"
)
model.to(torch_device)
model.eval()
processor = AutoProcessor.from_pretrained(self.model_id)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
},
{"type": "text", "text": "Describe this image."},
],
}
]
EXPECTED_TEXT = 'You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: Describe this image.\n\nAssistant:In the image, a majestic snow leopard is captured in a moment of tranquility. The snow leopard' # fmt: skip
inputs = processor.apply_chat_template(
messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
)
inputs = inputs.to(model.device, dtype=model.dtype)
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
text = processor.decode(output[0], skip_special_tokens=True)
self.assertEqual(
text,
EXPECTED_TEXT,
)
def test_model_text_generation_batched(self):
model = DeepseekVLForConditionalGeneration.from_pretrained(
self.model_id, torch_dtype="auto", device_map="auto"
)
model.to(torch_device)
model.eval()
processor = AutoProcessor.from_pretrained(self.model_id)
messages = [
[
{
"role": "user",
"content": [
{
"type": "image",
"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
},
{"type": "text", "text": "Describe this image."},
],
}
],
[
{
"role": "user",
"content": [
{
"type": "image",
"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
},
{"type": "text", "text": "What animal do you see in the image?"},
],
}
],
]
EXPECTED_TEXT = [
"You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: Describe this image.\n\nAssistant:In the image, a majestic snow leopard is captured in a moment of tranquility. The snow leopard", # fmt: skip
"You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: What animal do you see in the image?\n\nAssistant:I see a bear in the image.What is the significance of the color red in the", # fmt: skip
]
inputs = processor.apply_chat_template(
messages, add_generation_prompt=True, tokenize=True, padding=True, return_dict=True, return_tensors="pt"
)
inputs = inputs.to(model.device, dtype=model.dtype)
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
text = processor.batch_decode(output, skip_special_tokens=True)
self.assertEqual(EXPECTED_TEXT, text)
def test_model_text_generation_with_multi_image(self):
model = DeepseekVLForConditionalGeneration.from_pretrained(
self.model_id, torch_dtype="auto", device_map="auto"
)
model.to(torch_device)
model.eval()
processor = AutoProcessor.from_pretrained(self.model_id)
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "What's the difference between"},
{"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
{"type": "text", "text": " and "},
{"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
],
}
]
EXPECTED_TEXT = "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: What's the difference between and \n\nAssistant:The image is a photograph featuring two cats lying on a pink blanket. The cat on the left is" # fmt: skip
inputs = processor.apply_chat_template(
messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
)
inputs = inputs.to(model.device, dtype=model.dtype)
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
text = processor.decode(output[0], skip_special_tokens=True)
self.assertEqual(
text,
EXPECTED_TEXT,
)

View File

@@ -0,0 +1,54 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tempfile
import unittest
from transformers import DeepseekVLProcessor, LlamaTokenizer
from transformers.models.deepseek_vl.convert_deepseek_vl_weights_to_hf import CHAT_TEMPLATE
from transformers.testing_utils import get_tests_dir
from transformers.utils import is_vision_available
from ...test_processing_common import ProcessorTesterMixin
if is_vision_available():
from transformers import DeepseekVLImageProcessor
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
class DeepseekVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = DeepseekVLProcessor
def setUp(self):
self.tmpdirname = tempfile.mkdtemp()
image_processor = DeepseekVLImageProcessor()
tokenizer = LlamaTokenizer(
vocab_file=SAMPLE_VOCAB,
extra_special_tokens={
"pad_token": "<end▁of▁sentence>",
"image_token": "<image_placeholder>",
},
)
processor = self.processor_class(
image_processor=image_processor,
tokenizer=tokenizer,
chat_template=CHAT_TEMPLATE,
)
processor.save_pretrained(self.tmpdirname)
def prepare_processor_dict(self):
return {"chat_template": CHAT_TEMPLATE, "num_image_tokens": 576}

View File

@@ -0,0 +1,218 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
if is_torch_available():
import torch
if is_vision_available():
from PIL import Image
from transformers import DeepseekVLHybridImageProcessor
class DeepseekVLHybridImageProcessingTester:
def __init__(
self,
parent,
batch_size=7,
num_channels=3,
image_size=18,
min_resolution=30,
max_resolution=400,
do_resize=True,
size=None,
high_res_size=None,
do_normalize=True,
image_mean=[0.5, 0.5, 0.5],
image_std=[0.5, 0.5, 0.5],
high_res_image_mean=[0.5, 0.5, 0.5],
high_res_image_std=[0.5, 0.5, 0.5],
):
size = size if size is not None else {"height": 18, "width": 18}
high_res_size = high_res_size if high_res_size is not None else {"height": 36, "width": 36}
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.image_size = image_size
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.size = size
self.high_res_size = high_res_size
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.high_res_image_mean = high_res_image_mean
self.high_res_image_std = high_res_image_std
def prepare_image_processor_dict(self):
return {
"image_mean": self.image_mean,
"image_std": self.image_std,
"high_res_image_mean": self.high_res_image_mean,
"high_res_image_std": self.high_res_image_std,
"do_normalize": self.do_normalize,
"do_resize": self.do_resize,
"size": self.size,
"high_res_size": self.high_res_size,
}
def expected_output_image_shape(self, images):
max_size = max(self.size["height"], self.size["width"])
return self.num_channels, max_size, max_size
def expected_output_high_res_image_shape(self, images):
max_size = max(self.high_res_size["height"], self.high_res_size["width"])
return self.num_channels, max_size, max_size
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
return prepare_image_inputs(
batch_size=self.batch_size,
num_channels=self.num_channels,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
numpify=numpify,
torchify=torchify,
)
@require_torch
@require_vision
class DeepseekVLHybridImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processing_class = DeepseekVLHybridImageProcessor if is_vision_available() else None
# Copied from tests.models.vit.test_image_processing_vit.ViTImageProcessingTester.setUp with ViT->DeepseekVLHybrid
def setUp(self):
super().setUp()
self.image_processor_tester = DeepseekVLHybridImageProcessingTester(self)
@property
# Copied from tests.models.vit.test_image_processing_vit.ViTImageProcessingTester.image_processor_dict with ViT->DeepseekVLHybrid
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
# Copied from tests.models.vit.test_image_processing_vit.ViTImageProcessingTester.test_image_processor_from_dict_with_kwargs
def test_image_processor_from_dict_with_kwargs(self):
for image_processing_class in self.image_processor_list:
image_processor = image_processing_class.from_dict(self.image_processor_dict)
self.assertEqual(image_processor.size, {"height": 18, "width": 18})
image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42)
self.assertEqual(image_processor.size, {"height": 42, "width": 42})
def test_image_processor_properties(self):
for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "high_res_image_mean"))
self.assertTrue(hasattr(image_processing, "high_res_image_std"))
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "size"))
self.assertTrue(hasattr(image_processing, "high_res_size"))
def test_call_pil_high_res(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = image_processing_class(**self.image_processor_dict)
# create random PIL images
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
for image in image_inputs:
self.assertIsInstance(image, Image.Image)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").high_res_pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_high_res_image_shape(
[image_inputs[0]]
)
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
# Test batched
encoded_images = image_processing(image_inputs, return_tensors="pt").high_res_pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_high_res_image_shape(
image_inputs
)
self.assertEqual(
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
)
def test_call_numpy_high_res(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = image_processing_class(**self.image_processor_dict)
# create random numpy tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
for image in image_inputs:
self.assertIsInstance(image, np.ndarray)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").high_res_pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_high_res_image_shape(
[image_inputs[0]]
)
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
# Test batched
encoded_images = image_processing(image_inputs, return_tensors="pt").high_res_pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_high_res_image_shape(
image_inputs
)
self.assertEqual(
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
)
def test_call_pytorch_high_res(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
image_processing = image_processing_class(**self.image_processor_dict)
# create random PyTorch tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
for image in image_inputs:
self.assertIsInstance(image, torch.Tensor)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").high_res_pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_high_res_image_shape(
[image_inputs[0]]
)
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
# Test batched
expected_output_image_shape = self.image_processor_tester.expected_output_high_res_image_shape(
image_inputs
)
encoded_images = image_processing(image_inputs, return_tensors="pt").high_res_pixel_values
self.assertEqual(
tuple(encoded_images.shape),
(self.image_processor_tester.batch_size, *expected_output_image_shape),
)
@unittest.skip(reason="Not supported")
def test_call_numpy_4_channels(self):
pass

View File

@@ -0,0 +1,403 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch DeepseekVLHybrid model."""
import re
import tempfile
import unittest
from transformers import (
AutoProcessor,
DeepseekVLHybridConfig,
DeepseekVLHybridForConditionalGeneration,
DeepseekVLHybridModel,
is_torch_available,
)
from transformers.testing_utils import (
require_torch,
require_torch_accelerator,
require_torch_sdpa,
slow,
torch_device,
)
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
if is_torch_available():
import torch
class DeepseekVLHybridModelTester:
def __init__(
self,
parent,
batch_size=2,
seq_length=25,
num_channels=3,
initializer_range=0.02,
is_training=True,
use_cache=False,
text_config={
"num_hidden_layers": 2,
"vocab_size": 99,
"hidden_size": 16,
"intermediate_size": 37,
"max_position_embeddings": 512,
"num_attention_heads": 4,
"pad_token_id": 1,
},
vision_config={
"num_hidden_layers": 1,
"hidden_size": 16,
"intermediate_size": 37,
"image_size": 32,
"patch_size": 8,
"hidden_act": "gelu",
"vision_use_head": False,
"num_attention_heads": 4,
},
high_res_vision_config={
"num_hidden_layers": 2,
"global_attn_indexes": [0],
"hidden_size": 16,
"intermediate_size": 37,
"mlp_dim": 24,
"output_channels": 4,
"image_size": 128,
"patch_size": 32,
"num_attention_heads": 4,
},
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.num_channels = num_channels
self.initializer_range = initializer_range
self.is_training = is_training
self.use_cache = use_cache
self.text_config = text_config
self.vision_config = vision_config
self.high_res_vision_config = high_res_vision_config
self.vision_config["num_channels"] = self.num_channels
self.high_res_vision_config["num_channels"] = self.num_channels
self.num_hidden_layers = text_config["num_hidden_layers"]
self.vocab_size = text_config["vocab_size"]
self.hidden_size = text_config["hidden_size"]
self.num_attention_heads = text_config["num_attention_heads"]
self.high_res_image_size = high_res_vision_config["image_size"]
self.image_size = vision_config["image_size"]
self.num_image_tokens = vision_config["image_size"] // vision_config["patch_size"]
self.pad_token_id = text_config["pad_token_id"]
self.image_token_id = self.vocab_size - 1
def get_config(self):
return DeepseekVLHybridConfig(
text_config=self.text_config,
vision_config=self.vision_config,
high_res_vision_config=self.high_res_vision_config,
image_token_id=self.image_token_id,
)
def prepare_config_and_inputs(self):
config = self.get_config()
# create text and vision inputs
input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 1
attention_mask = random_attention_mask([self.batch_size, self.seq_length])
pixel_values = floats_tensor(
[
self.batch_size,
self.num_channels,
self.image_size,
self.image_size,
]
)
high_res_pixel_values = floats_tensor(
[
self.batch_size,
self.num_channels,
self.high_res_image_size,
self.high_res_image_size,
]
)
# fill image_tokens
input_ids[:, : self.num_image_tokens] = self.image_token_id
return config, input_ids, attention_mask, pixel_values, high_res_pixel_values
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, input_ids, attention_mask, pixel_values, high_res_pixel_values = config_and_inputs
inputs_dict = {
"input_ids": input_ids,
"attention_mask": attention_mask,
"pixel_values": pixel_values,
"high_res_pixel_values": high_res_pixel_values,
}
return config, inputs_dict
@require_torch
class DeepseekVLHybridModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
all_model_classes = (
(DeepseekVLHybridModel, DeepseekVLHybridForConditionalGeneration) if is_torch_available() else ()
)
pipeline_model_mapping = (
{
"feature-extraction": DeepseekVLHybridModel,
"image-text-to-text": DeepseekVLHybridForConditionalGeneration,
}
if is_torch_available()
else {}
)
_is_composite = True
test_pruning = False
test_head_masking = False
def setUp(self):
self.model_tester = DeepseekVLHybridModelTester(self)
self.config_tester = ConfigTester(self, config_class=DeepseekVLHybridConfig, has_text_modality=False)
# overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
def test_inputs_embeds(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
inputs = self._prepare_for_class(inputs_dict, model_class)
input_ids = inputs["input_ids"]
del inputs["input_ids"]
del inputs["pixel_values"]
del inputs["high_res_pixel_values"]
wte = model.get_input_embeddings()
inputs["inputs_embeds"] = wte(input_ids)
with torch.no_grad():
model(**inputs)
# overwrite inputs_embeds tests because we need to delete "pixel values" for VLMs.
def test_inputs_embeds_matches_input_ids(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
inputs = self._prepare_for_class(inputs_dict, model_class)
input_ids = inputs["input_ids"]
del inputs["input_ids"]
del inputs["pixel_values"]
del inputs["high_res_pixel_values"]
inputs_embeds = model.get_input_embeddings()(input_ids)
with torch.no_grad():
out_ids = model(input_ids=input_ids, **inputs)[0]
out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
torch.testing.assert_close(out_embeds, out_ids)
@unittest.skip(reason="Siglip uses the same initialization scheme as the Flax original implementation")
# Copied from tests.models.siglip.test_modeling_siglip.SiglipVisionModelTest.test_initialization
def test_initialization(self):
pass
@require_torch_sdpa
def test_sdpa_can_dispatch_composite_models(self):
for model_class in self.all_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
# Load the model with SDPA
model_sdpa = model_class.from_pretrained(
tmpdirname,
attn_implementation="sdpa",
)
model_sdpa = model_sdpa.eval().to(torch_device)
# Load model with eager attention
model_eager = model_class.from_pretrained(
tmpdirname,
attn_implementation="eager",
)
model_eager = model_eager.eval().to(torch_device)
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
self.assertTrue(model_eager.config._attn_implementation == "eager")
if (
hasattr(model_sdpa, "vision_model")
and hasattr(model_sdpa, "high_res_vision_model")
and hasattr(model_sdpa, "language_model")
):
self.assertTrue(model_sdpa.language_model.config._attn_implementation == "sdpa")
self.assertTrue(model_sdpa.vision_model.config._attn_implementation == "sdpa")
self.assertTrue(model_sdpa.high_res_vision_model.config._attn_implementation == "sdpa")
self.assertTrue(model_eager.language_model.config._attn_implementation == "eager")
self.assertTrue(model_eager.high_res_vision_model.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
class_name = submodule.__class__.__name__
if (
any(re.finditer(r"Attention(?!Pool)", class_name))
and getattr(submodule, "config", None)
and submodule.config._attn_implementation == "sdpa"
):
self.assertTrue(submodule.config._attn_implementation == "eager")
for name, submodule in model_sdpa.named_modules():
class_name = submodule.__class__.__name__
if (
any(re.finditer(r"Attention(?!Pool)", class_name))
and getattr(submodule, "config", None)
and submodule.config._attn_implementation == "eager"
):
self.assertTrue(submodule.config._attn_implementation == "sdpa")
@require_torch
@require_torch_accelerator
@slow
class DeepseekVLHybridIntegrationTest(unittest.TestCase):
def setUp(self):
self.model_id = "deepseek-community/deepseek-vl-7b-chat"
def test_model_text_generation(self):
model = DeepseekVLHybridForConditionalGeneration.from_pretrained(
self.model_id, torch_dtype="auto", device_map="auto"
)
model.to(torch_device)
model.eval()
processor = AutoProcessor.from_pretrained(self.model_id)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
},
{"type": "text", "text": "Describe this image."},
],
}
]
EXPECTED_TEXT = 'You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: Describe this image.\n\nAssistant:The image depicts a fluffy, beige-colored animal with a long tail, walking on snow. The' # fmt: skip
inputs = processor.apply_chat_template(
messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
)
inputs = inputs.to(model.device, dtype=model.dtype)
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
text = processor.decode(output[0], skip_special_tokens=True)
self.assertEqual(
text,
EXPECTED_TEXT,
)
def test_model_text_generation_batched(self):
model = DeepseekVLHybridForConditionalGeneration.from_pretrained(
self.model_id, torch_dtype="auto", device_map="auto"
)
model.to(torch_device)
model.eval()
processor = AutoProcessor.from_pretrained(self.model_id)
messages = [
[
{
"role": "user",
"content": [
{
"type": "image",
"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
},
{"type": "text", "text": "Describe this image."},
],
}
],
[
{
"role": "user",
"content": [
{
"type": "image",
"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
},
{"type": "text", "text": "What animal do you see in the image?"},
],
}
],
]
EXPECTED_TEXT = [
"You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: Describe this image.\n\nAssistant:The image depicts a fluffy, beige-colored animal with a long tail, walking on snow. The", # fmt: skip
"You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: What animal do you see in the image?\n\nAssistant:I see a large, furry animal that appears to be a type of bear.The ", # fmt: skip
]
inputs = processor.apply_chat_template(
messages, add_generation_prompt=True, tokenize=True, padding=True, return_dict=True, return_tensors="pt"
)
inputs = inputs.to(model.device, dtype=model.dtype)
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
text = processor.batch_decode(output, skip_special_tokens=True)
self.assertEqual(EXPECTED_TEXT, text)
def test_model_text_generation_with_multi_image(self):
model = DeepseekVLHybridForConditionalGeneration.from_pretrained(
self.model_id, torch_dtype="auto", device_map="auto"
)
model.to(torch_device)
model.eval()
processor = AutoProcessor.from_pretrained(self.model_id)
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "What's the difference between"},
{"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
{"type": "text", "text": " and "},
{"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
],
}
]
EXPECTED_TEXT = "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: What's the difference between and \n\nAssistant:The image shows a street scene with a prominent red stop sign in the foreground. The sign has the" # fmt: skip
inputs = processor.apply_chat_template(
messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
)
inputs = inputs.to(model.device, dtype=model.dtype)
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
text = processor.decode(output[0], skip_special_tokens=True)
self.assertEqual(
text,
EXPECTED_TEXT,
)

View File

@@ -0,0 +1,54 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tempfile
import unittest
from transformers import DeepseekVLHybridProcessor, LlamaTokenizer
from transformers.models.deepseek_vl.convert_deepseek_vl_weights_to_hf import CHAT_TEMPLATE
from transformers.testing_utils import get_tests_dir
from transformers.utils import is_vision_available
from ...test_processing_common import ProcessorTesterMixin
if is_vision_available():
from transformers import DeepseekVLHybridImageProcessor
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
class DeepseekVLHybridProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = DeepseekVLHybridProcessor
def setUp(self):
self.tmpdirname = tempfile.mkdtemp()
image_processor = DeepseekVLHybridImageProcessor()
tokenizer = LlamaTokenizer(
vocab_file=SAMPLE_VOCAB,
extra_special_tokens={
"pad_token": "<end▁of▁sentence>",
"image_token": "<image_placeholder>",
},
)
processor = self.processor_class(
image_processor=image_processor,
tokenizer=tokenizer,
chat_template=CHAT_TEMPLATE,
)
processor.save_pretrained(self.tmpdirname)
def prepare_processor_dict(self):
return {"chat_template": CHAT_TEMPLATE, "num_image_tokens": 576}