diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 50f43f27ac..f20b9a2d80 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -411,6 +411,8 @@
title: Cohere
- local: model_doc/cohere2
title: Cohere2
+ - local: model_doc/cohere2_vision
+ title: Cohere2Vision
- local: model_doc/convbert
title: ConvBERT
- local: model_doc/cpm
diff --git a/docs/source/en/model_doc/cohere2_vision.md b/docs/source/en/model_doc/cohere2_vision.md
new file mode 100644
index 0000000000..123f9573b9
--- /dev/null
+++ b/docs/source/en/model_doc/cohere2_vision.md
@@ -0,0 +1,92 @@
+# Command A Vision
+
+
+
+## Overview
+
+Command A Vision is a state-of-the-art multimodal model designed to seamlessly integrate visual and textual information for a wide range of applications. By combining advanced computer vision techniques with natural language processing capabilities, Command A Vision enables users to analyze, understand, and generate insights from both visual and textual data.
+
+The model excels at tasks including image captioning, visual question answering, document understanding, and chart understanding. This makes it a versatile tool for AI practitioners. Its ability to process complex visual and textual inputs makes it useful in settings where text-only representations are imprecise or unavailable, like real-world image understanding and graphics-heavy document processing.
+
+Command A Vision is built upon a robust architecture that leverages the latest advancements in VLMs. It's highly performant and efficient, even when dealing with large-scale datasets. The model's flexibility makes it suitable for a wide range of use cases, from content moderation and image search to medical imaging analysis and robotics.
+
+## Usage tips
+
+The model and image processor can be loaded as follows:
+
+```python
+
+import torch
+from transformers import AutoProcessor, AutoModelForImageTextToText
+
+model_id = "CohereLabs/command-a-vision-07-2025"
+
+processor = AutoProcessor.from_pretrained(model_id)
+model = AutoModelForImageTextToText.from_pretrained(
+ model_id, device_map="auto", torch_dtype=torch.float16
+)
+
+# Format message with the Command-A-Vision chat template
+messages = [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image",
+ "url": "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg",
+ },
+ {"type": "text", "text": "what is in this image?"},
+ ],
+ },
+]
+
+inputs = processor.apply_chat_template(
+ messages,
+ padding=True,
+ add_generation_prompt=True,
+ tokenize=True,
+ return_dict=True,
+ return_tensors="pt",
+).to(model.device)
+
+gen_tokens = model.generate(
+ **inputs,
+ max_new_tokens=300,
+ do_sample=True,
+ temperature=0.3,
+)
+
+print(
+ processor.tokenizer.decode(
+ gen_tokens[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
+ )
+)
+```
+
+## Cohere2VisionConfig
+
+[[autodoc]] Cohere2VisionConfig
+
+## Cohere2VisionForConditionalGeneration
+
+[[autodoc]] Cohere2VisionForConditionalGeneration
+ - forward
+
+## Cohere2VisionModel
+
+[[autodoc]] Cohere2VisionModel
+ - forward
+
+## Cohere2VisionImageProcessorFast
+
+[[autodoc]] Cohere2VisionImageProcessorFast
+ - preprocess
+
+## Cohere2VisionProcessor
+
+[[autodoc]] Cohere2VisionProcessor
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
index 93c4af7cdc..b502400dbb 100644
--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@@ -1164,7 +1164,7 @@ class Cache:
while len(self.layers) <= layer_idx:
kwargs = self.layer_init_kwargs.copy()
if self.layer_init_kwargs.get("layer_device_map", None) is not None:
- kwargs["device"] = kwargs.pop("layer_device_map")[layer_idx]
+ kwargs["device"] = kwargs.pop("layer_device_map")[len(self.layers)]
new_layer_class = (
self.layer_classes[len(self.layers)] if isinstance(self.layer_classes, list) else self.layer_classes
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index f9059e331b..56c2f3fcdc 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -63,6 +63,7 @@ if TYPE_CHECKING:
from .codegen import *
from .cohere import *
from .cohere2 import *
+ from .cohere2_vision import *
from .colpali import *
from .colqwen2 import *
from .conditional_detr import *
diff --git a/src/transformers/models/aria/modeling_aria.py b/src/transformers/models/aria/modeling_aria.py
index 9e6125720d..c1278bee03 100644
--- a/src/transformers/models/aria/modeling_aria.py
+++ b/src/transformers/models/aria/modeling_aria.py
@@ -1106,7 +1106,7 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
self.model.set_decoder(decoder)
def get_decoder(self):
- return self.model.get_decoder
+ return self.model.get_decoder()
def get_image_features(
self,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 99ba934986..6d78356c72 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -81,6 +81,7 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
("codegen", "CodeGenConfig"),
("cohere", "CohereConfig"),
("cohere2", "Cohere2Config"),
+ ("cohere2_vision", "Cohere2VisionConfig"),
("colpali", "ColPaliConfig"),
("colqwen2", "ColQwen2Config"),
("conditional_detr", "ConditionalDetrConfig"),
@@ -476,6 +477,7 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
("codegen", "CodeGen"),
("cohere", "Cohere"),
("cohere2", "Cohere2"),
+ ("cohere2_vision", "Cohere2Vision"),
("colpali", "ColPali"),
("colqwen2", "ColQwen2"),
("conditional_detr", "Conditional DETR"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index cefa1335eb..5e28f2ac2d 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -72,6 +72,7 @@ else:
("chinese_clip", ("ChineseCLIPImageProcessor", "ChineseCLIPImageProcessorFast")),
("clip", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
("clipseg", ("ViTImageProcessor", "ViTImageProcessorFast")),
+ ("cohere2_vision", ("Cohere2VisionImageProcessorFast",)),
("conditional_detr", ("ConditionalDetrImageProcessor", "ConditionalDetrImageProcessorFast")),
("convnext", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
("convnextv2", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 52eb254be1..ea69ed911d 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -77,6 +77,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
("codegen", "CodeGenModel"),
("cohere", "CohereModel"),
("cohere2", "Cohere2Model"),
+ ("cohere2_vision", "Cohere2VisionModel"),
("conditional_detr", "ConditionalDetrModel"),
("convbert", "ConvBertModel"),
("convnext", "ConvNextModel"),
@@ -944,6 +945,7 @@ MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = OrderedDict(
("blip", "BlipForConditionalGeneration"),
("blip-2", "Blip2ForConditionalGeneration"),
("chameleon", "ChameleonForConditionalGeneration"),
+ ("cohere2_vision", "Cohere2VisionForConditionalGeneration"),
("deepseek_vl", "DeepseekVLForConditionalGeneration"),
("deepseek_vl_hybrid", "DeepseekVLHybridForConditionalGeneration"),
("emu3", "Emu3ForConditionalGeneration"),
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 0d711cee06..8da5286260 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -60,6 +60,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
("clip", "CLIPProcessor"),
("clipseg", "CLIPSegProcessor"),
("clvp", "ClvpProcessor"),
+ ("cohere2_vision", "Cohere2VisionProcessor"),
("colpali", "ColPaliProcessor"),
("colqwen2", "ColQwen2Processor"),
("deepseek_vl", "DeepseekVLProcessor"),
diff --git a/src/transformers/models/aya_vision/configuration_aya_vision.py b/src/transformers/models/aya_vision/configuration_aya_vision.py
index 41c472b909..1b79c156a9 100644
--- a/src/transformers/models/aya_vision/configuration_aya_vision.py
+++ b/src/transformers/models/aya_vision/configuration_aya_vision.py
@@ -33,9 +33,9 @@ class AyaVisionConfig(PretrainedConfig):
documentation from [`PretrainedConfig`] for more information.
Args:
- vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `CLIPVisionConfig`):
+ vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SiglipVisionConfig`):
The config object or dictionary of the vision backbone.
- text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
+ text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Cohere2Config`):
The config object or dictionary of the text backbone.
vision_feature_select_strategy (`str`, *optional*, defaults to `"full"`):
The feature selection strategy used to select the vision feature from the vision backbone.
@@ -81,7 +81,9 @@ class AyaVisionConfig(PretrainedConfig):
self.vision_feature_layer = vision_feature_layer
if isinstance(vision_config, dict):
- vision_config["model_type"] = vision_config.get("model_type", "clip_vision_model")
+ vision_config["model_type"] = (
+ vision_config["model_type"] if "model_type" in vision_config else "siglip_vision_model"
+ )
vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
elif vision_config is None:
vision_config = CONFIG_MAPPING["siglip_vision_model"](
@@ -97,7 +99,7 @@ class AyaVisionConfig(PretrainedConfig):
self.vision_config = vision_config
if isinstance(text_config, dict):
- text_config["model_type"] = text_config.get("model_type", "llama")
+ text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "cohere2"
text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
elif text_config is None:
text_config = CONFIG_MAPPING["cohere2"]()
diff --git a/src/transformers/models/aya_vision/modeling_aya_vision.py b/src/transformers/models/aya_vision/modeling_aya_vision.py
index df45633cc7..dd922c63ce 100644
--- a/src/transformers/models/aya_vision/modeling_aya_vision.py
+++ b/src/transformers/models/aya_vision/modeling_aya_vision.py
@@ -33,6 +33,7 @@ from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput
from ...modeling_utils import PreTrainedModel
from ...processing_utils import Unpack
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling
+from ...utils.generic import check_model_inputs
from ..auto import AutoModel
from .configuration_aya_vision import AyaVisionConfig
@@ -99,6 +100,10 @@ class AyaVisionPreTrainedModel(PreTrainedModel):
_can_compile_fullgraph = False
_supports_flex_attn = True
_supports_attention_backend = True
+ _can_record_outputs = {
+ "hidden_states": "DecoderLayer",
+ "attentions": "Attention",
+ }
@dataclass
@@ -237,7 +242,7 @@ class AyaVisionModel(AyaVisionPreTrainedModel):
image_features = self.multi_modal_projector(selected_image_feature)
return image_features
- @can_return_tuple
+ @check_model_inputs
@auto_docstring
def forward(
self,
@@ -250,17 +255,9 @@ class AyaVisionModel(AyaVisionPreTrainedModel):
vision_feature_layer: Optional[Union[int, list[int]]] = None,
vision_feature_select_strategy: Optional[str] = None,
use_cache: Optional[bool] = None,
- output_attentions: Optional[bool] = None,
- output_hidden_states: Optional[bool] = None,
- return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
**kwargs: Unpack[FlashAttentionKwargs],
) -> Union[tuple, AyaVisionModelOutputWithPast]:
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
- output_hidden_states = (
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
- )
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
vision_feature_layer = (
vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
)
@@ -308,9 +305,6 @@ class AyaVisionModel(AyaVisionPreTrainedModel):
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=True,
cache_position=cache_position,
**kwargs,
)
@@ -357,7 +351,7 @@ class AyaVisionForConditionalGeneration(AyaVisionPreTrainedModel, GenerationMixi
self.model.set_decoder(decoder)
def get_decoder(self):
- return self.model.get_decoder
+ return self.model.get_decoder()
def get_image_features(
self,
diff --git a/src/transformers/models/aya_vision/modular_aya_vision.py b/src/transformers/models/aya_vision/modular_aya_vision.py
index 8e77762917..5a7b0950cd 100644
--- a/src/transformers/models/aya_vision/modular_aya_vision.py
+++ b/src/transformers/models/aya_vision/modular_aya_vision.py
@@ -32,7 +32,8 @@ from ...activations import ACT2FN
from ...cache_utils import Cache
from ...modeling_flash_attention_utils import FlashAttentionKwargs
from ...processing_utils import Unpack
-from ...utils import auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging
+from ...utils import auto_docstring, is_torchdynamo_compiling, logging
+from ...utils.generic import check_model_inputs
from .configuration_aya_vision import AyaVisionConfig
@@ -91,6 +92,10 @@ class AyaVisionMultiModalProjector(nn.Module):
class AyaVisionPreTrainedModel(LlavaPreTrainedModel):
_can_compile_fullgraph = False
+ _can_record_outputs = {
+ "hidden_states": "DecoderLayer",
+ "attentions": "Attention",
+ }
class AyaVisionCausalLMOutputWithPast(LlavaCausalLMOutputWithPast):
@@ -158,7 +163,7 @@ class AyaVisionModel(LlavaModel):
image_features = self.multi_modal_projector(selected_image_feature)
return image_features
- @can_return_tuple
+ @check_model_inputs
@auto_docstring
def forward(
self,
@@ -171,17 +176,9 @@ class AyaVisionModel(LlavaModel):
vision_feature_layer: Optional[Union[int, list[int]]] = None,
vision_feature_select_strategy: Optional[str] = None,
use_cache: Optional[bool] = None,
- output_attentions: Optional[bool] = None,
- output_hidden_states: Optional[bool] = None,
- return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
**kwargs: Unpack[FlashAttentionKwargs],
) -> Union[tuple, AyaVisionModelOutputWithPast]:
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
- output_hidden_states = (
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
- )
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
vision_feature_layer = (
vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
)
@@ -229,9 +226,6 @@ class AyaVisionModel(LlavaModel):
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=True,
cache_position=cache_position,
**kwargs,
)
diff --git a/src/transformers/models/cohere2_vision/__init__.py b/src/transformers/models/cohere2_vision/__init__.py
new file mode 100644
index 0000000000..9b20eb3c1e
--- /dev/null
+++ b/src/transformers/models/cohere2_vision/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+ from .configuration_cohere2_vision import *
+ from .image_processing_cohere2_vision_fast import *
+ from .modeling_cohere2_vision import *
+ from .processing_cohere2_vision import *
+else:
+ import sys
+
+ _file = globals()["__file__"]
+ sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/cohere2_vision/configuration_cohere2_vision.py b/src/transformers/models/cohere2_vision/configuration_cohere2_vision.py
new file mode 100644
index 0000000000..e4e670e4a6
--- /dev/null
+++ b/src/transformers/models/cohere2_vision/configuration_cohere2_vision.py
@@ -0,0 +1,84 @@
+# Copyright 2025 the Cohere Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...configuration_utils import PretrainedConfig
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+class Cohere2VisionConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`Cohere2VisionForConditionalGeneration`]. It is used to instantiate an
+ Cohere2 Vision model according to the specified arguments, defining the model architecture.
+
+ [CohereLabs/command-a-vision-07-2025](https://huggingface.co/CohereLabs/command-a-vision-07-2025)
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SiglipVisionConfig`):
+ The config object or dictionary of the vision backbone.
+ text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Cohere2Config`):
+ The config object or dictionary of the text backbone.
+ downsample_factor (`int`, *optional*, defaults to 2):
+ The factor by which to downsample the input image.
+ image_token_id (`int`, *optional*, defaults to 255036):
+ The token ID to use as placeholder for the image input.
+ alignment_intermediate_size (`int`, *optional*, defaults to 36864):
+ The size of the intermediate layer for alignment.
+ """
+
+ model_type = "cohere2_vision"
+ sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
+
+ def __init__(
+ self,
+ vision_config=None,
+ text_config=None,
+ downsample_factor=2,
+ image_token_id=255036,
+ alignment_intermediate_size=36864,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ self.downsample_factor = downsample_factor
+ self.image_token_id = image_token_id
+ self.alignment_intermediate_size = alignment_intermediate_size
+
+ if isinstance(vision_config, dict):
+ vision_config["model_type"] = (
+ vision_config["model_type"] if "model_type" in vision_config else "siglip_vision_model"
+ )
+ vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+ elif vision_config is None:
+ vision_config = CONFIG_MAPPING["siglip_vision_model"](
+ hidden_size=1152,
+ intermediate_size=3072,
+ image_size=512,
+ num_hidden_layers=27,
+ num_attention_heads=12,
+ )
+
+ self.vision_config = vision_config
+
+ if isinstance(text_config, dict):
+ text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "cohere2"
+ text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+ elif text_config is None:
+ text_config = CONFIG_MAPPING["cohere2"](tie_word_embeddings=True)
+
+ self.text_config = text_config
+
+
+__all__ = ["Cohere2VisionConfig"]
diff --git a/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py b/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py
new file mode 100644
index 0000000000..6c1aaa48a3
--- /dev/null
+++ b/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py
@@ -0,0 +1,309 @@
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# This file was automatically generated from src/transformers/models/cohere2_vision/modular_cohere2_vision.py.
+# Do NOT edit this file manually as any edits will be overwritten by the generation of
+# the file from the modular. If any change should be done, please apply the change to the
+# modular_cohere2_vision.py file directly. One of our CI enforces this.
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 the Cohere Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import lru_cache
+from typing import Optional, Union
+
+import numpy as np
+import torch
+
+from ...image_processing_utils import BatchFeature
+from ...image_processing_utils_fast import (
+ BaseImageProcessorFast,
+ DefaultFastImageProcessorKwargs,
+ group_images_by_shape,
+ reorder_images,
+)
+from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ImageInput, PILImageResampling, SizeDict
+from ...processing_utils import Unpack
+from ...utils import TensorType, auto_docstring, is_torchvision_v2_available
+
+
+if is_torchvision_v2_available():
+ from torchvision.transforms.v2 import functional as F
+else:
+ from torchvision.transforms import functional as F
+
+
+class Cohere2VisionFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+ """
+ crop_to_patches (`bool`, *optional*, defaults to `False`):
+ Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
+ `preprocess` method.
+ min_patches (`int`, *optional*, defaults to 1):
+ The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+ set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method.
+ max_patches (`int`, *optional*, defaults to 12):
+ The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+ set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
+ """
+
+ crop_to_patches: Optional[bool]
+ min_patches: Optional[int]
+ max_patches: Optional[int]
+
+
+@lru_cache(maxsize=10)
+def get_all_supported_aspect_ratios(max_image_tiles: int) -> list[tuple[int, int]]:
+ """
+ Computes all allowed aspect ratios for a given maximum number of input tiles.
+
+ This function calculates all possible arrangements of tiles that can be formed
+ within the constraint of the maximum number of tiles. Each arrangement is
+ represented by its aspect ratio (width/height) and the corresponding tile configuration.
+
+ Args:
+ max_image_tiles (`int`):
+ The maximum number of tiles allowed.
+
+ Returns:
+ `list[tuple[int, int]]`: A list of tuples, each tuple representing a valid (width, height)
+ configuration in terms of number of tiles.
+
+ Example:
+ >>> get_all_supported_aspect_ratios(4)
+ [(1, 1), (1, 2), (1, 3), (1, 4), (2, 1), (2, 2), (3, 1), (4, 1)]
+
+ """
+ aspect_ratios = []
+ for width in range(1, max_image_tiles + 1):
+ for height in range(1, max_image_tiles + 1):
+ if width * height <= max_image_tiles:
+ aspect_ratios.append((width, height))
+ return aspect_ratios
+
+
+def get_optimal_tiled_canvas(
+ original_image_size: tuple[int, int],
+ target_tile_size: tuple[int, int],
+ min_image_tiles: int,
+ max_image_tiles: int,
+) -> tuple[int, int]:
+ possible_resolutions = get_all_supported_aspect_ratios(max_image_tiles)
+ possible_resolutions = sorted(possible_resolutions, key=lambda x: x[0] * x[1])
+ image_height, image_width = original_image_size
+ patch_size_height, patch_size_width = target_tile_size # (height == width)
+
+ candidate_resolutions = np.array(possible_resolutions) * patch_size_height
+ original_size = np.stack([image_height, image_width])
+ required_scales = candidate_resolutions / original_size
+ required_scale = np.min(required_scales, axis=-1, keepdims=True) # [n_resolutions, 1]
+ if np.all(required_scale < 1):
+ # We are forced to downscale, so try to minimize the amount of downscaling
+ best_grid = possible_resolutions[np.argmax(required_scale)]
+ else:
+ # Pick the resolution that required the least upscaling so that it most closely fits the image
+ required_scale = np.where(required_scale < 1.0, 10e9, required_scale)
+ best_grid = possible_resolutions[np.argmin(required_scale)]
+ return best_grid
+
+
+@auto_docstring
+class Cohere2VisionImageProcessorFast(BaseImageProcessorFast):
+ resample = PILImageResampling.BICUBIC
+ image_mean = OPENAI_CLIP_MEAN
+ image_std = OPENAI_CLIP_STD
+ size = {"height": 512, "width": 512}
+ do_resize = True
+ do_rescale = True
+ do_normalize = True
+ do_convert_rgb = True
+ crop_to_patches = True
+ min_patches = 1
+ max_patches = 12
+ valid_kwargs = Cohere2VisionFastImageProcessorKwargs
+ patch_size = 16
+
+ def __init__(self, **kwargs: Unpack[Cohere2VisionFastImageProcessorKwargs]):
+ super().__init__(**kwargs)
+
+ @auto_docstring
+ def preprocess(self, images: ImageInput, **kwargs: Unpack[Cohere2VisionFastImageProcessorKwargs]) -> BatchFeature:
+ return super().preprocess(images, **kwargs)
+
+ def crop_image_to_patches(
+ self,
+ images: "torch.Tensor",
+ min_patches: int,
+ max_patches: int,
+ use_thumbnail: bool = True,
+ patch_size: Optional[Union[tuple, int, dict]] = None,
+ interpolation: Optional["F.InterpolationMode"] = None,
+ ):
+ """
+ Crop the images to patches and return a list of cropped images.
+ The number of patches and their grid arrangement are determined by the original image size,
+ the target patch size and the minimum and maximum number of patches.
+ The aspect ratio of the patches grid is chosen to be the closest to the original image aspect ratio.
+
+ Args:
+ images (`torch.Tensor`):
+ The images to be cropped.
+ min_patches (`int`):
+ The minimum number of patches to be extracted from the image.
+ max_patches (`int`):
+ The maximum number of patches to be extracted from the image.
+ use_thumbnail (`bool`, *optional*, defaults to `True`):
+ Whether to add a thumbnail image to the list of cropped patches.
+ patch_size (`int`, `tuple[int, int]`, `dict`, *optional*):
+ The size of the output patches.
+ The format of the image data. If `None`, the format is inferred from the input image.
+
+ Returns:
+ list[`PIL.Image.Image`] or list[np.ndarray]: The list of cropped images.
+ """
+ patch_size_height, patch_size_width = patch_size.height, patch_size.width
+ original_height, original_width = images.shape[-2:]
+ # find the closest aspect ratio to the target
+ num_columns, num_rows = get_optimal_tiled_canvas(
+ (original_height, original_width), (patch_size_height, patch_size_width), min_patches, max_patches
+ )
+
+ # calculate the target width and height
+ target_width = patch_size_width * num_columns
+ target_height = patch_size_height * num_rows
+ num_blocks = num_columns * num_rows
+
+ # resize the image so that each patch is of patch_size
+ resized_image = self.resize(
+ images, SizeDict(height=target_height, width=target_width), interpolation=interpolation
+ )
+ # split the image into patches
+ processed_images = []
+ for i in range(num_blocks):
+ column = i % num_columns
+ row = i // num_columns
+ box = (
+ column * patch_size_width,
+ row * patch_size_height,
+ (column + 1) * patch_size_width,
+ (row + 1) * patch_size_height,
+ )
+ # split the image
+ patch_image = resized_image[..., box[1] : box[3], box[0] : box[2]]
+ processed_images.append(patch_image)
+
+ if use_thumbnail and len(processed_images) != 1:
+ thumbnail_img = self.resize(images, patch_size, interpolation=interpolation)
+ processed_images.append(thumbnail_img)
+
+ processed_images = torch.stack(processed_images, dim=0).transpose(0, 1).contiguous()
+
+ return processed_images
+
+ def _preprocess(
+ self,
+ images: list["torch.Tensor"],
+ do_resize: bool,
+ size: SizeDict,
+ crop_to_patches: bool,
+ min_patches: int,
+ max_patches: int,
+ interpolation: Optional["F.InterpolationMode"],
+ do_center_crop: bool,
+ crop_size: SizeDict,
+ do_rescale: bool,
+ rescale_factor: float,
+ do_normalize: bool,
+ image_mean: Optional[Union[float, list[float]]],
+ image_std: Optional[Union[float, list[float]]],
+ disable_grouping: Optional[bool],
+ return_tensors: Optional[Union[str, TensorType]],
+ ) -> BatchFeature:
+ if crop_to_patches:
+ grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+ processed_images_grouped = {}
+ num_patches = {}
+ for shape, stacked_images in grouped_images.items():
+ stacked_images = self.crop_image_to_patches(
+ stacked_images,
+ min_patches,
+ max_patches,
+ patch_size=size,
+ interpolation=interpolation,
+ )
+ processed_images_grouped[shape] = stacked_images
+ num_patches[shape] = [stacked_images.shape[1]] * stacked_images.shape[0]
+ images = reorder_images(processed_images_grouped, grouped_images_index)
+ images = [image for images_list in images for image in images_list]
+ num_patches = reorder_images(num_patches, grouped_images_index)
+ else:
+ num_patches = [1] * len(images)
+
+ # Group images by size for batched resizing
+ grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+ resized_images_grouped = {}
+ for shape, stacked_images in grouped_images.items():
+ if do_resize:
+ stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
+ resized_images_grouped[shape] = stacked_images
+ resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+ # Group images by size for further processing
+ # Needed in case do_resize is False, or resize returns images with different sizes
+ grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+ processed_images_grouped = {}
+ for shape, stacked_images in grouped_images.items():
+ if do_center_crop:
+ stacked_images = self.center_crop(stacked_images, crop_size)
+ # Fused rescale and normalize
+ stacked_images = self.rescale_and_normalize(
+ stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+ )
+ processed_images_grouped[shape] = stacked_images
+
+ processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+ processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+
+ return BatchFeature(
+ data={"pixel_values": processed_images, "num_patches": num_patches}, tensor_type=return_tensors
+ )
+
+ def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
+ """
+ A utility that returns number patches for a given image size.
+
+ Args:
+ height (`int`):
+ Height of the input image.
+ width (`int`):
+ Width of the input image.
+ images_kwargs (`dict`, *optional*)
+ Any kwargs to override defaults of the image processor.
+ Returns:
+ `int`: Number of patches per image.
+ """
+ min_patches = images_kwargs.get("min_patches", self.min_patches)
+ max_patches = images_kwargs.get("max_patches", self.max_patches)
+ patch_size = images_kwargs.get("patch_size", self.size)
+ crop_to_patches = images_kwargs.get("crop_to_patches", self.crop_to_patches)
+
+ num_patches = 1
+ if crop_to_patches and max_patches > 1:
+ num_columns, num_rows = get_optimal_tiled_canvas(
+ (height, width), (patch_size["height"], patch_size["width"]), min_patches, max_patches
+ )
+ num_patches += num_columns * num_rows
+
+ return num_patches
+
+
+__all__ = ["Cohere2VisionImageProcessorFast"]
diff --git a/src/transformers/models/cohere2_vision/modeling_cohere2_vision.py b/src/transformers/models/cohere2_vision/modeling_cohere2_vision.py
new file mode 100644
index 0000000000..91f84cdd6b
--- /dev/null
+++ b/src/transformers/models/cohere2_vision/modeling_cohere2_vision.py
@@ -0,0 +1,432 @@
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# This file was automatically generated from src/transformers/models/cohere2_vision/modular_cohere2_vision.py.
+# Do NOT edit this file manually as any edits will be overwritten by the generation of
+# the file from the modular. If any change should be done, please apply the change to the
+# modular_cohere2_vision.py file directly. One of our CI enforces this.
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 the Cohere Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ...cache_utils import Cache
+from ...generation import GenerationMixin
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring
+from ...utils.generic import check_model_inputs
+from ..auto import AutoModel
+from .configuration_cohere2_vision import Cohere2VisionConfig
+
+
+class Cohere2VisionMultiModalProjector(nn.Module):
+ def __init__(self, config: Cohere2VisionConfig):
+ super().__init__()
+ self.config = config
+ self.downsample_factor = config.downsample_factor
+ self.intermediate_size = config.alignment_intermediate_size
+ self.linear_1 = nn.Linear(
+ config.vision_config.hidden_size * (config.downsample_factor**2), self.intermediate_size, bias=True
+ )
+ self.act = nn.SiLU()
+ self.linear_2 = nn.Linear(self.intermediate_size // 2, config.text_config.hidden_size, bias=True)
+
+ def pixel_shuffle(self, image_features): # B, S, D
+ batch_size, seq_length, feature_dim = image_features.shape
+ height = width = int(seq_length**0.5)
+ image_features = image_features.reshape(image_features.shape[0], width, height, -1)
+ channels = image_features.shape[-1]
+ image_features = image_features.reshape(
+ batch_size, width, int(height / self.downsample_factor), int(channels * self.downsample_factor)
+ )
+ image_features = image_features.permute(0, 2, 1, 3)
+ image_features = image_features.reshape(
+ batch_size, int(height / self.downsample_factor), int(width / self.downsample_factor), -1
+ )
+ image_features = image_features.permute(0, 2, 1, 3)
+ return image_features
+
+ def forward(self, image_features):
+ image_features = self.pixel_shuffle(image_features)
+ hidden_states = self.linear_1(image_features)
+
+ # Split along last dimension and apply SwiGLU
+ x, gate = hidden_states.chunk(2, dim=-1)
+ hidden_states = self.act(gate) * x
+
+ hidden_states = self.linear_2(hidden_states)
+ return hidden_states
+
+
+@dataclass
+@auto_docstring(
+ custom_intro="""
+ Base class for Cohere2Vision outputs, with hidden states and attentions.
+ """
+)
+class Cohere2VisionModelOutputWithPast(BaseModelOutputWithPast):
+ r"""
+ past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+ Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+ `past_key_values` input) to speed up sequential decoding.
+ image_hidden_states (`torch.FloatTensor`, *optional*):
+ A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+ image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+ """
+
+ image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+@auto_docstring(
+ custom_intro="""
+ Base class for Cohere2Vision causal language model (or autoregressive) outputs.
+ """
+)
+class Cohere2VisionCausalLMOutputWithPast(ModelOutput):
+ r"""
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+ Language modeling loss (for next-token prediction).
+ logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+ past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+ Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+ `past_key_values` input) to speed up sequential decoding.
+ image_hidden_states (`torch.FloatTensor`, *optional*):
+ A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+ image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+ """
+
+ loss: Optional[torch.FloatTensor] = None
+ logits: Optional[torch.FloatTensor] = None
+ past_key_values: Optional[list[torch.FloatTensor]] = None
+ hidden_states: Optional[tuple[torch.FloatTensor]] = None
+ attentions: Optional[tuple[torch.FloatTensor]] = None
+ image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@auto_docstring
+class Cohere2VisionPreTrainedModel(PreTrainedModel):
+ config: Cohere2VisionConfig
+ base_model_prefix = ""
+ supports_gradient_checkpointing = True
+ _skip_keys_device_placement = "past_key_values"
+
+ _supports_flash_attn = True
+ _supports_sdpa = True
+ _can_compile_fullgraph = False
+ _supports_flex_attn = True
+ _supports_attention_backend = True
+ _can_record_outputs = {
+ "hidden_states": "DecoderLayer",
+ "attentions": "Attention",
+ }
+
+
+@auto_docstring(
+ custom_intro="""
+ The Cohere2Vision model which consists of a vision backbone and a language model, without a language modeling head.
+ """
+)
+class Cohere2VisionModel(Cohere2VisionPreTrainedModel):
+ _checkpoint_conversion_mapping = {}
+
+ def __init__(self, config: Cohere2VisionConfig):
+ super().__init__(config)
+ self.vision_tower = AutoModel.from_config(config.vision_config)
+
+ self.multi_modal_projector = Cohere2VisionMultiModalProjector(config)
+ self.language_model = AutoModel.from_config(config.text_config)
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.language_model.get_input_embeddings()
+
+ def set_input_embeddings(self, value):
+ self.language_model.set_input_embeddings(value)
+
+ def set_decoder(self, decoder):
+ self.language_model = decoder
+
+ def get_decoder(self):
+ return self.language_model
+
+ def get_image_features(
+ self,
+ pixel_values: torch.FloatTensor,
+ image_num_patches: torch.Tensor,
+ ):
+ """
+ Obtains image last hidden states from the vision tower and apply multimodal projection.
+
+ Args:
+ pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`)
+ The tensors corresponding to the input images.
+ image_num_patches (`torch.Tensor` of shape `(num_images)`)
+ Number of patches for each image.
+ Returns:
+ image_features (List[`torch.Tensor`]): List of image feature tensor, each contains all the visual feature of all patches
+ and are of shape `(num_patches, image_length, embed_dim)`).
+ """
+
+ image_features = self.vision_tower(pixel_values, output_hidden_states=True)
+ selected_image_feature = image_features.last_hidden_state
+ image_features = self.multi_modal_projector(selected_image_feature)
+ return image_features
+
+ @check_model_inputs
+ @auto_docstring
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ pixel_values: torch.FloatTensor = None,
+ image_num_patches: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Cache] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ **kwargs: Unpack[FlashAttentionKwargs],
+ ) -> Union[tuple, Cohere2VisionModelOutputWithPast]:
+ r"""
+ image_num_patches (`torch.Tensor` of shape `(num_images,)`):
+ Number of patches per input image.
+ """
+ if (input_ids is None) ^ (inputs_embeds is not None):
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+ if inputs_embeds is None:
+ inputs_embeds = self.get_input_embeddings()(input_ids)
+
+ if pixel_values is not None:
+ image_features = self.get_image_features(pixel_values, image_num_patches=image_num_patches)
+ image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+
+ if input_ids is None:
+ special_image_mask = inputs_embeds == self.get_input_embeddings()(
+ torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+ )
+ special_image_mask = special_image_mask.all(-1)
+ else:
+ special_image_mask = input_ids == self.config.image_token_id
+
+ special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+ inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+ outputs = self.language_model(
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ cache_position=cache_position,
+ **kwargs,
+ )
+
+ return Cohere2VisionModelOutputWithPast(
+ last_hidden_state=outputs.last_hidden_state,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ image_hidden_states=image_features if pixel_values is not None else None,
+ )
+
+
+@auto_docstring(
+ custom_intro="""
+ The COHERE2_VISION model which consists of a vision backbone and a language model.
+ """
+)
+class Cohere2VisionForConditionalGeneration(Cohere2VisionPreTrainedModel, GenerationMixin):
+ _checkpoint_conversion_mapping = {}
+ _tied_weights_keys = ["lm_head.weight"]
+
+ def __init__(self, config: Cohere2VisionConfig):
+ super().__init__(config)
+ self.model = Cohere2VisionModel(config)
+ self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.get_input_embeddings()
+
+ def set_input_embeddings(self, value):
+ self.model.set_input_embeddings(value)
+
+ def get_output_embeddings(self) -> nn.Module:
+ return self.lm_head
+
+ def set_decoder(self, decoder):
+ self.model.set_decoder(decoder)
+
+ def get_decoder(self):
+ return self.model.get_decoder()
+
+ def get_image_features(
+ self,
+ pixel_values: torch.FloatTensor,
+ image_num_patches: torch.Tensor,
+ ):
+ return self.model.get_image_features(
+ pixel_values=pixel_values,
+ image_num_patches=image_num_patches,
+ )
+
+ # Make modules available throught conditional class for BC
+ @property
+ def language_model(self):
+ return self.model.language_model
+
+ @property
+ def vision_tower(self):
+ return self.model.vision_tower
+
+ @property
+ def multi_modal_projector(self):
+ return self.model.multi_modal_projector
+
+ @check_model_inputs
+ @auto_docstring
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ pixel_values: Optional[torch.FloatTensor] = None,
+ image_num_patches: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Cache] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ logits_to_keep: Union[int, torch.Tensor] = 0,
+ image_sizes: Optional[torch.Tensor] = None,
+ **kwargs: Unpack[TransformersKwargs],
+ ) -> Union[tuple, Cohere2VisionCausalLMOutputWithPast]:
+ r"""
+ image_num_patches (`torch.Tensor` of shape `(num_images,)`):
+ Number of patches per input image.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+ Example:
+
+ ```python
+ >>> from transformers import AutoProcessor, Cohere2VisionForConditionalGeneration
+ >>> import torch
+
+ >>> processor = AutoProcessor.from_pretrained("CohereLabs/command-a-vision-07-2025", use_fast=True)
+ >>> model = Cohere2VisionForConditionalGeneration.from_pretrained("CohereLabs/command-a-vision-07-2025", device_map="auto")
+
+ >>> messages = [
+ ... {
+ ... "role": "user",
+ ... "content": [
+ ... {
+ ... "type": "image",
+ ... "url": "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg",
+ ... },
+ ... {"type": "text", "text": "what is in this image?"},
+ ... ],
+ ... },
+ ... ]
+
+ >>> inputs = processor.apply_chat_template(
+ ... messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt",
+ ... ).to(model.device)
+
+ >>> gen_tokens = model.generate(**inputs, max_new_tokens=300, do_sample=True, temperature=0.3)
+ >>> processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
+ ```"""
+ outputs = self.model(
+ input_ids=input_ids,
+ pixel_values=pixel_values,
+ image_num_patches=image_num_patches,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ cache_position=cache_position,
+ image_sizes=image_sizes,
+ **kwargs,
+ )
+
+ hidden_states = outputs[0]
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+ loss = None
+ if labels is not None:
+ loss = self.loss_function(
+ logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+ )
+
+ return Cohere2VisionCausalLMOutputWithPast(
+ loss=loss,
+ logits=logits,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ image_hidden_states=outputs.image_hidden_states,
+ )
+
+ def prepare_inputs_for_generation(
+ self,
+ input_ids,
+ past_key_values=None,
+ inputs_embeds=None,
+ pixel_values=None,
+ attention_mask=None,
+ cache_position=None,
+ logits_to_keep=None,
+ **kwargs,
+ ):
+ # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+ model_inputs = super().prepare_inputs_for_generation(
+ input_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ attention_mask=attention_mask,
+ cache_position=cache_position,
+ logits_to_keep=logits_to_keep,
+ **kwargs,
+ )
+
+ if cache_position[0] == 0:
+ # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+ # Otherwise we need pixel values to be passed to model
+ model_inputs["pixel_values"] = pixel_values
+
+ return model_inputs
+
+
+__all__ = ["Cohere2VisionForConditionalGeneration", "Cohere2VisionPreTrainedModel", "Cohere2VisionModel"]
diff --git a/src/transformers/models/cohere2_vision/modular_cohere2_vision.py b/src/transformers/models/cohere2_vision/modular_cohere2_vision.py
new file mode 100644
index 0000000000..90cf7defe7
--- /dev/null
+++ b/src/transformers/models/cohere2_vision/modular_cohere2_vision.py
@@ -0,0 +1,351 @@
+# coding=utf-8
+# Copyright 2025 the Cohere Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch AyaVision model."""
+
+from functools import lru_cache
+from typing import Optional, Union
+
+import numpy as np
+import torch
+from torch import nn
+
+from transformers.models.aya_vision.modeling_aya_vision import (
+ AyaVisionCausalLMOutputWithPast,
+ AyaVisionForConditionalGeneration,
+ AyaVisionModel,
+ AyaVisionModelOutputWithPast,
+)
+from transformers.models.got_ocr2.image_processing_got_ocr2_fast import GotOcr2ImageProcessorFast
+
+from ...cache_utils import Cache
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...processing_utils import Unpack
+from ...utils import (
+ TransformersKwargs,
+ auto_docstring,
+ logging,
+)
+from ...utils.generic import check_model_inputs
+from .configuration_cohere2_vision import Cohere2VisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class Cohere2VisionMultiModalProjector(nn.Module):
+ def __init__(self, config: Cohere2VisionConfig):
+ super().__init__()
+ self.config = config
+ self.downsample_factor = config.downsample_factor
+ self.intermediate_size = config.alignment_intermediate_size
+ self.linear_1 = nn.Linear(
+ config.vision_config.hidden_size * (config.downsample_factor**2), self.intermediate_size, bias=True
+ )
+ self.act = nn.SiLU()
+ self.linear_2 = nn.Linear(self.intermediate_size // 2, config.text_config.hidden_size, bias=True)
+
+ def pixel_shuffle(self, image_features): # B, S, D
+ batch_size, seq_length, feature_dim = image_features.shape
+ height = width = int(seq_length**0.5)
+ image_features = image_features.reshape(image_features.shape[0], width, height, -1)
+ channels = image_features.shape[-1]
+ image_features = image_features.reshape(
+ batch_size, width, int(height / self.downsample_factor), int(channels * self.downsample_factor)
+ )
+ image_features = image_features.permute(0, 2, 1, 3)
+ image_features = image_features.reshape(
+ batch_size, int(height / self.downsample_factor), int(width / self.downsample_factor), -1
+ )
+ image_features = image_features.permute(0, 2, 1, 3)
+ return image_features
+
+ def forward(self, image_features):
+ image_features = self.pixel_shuffle(image_features)
+ hidden_states = self.linear_1(image_features)
+
+ # Split along last dimension and apply SwiGLU
+ x, gate = hidden_states.chunk(2, dim=-1)
+ hidden_states = self.act(gate) * x
+
+ hidden_states = self.linear_2(hidden_states)
+ return hidden_states
+
+
+class Cohere2VisionModelOutputWithPast(AyaVisionModelOutputWithPast):
+ pass
+
+
+class Cohere2VisionCausalLMOutputWithPast(AyaVisionCausalLMOutputWithPast):
+ pass
+
+
+class Cohere2VisionModel(AyaVisionModel):
+ _checkpoint_conversion_mapping = {}
+
+ def get_image_features(
+ self,
+ pixel_values: torch.FloatTensor,
+ image_num_patches: torch.Tensor,
+ ):
+ """
+ Obtains image last hidden states from the vision tower and apply multimodal projection.
+
+ Args:
+ pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`)
+ The tensors corresponding to the input images.
+ image_num_patches (`torch.Tensor` of shape `(num_images)`)
+ Number of patches for each image.
+ Returns:
+ image_features (List[`torch.Tensor`]): List of image feature tensor, each contains all the visual feature of all patches
+ and are of shape `(num_patches, image_length, embed_dim)`).
+ """
+
+ image_features = self.vision_tower(pixel_values, output_hidden_states=True)
+ selected_image_feature = image_features.last_hidden_state
+ image_features = self.multi_modal_projector(selected_image_feature)
+ return image_features
+
+ @check_model_inputs
+ @auto_docstring
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ pixel_values: torch.FloatTensor = None,
+ image_num_patches: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Cache] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ **kwargs: Unpack[FlashAttentionKwargs],
+ ) -> Union[tuple, Cohere2VisionModelOutputWithPast]:
+ r"""
+ image_num_patches (`torch.Tensor` of shape `(num_images,)`):
+ Number of patches per input image.
+ """
+ if (input_ids is None) ^ (inputs_embeds is not None):
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+ if inputs_embeds is None:
+ inputs_embeds = self.get_input_embeddings()(input_ids)
+
+ if pixel_values is not None:
+ image_features = self.get_image_features(pixel_values, image_num_patches=image_num_patches)
+ image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+
+ if input_ids is None:
+ special_image_mask = inputs_embeds == self.get_input_embeddings()(
+ torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+ )
+ special_image_mask = special_image_mask.all(-1)
+ else:
+ special_image_mask = input_ids == self.config.image_token_id
+
+ special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+ inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+ outputs = self.language_model(
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ cache_position=cache_position,
+ **kwargs,
+ )
+
+ return Cohere2VisionModelOutputWithPast(
+ last_hidden_state=outputs.last_hidden_state,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ image_hidden_states=image_features if pixel_values is not None else None,
+ )
+
+
+class Cohere2VisionForConditionalGeneration(AyaVisionForConditionalGeneration):
+ _checkpoint_conversion_mapping = {}
+
+ def get_image_features(
+ self,
+ pixel_values: torch.FloatTensor,
+ image_num_patches: torch.Tensor,
+ ):
+ return self.model.get_image_features(
+ pixel_values=pixel_values,
+ image_num_patches=image_num_patches,
+ )
+
+ @check_model_inputs
+ @auto_docstring
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ pixel_values: Optional[torch.FloatTensor] = None,
+ image_num_patches: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Cache] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ logits_to_keep: Union[int, torch.Tensor] = 0,
+ image_sizes: Optional[torch.Tensor] = None,
+ **kwargs: Unpack[TransformersKwargs],
+ ) -> Union[tuple, Cohere2VisionCausalLMOutputWithPast]:
+ r"""
+ image_num_patches (`torch.Tensor` of shape `(num_images,)`):
+ Number of patches per input image.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+ Example:
+
+ ```python
+ >>> from transformers import AutoProcessor, Cohere2VisionForConditionalGeneration
+ >>> import torch
+
+ >>> processor = AutoProcessor.from_pretrained("CohereLabs/command-a-vision-07-2025", use_fast=True)
+ >>> model = Cohere2VisionForConditionalGeneration.from_pretrained("CohereLabs/command-a-vision-07-2025", device_map="auto")
+
+ >>> messages = [
+ ... {
+ ... "role": "user",
+ ... "content": [
+ ... {
+ ... "type": "image",
+ ... "url": "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg",
+ ... },
+ ... {"type": "text", "text": "what is in this image?"},
+ ... ],
+ ... },
+ ... ]
+
+ >>> inputs = processor.apply_chat_template(
+ ... messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt",
+ ... ).to(model.device)
+
+ >>> gen_tokens = model.generate(**inputs, max_new_tokens=300, do_sample=True, temperature=0.3)
+ >>> processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
+ ```"""
+ outputs = self.model(
+ input_ids=input_ids,
+ pixel_values=pixel_values,
+ image_num_patches=image_num_patches,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ cache_position=cache_position,
+ image_sizes=image_sizes,
+ **kwargs,
+ )
+
+ hidden_states = outputs[0]
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+ loss = None
+ if labels is not None:
+ loss = self.loss_function(
+ logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+ )
+
+ return Cohere2VisionCausalLMOutputWithPast(
+ loss=loss,
+ logits=logits,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ image_hidden_states=outputs.image_hidden_states,
+ )
+
+
+@lru_cache(maxsize=10)
+def get_all_supported_aspect_ratios(max_image_tiles: int) -> list[tuple[int, int]]:
+ """
+ Computes all allowed aspect ratios for a given maximum number of input tiles.
+
+ This function calculates all possible arrangements of tiles that can be formed
+ within the constraint of the maximum number of tiles. Each arrangement is
+ represented by its aspect ratio (width/height) and the corresponding tile configuration.
+
+ Args:
+ max_image_tiles (`int`):
+ The maximum number of tiles allowed.
+
+ Returns:
+ `list[tuple[int, int]]`: A list of tuples, each tuple representing a valid (width, height)
+ configuration in terms of number of tiles.
+
+ Example:
+ >>> get_all_supported_aspect_ratios(4)
+ [(1, 1), (1, 2), (1, 3), (1, 4), (2, 1), (2, 2), (3, 1), (4, 1)]
+
+ """
+ aspect_ratios = []
+ for width in range(1, max_image_tiles + 1):
+ for height in range(1, max_image_tiles + 1):
+ if width * height <= max_image_tiles:
+ aspect_ratios.append((width, height))
+ return aspect_ratios
+
+
+def get_optimal_tiled_canvas(
+ original_image_size: tuple[int, int],
+ target_tile_size: tuple[int, int],
+ min_image_tiles: int,
+ max_image_tiles: int,
+) -> tuple[int, int]:
+ possible_resolutions = get_all_supported_aspect_ratios(max_image_tiles)
+ possible_resolutions = sorted(possible_resolutions, key=lambda x: x[0] * x[1])
+ image_height, image_width = original_image_size
+ patch_size_height, patch_size_width = target_tile_size # (height == width)
+
+ candidate_resolutions = np.array(possible_resolutions) * patch_size_height
+ original_size = np.stack([image_height, image_width])
+ required_scales = candidate_resolutions / original_size
+ required_scale = np.min(required_scales, axis=-1, keepdims=True) # [n_resolutions, 1]
+ if np.all(required_scale < 1):
+ # We are forced to downscale, so try to minimize the amount of downscaling
+ best_grid = possible_resolutions[np.argmax(required_scale)]
+ else:
+ # Pick the resolution that required the least upscaling so that it most closely fits the image
+ required_scale = np.where(required_scale < 1.0, 10e9, required_scale)
+ best_grid = possible_resolutions[np.argmin(required_scale)]
+ return best_grid
+
+
+@auto_docstring
+class Cohere2VisionImageProcessorFast(GotOcr2ImageProcessorFast):
+ size = {"height": 512, "width": 512}
+ min_patches = 1
+ max_patches = 12
+ crop_to_patches = True
+ patch_size = 16
+
+
+__all__ = [
+ "Cohere2VisionForConditionalGeneration",
+ "Cohere2VisionPreTrainedModel", # noqa: F822
+ "Cohere2VisionModel",
+ "Cohere2VisionImageProcessorFast",
+]
diff --git a/src/transformers/models/cohere2_vision/processing_cohere2_vision.py b/src/transformers/models/cohere2_vision/processing_cohere2_vision.py
new file mode 100644
index 0000000000..b72e1512ea
--- /dev/null
+++ b/src/transformers/models/cohere2_vision/processing_cohere2_vision.py
@@ -0,0 +1,216 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+
+
+class Cohere2VisionImagesKwargs(ImagesKwargs, total=False):
+ max_patches: Optional[int]
+
+
+class Cohere2VisionProcessorKwargs(ProcessingKwargs, total=False):
+ images_kwargs: Cohere2VisionImagesKwargs
+ _defaults = {
+ "text_kwargs": {
+ "padding_side": "left",
+ "padding": True,
+ "return_mm_token_type_ids": False,
+ },
+ }
+
+
+class Cohere2VisionProcessor(ProcessorMixin):
+ r"""
+ Constructs a Cohere2Vision processor which wraps a [`AutoImageProcessor`] and
+ [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and
+ tokenizer functionalities. See the [`~Cohere2VisionProcessor.__call__`] and [`~Cohere2VisionProcessor.decode`] for more information.
+ Args:
+ image_processor ([`AutoImageProcessor`], *optional*):
+ The image processor is a required input.
+ tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*):
+ The tokenizer is a required input.
+ chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+ in a chat into a tokenizable string.
+ """
+
+ attributes = ["image_processor", "tokenizer"]
+ image_processor_class = "AutoImageProcessor"
+ tokenizer_class = "AutoTokenizer"
+
+ def __init__(
+ self,
+ image_processor=None,
+ tokenizer=None,
+ chat_template=None,
+ **kwargs,
+ ):
+ super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+ self.patch_size = self.image_processor.patch_size
+ self.boi_token = tokenizer.boi_token
+ self.eoi_token = tokenizer.eoi_token
+ self.image_token = tokenizer.image_token
+ self.img_line_break_token = tokenizer.img_line_break_token
+ self.image_token_id = tokenizer.image_token_id
+
+ self.image_ids = tokenizer.convert_tokens_to_ids(
+ [
+ self.image_token,
+ self.boi_token,
+ self.eoi_token,
+ self.img_line_break_token,
+ ]
+ )
+
+ def __call__(
+ self,
+ images: Optional[ImageInput] = None,
+ text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
+ **kwargs: Unpack[Cohere2VisionProcessorKwargs],
+ ) -> BatchFeature:
+ """
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+ and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text.
+ To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to
+ GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`.
+
+ Args:
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+ tensor. Both channels-first and channels-last formats are supported.
+ text (`str`, `list[str]`, `list[list[str]]`):
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
+ If set, will return tensors of a particular framework. Acceptable values are:
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
+ - `'np'`: Return NumPy `np.ndarray` objects.
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+ Returns:
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+ `None`).
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+ """
+ if text is None:
+ raise ValueError("You have to specify text.")
+ elif not isinstance(text, (list, tuple)):
+ text = [text]
+
+ output_kwargs = self._merge_kwargs(
+ Cohere2VisionProcessorKwargs,
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+ **kwargs,
+ )
+
+ # Process images
+ image_inputs = {}
+ if images is not None:
+ image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
+ batch_num_patches = iter(image_inputs.pop("num_patches"))
+ processed_text = []
+ for sample in text:
+ while self.image_token in sample:
+ num_patches = next(batch_num_patches)
+ img_patches_per_tile = int(self.patch_size**2)
+
+ img_string = f"{self.boi_token}"
+ for idx in range(1, num_patches):
+ img_string += "" * img_patches_per_tile + self.img_line_break_token
+ img_string += "" * img_patches_per_tile + self.img_line_break_token
+ img_string += f"{self.eoi_token}"
+
+ sample = sample.replace(self.image_token, img_string, 1)
+ processed_text.append(sample)
+ text = [sample.replace("", self.image_token) for sample in processed_text]
+
+ return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+ return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
+ text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"], return_tensors=None)
+
+ if return_mm_token_type_ids:
+ array_ids = np.array(text_inputs["input_ids"])
+ mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
+ mm_token_type_ids[np.isin(array_ids, self.image_ids)] = 1
+ text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
+
+ return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
+
+ def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
+ """
+ Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
+
+ Args:
+ image_sizes (`list[list[int]]`, *optional*):
+ The input sizes formatted as (height, width) per each image.
+
+ Returns:
+ `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+ input modalities, along with other useful data.
+ """
+
+ vision_data = {}
+ if image_sizes is not None:
+ images_kwargs = Cohere2VisionProcessorKwargs._defaults.get("images_kwargs", {})
+ images_kwargs.update(kwargs)
+
+ num_image_patches = [
+ self.image_processor.get_number_of_image_patches(*image_size, images_kwargs)
+ for image_size in image_sizes
+ ]
+
+ token_per_patch = int(self.patch_size**2)
+ num_image_tokens = [
+ 2 + sum(token_per_patch + 1 for _ in range(num_patches)) for num_patches in num_image_patches
+ ] # Add +2 and +1 for BOI/EOI and image break tokens
+ vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
+
+ return MultiModalData(**vision_data)
+
+ def batch_decode(self, *args, **kwargs):
+ """
+ This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+ refer to the docstring of this method for more information.
+ """
+ return self.tokenizer.batch_decode(*args, **kwargs)
+
+ def decode(self, *args, **kwargs):
+ """
+ This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+ the docstring of this method for more information.
+ """
+ return self.tokenizer.decode(*args, **kwargs)
+
+ @property
+ def model_input_names(self):
+ tokenizer_input_names = self.tokenizer.model_input_names
+ image_processor_input_names = self.image_processor.model_input_names
+ return list(tokenizer_input_names) + list(image_processor_input_names)
+
+
+__all__ = ["Cohere2VisionProcessor"]
diff --git a/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py b/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py
index 813bb6061b..8d087ef762 100644
--- a/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py
+++ b/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py
@@ -45,7 +45,7 @@ if is_torchvision_available():
from torchvision.transforms import functional as F
-class GotOcr2ImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+class GotOcr2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
"""
crop_to_patches (`bool`, *optional*, defaults to `False`):
Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
@@ -76,13 +76,13 @@ class GotOcr2ImageProcessorFast(BaseImageProcessorFast):
crop_to_patches = False
min_patches = 1
max_patches = 12
- valid_kwargs = GotOcr2ImageProcessorKwargs
+ valid_kwargs = GotOcr2FastImageProcessorKwargs
- def __init__(self, **kwargs: Unpack[GotOcr2ImageProcessorKwargs]):
+ def __init__(self, **kwargs: Unpack[GotOcr2FastImageProcessorKwargs]):
super().__init__(**kwargs)
@auto_docstring
- def preprocess(self, images: ImageInput, **kwargs: Unpack[GotOcr2ImageProcessorKwargs]) -> BatchFeature:
+ def preprocess(self, images: ImageInput, **kwargs: Unpack[GotOcr2FastImageProcessorKwargs]) -> BatchFeature:
return super().preprocess(images, **kwargs)
def crop_image_to_patches(
diff --git a/src/transformers/models/got_ocr2/modeling_got_ocr2.py b/src/transformers/models/got_ocr2/modeling_got_ocr2.py
index 464d54f819..0d55eb3abc 100644
--- a/src/transformers/models/got_ocr2/modeling_got_ocr2.py
+++ b/src/transformers/models/got_ocr2/modeling_got_ocr2.py
@@ -678,7 +678,7 @@ class GotOcr2ForConditionalGeneration(GotOcr2PreTrainedModel, GenerationMixin):
self.model.set_decoder(decoder)
def get_decoder(self):
- return self.model.get_decoder
+ return self.model.get_decoder()
def get_image_features(
self,
diff --git a/src/transformers/models/internvl/modeling_internvl.py b/src/transformers/models/internvl/modeling_internvl.py
index 8e1c616700..8e3963cfab 100644
--- a/src/transformers/models/internvl/modeling_internvl.py
+++ b/src/transformers/models/internvl/modeling_internvl.py
@@ -824,7 +824,7 @@ class InternVLForConditionalGeneration(InternVLPreTrainedModel, GenerationMixin)
self.model.set_decoder(decoder)
def get_decoder(self):
- return self.model.get_decoder
+ return self.model.get_decoder()
def get_image_features(
self,
diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py
index 92199c9505..e5145554dc 100644
--- a/src/transformers/models/llava/modeling_llava.py
+++ b/src/transformers/models/llava/modeling_llava.py
@@ -341,7 +341,7 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel, GenerationMixin):
self.model.set_decoder(decoder)
def get_decoder(self):
- return self.model.get_decoder
+ return self.model.get_decoder()
def get_image_features(
self,
diff --git a/src/transformers/models/mistral3/modeling_mistral3.py b/src/transformers/models/mistral3/modeling_mistral3.py
index bc61bc55b1..7e8dabef15 100644
--- a/src/transformers/models/mistral3/modeling_mistral3.py
+++ b/src/transformers/models/mistral3/modeling_mistral3.py
@@ -378,7 +378,7 @@ class Mistral3ForConditionalGeneration(Mistral3PreTrainedModel, GenerationMixin)
self.model.set_decoder(decoder)
def get_decoder(self):
- return self.model.get_decoder
+ return self.model.get_decoder()
def get_image_features(
self,
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index 65f35c7951..cb99ca8e19 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -316,7 +316,7 @@ class PerceptionLMForConditionalGeneration(PerceptionLMPreTrainedModel, Generati
self.model.set_decoder(decoder)
def get_decoder(self):
- return self.model.get_decoder
+ return self.model.get_decoder()
@can_return_tuple
@auto_docstring
diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py
index df3635b690..3c9d5cd0ee 100644
--- a/src/transformers/models/vipllava/modeling_vipllava.py
+++ b/src/transformers/models/vipllava/modeling_vipllava.py
@@ -303,7 +303,7 @@ class VipLlavaForConditionalGeneration(VipLlavaPreTrainedModel, GenerationMixin)
self.model.set_decoder(decoder)
def get_decoder(self):
- return self.model.get_decoder
+ return self.model.get_decoder()
def get_image_features(
self, pixel_values: torch.FloatTensor, vision_feature_layers: Optional[Union[int, list[int]]] = None
diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py
index 2989f0230c..ea3942a08f 100644
--- a/src/transformers/utils/generic.py
+++ b/src/transformers/utils/generic.py
@@ -974,11 +974,13 @@ class OutputRecorder:
target_class (Type): The class (e.g., nn.Module) to which the hook will be attached.
index (Optional[int]): If the output is a tuple/list, optionally record only at a specific index.
layer_name (Optional[str]): Name of the submodule to target (if needed), e.g., "transformer.layer.3.attn".
+ class_name (Optional[str]): Name of the class to which the hook will be attached. Could be the suffix of class name in some cases.
"""
target_class: "type[torch.nn.Module]"
index: Optional[int] = 0
layer_name: Optional[str] = None
+ class_name: Optional[str] = None
def check_model_inputs(func):
@@ -1049,12 +1051,17 @@ def check_model_inputs(func):
for specs in layer_specs:
if not isinstance(specs, OutputRecorder):
index = 0 if "hidden_states" in key else 1
- specs = OutputRecorder(target_class=specs, index=index)
+ class_name = None if not isinstance(specs, str) else specs
+ target_class = specs if not isinstance(specs, str) else None
+ specs = OutputRecorder(target_class=target_class, index=index, class_name=class_name)
capture_tasks.append((key, specs))
for name, module in self.named_modules():
for key, specs in capture_tasks:
- if isinstance(module, specs.target_class):
+ # The second check is for multimodals where only backbone layer suffix is available
+ if (specs.target_class is not None and isinstance(module, specs.target_class)) or (
+ specs.class_name is not None and name.endswith(specs.class_name)
+ ):
if specs.layer_name is not None and specs.layer_name not in name:
continue
# Monkey patch forward
diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
index ac3eba924c..e193afc513 100644
--- a/src/transformers/utils/hub.py
+++ b/src/transformers/utils/hub.py
@@ -167,7 +167,10 @@ def list_repo_templates(
return [
entry.path.removeprefix(f"{CHAT_TEMPLATE_DIR}/")
for entry in list_repo_tree(
- repo_id=repo_id, revision=revision, path_in_repo=CHAT_TEMPLATE_DIR, recursive=False
+ repo_id=repo_id,
+ revision=revision,
+ path_in_repo=CHAT_TEMPLATE_DIR,
+ recursive=False,
)
if entry.path.endswith(".jinja")
]
diff --git a/tests/models/cohere2_vision/__init__.py b/tests/models/cohere2_vision/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/models/cohere2_vision/test_image_processing_cohere2_vision.py b/tests/models/cohere2_vision/test_image_processing_cohere2_vision.py
new file mode 100644
index 0000000000..7ab3bf70d5
--- /dev/null
+++ b/tests/models/cohere2_vision/test_image_processing_cohere2_vision.py
@@ -0,0 +1,192 @@
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+ import torch
+
+if is_vision_available():
+ from PIL import Image
+
+ if is_torchvision_available():
+ from transformers import Cohere2VisionImageProcessorFast
+
+
+class Cohere2VisionImageProcessingTester(unittest.TestCase):
+ def __init__(
+ self,
+ parent,
+ batch_size=7,
+ num_channels=3,
+ image_size=18,
+ min_resolution=30,
+ max_resolution=400,
+ do_resize=True,
+ size=None,
+ do_normalize=True,
+ image_mean=[0.48145466, 0.4578275, 0.40821073],
+ image_std=[0.26862954, 0.26130258, 0.27577711],
+ do_convert_rgb=True,
+ ):
+ super().__init__()
+ size = size if size is not None else {"height": 30, "width": 30}
+ self.parent = parent
+ self.batch_size = batch_size
+ self.num_channels = num_channels
+ self.image_size = image_size
+ self.min_resolution = min_resolution
+ self.max_resolution = max_resolution
+ self.do_resize = do_resize
+ self.size = size
+ self.do_normalize = do_normalize
+ self.image_mean = image_mean
+ self.image_std = image_std
+ self.do_convert_rgb = do_convert_rgb
+
+ def prepare_image_processor_dict(self):
+ return {
+ "do_resize": self.do_resize,
+ "size": self.size,
+ "do_normalize": self.do_normalize,
+ "image_mean": self.image_mean,
+ "image_std": self.image_std,
+ "do_convert_rgb": self.do_convert_rgb,
+ }
+
+ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+ return prepare_image_inputs(
+ batch_size=self.batch_size,
+ num_channels=self.num_channels,
+ min_resolution=self.min_resolution,
+ max_resolution=self.max_resolution,
+ equal_resolution=equal_resolution,
+ numpify=numpify,
+ torchify=torchify,
+ )
+
+
+@require_torch
+@require_vision
+class Cohere2VisionProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+ fast_image_processing_class = Cohere2VisionImageProcessorFast if is_torchvision_available() else None
+ test_slow_image_processor = False
+
+ def setUp(self):
+ super().setUp()
+ self.image_processor_tester = Cohere2VisionImageProcessingTester(self)
+
+ @property
+ def image_processor_dict(self):
+ return self.image_processor_tester.prepare_image_processor_dict()
+
+ def test_image_processor_properties(self):
+ for image_processing_class in self.image_processor_list:
+ image_processor = image_processing_class(**self.image_processor_dict)
+ self.assertTrue(hasattr(image_processor, "do_resize"))
+ self.assertTrue(hasattr(image_processor, "size"))
+ self.assertTrue(hasattr(image_processor, "do_normalize"))
+ self.assertTrue(hasattr(image_processor, "image_mean"))
+ self.assertTrue(hasattr(image_processor, "image_std"))
+ self.assertTrue(hasattr(image_processor, "do_convert_rgb"))
+
+ def test_call_pil(self):
+ for image_processing_class in self.image_processor_list:
+ # Initialize image_processing
+ image_processing = image_processing_class(**self.image_processor_dict)
+ # create random PIL images
+ image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
+ for image in image_inputs:
+ self.assertIsInstance(image, Image.Image)
+
+ # Test not batched input
+ encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+ self.assertEqual(tuple(encoded_images.shape), (10, 3, 30, 30))
+
+ # Test batched
+ encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+ self.assertEqual(tuple(encoded_images.shape), (70, 3, 30, 30))
+
+ def test_call_numpy(self):
+ for image_processing_class in self.image_processor_list:
+ # Initialize image_processing
+ image_processing = image_processing_class(**self.image_processor_dict)
+ # create random numpy tensors
+ image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True)
+ for image in image_inputs:
+ self.assertIsInstance(image, np.ndarray)
+
+ # Test not batched input
+ encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+ self.assertEqual(tuple(encoded_images.shape), (10, 3, 30, 30))
+
+ # Test batched
+ encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+ self.assertEqual(tuple(encoded_images.shape), (70, 3, 30, 30))
+
+ def test_call_pytorch(self):
+ for image_processing_class in self.image_processor_list:
+ # Initialize image_processing
+ image_processing = image_processing_class(**self.image_processor_dict)
+ # create random PyTorch tensors
+ image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)
+
+ for image in image_inputs:
+ self.assertIsInstance(image, torch.Tensor)
+
+ # Test not batched input
+ encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+ self.assertEqual(tuple(encoded_images.shape), (10, 3, 30, 30))
+
+ # Test batched
+ encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+ self.assertEqual(tuple(encoded_images.shape), (70, 3, 30, 30))
+
+ def test_call_numpy_4_channels(self):
+ for image_processing_class in self.image_processor_list:
+ # Test that can process images which have an arbitrary number of channels
+ # Initialize image_processing
+ image_processor = image_processing_class(**self.image_processor_dict)
+
+ # create random numpy tensors
+ self.image_processor_tester.num_channels = 4
+ image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True)
+
+ # Test not batched input
+ encoded_images = image_processor(
+ image_inputs[0],
+ return_tensors="pt",
+ input_data_format="channels_last",
+ image_mean=0,
+ image_std=1,
+ ).pixel_values
+ self.assertEqual(tuple(encoded_images.shape), (10, 4, 30, 30))
+
+ # Test batched
+ encoded_images = image_processor(
+ image_inputs,
+ return_tensors="pt",
+ input_data_format="channels_last",
+ image_mean=0,
+ image_std=1,
+ ).pixel_values
+ self.assertEqual(tuple(encoded_images.shape), (70, 4, 30, 30))
diff --git a/tests/models/cohere2_vision/test_modeling_cohere2_vision.py b/tests/models/cohere2_vision/test_modeling_cohere2_vision.py
new file mode 100644
index 0000000000..4e49baa303
--- /dev/null
+++ b/tests/models/cohere2_vision/test_modeling_cohere2_vision.py
@@ -0,0 +1,470 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch GotOcr2 model."""
+
+import unittest
+
+from transformers import (
+ AutoProcessor,
+ Cohere2VisionConfig,
+ is_torch_available,
+)
+from transformers.testing_utils import (
+ Expectations,
+ cleanup,
+ get_device_properties,
+ require_deterministic_for_xpu,
+ require_read_token,
+ require_torch,
+ require_torch_accelerator,
+ slow,
+ torch_device,
+)
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+ import torch
+
+ from transformers import (
+ Cohere2VisionForConditionalGeneration,
+ Cohere2VisionModel,
+ )
+
+
+class Cohere2VisionText2TextModelTester:
+ def __init__(
+ self,
+ parent,
+ batch_size=3,
+ seq_length=7,
+ downsample_factor=2,
+ alignment_intermediate_size=32,
+ ignore_index=-100,
+ image_token_id=2,
+ num_channels=3,
+ image_size=64,
+ is_training=True,
+ text_config={
+ "model_type": "cohere2",
+ "vocab_size": 99,
+ "hidden_size": 128,
+ "intermediate_size": 37,
+ "num_hidden_layers": 4,
+ "num_attention_heads": 4,
+ "output_channels": 64,
+ "hidden_act": "silu",
+ "max_position_embeddings": 512,
+ "tie_word_embeddings": True,
+ "bos_token_id": 0,
+ "eos_token_id": 0,
+ "pad_token_id": 0,
+ },
+ vision_config={
+ "model_type": "siglip_vision_model",
+ "hidden_size": 32,
+ "num_hidden_layers": 2,
+ "num_attention_heads": 4,
+ "intermediate_size": 128,
+ "image_size": 64,
+ "patch_size": 8,
+ "vision_use_head": False,
+ },
+ ):
+ self.parent = parent
+ self.ignore_index = ignore_index
+ self.bos_token_id = text_config["bos_token_id"]
+ self.eos_token_id = text_config["eos_token_id"]
+ self.pad_token_id = text_config["pad_token_id"]
+ self.image_token_id = image_token_id
+ self.text_config = text_config
+ self.vision_config = vision_config
+ self.batch_size = batch_size
+ self.downsample_factor = downsample_factor
+ self.alignment_intermediate_size = alignment_intermediate_size
+ self.is_training = is_training
+ self.num_channels = num_channels
+ self.image_size = image_size
+ self.image_seq_length = 16
+ self.seq_length = seq_length + self.image_seq_length
+
+ self.num_hidden_layers = text_config["num_hidden_layers"]
+ self.vocab_size = text_config["vocab_size"]
+ self.hidden_size = text_config["hidden_size"]
+ self.num_attention_heads = text_config["num_attention_heads"]
+
+ def get_config(self):
+ return Cohere2VisionConfig(
+ text_config=self.text_config,
+ vision_config=self.vision_config,
+ image_token_id=self.image_token_id,
+ downsample_factor=self.downsample_factor,
+ alignment_intermediate_size=self.alignment_intermediate_size,
+ )
+
+ def prepare_config_and_inputs(self):
+ config = self.get_config()
+ pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+ image_num_patches = torch.tensor([1] * self.batch_size).to(torch_device)
+
+ return config, pixel_values, image_num_patches
+
+ def prepare_config_and_inputs_for_common(self):
+ config_and_inputs = self.prepare_config_and_inputs()
+ config, pixel_values, image_num_patches = config_and_inputs
+ input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+ attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
+ input_ids[input_ids == self.image_token_id] = self.pad_token_id
+ input_ids[:, : self.image_seq_length] = self.image_token_id
+
+ inputs_dict = {
+ "pixel_values": pixel_values,
+ "input_ids": input_ids,
+ "attention_mask": attention_mask,
+ "image_num_patches": image_num_patches,
+ }
+ return config, inputs_dict
+
+
+@require_torch
+class Cohere2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+ all_model_classes = (
+ (
+ Cohere2VisionModel,
+ Cohere2VisionForConditionalGeneration,
+ )
+ if is_torch_available()
+ else ()
+ )
+ all_generative_model_classes = (Cohere2VisionForConditionalGeneration,) if is_torch_available() else ()
+ pipeline_model_mapping = (
+ {
+ "image-text-to-text": Cohere2VisionForConditionalGeneration,
+ }
+ if is_torch_available()
+ else {}
+ )
+ fx_compatible = False
+ test_pruning = False
+ test_torchscript = False
+ test_head_masking = False
+ _is_composite = True
+
+ def setUp(self):
+ self.model_tester = Cohere2VisionText2TextModelTester(self)
+ self.config_tester = ConfigTester(self, config_class=Cohere2VisionConfig, has_text_modality=False)
+
+ def test_config(self):
+ self.config_tester.run_common_tests()
+
+ @unittest.skip(reason="Siglip backbone uses the same initialization scheme as the Flax original implementation")
+ def test_initialization(self):
+ pass
+
+
+@require_read_token
+@require_torch
+class Cohere2IntegrationTest(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ cls.model_checkpoint = "CohereLabs/command-a-vision-07-2025"
+ cls.model = None
+
+ @classmethod
+ def tearDownClass(cls):
+ del cls.model
+ cleanup(torch_device, gc_collect=True)
+
+ def tearDown(self):
+ cleanup(torch_device, gc_collect=True)
+
+ @classmethod
+ def get_model(cls):
+ # Use 4-bit on T4
+ device_type, major, _ = get_device_properties()
+ load_in_4bit = (device_type == "cuda") and (major < 8)
+ torch_dtype = None if load_in_4bit else torch.float16
+
+ if cls.model is None:
+ cls.model = Cohere2VisionForConditionalGeneration.from_pretrained(
+ cls.model_checkpoint,
+ device_map="auto",
+ torch_dtype=torch_dtype,
+ load_in_4bit=load_in_4bit,
+ )
+ return cls.model
+
+ @slow
+ @require_torch_accelerator
+ def test_model_integration_forward(self):
+ processor = AutoProcessor.from_pretrained(self.model_checkpoint)
+ model = self.get_model()
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
+ {"type": "text", "text": "Please describe the image explicitly."},
+ ],
+ }
+ ]
+
+ inputs = processor.apply_chat_template(
+ messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
+ ).to(torch_device, dtype=torch.float16)
+ # Forward
+ with torch.inference_mode():
+ output = model(**inputs)
+
+ actual_logits = output.logits[0, -1, :5].cpu()
+
+ EXPECTED_LOGITS = Expectations(
+ {
+ ("xpu", 3): [0.4109, 0.1532, 0.8018, 2.1328, 0.5483],
+ # 4-bit
+ ("cuda", 7): [0.1097, 0.3481, 3.8340, 9.7969, 2.0488],
+ ("cuda", 8): [2.4277, 1.6875, 1.8789, 2.1875, 1.9375],
+ }
+ ) # fmt: skip
+ expected_logits = torch.tensor(EXPECTED_LOGITS.get_expectation(), dtype=torch.float16)
+
+ self.assertTrue(
+ torch.allclose(actual_logits, expected_logits, atol=0.1),
+ f"Actual logits: {actual_logits}"
+ f"\nExpected logits: {expected_logits}"
+ f"\nDifference: {torch.abs(actual_logits - expected_logits)}",
+ )
+
+ @slow
+ @require_torch_accelerator
+ @require_deterministic_for_xpu
+ def test_model_integration_generate_text_only(self):
+ processor = AutoProcessor.from_pretrained(self.model_checkpoint)
+ model = self.get_model()
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {"type": "text", "text": "Write a haiku"},
+ ],
+ }
+ ]
+
+ inputs = processor.apply_chat_template(
+ messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
+ ).to(torch_device, dtype=torch.float16)
+ with torch.no_grad():
+ generate_ids = model.generate(**inputs, max_new_tokens=25, do_sample=False)
+ decoded_output = processor.decode(
+ generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
+ )
+
+ expected_outputs = Expectations(
+ {
+ ("xpu", 3): "Whispers on the breeze,\nLeaves dance under moonlit skies,\nNature's quiet song.",
+ # 4-bit
+ ("cuda", 7): "Sure, here's a haiku for you:\n\nMorning dew sparkles,\nPetals unfold in sunlight,\n",
+ ("cuda", 8): "**Haiku**\n\n*Softly falls the snow*\n*Blanketing the earth in white*\n*",
+ }
+ ) # fmt: skip
+ expected_output = expected_outputs.get_expectation()
+
+ self.assertEqual(decoded_output, expected_output)
+
+ @slow
+ @require_torch_accelerator
+ @require_deterministic_for_xpu
+ def test_model_integration_generate_chat_template(self):
+ processor = AutoProcessor.from_pretrained(self.model_checkpoint)
+ model = self.get_model()
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
+ {"type": "text", "text": "Please describe the image explicitly."},
+ ],
+ }
+ ]
+
+ inputs = processor.apply_chat_template(
+ messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
+ ).to(torch_device, dtype=torch.float16)
+ with torch.no_grad():
+ generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
+ decoded_output = processor.decode(
+ generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
+ )
+
+ expected_outputs = Expectations(
+ {
+ ("xpu", 3): 'The image depicts a cozy scene of two cats resting on a bright pink blanket. The cats,',
+ # 4-bit
+ ("cuda", 7): 'The image depicts two cats comfortably resting on a pink blanket spread across a sofa. The cats,',
+ ("cuda", 8): 'The image depicts two cats lying on a bright pink blanket that covers a red couch. The cat',
+ }
+ ) # fmt: skip
+ expected_output = expected_outputs.get_expectation()
+
+ self.assertEqual(decoded_output, expected_output)
+
+ @slow
+ @require_torch_accelerator
+ def test_model_integration_batched_generate(self):
+ processor = AutoProcessor.from_pretrained(self.model_checkpoint)
+ model = self.get_model()
+ # Prepare inputs
+ messages = [
+ [
+ {
+ "role": "user",
+ "content": [
+ {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
+ {"type": "text", "text": "Write a haiku for this image"},
+ ],
+ },
+ ],
+ [
+ {
+ "role": "user",
+ "content": [
+ {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
+ {"type": "text", "text": "Describe this image"},
+ ],
+ },
+ ],
+ ]
+ inputs = processor.apply_chat_template(
+ messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
+ ).to(model.device, dtype=torch.float16)
+
+ output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
+
+ # Check first output
+ decoded_output = processor.decode(output[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
+ expected_outputs = Expectations(
+ {
+ ("xpu", 3): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest lake.",
+ # 4-bit
+ ("cuda", 7): "Wooden bridge stretches\nMirrored lake below, mountains rise\nPeaceful, serene",
+ ("cuda", 8): 'Dock stretches to calm, \nMountains whisper through the trees, \nLake mirrors the sky.',
+ }
+ ) # fmt: skip
+ expected_output = expected_outputs.get_expectation()
+
+ self.assertEqual(
+ decoded_output,
+ expected_output,
+ f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
+ )
+
+ # Check second output
+ decoded_output = processor.decode(output[1, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
+
+ expected_outputs = Expectations(
+ {
+ ("xpu", 3): 'This image captures a vibrant street scene in a bustling urban area, likely in an Asian city. The focal point is a',
+ # 4-bit
+ ("cuda", 7): 'This vibrant image captures a bustling street scene in a multicultural urban area, featuring a traditional Chinese gate adorned with intricate red and',
+ ("cuda", 8): 'The image depicts a vibrant street scene in what appears to be a Chinatown district, likely in an urban area. The focal',
+ }
+ ) # fmt: skip
+ expected_output = expected_outputs.get_expectation()
+
+ self.assertEqual(
+ decoded_output,
+ expected_output,
+ f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
+ )
+
+ @slow
+ @require_torch_accelerator
+ @require_deterministic_for_xpu
+ def test_model_integration_batched_generate_multi_image(self):
+ processor = AutoProcessor.from_pretrained(self.model_checkpoint)
+ model = self.get_model()
+ # Prepare inputs
+ messages = [
+ [
+ {
+ "role": "user",
+ "content": [
+ {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
+ {"type": "text", "text": "Write a haiku for this image"},
+ ],
+ },
+ ],
+ [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image",
+ "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
+ },
+ {
+ "type": "image",
+ "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
+ },
+ {
+ "type": "text",
+ "text": "These images depict two different landmarks. Can you identify them?",
+ },
+ ],
+ },
+ ],
+ ]
+ inputs = processor.apply_chat_template(
+ messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
+ ).to(model.device, dtype=torch.float16)
+ output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
+
+ # Check first output
+ decoded_output = processor.decode(output[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
+ # Batching seems to alter the output slightly, but it is also the case in the original implementation. This seems to be expected: https://github.com/huggingface/transformers/issues/23017#issuecomment-1649630232
+ expected_outputs = Expectations(
+ {
+ ("xpu", 3): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest lake.",
+ ("cuda", 7): 'Wooden bridge stretches\nMirrored lake below, mountains rise\nPeaceful, serene',
+ ("cuda", 8): 'Dock stretches to calm, \nMountains whisper through the trees, \nLake mirrors the sky.',
+ }
+ ) # fmt: skip
+ expected_output = expected_outputs.get_expectation()
+
+ self.assertEqual(
+ decoded_output,
+ expected_output,
+ f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
+ )
+
+ # Check second output
+ decoded_output = processor.decode(output[1, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
+ expected_outputs = Expectations(
+ {
+ ("xpu", 3): "The first image showcases the Statue of Liberty, a colossal neoclassical sculpture on Liberty Island in New York Harbor. Standing at ",
+ ("cuda", 7): 'The first image showcases the Statue of Liberty, a monumental sculpture located on Liberty Island in New York Harbor. Standing atop a',
+ ("cuda", 8): 'The two landmarks depicted in the images are the Statue of Liberty and the Golden Gate Bridge. \n\n1. **Statue',
+ }
+ ) # fmt: skip
+ expected_output = expected_outputs.get_expectation()
+
+ self.assertEqual(
+ decoded_output,
+ expected_output,
+ f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
+ )
diff --git a/tests/models/cohere2_vision/test_processing_cohere2_vision.py b/tests/models/cohere2_vision/test_processing_cohere2_vision.py
new file mode 100644
index 0000000000..6573611423
--- /dev/null
+++ b/tests/models/cohere2_vision/test_processing_cohere2_vision.py
@@ -0,0 +1,139 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+import tempfile
+import unittest
+
+from transformers import AutoProcessor, AutoTokenizer, Cohere2VisionProcessor
+from transformers.testing_utils import require_read_token, require_torch, require_vision
+from transformers.utils import is_torch_available, is_torchvision_available
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+if is_torch_available():
+ import torch
+
+if is_torchvision_available():
+ from transformers import Cohere2VisionImageProcessorFast
+
+
+@require_read_token
+@require_vision
+@unittest.skip("Model not released yet!")
+class Cohere2VisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+ processor_class = Cohere2VisionProcessor
+
+ @classmethod
+ def setUpClass(cls):
+ cls.tmpdirname = tempfile.mkdtemp()
+ image_processor = Cohere2VisionImageProcessorFast(
+ size={"height": 20, "width": 20},
+ max_patches=3,
+ )
+ tokenizer = AutoTokenizer.from_pretrained("CohereLabs/command-a-vision-07-2025")
+
+ processor_kwargs = cls.prepare_processor_dict()
+ processor = Cohere2VisionProcessor(
+ image_processor=image_processor,
+ tokenizer=tokenizer,
+ **processor_kwargs,
+ )
+ processor.save_pretrained(cls.tmpdirname)
+ cls.image_token = processor.image_token
+
+ @staticmethod
+ def prepare_processor_dict():
+ return {}
+
+ def get_tokenizer(self, **kwargs):
+ return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
+
+ def get_image_processor(self, **kwargs):
+ return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
+
+ def get_processor(self, **kwargs):
+ return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)
+
+ @classmethod
+ def tearDownClass(cls):
+ shutil.rmtree(cls.tmpdirname, ignore_errors=True)
+
+ @require_torch
+ def test_process_interleaved_images_videos(self):
+ processor = self.get_processor()
+
+ messages = [
+ [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image",
+ "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
+ },
+ {
+ "type": "image",
+ "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
+ },
+ {"type": "text", "text": "What are the differences between these two images?"},
+ ],
+ },
+ ],
+ [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image",
+ "url": "https://llava-vl.github.io/static/images/view.jpg",
+ },
+ {"type": "text", "text": "Write a haiku for this image"},
+ ],
+ }
+ ],
+ ]
+
+ inputs_batched = processor.apply_chat_template(
+ messages,
+ add_generation_prompt=True,
+ tokenize=True,
+ return_dict=True,
+ return_tensors="pt",
+ padding=True,
+ )
+
+ # Process non batched inputs to check if the pixel_values and input_ids are reconstructed in the correct order when batched together
+ images_patches_index = 0
+ for i, message in enumerate(messages):
+ inputs = processor.apply_chat_template(
+ message,
+ add_generation_prompt=True,
+ tokenize=True,
+ return_dict=True,
+ return_tensors="pt",
+ padding=True,
+ )
+ # We slice with [-inputs["input_ids"].shape[1] :] as the input_ids are left padded
+ torch.testing.assert_close(
+ inputs["input_ids"][0], inputs_batched["input_ids"][i][-inputs["input_ids"].shape[1] :]
+ )
+ torch.testing.assert_close(
+ inputs["pixel_values"],
+ inputs_batched["pixel_values"][
+ images_patches_index : images_patches_index + inputs["pixel_values"].shape[0]
+ ],
+ )
+ images_patches_index += inputs["pixel_values"].shape[0]
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index fcc47466a3..e60537e302 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -4677,9 +4677,13 @@ class ModelTesterMixin:
sub_config = getattr(config, key)
update_config_for_flex(sub_config)
- model = model_class(config).to(device=torch_device)
- model.set_attn_implementation("flex_attention")
- self.assertTrue(model.config._attn_implementation == "flex_attention")
+ if model_class._can_set_attn_implementation():
+ model = model_class(config).to(device=torch_device)
+ model.set_attn_implementation("flex_attention")
+ self.assertTrue(model.config._attn_implementation == "flex_attention")
+ else:
+ config._attn_implementation = "flex_attention"
+ model = model_class(config).to(device=torch_device)
# Elaborate workaround for encoder-decoder models as some do not specify their main input
dummy_inputs = {model.main_input_name: inputs_dict[model.main_input_name].to(torch_device)}