From 6f259bc83e518c281877cfc90efe61bf8a79bba0 Mon Sep 17 00:00:00 2001 From: Pavel Iakubovskii Date: Thu, 14 Aug 2025 16:29:53 +0100 Subject: [PATCH] Fix docs typo (#40167) * DINOv3 model * working version * linter revert * linter revert * linter revert * fix init * remove flex and add convert to hf script * DINOv3 convnext * working version of convnext * adding to auto * Dinov3 -> DINOv3 * PR feedback * complete convert checkpoint * fix assertion * bf16 -> fp32 * add fast image processor * fixup * change conversion script * Use Pixtral attention * minor renaming * simplify intermediates capturing * refactor DINOv3ViTPatchEmbeddings * Refactor DINOv3ViTEmbeddings * [WIP] rope: remove unused params * [WIP] rope: rename period -> inv_freq for consistency * [WIP] rope: move augs * change inv_freq init (not persistent anymore) * [WIP] rope: move coords to init * rope - done! * use default LayerScale * conversion: truncate expected outputs * remove commented code * Refactor MLP layers * nit * clean up config params * nit docs * simplify embeddings * simplify compile compat lru_cache * fixup * dynamic patch coords * move augmentation * Fix docs * fixup and type hints * fix output capturing * fix tests * fixup * fix auto mappings * Add draft docs * fix dtype cast issue * add push to hub * add image processor tests * fixup * add modular * update modular * convert and test convnext * update conversion script * update prefix * Update LayerNorm * refactor DINOv3ConvNextLayer * rename * refactor convnext model * fix doc check * fix docs * fix convnext config * tmp fix for check docstring * remove unused arg * fix tests * (nit) change init * standardize gated MLP * clear namings and sat493m * fix tensors on different devices * revert linter * pr * pr feedbak ruff format * missing headers * fix code snippet and collection link in docs * DINOv3 description * fix checkpoints in tests * not doc fixes in configs * output_hidden_states * x -> features * remove sequential --------- Co-authored-by: Cijo Jose --- docs/source/en/_toctree.yml | 2 + docs/source/en/model_doc/dinov3.md | 181 ++++++ src/transformers/models/__init__.py | 2 + .../models/auto/configuration_auto.py | 4 + .../models/auto/image_processing_auto.py | 1 + src/transformers/models/auto/modeling_auto.py | 4 + .../models/dinov3_convnext/__init__.py | 27 + .../configuration_dinov3_convnext.py | 103 ++++ .../convert_dinov3_convnext_to_hf.py | 234 ++++++++ .../modeling_dinov3_convnext.py | 261 +++++++++ .../models/dinov3_vit/__init__.py | 28 + .../dinov3_vit/configuration_dinov3_vit.py | 166 ++++++ .../dinov3_vit/convert_dinov3_vit_to_hf.py | 337 +++++++++++ .../image_processing_dinov3_vit_fast.py | 104 ++++ .../models/dinov3_vit/modeling_dinov3_vit.py | 538 ++++++++++++++++++ .../models/dinov3_vit/modular_dinov3_vit.py | 429 ++++++++++++++ src/transformers/pytorch_utils.py | 18 +- src/transformers/utils/fx.py | 2 + tests/models/dinov3_convnext/__init__.py | 0 .../test_modeling_dinov3_convnext.py | 242 ++++++++ tests/models/dinov3_vit/__init__.py | 0 .../test_image_processing_dinov3_vit_fast.py | 127 +++++ .../dinov3_vit/test_modeling_dinov3_vit.py | 278 +++++++++ utils/check_docstrings.py | 2 + utils/check_repo.py | 2 + 25 files changed, 3081 insertions(+), 11 deletions(-) create mode 100644 docs/source/en/model_doc/dinov3.md create mode 100644 src/transformers/models/dinov3_convnext/__init__.py create mode 100644 src/transformers/models/dinov3_convnext/configuration_dinov3_convnext.py create mode 100644 src/transformers/models/dinov3_convnext/convert_dinov3_convnext_to_hf.py create mode 100644 src/transformers/models/dinov3_convnext/modeling_dinov3_convnext.py create mode 100644 src/transformers/models/dinov3_vit/__init__.py create mode 100644 src/transformers/models/dinov3_vit/configuration_dinov3_vit.py create mode 100644 src/transformers/models/dinov3_vit/convert_dinov3_vit_to_hf.py create mode 100644 src/transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py create mode 100644 src/transformers/models/dinov3_vit/modeling_dinov3_vit.py create mode 100644 src/transformers/models/dinov3_vit/modular_dinov3_vit.py create mode 100644 tests/models/dinov3_convnext/__init__.py create mode 100644 tests/models/dinov3_convnext/test_modeling_dinov3_convnext.py create mode 100644 tests/models/dinov3_vit/__init__.py create mode 100644 tests/models/dinov3_vit/test_image_processing_dinov3_vit_fast.py create mode 100644 tests/models/dinov3_vit/test_modeling_dinov3_vit.py diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 18704b846d..c44a55c622 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -763,6 +763,8 @@ title: DINOV2 - local: model_doc/dinov2_with_registers title: DINOv2 with Registers + - local: model_doc/dinov3 + title: DINOv3 - local: model_doc/dit title: DiT - local: model_doc/dpt diff --git a/docs/source/en/model_doc/dinov3.md b/docs/source/en/model_doc/dinov3.md new file mode 100644 index 0000000000..b3f2067fe9 --- /dev/null +++ b/docs/source/en/model_doc/dinov3.md @@ -0,0 +1,181 @@ + + +
+
+ PyTorch + Flax + FlashAttention + SDPA +
+
+ + +# DINOv3 + +DINOv3 is a family of versatile vision foundation models that outperforms the specialized state of the art across a broad range of settings, without fine-tuning. DINOv3 produces high-quality dense features that achieve outstanding performance on various vision tasks, significantly surpassing previous self- and weakly-supervised foundation models. + +You can find all the original DINOv3 checkpoints under the [DINOv3](https://huggingface.co/collections/facebook/dinov3-68924841bd6b561778e31009) collection. + +> [!TIP] +> Click on the DINOv3 models in the right sidebar for more examples of how to apply DINOv3 to different vision tasks. + +The example below demonstrates how to obtain an image embedding with [`Pipeline`] or the [`AutoModel`] class. + + + + +```py +import torch +from transformers import pipeline + +pipe = pipeline( + task="image-feature-extraction", + model="facebook/dinov3-vits16-pretrain-lvd1689m", + torch_dtype=torch.bfloat16, +) + +pipe("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg") +``` + + + + +```py +import torch +from transformers import AutoImageProcessor, AutoModel +from transformers.image_utils import load_image + +url = "http://images.cocodataset.org/val2017/000000039769.jpg" +image = load_image(url) + +processor = AutoImageProcessor.from_pretrained("facebook/dinov3-vits16-pretrain-lvd1689m") +model = AutoModel.from_pretrained( + "facebook/dinov3-vits16-pretrain-lvd1689m", + torch_dtype=torch.float16, + device_map="auto", + attn_implementation="sdpa" +) + +inputs = processor(images=image, return_tensors="pt").to(model.device) +with torch.inference_mode(): + outputs = model(**inputs) + +pooled_output = outputs.pooler_output +print("Pooled output shape:", pooled_output.shape) +``` + + + + +Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends. + +The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4. + +```py +# pip install torchao +import torch +from transformers import TorchAoConfig, AutoImageProcessor, AutoModel +from torchao.quantization import Int4WeightOnlyConfig +from transformers.image_utils import load_image + + +url = "http://images.cocodataset.org/val2017/000000039769.jpg" +image = load_image(url) + +processor = AutoImageProcessor.from_pretrained("facebook/dinov3-vitsplus-pretrain-lvd1689m") + +quant_type = Int4WeightOnlyConfig(group_size=128) +quantization_config = TorchAoConfig(quant_type=quant_type) + +model = AutoModel.from_pretrained( + "facebook/dinov3-vit7b16-pretrain-lvd1689m", + torch_dtype=torch.bfloat16, + device_map="auto", + quantization_config=quantization_config +) + +inputs = processor(images=image, return_tensors="pt").to(model.device) +with torch.inference_mode(): + outputs = model(**inputs) + +pooled_output = outputs.pooler_output +print("Pooled output shape:", pooled_output.shape) +``` + +## Notes + +- The example below shows how to split the output tensor into: + - one embedding for the whole image, commonly referred to as a `CLS` token, + useful for classification and retrieval + - register tokens - learnable embeddings that act as dedicated “memory slots” for global information, + they reduce high-norm artifacts in patch tokens, yielding cleaner attention maps and better + performance on dense prediction tasks. + - a set of local embeddings, one for each `16x16` patch of the input image, + useful for dense tasks, such as semantic segmentation + + ```py + import torch + from transformers import AutoImageProcessor, AutoModel + from transformers.image_utils import load_image + + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + image = load_image(url) + print("Image size:", image.height, image.width) # [480, 640] + + processor = AutoImageProcessor.from_pretrained("facebook/dinov3-vits16-pretrain-lvd1689m") + model = AutoModel.from_pretrained("facebook/dinov3-vits16-pretrain-lvd1689m") + patch_size = model.config.patch_size + print("Patch size:", patch_size) # 16 + print("Num register tokens:", model.config.num_register_tokens) # 4 + + inputs = processor(images=image, return_tensors="pt") + print("Preprocessed image size:", inputs.pixel_values.shape) # [1, 3, 224, 224] + + batch_size, _, img_height, img_width = inputs.pixel_values.shape + num_patches_height, num_patches_width = img_height // patch_size, img_width // patch_size + num_patches_flat = num_patches_height * num_patches_width + + with torch.inference_mode(): + outputs = model(**inputs) + + last_hidden_states = outputs.last_hidden_state + print(last_hidden_states.shape) # [1, 1 + 4 + 256, 384] + assert last_hidden_states.shape == (batch_size, 1 + model.config.num_register_tokens + num_patches_flat, model.config.hidden_size) + + cls_token = last_hidden_states[:, 0, :] + patch_features_flat = last_hidden_states[:, 1 + model.config.num_register_tokens:, :] + patch_features = patch_features_flat.unflatten(1, (num_patches_height, num_patches_width)) + ``` + +## DINOv3ViTConfig + +[[autodoc]] DINOv3ViTConfig + +## DINOv3ConvNextConfig + +[[autodoc]] DINOv3ConvNextConfig + +## DINOv3ViTModel + +[[autodoc]] DINOv3ViTModel + - forward + +## DINOv3ConvNextModel + +[[autodoc]] DINOv3ConvNextModel + - forward + +## DINOv3ViTImageProcessorFast + +[[autodoc]] DINOv3ViTImageProcessorFast + - preprocess diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 662edfe269..ea1d5488a6 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -99,6 +99,8 @@ if TYPE_CHECKING: from .dinat import * from .dinov2 import * from .dinov2_with_registers import * + from .dinov3_convnext import * + from .dinov3_vit import * from .distilbert import * from .dit import * from .donut import * diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 01c85be864..57f074590e 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -117,6 +117,8 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str]( ("dinat", "DinatConfig"), ("dinov2", "Dinov2Config"), ("dinov2_with_registers", "Dinov2WithRegistersConfig"), + ("dinov3_convnext", "DINOv3ConvNextConfig"), + ("dinov3_vit", "DINOv3ViTConfig"), ("distilbert", "DistilBertConfig"), ("doge", "DogeConfig"), ("donut-swin", "DonutSwinConfig"), @@ -525,6 +527,8 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str]( ("dinat", "DiNAT"), ("dinov2", "DINOv2"), ("dinov2_with_registers", "DINOv2 with Registers"), + ("dinov3_convnext", "DINOv3 ConvNext"), + ("dinov3_vit", "DINOv3 ViT"), ("distilbert", "DistilBERT"), ("dit", "DiT"), ("doge", "Doge"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 9a983d68f8..1e3e0aa178 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -88,6 +88,7 @@ else: ("detr", ("DetrImageProcessor", "DetrImageProcessorFast")), ("dinat", ("ViTImageProcessor", "ViTImageProcessorFast")), ("dinov2", ("BitImageProcessor", "BitImageProcessorFast")), + ("dinov3_vit", (None, "DINOv3ViTImageProcessorFast")), ("donut-swin", ("DonutImageProcessor", "DonutImageProcessorFast")), ("dpt", ("DPTImageProcessor", "DPTImageProcessorFast")), ("efficientformer", ("EfficientFormerImageProcessor", None)), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index b9fe86a45b..7d8d215712 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -121,6 +121,8 @@ MODEL_MAPPING_NAMES = OrderedDict( ("dinat", "DinatModel"), ("dinov2", "Dinov2Model"), ("dinov2_with_registers", "Dinov2WithRegistersModel"), + ("dinov3_convnext", "DINOv3ConvNextModel"), + ("dinov3_vit", "DINOv3ViTModel"), ("distilbert", "DistilBertModel"), ("doge", "DogeModel"), ("donut-swin", "DonutSwinModel"), @@ -746,6 +748,8 @@ MODEL_FOR_IMAGE_MAPPING_NAMES = OrderedDict( ("dinat", "DinatModel"), ("dinov2", "Dinov2Model"), ("dinov2_with_registers", "Dinov2WithRegistersModel"), + ("dinov3_convnext", "DINOv3ConvNextModel"), + ("dinov3_vit", "DINOv3ViTModel"), ("dpt", "DPTModel"), ("efficientformer", "EfficientFormerModel"), ("efficientnet", "EfficientNetModel"), diff --git a/src/transformers/models/dinov3_convnext/__init__.py b/src/transformers/models/dinov3_convnext/__init__.py new file mode 100644 index 0000000000..8839dc7cec --- /dev/null +++ b/src/transformers/models/dinov3_convnext/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_dinov3_convnext import * + from .modeling_dinov3_convnext import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/dinov3_convnext/configuration_dinov3_convnext.py b/src/transformers/models/dinov3_convnext/configuration_dinov3_convnext.py new file mode 100644 index 0000000000..fa593e10ec --- /dev/null +++ b/src/transformers/models/dinov3_convnext/configuration_dinov3_convnext.py @@ -0,0 +1,103 @@ +# coding=utf-8 +# Copyright 2025 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""ConvNeXT model configuration""" + +from typing import Optional + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class DINOv3ConvNextConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DINOv3ConvNextModel`]. It is used to instantiate an + DINOv3ConvNext model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the DINOv3ConvNext + [facebook/dinov3-convnext-tiny-pretrain-lvd1689m](https://huggingface.co/facebook/dinov3-convnext-tiny-pretrain-lvd1689m) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + hidden_sizes (`list[int]`, *optional*, defaults to [96, 192, 384, 768]): + Dimensionality (hidden size) at each stage. + depths (`list[int]`, *optional*, defaults to [3, 3, 9, 3]): + The number of layers for each stage. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in each block. If string, `"gelu"`, `"relu"`, + `"selu"` and `"gelu_new"` are supported. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + layer_scale_init_value (`float`, *optional*, defaults to 1e-06): + The initial value for the layer scale. + drop_path_rate (`float`, *optional*, defaults to 0.0): + The drop rate for stochastic depth. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of input images. + + Example: + ```python + >>> from transformers import DINOv3ConvNextConfig, DINOv3ConvNextModel + + >>> # Initializing a DINOv3ConvNext (tiny variant) style configuration + >>> config = DINOv3ConvNextConfig() + + >>> # Initializing a model (with random weights) + >>> model = DINOv3ConvNextModel(config) + + >>> # Accessing the model config + >>> config = model.config + ```""" + + model_type = "dinov3_convnext" + + def __init__( + self, + num_channels: int = 3, + hidden_sizes: Optional[list[int]] = None, + depths: Optional[list[int]] = None, + hidden_act: str = "gelu", + initializer_range: float = 0.02, + layer_norm_eps: float = 1e-6, + layer_scale_init_value: float = 1e-6, + drop_path_rate: float = 0.0, + image_size: int = 224, + **kwargs, + ): + super().__init__(**kwargs) + + self.num_channels = num_channels + self.hidden_sizes = [96, 192, 384, 768] if hidden_sizes is None else hidden_sizes + self.depths = [3, 3, 9, 3] if depths is None else depths + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.layer_scale_init_value = layer_scale_init_value + self.drop_path_rate = drop_path_rate + self.image_size = image_size + + @property + def num_stages(self) -> int: + return len(self.hidden_sizes) + + +__all__ = ["DINOv3ConvNextConfig"] diff --git a/src/transformers/models/dinov3_convnext/convert_dinov3_convnext_to_hf.py b/src/transformers/models/dinov3_convnext/convert_dinov3_convnext_to_hf.py new file mode 100644 index 0000000000..0ba200936e --- /dev/null +++ b/src/transformers/models/dinov3_convnext/convert_dinov3_convnext_to_hf.py @@ -0,0 +1,234 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert DINOv3 checkpoints from the original repository. + +URL: https://github.com/facebookresearch/dinov3/tree/main +""" + +import argparse +import os +import re +from typing import Optional + +import requests +import torch +from huggingface_hub import HfApi, hf_hub_download +from PIL import Image +from torchvision import transforms + +from transformers import DINOv3ConvNextConfig, DINOv3ConvNextModel, DINOv3ViTImageProcessorFast + + +HUB_MODELS = { + "convnext_tiny": "facebook/dinov3-convnext-tiny-pretrain-lvd1689m", + "convnext_small": "facebook/dinov3-convnext-small-pretrain-lvd1689m", + "convnext_base": "facebook/dinov3-convnext-base-pretrain-lvd1689m", + "convnext_large": "facebook/dinov3-convnext-large-pretrain-lvd1689m", +} + +HUB_CHECKPOINTS = { + "convnext_tiny": "dinov3_convnext_tiny_pretrain_lvd1689m-21b726bb.pth", + "convnext_small": "dinov3_convnext_small_pretrain_lvd1689m-296db49d.pth", + "convnext_base": "dinov3_convnext_base_pretrain_lvd1689m-801f2ba9.pth", + "convnext_large": "dinov3_convnext_large_pretrain_lvd1689m-61fa432d.pth", +} + +# fmt: off +ORIGINAL_TO_CONVERTED_KEY_MAPPING = { + r"dwconv": r"depthwise_conv", + r"pwconv": r"pointwise_conv", + r"norm": r"layer_norm", + r"stages.(\d+).(\d+)": r"stages.\1.layers.\2", + r"downsample_layers.(\d+).(\d+)": r"stages.\1.downsample_layers.\2", +} +# fmt: on + + +def get_dinov3_config(model_name: str) -> DINOv3ConvNextConfig: + # size of the architecture + if model_name == "convnext_tiny": + return DINOv3ConvNextConfig( + depths=[3, 3, 9, 3], + hidden_sizes=[96, 192, 384, 768], + ) + elif model_name == "convnext_small": + return DINOv3ConvNextConfig( + depths=[3, 3, 27, 3], + hidden_sizes=[96, 192, 384, 768], + ) + elif model_name == "convnext_base": + return DINOv3ConvNextConfig( + depths=[3, 3, 27, 3], + hidden_sizes=[128, 256, 512, 1024], + ) + elif model_name == "convnext_large": + return DINOv3ConvNextConfig( + depths=[3, 3, 27, 3], + hidden_sizes=[192, 384, 768, 1536], + ) + else: + raise ValueError("Model not supported") + + +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + image = Image.open(requests.get(url, stream=True).raw).convert("RGB") + return image + + +def get_transform(resize_size: int = 224): + to_tensor = transforms.ToTensor() + resize = transforms.Resize((resize_size, resize_size), antialias=True) + normalize = transforms.Normalize( + mean=(0.485, 0.456, 0.406), + std=(0.229, 0.224, 0.225), + ) + return transforms.Compose([to_tensor, resize, normalize]) + + +def get_image_processor(resize_size: int = 224): + return DINOv3ViTImageProcessorFast( + do_resize=True, + size={"height": resize_size, "width": resize_size}, + resample=2, # BILINEAR + ) + + +def convert_old_keys_to_new_keys(state_dict_keys: Optional[dict] = None): + """ + This function should be applied only once, on the concatenated keys to efficiently rename using + the key mappings. + """ + output_dict = {} + if state_dict_keys is not None: + old_text = "\n".join(state_dict_keys) + new_text = old_text + for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items(): + if replacement is None: + new_text = re.sub(pattern, "", new_text) # an empty line + continue + new_text = re.sub(pattern, replacement, new_text) + output_dict = dict(zip(old_text.split("\n"), new_text.split("\n"))) + return output_dict + + +@torch.no_grad() +def convert_and_test_dinov3_checkpoint(args): + expected_outputs = { + "convnext_tiny_cls": [-6.372119, 1.300791, 2.074303, -0.079975, 0.607205], + "convnext_tiny_patch": [0.490530, -3.713466, 1.848513, -1.040319, -1.090818], + "convnext_small_cls": [-0.903914, 1.412183, 0.287465, 0.175296, -2.397940], + "convnext_small_patch": [-1.081114, 0.637362, 3.748765, 0.170179, 1.445153], + "convnext_base_cls": [0.155366, -0.378771, -0.735157, -2.818718, 0.015095], + "convnext_base_patch": [3.039118, 0.778155, -1.961322, -1.607147, -2.411941], + "convnext_large_cls": [-2.219094, -0.594451, -2.300294, -0.957415, -0.520473], + "convnext_large_patch": [-1.477349, -0.217038, -3.128137, 0.418962, 0.334949], + } + model_name = args.model_name + config = get_dinov3_config(model_name) + # print(config) + + model = DINOv3ConvNextModel(config).eval() + state_dict_path = hf_hub_download(repo_id=HUB_MODELS[model_name], filename=HUB_CHECKPOINTS[model_name]) + original_state_dict = torch.load(state_dict_path) + original_keys = list(original_state_dict.keys()) + new_keys = convert_old_keys_to_new_keys(original_keys) + + converted_state_dict = {} + for key in original_keys: + new_key = new_keys[key] + weight_tensor = original_state_dict[key] + if key == "norms.3.weight" or key == "norms.3.bias": + continue + converted_state_dict[new_key] = weight_tensor + model.load_state_dict(converted_state_dict, strict=True) + model = model.eval() + + transform = get_transform() + image_processor = get_image_processor() + image = prepare_img() + + # check preprocessing + original_pixel_values = transform(image).unsqueeze(0) # add batch dimension + inputs = image_processor(image, return_tensors="pt") + + torch.testing.assert_close(original_pixel_values, inputs["pixel_values"], atol=1e-6, rtol=1e-6) + print("Preprocessing looks ok!") + + with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float): + model_output = model(**inputs) + + last_layer_class_token = model_output.pooler_output + last_layer_patch_tokens = model_output.last_hidden_state[:, 1:] + + actual_outputs = {} + actual_outputs[f"{model_name}_cls"] = last_layer_class_token[0, :5].tolist() + actual_outputs[f"{model_name}_patch"] = last_layer_patch_tokens[0, 0, :5].tolist() + + print("Actual: ", [round(x, 6) for x in actual_outputs[f"{model_name}_cls"]]) + print("Expected:", expected_outputs[f"{model_name}_cls"]) + + torch.testing.assert_close( + torch.Tensor(actual_outputs[f"{model_name}_cls"]), + torch.Tensor(expected_outputs[f"{model_name}_cls"]), + atol=1e-3, + rtol=1e-3, + ) + print("Actual: ", [round(x, 6) for x in actual_outputs[f"{model_name}_patch"]]) + print("Expected:", expected_outputs[f"{model_name}_patch"]) + + torch.testing.assert_close( + torch.Tensor(actual_outputs[f"{model_name}_patch"]), + torch.Tensor(expected_outputs[f"{model_name}_patch"]), + atol=1e-3, + rtol=1e-3, + ) + print("Forward pass looks ok!") + + save_dir = os.path.join(args.save_dir, model_name) + os.makedirs(save_dir, exist_ok=True) + model.save_pretrained(save_dir) + image_processor.save_pretrained(save_dir) + print(f"Model saved to {save_dir}") + + if args.push_to_hub: + api = HfApi() + repo = HUB_MODELS[model_name] + api.upload_folder(folder_path=save_dir, repo_id=repo, repo_type="model") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--model-name", + default="convnext_tiny", + type=str, + choices=["convnext_tiny", "convnext_small", "convnext_base", "convnext_large"], + help="Name of the model you'd like to convert.", + ) + parser.add_argument( + "--save-dir", + default="converted_models", + type=str, + help="Directory to save the converted model.", + ) + parser.add_argument( + "--push-to-hub", + action="store_true", + help="Push the converted model to the Hugging Face Hub.", + ) + args = parser.parse_args() + convert_and_test_dinov3_checkpoint(args) diff --git a/src/transformers/models/dinov3_convnext/modeling_dinov3_convnext.py b/src/transformers/models/dinov3_convnext/modeling_dinov3_convnext.py new file mode 100644 index 0000000000..2318faf148 --- /dev/null +++ b/src/transformers/models/dinov3_convnext/modeling_dinov3_convnext.py @@ -0,0 +1,261 @@ +# coding=utf-8 +# Copyright 2025 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch ConvNext model.""" + +from typing import Optional + +import numpy as np +import torch +import torch.utils.checkpoint +from torch import nn + +from ...activations import ACT2FN +from ...modeling_outputs import ( + BaseModelOutputWithPoolingAndNoAttention, +) +from ...modeling_utils import PreTrainedModel +from ...utils import auto_docstring, logging +from ...utils.generic import can_return_tuple +from .configuration_dinov3_convnext import DINOv3ConvNextConfig + + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.beit.modeling_beit.drop_path +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +# Copied from transformers.models.convnext.modeling_convnext.ConvNextDropPath with ConvNext->DINOv3ConvNext +class DINOv3ConvNextDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return f"p={self.drop_prob}" + + +class DINOv3ConvNextLayerNorm(nn.LayerNorm): + r"""LayerNorm that supports two data formats: channels_last (default) or channels_first. + The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height, + width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width). + """ + + def __init__(self, *args, data_format="channels_last", **kwargs): + super().__init__(*args, **kwargs) + if data_format not in ["channels_last", "channels_first"]: + raise NotImplementedError(f"Unsupported data format: {data_format}") + self.data_format = data_format + + def forward(self, features: torch.Tensor) -> torch.Tensor: + """ + Args: + features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels) + """ + if self.data_format == "channels_first": + features = features.permute(0, 2, 3, 1) + features = super().forward(features) + features = features.permute(0, 3, 1, 2) + else: + features = super().forward(features) + return features + + +class DINOv3ConvNextLayer(nn.Module): + """This corresponds to the `Block` class in the original implementation. + + There are two equivalent implementations: + 1) DwConv, LayerNorm (channels_first), Conv, GELU, Conv (all in (N, C, H, W) format) + 2) DwConv, Permute, LayerNorm (channels_last), Linear, GELU, Linear, Permute + + The authors used (2) as they find it slightly faster in PyTorch. + + Args: + config ([`DINOv3ConvNextConfig`]): + Model config. + channels (`int`): + Number of input (and output) channels. + drop_path (`float`): + Drop path rate. Default: 0.0. + """ + + def __init__(self, config: DINOv3ConvNextConfig, channels: int, drop_path: float = 0.0): + super().__init__() + self.depthwise_conv = nn.Conv2d(channels, channels, kernel_size=7, padding=3, groups=channels) + self.layer_norm = DINOv3ConvNextLayerNorm(channels, eps=config.layer_norm_eps) + self.pointwise_conv1 = nn.Linear(channels, 4 * channels) # can be seen as a 1x1 conv + self.activation_fn = ACT2FN[config.hidden_act] + self.pointwise_conv2 = nn.Linear(4 * channels, channels) # can be seen as a 1x1 conv + self.gamma = nn.Parameter(torch.full((channels,), config.layer_scale_init_value), requires_grad=True) + self.drop_path = DINOv3ConvNextDropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + def forward(self, features: torch.Tensor) -> torch.Tensor: + """ + Args: + features: Tensor of shape (batch_size, channels, height, width) + """ + residual = features + features = self.depthwise_conv(features) + features = features.permute(0, 2, 3, 1) # to channels last + features = self.layer_norm(features) + features = self.pointwise_conv1(features) + features = self.activation_fn(features) + features = self.pointwise_conv2(features) + features = features * self.gamma + features = features.permute(0, 3, 1, 2) # back to channels first + features = residual + self.drop_path(features) + return features + + +class DINOv3ConvNextStage(nn.Module): + """ """ + + def __init__(self, config: DINOv3ConvNextConfig, stage_idx: int): + super().__init__() + + in_channels = config.hidden_sizes[stage_idx - 1] if stage_idx > 0 else config.num_channels + out_channels = config.hidden_sizes[stage_idx] + + if stage_idx == 0: + self.downsample_layers = nn.ModuleList( + [ + nn.Conv2d(config.num_channels, out_channels, kernel_size=4, stride=4), + DINOv3ConvNextLayerNorm(out_channels, eps=config.layer_norm_eps, data_format="channels_first"), + ] + ) + else: + self.downsample_layers = nn.ModuleList( + [ + DINOv3ConvNextLayerNorm(in_channels, eps=config.layer_norm_eps, data_format="channels_first"), + nn.Conv2d(in_channels, out_channels, kernel_size=2, stride=2), + ] + ) + + num_stage_layers = config.depths[stage_idx] + num_previous_layers = sum(config.depths[:stage_idx]) + num_total_layers = sum(config.depths) + drop_path_rates = np.linspace(0, config.drop_path_rate, num_total_layers).tolist() + + self.layers = nn.ModuleList( + [ + DINOv3ConvNextLayer(config, channels=out_channels, drop_path=drop_path_rates[i]) + for i in range(num_previous_layers, num_previous_layers + num_stage_layers) + ] + ) + + def forward(self, features: torch.Tensor) -> torch.Tensor: + """ + Args: + features: Tensor of shape (batch_size, channels, height, width) + """ + for layer in self.downsample_layers: + features = layer(features) + for layer in self.layers: + features = layer(features) + return features + + +@auto_docstring +class DINOv3ConvNextPreTrainedModel(PreTrainedModel): + config: DINOv3ConvNextConfig + base_model_prefix = "dinov3_convnext" + main_input_name = "pixel_values" + _no_split_modules = ["DINOv3ConvNextLayer"] + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, (nn.LayerNorm, DINOv3ConvNextLayerNorm)): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, DINOv3ConvNextLayer): + if module.gamma is not None: + module.gamma.data.fill_(self.config.layer_scale_init_value) + + +@auto_docstring +class DINOv3ConvNextModel(DINOv3ConvNextPreTrainedModel): + def __init__(self, config: DINOv3ConvNextConfig): + super().__init__(config) + self.config = config + self.stages = nn.ModuleList([DINOv3ConvNextStage(config, stage_idx) for stage_idx in range(config.num_stages)]) + self.layer_norm = nn.LayerNorm(config.hidden_sizes[-1], eps=config.layer_norm_eps) # final norm layer + self.pool = nn.AdaptiveAvgPool2d(1) + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, pixel_values: torch.FloatTensor, output_hidden_states: Optional[bool] = None + ) -> BaseModelOutputWithPoolingAndNoAttention: + hidden_states = pixel_values + + output_hidden_states = output_hidden_states or self.config.output_hidden_states + all_hidden_states = [hidden_states] if output_hidden_states else [] + + for stage in self.stages: + hidden_states = stage(hidden_states) + + # store intermediate stage outputs + if output_hidden_states: + all_hidden_states.append(hidden_states) + + # make global representation, a.k.a [CLS] token + pooled_output = self.pool(hidden_states) + + # (batch_size, channels, height, width) -> (batch_size, height * width, channels) + pooled_output = pooled_output.flatten(2).transpose(1, 2) + hidden_states = hidden_states.flatten(2).transpose(1, 2) + + # concat "cls" and "patch tokens" as (batch_size, 1 + height * width, channels) + hidden_states = torch.cat([pooled_output, hidden_states], dim=1) + hidden_states = self.layer_norm(hidden_states) + + return BaseModelOutputWithPoolingAndNoAttention( + last_hidden_state=hidden_states, + pooler_output=hidden_states[:, 0], + hidden_states=tuple(all_hidden_states) if output_hidden_states else None, + ) + + +__all__ = ["DINOv3ConvNextModel", "DINOv3ConvNextPreTrainedModel"] diff --git a/src/transformers/models/dinov3_vit/__init__.py b/src/transformers/models/dinov3_vit/__init__.py new file mode 100644 index 0000000000..a74878b205 --- /dev/null +++ b/src/transformers/models/dinov3_vit/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_dinov3_vit import * + from .image_processing_dinov3_vit_fast import * + from .modeling_dinov3_vit import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/dinov3_vit/configuration_dinov3_vit.py b/src/transformers/models/dinov3_vit/configuration_dinov3_vit.py new file mode 100644 index 0000000000..78cbd200ce --- /dev/null +++ b/src/transformers/models/dinov3_vit/configuration_dinov3_vit.py @@ -0,0 +1,166 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""DINOv3 model configuration""" + +from typing import Optional + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class DINOv3ViTConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DINOv3Model`]. It is used to instantiate an + DINOv3 model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the DINOv3 + [facebook/dinov3-vits16-pretrain-lvd1689m](https://huggingface.co/facebook/dinov3-vits16-pretrain-lvd1689m) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + hidden_size (`int`, *optional*, defaults to 384): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 1536): + Dimensionality of the "intermediate" (i.e., feed-forward) layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 6): + Number of attention heads for each attention layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + rope_theta (`float`, *optional*, defaults to 100.0): + The base period of the RoPE embeddings. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + query_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the query projection. + key_bias (`bool`, *optional*, defaults to `False`): + Whether to add a bias to the key projection. + value_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the value projection. + proj_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the output projection. + mlp_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the MLP layers. + layerscale_value (`float`, *optional*, defaults to 1.0): + Initial value to use for layer scale. + drop_path_rate (`float`, *optional*, defaults to 0.0): + Stochastic depth rate per sample (when applied in the main path of residual layers). + use_gated_mlp (`bool`, *optional*, defaults to `False`): + Whether to use the SwiGLU feedforward neural network. + num_register_tokens (`int`, *optional*, defaults to 0): + The number of register tokens. + pos_embed_shift (`float`, *optional*): + Amount to randomly shift position embedding coordinates in [-shift, shift], + applied only in training mode if not `None`. + pos_embed_jitter (`float`, *optional*): + Amount to randomly jitter position embedding coordinates in log-uniform value in [1/jitter, jitter], + applied only in training mode if not `None`. + pos_embed_rescale (`float`, *optional*, defaults to 2.0): + Amount to randomly rescale position embedding coordinates in log-uniform value in [1/rescale, rescale], + applied only in training mode if not `None`. + + Example: + + ```python + >>> from transformers import DINOv3ViTConfig, DINOv3ViTModel + + >>> # Initializing a DINOv3 ViT-small style configuration + >>> config = DINOv3ViTConfig() + + >>> # Initializing a model (with random weights) from the config + >>> model = DINOv3ViTModel(config) + + >>> # Accessing the model config + >>> config = model.config + ```""" + + model_type = "dinov3_vit" + + def __init__( + self, + patch_size: int = 16, + hidden_size: int = 384, + intermediate_size: int = 1536, + num_hidden_layers: int = 12, + num_attention_heads: int = 6, + hidden_act: str = "gelu", + attention_dropout: float = 0.0, + initializer_range: float = 0.02, + layer_norm_eps: float = 1e-5, + rope_theta: float = 100.0, + image_size: int = 224, + num_channels: int = 3, + query_bias: bool = True, + key_bias: bool = False, + value_bias: bool = True, + proj_bias: bool = True, + mlp_bias: bool = True, + layerscale_value: float = 1.0, + drop_path_rate: float = 0.0, + use_gated_mlp: bool = False, + num_register_tokens: int = 0, + # train augs + pos_embed_shift: Optional[float] = None, + pos_embed_jitter: Optional[float] = None, + pos_embed_rescale: Optional[float] = 2.0, + **kwargs, + ): + super().__init__(**kwargs) + + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.attention_dropout = attention_dropout + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.layerscale_value = layerscale_value + self.drop_path_rate = drop_path_rate + self.use_gated_mlp = use_gated_mlp + self.rope_theta = rope_theta + self.query_bias = query_bias + self.key_bias = key_bias + self.value_bias = value_bias + self.proj_bias = proj_bias + self.mlp_bias = mlp_bias + self.num_register_tokens = num_register_tokens + + # train augs + self.pos_embed_shift = pos_embed_shift + self.pos_embed_jitter = pos_embed_jitter + self.pos_embed_rescale = pos_embed_rescale + + +__all__ = ["DINOv3ViTConfig"] diff --git a/src/transformers/models/dinov3_vit/convert_dinov3_vit_to_hf.py b/src/transformers/models/dinov3_vit/convert_dinov3_vit_to_hf.py new file mode 100644 index 0000000000..b6589e089d --- /dev/null +++ b/src/transformers/models/dinov3_vit/convert_dinov3_vit_to_hf.py @@ -0,0 +1,337 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert DINOv3 checkpoints from the original repository. + +URL: https://github.com/facebookresearch/dinov3/tree/main +""" + +import argparse +import os +import re +from typing import Optional + +import requests +import torch +from huggingface_hub import HfApi, hf_hub_download +from PIL import Image +from torchvision import transforms + +from transformers import DINOv3ViTConfig, DINOv3ViTImageProcessorFast, DINOv3ViTModel + + +HUB_MODELS = { + "vits16_lvd1689m": "facebook/dinov3-vits16-pretrain-lvd1689m", + "vits16plus_lvd1689m": "facebook/dinov3-vits16plus-pretrain-lvd1689m", + "vitb16_lvd1689m": "facebook/dinov3-vitb16-pretrain-lvd1689m", + "vitl16_lvd1689m": "facebook/dinov3-vitl16-pretrain-lvd1689m", + "vitl16_sat493m": "facebook/dinov3-vitl16-pretrain-sat493m", + "vith16plus_lvd1689m": "facebook/dinov3-vith16plus-pretrain-lvd1689m", + "vit7b16_lvd1689m": "facebook/dinov3-vit7b16-pretrain-lvd1689m", + "vit7b16_sat493m": "facebook/dinov3-vit7b16-pretrain-sat493m", +} + +HUB_CHECKPOINTS = { + "vits16_lvd1689m": "dinov3_vits16_pretrain_lvd1689m-08c60483.pth", + "vits16plus_lvd1689m": "dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pth", + "vitb16_lvd1689m": "dinov3_vitb16_pretrain_lvd1689m-73cec8be.pth", + "vitl16_lvd1689m": "dinov3_vitl16_pretrain_lvd1689m-8aa4cbdd.pth", + "vitl16_sat493m": "dinov3_vitl16_pretrain_sat493m-eadcf0ff.pth", + "vith16plus_lvd1689m": "dinov3_vith16plus_pretrain_lvd1689m-7c1da9a5.pth", + "vit7b16_lvd1689m": "dinov3_vit7b16_pretrain_lvd1689m-a955f4ea.pth", + "vit7b16_sat493m": "dinov3_vit7b16_pretrain_sat493m-a6675841.pth", +} + +# fmt: off +ORIGINAL_TO_CONVERTED_KEY_MAPPING = { + r"cls_token": r"embeddings.cls_token", + r"mask_token": r"embeddings.mask_token", + r"storage_tokens": r"embeddings.register_tokens", + r"patch_embed.proj": r"embeddings.patch_embeddings", + r"periods": r"inv_freq", + r"rope_embed": r"rope_embeddings", + r"blocks.(\d+).attn.proj": r"layer.\1.attention.o_proj", + r"blocks.(\d+).attn.": r"layer.\1.attention.", + r"blocks.(\d+).ls(\d+).gamma": r"layer.\1.layer_scale\2.lambda1", + r"blocks.(\d+).mlp.fc1": r"layer.\1.mlp.up_proj", + r"blocks.(\d+).mlp.fc2": r"layer.\1.mlp.down_proj", + r"blocks.(\d+).mlp": r"layer.\1.mlp", + r"blocks.(\d+).norm": r"layer.\1.norm", + r"w1": r"gate_proj", + r"w2": r"up_proj", + r"w3": r"down_proj", +} +# fmt: on + + +def convert_old_keys_to_new_keys(state_dict_keys: Optional[dict] = None): + """ + This function should be applied only once, on the concatenated keys to efficiently rename using + the key mappings. + """ + output_dict = {} + if state_dict_keys is not None: + old_text = "\n".join(state_dict_keys) + new_text = old_text + for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items(): + if replacement is None: + new_text = re.sub(pattern, "", new_text) # an empty line + continue + new_text = re.sub(pattern, replacement, new_text) + output_dict = dict(zip(old_text.split("\n"), new_text.split("\n"))) + return output_dict + + +def split_qkv(state_dict: dict): + keys = [x for x in state_dict.keys() if "qkv" in x] + for key in keys: + qkv = state_dict.pop(key) + q, k, v = torch.chunk(qkv, 3, dim=0) + state_dict[key.replace("qkv", "q_proj")] = q + state_dict[key.replace("qkv", "k_proj")] = k + state_dict[key.replace("qkv", "v_proj")] = v + return state_dict + + +def get_dinov3_config(model_name: str) -> DINOv3ViTConfig: + # size of the architecture + if model_name == "vits16_lvd1689m": + return DINOv3ViTConfig( + patch_size=16, + hidden_size=384, + intermediate_size=1536, + num_hidden_layers=12, + num_attention_heads=6, + proj_bias=True, + num_register_tokens=4, + use_gated_mlp=False, + hidden_act="gelu", + ) + elif model_name == "vits16plus_lvd1689m": + return DINOv3ViTConfig( + patch_size=16, + hidden_size=384, + intermediate_size=1536, + num_hidden_layers=12, + num_attention_heads=6, + num_register_tokens=4, + use_gated_mlp=True, + hidden_act="silu", + ) + elif model_name == "vitb16_lvd1689m": + return DINOv3ViTConfig( + patch_size=16, + hidden_size=768, + intermediate_size=3072, + num_hidden_layers=12, + num_attention_heads=12, + proj_bias=True, + num_register_tokens=4, + use_gated_mlp=False, + hidden_act="gelu", + ) + elif model_name in ("vitl16_lvd1689m", "vitl16_sat493m"): + return DINOv3ViTConfig( + patch_size=16, + hidden_size=1024, + intermediate_size=4096, + num_hidden_layers=24, + num_attention_heads=16, + num_register_tokens=4, + use_gated_mlp=False, + hidden_act="gelu", + ) + elif model_name == "vith16plus_lvd1689m": + return DINOv3ViTConfig( + patch_size=16, + hidden_size=1280, + intermediate_size=5120, + num_hidden_layers=32, + num_attention_heads=20, + num_register_tokens=4, + use_gated_mlp=True, + hidden_act="silu", + ) + elif model_name in ("vit7b16_lvd1689m", "vit7b16_sat493m"): + return DINOv3ViTConfig( + patch_size=16, + hidden_size=4096, + intermediate_size=8192, + num_hidden_layers=40, + num_attention_heads=32, + query_bias=False, + value_bias=False, + num_register_tokens=4, + use_gated_mlp=True, + hidden_act="silu", + ) + else: + raise ValueError("Model not supported") + + +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + image = Image.open(requests.get(url, stream=True).raw).convert("RGB") + return image + + +def get_transform(resize_size: int = 224): + to_tensor = transforms.ToTensor() + resize = transforms.Resize((resize_size, resize_size), antialias=True) + normalize = transforms.Normalize( + mean=(0.485, 0.456, 0.406), + std=(0.229, 0.224, 0.225), + ) + return transforms.Compose([to_tensor, resize, normalize]) + + +def get_image_processor(resize_size: int = 224): + return DINOv3ViTImageProcessorFast( + do_resize=True, + size={"height": resize_size, "width": resize_size}, + resample=2, # BILINEAR + ) + + +@torch.no_grad() +def convert_and_test_dinov3_checkpoint(args): + expected_outputs = { + "vits16_lvd1689m_cls": [0.463561, -0.415609, 0.408236, -0.126613, -0.286636], + "vits16_lvd1689m_patch": [-0.038754, -0.250895, -0.016392, -0.455473, 0.571582], + "vits16plus_lvd1689m_cls": [-0.471349, -1.365778, -0.317983, 0.377219, -0.769085], + "vits16plus_lvd1689m_patch": [0.144551, -0.388117, -0.393433, -0.157695, -0.600380], + "vitb16_lvd1689m_cls": [1.034643, -0.180609, -0.341018, -0.066376, -0.011383], + "vitb16_lvd1689m_patch": [-0.082523, -0.456272, -0.728029, -0.430680, -0.152880], + "vitl16_lvd1689m_cls": [0.484527, -0.582214, 0.480636, 0.592040, 0.945166], + "vitl16_lvd1689m_patch": [-0.211367, -0.490863, -0.257131, 0.101763, 0.154511], + "vith16plus_lvd1689m_cls": [-0.064575, -0.148866, -0.621524, 0.634878, 0.152695], + "vith16plus_lvd1689m_patch": [-0.093817, 0.287407, -0.050036, 0.428043, 0.094561], + "vit7b16_lvd1689m_cls": [0.275439, -0.261353, 0.067772, 0.049936, -0.158747], + "vit7b16_lvd1689m_patch": [0.044442, -0.052542, 0.070777, -0.065111, -0.026546], + "vitl16_sat493m_cls": [-0.33235, 0.34052, -0.22087, 0.21434, 0.09003], + "vitl16_sat493m_patch": [0.18488, 0.30309, -0.20689, 0.12848, 0.06207], + "vit7b16_sat493m_cls": [-0.19779, 0.11819, -0.00581, -0.21055, -0.03971], + "vit7b16_sat493m_patch": [-0.12423, 0.07879, -0.10057, 0.02835, -0.11727], + } + + model_name = args.model_name + config = get_dinov3_config(model_name) + + model = DINOv3ViTModel(config).eval() + state_dict_path = hf_hub_download(repo_id=HUB_MODELS[model_name], filename=HUB_CHECKPOINTS[model_name]) + original_state_dict = torch.load(state_dict_path, mmap=True) + + original_state_dict = split_qkv(original_state_dict) + original_keys = list(original_state_dict.keys()) + new_keys = convert_old_keys_to_new_keys(original_keys) + + converted_state_dict = {} + for key in original_keys: + new_key = new_keys[key] + weight_tensor = original_state_dict[key] + + if "bias_mask" in key or "attn.k_proj.bias" in key or "local_cls_norm" in key: + continue + if "embeddings.mask_token" in new_key: + weight_tensor = weight_tensor.unsqueeze(1) + if "inv_freq" in new_key: + continue + + converted_state_dict[new_key] = weight_tensor + + model.load_state_dict(converted_state_dict, strict=True) + model = model.eval() + + transform = get_transform() + image_processor = get_image_processor() + image = prepare_img() + + # check preprocessing + original_pixel_values = transform(image).unsqueeze(0) # add batch dimension + inputs = image_processor(image, return_tensors="pt") + + torch.testing.assert_close(original_pixel_values, inputs["pixel_values"], atol=1e-6, rtol=1e-6) + print("Preprocessing looks ok!") + + with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float): + model_output = model(**inputs) + + last_layer_class_token = model_output.pooler_output + last_layer_patch_tokens = model_output.last_hidden_state[:, config.num_register_tokens + 1 :] + + actual_outputs = {} + actual_outputs[f"{model_name}_cls"] = last_layer_class_token[0, :5].tolist() + actual_outputs[f"{model_name}_patch"] = last_layer_patch_tokens[0, 0, :5].tolist() + + print("Actual: ", [round(x, 6) for x in actual_outputs[f"{model_name}_cls"]]) + print("Expected:", expected_outputs[f"{model_name}_cls"]) + + torch.testing.assert_close( + torch.Tensor(actual_outputs[f"{model_name}_cls"]), + torch.Tensor(expected_outputs[f"{model_name}_cls"]), + atol=1e-3, + rtol=1e-3, + ) + torch.testing.assert_close( + torch.Tensor(actual_outputs[f"{model_name}_patch"]), + torch.Tensor(expected_outputs[f"{model_name}_patch"]), + atol=1e-3, + rtol=1e-3, + ) + print("Forward pass looks ok!") + + save_dir = os.path.join(args.save_dir, model_name) + os.makedirs(save_dir, exist_ok=True) + model.save_pretrained(save_dir) + image_processor.save_pretrained(save_dir) + print(f"Model saved to {save_dir}") + + if args.push_to_hub: + api = HfApi() + repo = HUB_MODELS[model_name] + api.upload_folder(folder_path=save_dir, repo_id=repo, repo_type="model") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--model-name", + default="vith16plus_lvd1689m", + type=str, + choices=[ + "vits16_lvd1689m", + "vits16plus_lvd1689m", + "vitb16_lvd1689m", + "vitl16_lvd1689m", + "vitl16_sat493m", + "vith16plus_lvd1689m", + "vit7b16_lvd1689m", + "vit7b16_sat493m", + ], + help="Name of the model you'd like to convert.", + ) + parser.add_argument( + "--save-dir", + default="converted_models", + type=str, + help="Directory to save the converted model.", + ) + parser.add_argument( + "--push-to-hub", + action="store_true", + help="Push the converted model to the Hugging Face Hub.", + ) + args = parser.parse_args() + convert_and_test_dinov3_checkpoint(args) diff --git a/src/transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py b/src/transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py new file mode 100644 index 0000000000..3664bdd20a --- /dev/null +++ b/src/transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py @@ -0,0 +1,104 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for DINOv3.""" + +from typing import Optional, Union + +from transformers.image_processing_base import BatchFeature +from transformers.image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images +from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling, SizeDict +from transformers.utils import ( + TensorType, + auto_docstring, + is_torch_available, + is_torchvision_available, + is_torchvision_v2_available, + logging, +) +from transformers.utils.import_utils import requires + + +logger = logging.get_logger(__name__) + + +if is_torch_available(): + import torch + +if is_torchvision_v2_available(): + from torchvision.transforms.v2 import functional as F +elif is_torchvision_available(): + from torchvision.transforms import functional as F + + +@auto_docstring +@requires(backends=("torchvision", "torch")) +class DINOv3ViTImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.BILINEAR + image_mean = IMAGENET_DEFAULT_MEAN + image_std = IMAGENET_DEFAULT_STD + size = {"height": 224, "width": 224} + do_resize = True + do_rescale = True + do_normalize = True + + # Overriden for DINOv3 to preserve order of transforms + # rescale -> resize -> normalize + def _preprocess( + self, + images: list["torch.Tensor"], + do_resize: bool, + size: SizeDict, + interpolation: Optional["F.InterpolationMode"], + do_center_crop: bool, + crop_size: SizeDict, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Optional[Union[float, list[float]]], + image_std: Optional[Union[float, list[float]]], + disable_grouping: Optional[bool], + return_tensors: Optional[Union[str, TensorType]], + ) -> BatchFeature: + # Group images by size for batched resizing + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + resized_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_rescale: + stacked_images = self.rescale(stacked_images, rescale_factor) + if do_resize: + stacked_images = self.resize( + image=stacked_images, size=size, interpolation=interpolation, antialias=True + ) + resized_images_grouped[shape] = stacked_images + resized_images = reorder_images(resized_images_grouped, grouped_images_index) + + # Group images by size for further processing + # Needed in case do_resize is False, or resize returns images with different sizes + grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping) + processed_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_center_crop: + stacked_images = self.center_crop(stacked_images, crop_size) + if do_normalize: + stacked_images = self.normalize(stacked_images, image_mean, image_std) + processed_images_grouped[shape] = stacked_images + + processed_images = reorder_images(processed_images_grouped, grouped_images_index) + processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images + + return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors) + + +__all__ = ["DINOv3ViTImageProcessorFast"] diff --git a/src/transformers/models/dinov3_vit/modeling_dinov3_vit.py b/src/transformers/models/dinov3_vit/modeling_dinov3_vit.py new file mode 100644 index 0000000000..dbea73e6ca --- /dev/null +++ b/src/transformers/models/dinov3_vit/modeling_dinov3_vit.py @@ -0,0 +1,538 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/dinov3_vit/modular_dinov3_vit.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_dinov3_vit.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 Meta AI and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import Callable, Optional + +import numpy as np +import torch +from torch import nn + +from ...activations import ACT2FN +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPooling +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...pytorch_utils import compile_compatible_method_lru_cache +from ...utils import TransformersKwargs, auto_docstring +from ...utils.generic import check_model_inputs +from .configuration_dinov3_vit import DINOv3ViTConfig + + +class DINOv3ViTEmbeddings(nn.Module): + """ + Construct the CLS token, mask token, position and patch embeddings. + """ + + def __init__(self, config: DINOv3ViTConfig): + super().__init__() + self.config = config + self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size)) + self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + self.register_tokens = nn.Parameter(torch.empty(1, config.num_register_tokens, config.hidden_size)) + self.patch_embeddings = nn.Conv2d( + config.num_channels, config.hidden_size, kernel_size=config.patch_size, stride=config.patch_size + ) + + def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor: + batch_size = pixel_values.shape[0] + target_dtype = self.patch_embeddings.weight.dtype + + # (batch_size, num_channels, height, width) -> (batch_size, num_patches, hidden_size) + patch_embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype)) + patch_embeddings = patch_embeddings.flatten(2).transpose(1, 2) + + if bool_masked_pos is not None: + mask_token = self.mask_token.to(patch_embeddings.dtype) + patch_embeddings = torch.where(bool_masked_pos.unsqueeze(-1), mask_token, patch_embeddings) + + # Add CLS and register tokens + cls_token = self.cls_token.expand(batch_size, -1, -1) + register_tokens = self.register_tokens.expand(batch_size, -1, -1) + embeddings = torch.cat([cls_token, register_tokens, patch_embeddings], dim=1) + + return embeddings + + +@compile_compatible_method_lru_cache(maxsize=32) +def get_patches_center_coordinates( + num_patches_h: int, num_patches_w: int, dtype: torch.dtype, device: torch.device +) -> torch.Tensor: + """ + Computes the 2D coordinates of the centers of image patches, normalized to the range [-1, +1]. + The center of each patch is exactly halfway between its top-left and bottom-right corners. + + Args: + num_patches_h (int): Number of patches along the vertical (height) axis. + num_patches_w (int): Number of patches along the horizontal (width) axis. + dtype (torch.dtype): The desired data type of the returned tensor. + + Returns: + torch.Tensor: A tensor of shape (height * width, 2), where each row contains the (y, x) + coordinates of a patch center, normalized to [-1, +1]. + """ + coords_h = torch.arange(0.5, num_patches_h, dtype=dtype, device=device) + coords_w = torch.arange(0.5, num_patches_w, dtype=dtype, device=device) + coords_h = coords_h / num_patches_h + coords_w = coords_w / num_patches_w + # (height, width, 2) -> (height * width, 2) + coords = torch.stack(torch.meshgrid(coords_h, coords_w, indexing="ij"), dim=-1) + coords = coords.flatten(0, 1) + # Shift range [0, 1] to [-1, +1] + coords = 2.0 * coords - 1.0 + return coords + + +def augment_patches_center_coordinates( + coords: torch.Tensor, + shift: Optional[float] = None, + jitter: Optional[float] = None, + rescale: Optional[float] = None, +) -> torch.Tensor: + # Shift coords by adding a uniform value in [-shift, shift] + if shift is not None: + shift_hw = torch.empty((1, 2), device=coords.device, dtype=coords.dtype) + shift_hw = shift_hw.uniform_(-shift, shift) + coords = coords + shift_hw + + # Jitter coords by multiplying the range [-1, 1] by a log-uniform value in [1/jitter, jitter] + if jitter is not None: + jitter_range = np.log(jitter) + jitter_hw = torch.empty((1, 2), device=coords.device, dtype=coords.dtype) + jitter_hw = jitter_hw.uniform_(-jitter_range, jitter_range).exp() + coords = coords * jitter_hw + + # Rescale coords by multiplying the range [-1, 1] by a log-uniform value in [1/rescale, rescale] + if rescale is not None: + rescale_range = np.log(rescale) + rescale_hw = torch.empty(1, device=coords.device, dtype=coords.dtype) + rescale_hw = rescale_hw.uniform_(-rescale_range, rescale_range).exp() + coords = coords * rescale_hw + + return coords + + +class DINOv3ViTRopePositionEmbedding(nn.Module): + inv_freq: torch.Tensor + + def __init__(self, config: DINOv3ViTConfig): + super().__init__() + + self.config = config + self.base = config.rope_theta + self.head_dim = config.hidden_size // config.num_attention_heads + self.num_patches_h = config.image_size // config.patch_size + self.num_patches_w = config.image_size // config.patch_size + + inv_freq = 1 / self.base ** torch.arange(0, 1, 4 / self.head_dim, dtype=torch.float32) # (head_dim / 4,) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + def forward(self, pixel_values: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + _, _, height, width = pixel_values.shape + num_patches_h = height // self.config.patch_size + num_patches_w = width // self.config.patch_size + + device = pixel_values.device + device_type = device.type if isinstance(device.type, str) and device.type != "mps" else "cpu" + + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + # Although we could precompute static patch_coords from image_size and patch_size in the config, + # the model was trained with random_scale, so it can process images of varying sizes. + # Therefore, it's better to compute patch_coords dynamically (with lru_cache). + patch_coords = get_patches_center_coordinates( + num_patches_h, num_patches_w, dtype=torch.float32, device=device + ) + if self.training: + patch_coords = augment_patches_center_coordinates( + patch_coords, + shift=self.config.pos_embed_shift, + jitter=self.config.pos_embed_jitter, + rescale=self.config.pos_embed_rescale, + ) + + # (height * width, 2, head_dim / 4) -> (height * width, head_dim / 2) -> (height * width, head_dim) + angles = 2 * math.pi * patch_coords[:, :, None] * self.inv_freq[None, None, :] + angles = angles.flatten(1, 2) + angles = angles.tile(2) + + cos = torch.cos(angles) + sin = torch.sin(angles) + + dtype = pixel_values.dtype + return cos.to(dtype=dtype), sin.to(dtype=dtype) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs, +): + # Take the dot product between "query" and "key" to get the raw attention scores. + attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling + + # Normalize the attention scores to probabilities. + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + + # Mask heads if we want to + if attention_mask is not None: + attn_weights = attn_weights * attention_mask + + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +def apply_rotary_pos_emb( + q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, **kwargs +) -> tuple[torch.Tensor, torch.Tensor]: + """Applies Rotary Position Embedding to the query and key tensors, but only to the patch tokens, + ignoring the prefix tokens (cls token and register tokens). + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + + num_tokens = q.shape[-2] + num_patches = sin.shape[-2] + num_prefix_tokens = num_tokens - num_patches # cls token + register tokens + + q_prefix_tokens, q_patches = q.split((num_prefix_tokens, num_patches), dim=-2) + k_prefix_tokens, k_patches = k.split((num_prefix_tokens, num_patches), dim=-2) + + # apply rope only to patch tokens + q_patches = (q_patches * cos) + (rotate_half(q_patches) * sin) + k_patches = (k_patches * cos) + (rotate_half(k_patches) * sin) + + q = torch.cat((q_prefix_tokens, q_patches), dim=-2) + k = torch.cat((k_prefix_tokens, k_patches), dim=-2) + + return q, k + + +class DINOv3ViTAttention(nn.Module): + """ + Multi-headed attention compatible with ALL_ATTENTION_FUNCTIONS. + """ + + def __init__(self, config: DINOv3ViTConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + self.is_causal = False + + self.scaling = self.head_dim**-0.5 + self.is_causal = False + + self.dropout = config.attention_dropout + self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.key_bias) + self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.value_bias) + + self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.query_bias) + self.o_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.proj_bias) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + """Input shape: Batch x Time x Channel""" + + batch_size, patches, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(batch_size, patches, -1).contiguous() + attn_output = self.o_proj(attn_output) + + return attn_output, attn_weights + + +class DINOv3ViTLayerScale(nn.Module): + def __init__(self, config) -> None: + super().__init__() + self.lambda1 = nn.Parameter(config.layerscale_value * torch.ones(config.hidden_size)) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + return hidden_state * self.lambda1 + + +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +class DINOv3ViTDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return f"p={self.drop_prob}" + + +class DINOv3ViTMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + return self.down_proj(self.act_fn(self.up_proj(x))) + + +class DINOv3ViTGatedMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +class DINOv3ViTLayer(GradientCheckpointingLayer): + """This corresponds to the Block class in the original implementation.""" + + def __init__(self, config: DINOv3ViTConfig): + super().__init__() + + self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.attention = DINOv3ViTAttention(config) + self.layer_scale1 = DINOv3ViTLayerScale(config) + self.drop_path = DINOv3ViTDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + + self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + if config.use_gated_mlp: + self.mlp = DINOv3ViTGatedMLP(config) + else: + self.mlp = DINOv3ViTMLP(config) + self.layer_scale2 = DINOv3ViTLayerScale(config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + ) -> torch.Tensor: + # Attention with residual connection + residual = hidden_states + hidden_states = self.norm1(hidden_states) + hidden_states, _ = self.attention( + hidden_states, + attention_mask=attention_mask, + position_embeddings=position_embeddings, + ) + hidden_states = self.layer_scale1(hidden_states) + hidden_states = self.drop_path(hidden_states) + residual + + # MLP with residual connection + residual = hidden_states + hidden_states = self.norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = self.layer_scale2(hidden_states) + hidden_states = self.drop_path(hidden_states) + residual + + return hidden_states + + +@auto_docstring +class DINOv3ViTPreTrainedModel(PreTrainedModel): + config: DINOv3ViTConfig + base_model_prefix = "dinov3_vit" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _no_split_modules = ["DINOv3ViTLayer"] + _supports_sdpa = True + _supports_flash_attn = True + _supports_flex_attn = True + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": DINOv3ViTLayer, + "attentions": DINOv3ViTAttention, + } + + def _init_weights(self, module) -> None: + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid + # `trunc_normal_cpu` not implemented in `half` issues + module.weight.data = nn.init.trunc_normal_( + module.weight.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.weight.dtype) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, DINOv3ViTEmbeddings): + module.cls_token.data = nn.init.trunc_normal_( + module.cls_token.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.cls_token.dtype) + if module.config.num_register_tokens > 0: + module.register_tokens.data = nn.init.trunc_normal_( + module.register_tokens.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.register_tokens.dtype) + module.mask_token.data.zero_() + elif isinstance(module, DINOv3ViTLayerScale): + module.lambda1.data.fill_(self.config.layerscale_value) + + +@auto_docstring +class DINOv3ViTModel(DINOv3ViTPreTrainedModel): + def __init__(self, config: DINOv3ViTConfig): + super().__init__(config) + self.config = config + self.embeddings = DINOv3ViTEmbeddings(config) + self.rope_embeddings = DINOv3ViTRopePositionEmbedding(config) + self.layer = nn.ModuleList([DINOv3ViTLayer(config) for _ in range(config.num_hidden_layers)]) + self.norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.patch_embeddings + + @check_model_inputs + @auto_docstring + def forward( + self, + pixel_values: torch.Tensor, + bool_masked_pos: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPooling: + r""" + bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`): + Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for + pre-training. + """ + + pixel_values = pixel_values.to(self.embeddings.patch_embeddings.weight.dtype) + hidden_states = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos) + position_embeddings = self.rope_embeddings(pixel_values) + + for i, layer_module in enumerate(self.layer): + layer_head_mask = head_mask[i] if head_mask is not None else None + hidden_states = layer_module( + hidden_states, + attention_mask=layer_head_mask, + position_embeddings=position_embeddings, + ) + + sequence_output = self.norm(hidden_states) + pooled_output = sequence_output[:, 0, :] + + return BaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + ) + + +__all__ = ["DINOv3ViTModel", "DINOv3ViTPreTrainedModel"] diff --git a/src/transformers/models/dinov3_vit/modular_dinov3_vit.py b/src/transformers/models/dinov3_vit/modular_dinov3_vit.py new file mode 100644 index 0000000000..f4a1e69bea --- /dev/null +++ b/src/transformers/models/dinov3_vit/modular_dinov3_vit.py @@ -0,0 +1,429 @@ +# coding=utf-8 +# Copyright 2025 Meta AI and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch DINOv3 model.""" + +import math +from typing import Callable, Optional + +import numpy as np +import torch +import torch.utils.checkpoint +from torch import nn + +from transformers.models.arcee.modeling_arcee import ArceeMLP +from transformers.models.dinov2.modeling_dinov2 import ( + Dinov2DropPath, + Dinov2LayerScale, + Dinov2PreTrainedModel, + eager_attention_forward, +) +from transformers.models.llama.modeling_llama import LlamaMLP +from transformers.models.pixtral.modeling_pixtral import PixtralAttention, rotate_half + +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPooling +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS +from ...processing_utils import Unpack +from ...pytorch_utils import compile_compatible_method_lru_cache +from ...utils import TransformersKwargs, auto_docstring, logging +from ...utils.generic import check_model_inputs +from .configuration_dinov3_vit import DINOv3ViTConfig + + +logger = logging.get_logger(__name__) + + +class DINOv3ViTEmbeddings(nn.Module): + """ + Construct the CLS token, mask token, position and patch embeddings. + """ + + def __init__(self, config: DINOv3ViTConfig): + super().__init__() + self.config = config + self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size)) + self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + self.register_tokens = nn.Parameter(torch.empty(1, config.num_register_tokens, config.hidden_size)) + self.patch_embeddings = nn.Conv2d( + config.num_channels, config.hidden_size, kernel_size=config.patch_size, stride=config.patch_size + ) + + def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor: + batch_size = pixel_values.shape[0] + target_dtype = self.patch_embeddings.weight.dtype + + # (batch_size, num_channels, height, width) -> (batch_size, num_patches, hidden_size) + patch_embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype)) + patch_embeddings = patch_embeddings.flatten(2).transpose(1, 2) + + if bool_masked_pos is not None: + mask_token = self.mask_token.to(patch_embeddings.dtype) + patch_embeddings = torch.where(bool_masked_pos.unsqueeze(-1), mask_token, patch_embeddings) + + # Add CLS and register tokens + cls_token = self.cls_token.expand(batch_size, -1, -1) + register_tokens = self.register_tokens.expand(batch_size, -1, -1) + embeddings = torch.cat([cls_token, register_tokens, patch_embeddings], dim=1) + + return embeddings + + +@compile_compatible_method_lru_cache(maxsize=32) +def get_patches_center_coordinates( + num_patches_h: int, num_patches_w: int, dtype: torch.dtype, device: torch.device +) -> torch.Tensor: + """ + Computes the 2D coordinates of the centers of image patches, normalized to the range [-1, +1]. + The center of each patch is exactly halfway between its top-left and bottom-right corners. + + Args: + num_patches_h (int): Number of patches along the vertical (height) axis. + num_patches_w (int): Number of patches along the horizontal (width) axis. + dtype (torch.dtype): The desired data type of the returned tensor. + + Returns: + torch.Tensor: A tensor of shape (height * width, 2), where each row contains the (y, x) + coordinates of a patch center, normalized to [-1, +1]. + """ + coords_h = torch.arange(0.5, num_patches_h, dtype=dtype, device=device) + coords_w = torch.arange(0.5, num_patches_w, dtype=dtype, device=device) + coords_h = coords_h / num_patches_h + coords_w = coords_w / num_patches_w + # (height, width, 2) -> (height * width, 2) + coords = torch.stack(torch.meshgrid(coords_h, coords_w, indexing="ij"), dim=-1) + coords = coords.flatten(0, 1) + # Shift range [0, 1] to [-1, +1] + coords = 2.0 * coords - 1.0 + return coords + + +def augment_patches_center_coordinates( + coords: torch.Tensor, + shift: Optional[float] = None, + jitter: Optional[float] = None, + rescale: Optional[float] = None, +) -> torch.Tensor: + # Shift coords by adding a uniform value in [-shift, shift] + if shift is not None: + shift_hw = torch.empty((1, 2), device=coords.device, dtype=coords.dtype) + shift_hw = shift_hw.uniform_(-shift, shift) + coords = coords + shift_hw + + # Jitter coords by multiplying the range [-1, 1] by a log-uniform value in [1/jitter, jitter] + if jitter is not None: + jitter_range = np.log(jitter) + jitter_hw = torch.empty((1, 2), device=coords.device, dtype=coords.dtype) + jitter_hw = jitter_hw.uniform_(-jitter_range, jitter_range).exp() + coords = coords * jitter_hw + + # Rescale coords by multiplying the range [-1, 1] by a log-uniform value in [1/rescale, rescale] + if rescale is not None: + rescale_range = np.log(rescale) + rescale_hw = torch.empty(1, device=coords.device, dtype=coords.dtype) + rescale_hw = rescale_hw.uniform_(-rescale_range, rescale_range).exp() + coords = coords * rescale_hw + + return coords + + +class DINOv3ViTRopePositionEmbedding(nn.Module): + inv_freq: torch.Tensor + + def __init__(self, config: DINOv3ViTConfig): + super().__init__() + + self.config = config + self.base = config.rope_theta + self.head_dim = config.hidden_size // config.num_attention_heads + self.num_patches_h = config.image_size // config.patch_size + self.num_patches_w = config.image_size // config.patch_size + + inv_freq = 1 / self.base ** torch.arange(0, 1, 4 / self.head_dim, dtype=torch.float32) # (head_dim / 4,) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + def forward(self, pixel_values: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + _, _, height, width = pixel_values.shape + num_patches_h = height // self.config.patch_size + num_patches_w = width // self.config.patch_size + + device = pixel_values.device + device_type = device.type if isinstance(device.type, str) and device.type != "mps" else "cpu" + + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + # Although we could precompute static patch_coords from image_size and patch_size in the config, + # the model was trained with random_scale, so it can process images of varying sizes. + # Therefore, it's better to compute patch_coords dynamically (with lru_cache). + patch_coords = get_patches_center_coordinates( + num_patches_h, num_patches_w, dtype=torch.float32, device=device + ) + if self.training: + patch_coords = augment_patches_center_coordinates( + patch_coords, + shift=self.config.pos_embed_shift, + jitter=self.config.pos_embed_jitter, + rescale=self.config.pos_embed_rescale, + ) + + # (height * width, 2, head_dim / 4) -> (height * width, head_dim / 2) -> (height * width, head_dim) + angles = 2 * math.pi * patch_coords[:, :, None] * self.inv_freq[None, None, :] + angles = angles.flatten(1, 2) + angles = angles.tile(2) + + cos = torch.cos(angles) + sin = torch.sin(angles) + + dtype = pixel_values.dtype + return cos.to(dtype=dtype), sin.to(dtype=dtype) + + +def apply_rotary_pos_emb( + q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, **kwargs +) -> tuple[torch.Tensor, torch.Tensor]: + """Applies Rotary Position Embedding to the query and key tensors, but only to the patch tokens, + ignoring the prefix tokens (cls token and register tokens). + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + + num_tokens = q.shape[-2] + num_patches = sin.shape[-2] + num_prefix_tokens = num_tokens - num_patches # cls token + register tokens + + q_prefix_tokens, q_patches = q.split((num_prefix_tokens, num_patches), dim=-2) + k_prefix_tokens, k_patches = k.split((num_prefix_tokens, num_patches), dim=-2) + + # apply rope only to patch tokens + q_patches = (q_patches * cos) + (rotate_half(q_patches) * sin) + k_patches = (k_patches * cos) + (rotate_half(k_patches) * sin) + + q = torch.cat((q_prefix_tokens, q_patches), dim=-2) + k = torch.cat((k_prefix_tokens, k_patches), dim=-2) + + return q, k + + +class DINOv3ViTAttention(PixtralAttention): + def __init__(self, config: DINOv3ViTConfig): + super().__init__(config) + + self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.query_bias) + self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.key_bias) + self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.value_bias) + self.o_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.proj_bias) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + """Input shape: Batch x Time x Channel""" + + batch_size, patches, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(batch_size, patches, -1).contiguous() + attn_output = self.o_proj(attn_output) + + return attn_output, attn_weights + + +class DINOv3ViTLayerScale(Dinov2LayerScale): + pass + + +class DINOv3ViTDropPath(Dinov2DropPath): + pass + + +class DINOv3ViTMLP(ArceeMLP): + pass + + +class DINOv3ViTGatedMLP(LlamaMLP): + pass + + +class DINOv3ViTLayer(GradientCheckpointingLayer): + """This corresponds to the Block class in the original implementation.""" + + def __init__(self, config: DINOv3ViTConfig): + super().__init__() + + self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.attention = DINOv3ViTAttention(config) + self.layer_scale1 = DINOv3ViTLayerScale(config) + self.drop_path = DINOv3ViTDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + + self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + if config.use_gated_mlp: + self.mlp = DINOv3ViTGatedMLP(config) + else: + self.mlp = DINOv3ViTMLP(config) + self.layer_scale2 = DINOv3ViTLayerScale(config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + ) -> torch.Tensor: + # Attention with residual connection + residual = hidden_states + hidden_states = self.norm1(hidden_states) + hidden_states, _ = self.attention( + hidden_states, + attention_mask=attention_mask, + position_embeddings=position_embeddings, + ) + hidden_states = self.layer_scale1(hidden_states) + hidden_states = self.drop_path(hidden_states) + residual + + # MLP with residual connection + residual = hidden_states + hidden_states = self.norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = self.layer_scale2(hidden_states) + hidden_states = self.drop_path(hidden_states) + residual + + return hidden_states + + +@auto_docstring +class DINOv3ViTPreTrainedModel(Dinov2PreTrainedModel): + _can_record_outputs = { + "hidden_states": DINOv3ViTLayer, + "attentions": DINOv3ViTAttention, + } + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid + # `trunc_normal_cpu` not implemented in `half` issues + module.weight.data = nn.init.trunc_normal_( + module.weight.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.weight.dtype) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, DINOv3ViTEmbeddings): + module.cls_token.data = nn.init.trunc_normal_( + module.cls_token.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.cls_token.dtype) + if module.config.num_register_tokens > 0: + module.register_tokens.data = nn.init.trunc_normal_( + module.register_tokens.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.register_tokens.dtype) + module.mask_token.data.zero_() + elif isinstance(module, DINOv3ViTLayerScale): + module.lambda1.data.fill_(self.config.layerscale_value) + + +@auto_docstring +class DINOv3ViTModel(DINOv3ViTPreTrainedModel): + def __init__(self, config: DINOv3ViTConfig): + super().__init__(config) + self.config = config + self.embeddings = DINOv3ViTEmbeddings(config) + self.rope_embeddings = DINOv3ViTRopePositionEmbedding(config) + self.layer = nn.ModuleList([DINOv3ViTLayer(config) for _ in range(config.num_hidden_layers)]) + self.norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.patch_embeddings + + @check_model_inputs + @auto_docstring + def forward( + self, + pixel_values: torch.Tensor, + bool_masked_pos: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPooling: + r""" + bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`): + Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for + pre-training. + """ + + pixel_values = pixel_values.to(self.embeddings.patch_embeddings.weight.dtype) + hidden_states = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos) + position_embeddings = self.rope_embeddings(pixel_values) + + for i, layer_module in enumerate(self.layer): + layer_head_mask = head_mask[i] if head_mask is not None else None + hidden_states = layer_module( + hidden_states, + attention_mask=layer_head_mask, + position_embeddings=position_embeddings, + ) + + sequence_output = self.norm(hidden_states) + pooled_output = sequence_output[:, 0, :] + + return BaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + ) + + +__all__ = ["DINOv3ViTModel", "DINOv3ViTPreTrainedModel"] diff --git a/src/transformers/pytorch_utils.py b/src/transformers/pytorch_utils.py index c3cc4579e5..60548bbc7e 100644 --- a/src/transformers/pytorch_utils.py +++ b/src/transformers/pytorch_utils.py @@ -339,6 +339,7 @@ def isin_mps_friendly(elements: torch.Tensor, test_elements: torch.Tensor | int) return torch.isin(elements, test_elements) +@wraps(lru_cache) def compile_compatible_method_lru_cache(*lru_args, **lru_kwargs): """ LRU cache decorator from standard functools library, but with a workaround to disable @@ -346,19 +347,14 @@ def compile_compatible_method_lru_cache(*lru_args, **lru_kwargs): """ def decorator(func): + func_with_cache = lru_cache(*lru_args, **lru_kwargs)(func) + @wraps(func) - def wrapper(self, *args, **kwargs): - if not is_torchdynamo_compiling(): - # Cache the function only if the model is not being compiled - # check if the function is already cached, otherwise create it - if not hasattr(self, f"_cached_{func.__name__}"): - self.__setattr__( - f"_cached_{func.__name__}", lru_cache(*lru_args, **lru_kwargs)(func.__get__(self)) - ) - return self.__getattribute__(f"_cached_{func.__name__}")(*args, **kwargs) + def wrapper(*args, **kwargs): + if is_torchdynamo_compiling(): + return func(*args, **kwargs) else: - # Otherwise, just call the original function - return func(self, *args, **kwargs) + return func_with_cache(*args, **kwargs) return wrapper diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py index 8a48cf9cbb..13cdd6bb3f 100755 --- a/src/transformers/utils/fx.py +++ b/src/transformers/utils/fx.py @@ -129,6 +129,8 @@ _REGULAR_SUPPORTED_MODEL_NAMES_AND_TASKS = [ "deberta", "deberta-v2", "dinov2", + "dinov3_convnext", + "dinov3_vit", "distilbert", "donut-swin", "electra", diff --git a/tests/models/dinov3_convnext/__init__.py b/tests/models/dinov3_convnext/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/models/dinov3_convnext/test_modeling_dinov3_convnext.py b/tests/models/dinov3_convnext/test_modeling_dinov3_convnext.py new file mode 100644 index 0000000000..a34aacbd8e --- /dev/null +++ b/tests/models/dinov3_convnext/test_modeling_dinov3_convnext.py @@ -0,0 +1,242 @@ +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch ConvNext model.""" + +import unittest + +from transformers import DINOv3ConvNextConfig +from transformers.testing_utils import require_torch, require_vision, slow, torch_device +from transformers.utils import cached_property, is_torch_available, is_vision_available + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import DINOv3ConvNextModel + + +if is_vision_available(): + from PIL import Image + + from transformers import AutoImageProcessor + + +class DINOv3ConvNextModelTester: + def __init__( + self, + parent, + batch_size=13, + image_size=32, + num_channels=3, + hidden_sizes=[10, 20, 30, 40], + depths=[2, 2, 3, 2], + is_training=False, + use_labels=True, + intermediate_size=37, + hidden_act="gelu", + num_labels=10, + initializer_range=0.02, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.num_channels = num_channels + self.hidden_sizes = hidden_sizes + self.depths = depths + self.is_training = is_training + self.use_labels = use_labels + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.num_labels = num_labels + self.initializer_range = initializer_range + self.scope = scope + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size], self.num_labels) + + config = self.get_config() + return config, pixel_values, labels + + def get_config(self): + return DINOv3ConvNextConfig( + num_channels=self.num_channels, + hidden_sizes=self.hidden_sizes, + depths=self.depths, + hidden_act=self.hidden_act, + is_decoder=False, + initializer_range=self.initializer_range, + num_labels=self.num_labels, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = DINOv3ConvNextModel(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + # expected last hidden states: B, C, H // 32, W // 32 + self.parent.assertEqual( + result.last_hidden_state.shape, + ( + self.batch_size, + 1 + self.image_size // 32 * self.image_size // 32, + self.hidden_sizes[-1], + ), + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class DINOv3ConvNextModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as ConvNext does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (DINOv3ConvNextModel,) if is_torch_available() else () + pipeline_model_mapping = {"image-feature-extraction": DINOv3ConvNextModel} if is_torch_available() else {} + + fx_compatible = False + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + has_attentions = False + test_torch_exportable = True + + def setUp(self): + self.model_tester = DINOv3ConvNextModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=DINOv3ConvNextConfig, + has_text_modality=False, + hidden_size=37, + common_properties=["num_channels", "hidden_sizes"], + ) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="DINOv3ConvNext does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="DINOv3ConvNext does not support input and output embeddings") + def test_model_get_set_embeddings(self): + pass + + @unittest.skip(reason="DINOv3ConvNext does not use feedforward chunking") + def test_feed_forward_chunking(self): + pass + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states + + self.assertEqual(len(hidden_states), 5) + + # DINOv3ConvNext's feature maps are of shape (batch_size, num_channels, height, width) + self.assertListEqual( + list(hidden_states[1].shape[-2:]), + [self.model_tester.image_size // 4, self.model_tester.image_size // 4], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + @slow + def test_model_from_pretrained(self): + model_name = "facebook/dinov3-convnext-tiny-pretrain-lvd1689m" + model = DINOv3ConvNextModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + @unittest.skip(reason="DINOv3ConvNext does not retain grads for first hidden state (original pixel_values)") + def test_retain_grad_hidden_states_attentions(self): + pass + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_torch +@require_vision +class DINOv3ConvNextModelIntegrationTest(unittest.TestCase): + @cached_property + def default_image_processor(self): + return ( + AutoImageProcessor.from_pretrained("facebook/dinov3-convnext-tiny-pretrain-lvd1689m") + if is_vision_available() + else None + ) + + @slow + def test_inference_no_head(self): + model = DINOv3ConvNextModel.from_pretrained("facebook/dinov3-convnext-tiny-pretrain-lvd1689m").to(torch_device) + + image_processor = self.default_image_processor + image = prepare_img() + inputs = image_processor(image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the last hidden states + _, _, height, width = inputs["pixel_values"].shape + expected_seq_length = (height * width) // 4 ** (model.config.num_stages + 1) + 1 # +1 for the "CLS" token + expected_shape = torch.Size((1, expected_seq_length, model.config.hidden_sizes[-1])) + self.assertEqual(outputs.last_hidden_state.shape, expected_shape) + + last_layer_cls_token = outputs.pooler_output + expected_slice = torch.tensor([-6.3721, 1.3008, 2.0743, -0.0800, 0.6072], device=torch_device) + torch.testing.assert_close(last_layer_cls_token[0, :5], expected_slice, rtol=1e-4, atol=1e-4) + + last_layer_patch_tokens = outputs.last_hidden_state[:, 1:] + expected_slice = torch.tensor([0.4905, -3.7135, 1.8485, -1.0403, -1.0908], device=torch_device) + torch.testing.assert_close(last_layer_patch_tokens[0, 0, :5], expected_slice, rtol=1e-4, atol=1e-4) diff --git a/tests/models/dinov3_vit/__init__.py b/tests/models/dinov3_vit/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/models/dinov3_vit/test_image_processing_dinov3_vit_fast.py b/tests/models/dinov3_vit/test_image_processing_dinov3_vit_fast.py new file mode 100644 index 0000000000..552d522095 --- /dev/null +++ b/tests/models/dinov3_vit/test_image_processing_dinov3_vit_fast.py @@ -0,0 +1,127 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torchvision_available + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_torchvision_available(): + from transformers import DINOv3ViTImageProcessorFast + + +class DINOv3ViTImageProcessingTester: + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + image_size=18, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + do_center_crop=True, + crop_size=None, + do_normalize=True, + image_mean=[0.48145466, 0.4578275, 0.40821073], + image_std=[0.26862954, 0.26130258, 0.27577711], + do_convert_rgb=True, + ): + super().__init__() + size = size if size is not None else {"shortest_edge": 20} + crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.do_convert_rgb = do_convert_rgb + + def prepare_image_processor_dict(self): + return { + "do_resize": self.do_resize, + "size": self.size, + "do_center_crop": self.do_center_crop, + "crop_size": self.crop_size, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_convert_rgb": self.do_convert_rgb, + } + + def expected_output_image_shape(self, images): + return self.num_channels, self.crop_size["height"], self.crop_size["width"] + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class DINOv3ViTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = None + fast_image_processing_class = DINOv3ViTImageProcessorFast if is_torchvision_available() else None + test_slow_image_processor = False + + def setUp(self): + super().setUp() + self.image_processor_tester = DINOv3ViTImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "do_center_crop")) + self.assertTrue(hasattr(image_processing, "center_crop")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_convert_rgb")) + + def test_image_processor_from_dict_with_kwargs(self): + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"shortest_edge": 20}) + self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18}) + + image_processor = image_processing_class.from_dict( + self.image_processor_dict, size={"height": 42, "width": 42} + ) + self.assertEqual(image_processor.size, {"height": 42, "width": 42}) diff --git a/tests/models/dinov3_vit/test_modeling_dinov3_vit.py b/tests/models/dinov3_vit/test_modeling_dinov3_vit.py new file mode 100644 index 0000000000..00e078739d --- /dev/null +++ b/tests/models/dinov3_vit/test_modeling_dinov3_vit.py @@ -0,0 +1,278 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch DINOv3 model.""" + +import unittest + +from transformers import DINOv3ViTConfig +from transformers.testing_utils import require_torch, require_vision, slow, torch_device +from transformers.utils import cached_property, is_torch_available, is_vision_available + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import DINOv3ViTModel + + +if is_vision_available(): + from PIL import Image + + from transformers import AutoImageProcessor + + +class DINOv3ViTModelTester: + def __init__( + self, + parent, + batch_size=13, + image_size=30, + patch_size=2, + num_channels=3, + is_training=False, + use_labels=True, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + type_sequence_label_size=10, + initializer_range=0.02, + num_register_tokens=2, + mask_ratio=0.5, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.use_labels = use_labels + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_register_tokens = num_register_tokens + self.scope = scope + + num_patches = (image_size // patch_size) ** 2 + self.seq_length = num_patches + 1 + self.num_register_tokens + self.mask_ratio = mask_ratio + self.num_masks = int(mask_ratio * self.seq_length) + self.mask_length = num_patches + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + + config = self.get_config() + + return config, pixel_values, labels + + def get_config(self): + return DINOv3ViTConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + is_decoder=False, + initializer_range=self.initializer_range, + num_register_tokens=self.num_register_tokens, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = DINOv3ViTModel(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + self.parent.assertEqual( + result.last_hidden_state.shape, + (self.batch_size, self.seq_length, self.hidden_size), + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + pixel_values, + labels, + ) = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class Dinov3ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as Dinov3 does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (DINOv3ViTModel,) if is_torch_available() else () + pipeline_model_mapping = ( + { + "image-feature-extraction": DINOv3ViTModel, + } + if is_torch_available() + else {} + ) + fx_compatible = False + + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + test_torch_exportable = True + + def setUp(self): + self.model_tester = DINOv3ViTModelTester(self) + self.config_tester = ConfigTester(self, config_class=DINOv3ViTConfig, has_text_modality=False, hidden_size=37) + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if param.requires_grad and "register_tokens" not in name: + # See PR #38607 (to avoid flakiness) + data = torch.flatten(param.data) + n_elements = torch.numel(data) + # skip 2.5% of elements on each side to avoid issues caused by `nn.init.trunc_normal_` described in + # https://github.com/huggingface/transformers/pull/27906#issuecomment-1846951332 + n_elements_to_skip_on_each_side = int(n_elements * 0.025) + data_to_check = torch.sort(data).values + if n_elements_to_skip_on_each_side > 0: + data_to_check = data_to_check[n_elements_to_skip_on_each_side:-n_elements_to_skip_on_each_side] + self.assertIn( + ((data_to_check.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="Dinov3 does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + def test_model_get_set_embeddings(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @unittest.skip(reason="Dinov3 does not support feedforward chunking yet") + def test_feed_forward_chunking(self): + pass + + @slow + def test_model_from_pretrained(self): + model_name = "facebook/dinov3-vits16-pretrain-lvd1689m" + model = DINOv3ViTModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_torch +@require_vision +class DINOv3ViTModelIntegrationTest(unittest.TestCase): + @cached_property + def default_image_processor(self): + return ( + AutoImageProcessor.from_pretrained("facebook/dinov3-vits16-pretrain-lvd1689m") + if is_vision_available() + else None + ) + + @slow + def test_inference_no_head(self): + model = DINOv3ViTModel.from_pretrained("facebook/dinov3-vits16-pretrain-lvd1689m").to(torch_device) + + image_processor = self.default_image_processor + image = prepare_img() + inputs = image_processor(image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the last hidden states + # in DINOv3 with Registers, the seq length equals the number of patches + 1 + num_register_tokens (we add 1 for the [CLS] token) + _, _, height, width = inputs["pixel_values"].shape + num_patches = (height // model.config.patch_size) * (width // model.config.patch_size) + expected_seq_length = num_patches + 1 + model.config.num_register_tokens + expected_shape = torch.Size((1, expected_seq_length, model.config.hidden_size)) + self.assertEqual(outputs.last_hidden_state.shape, expected_shape) + + last_layer_cls_token = outputs.pooler_output + expected_slice = torch.tensor([0.4637, -0.4160, 0.4086, -0.1265, -0.2865], device=torch_device) + torch.testing.assert_close(last_layer_cls_token[0, :5], expected_slice, rtol=1e-4, atol=1e-4) + + last_layer_patch_tokens = outputs.last_hidden_state[:, model.config.num_register_tokens + 1 :] + expected_slice = torch.tensor([-0.0386, -0.2509, -0.0161, -0.4556, 0.5716], device=torch_device) + torch.testing.assert_close(last_layer_patch_tokens[0, 0, :5], expected_slice, rtol=1e-4, atol=1e-4) diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py index 8878491e4e..3904b850b6 100644 --- a/utils/check_docstrings.py +++ b/utils/check_docstrings.py @@ -169,6 +169,8 @@ OBJECTS_TO_IGNORE = [ "DetrConfig", "DetrImageProcessor", "DinatModel", + "DINOv3ConvNextConfig", + "DINOv3ViTConfig", "DistilBertConfig", "DistilBertTokenizerFast", "DocumentQuestionAnsweringPipeline", diff --git a/utils/check_repo.py b/utils/check_repo.py index 8d82b29360..cf4fd6c762 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -411,6 +411,8 @@ MODEL_TYPE_TO_DOC_MAPPING = OrderedDict( ("data2vec-audio", "data2vec"), ("data2vec-vision", "data2vec"), ("donut-swin", "donut"), + ("dinov3_convnext", "dinov3"), + ("dinov3_vit", "dinov3"), ] )