From f6a63889728e5206f6fd3fabdb8a820257304f98 Mon Sep 17 00:00:00 2001 From: amyeroberts Date: Mon, 16 May 2022 22:19:53 +0100 Subject: [PATCH] Add Tensorflow Swin model (#16988) Co-authored-by: Matt Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- docs/source/en/index.mdx | 2 +- docs/source/en/model_doc/swin.mdx | 47 +- src/transformers/__init__.py | 18 + src/transformers/models/auto/__init__.py | 2 + .../models/auto/modeling_tf_auto.py | 20 + src/transformers/models/swin/__init__.py | 28 +- src/transformers/models/swin/modeling_swin.py | 1 - .../models/swin/modeling_tf_swin.py | 1408 +++++++++++++++++ src/transformers/utils/dummy_tf_objects.py | 34 + tests/models/swin/test_modeling_swin.py | 100 +- tests/models/swin/test_modeling_tf_swin.py | 376 +++++ 11 files changed, 1977 insertions(+), 59 deletions(-) create mode 100644 src/transformers/models/swin/modeling_tf_swin.py create mode 100644 tests/models/swin/test_modeling_tf_swin.py diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx index e0b6674f69..4b9797b867 100644 --- a/docs/source/en/index.mdx +++ b/docs/source/en/index.mdx @@ -256,7 +256,7 @@ Flax), PyTorch, and/or TensorFlow. | Speech2Text2 | ✅ | ❌ | ❌ | ❌ | ❌ | | Splinter | ✅ | ✅ | ✅ | ❌ | ❌ | | SqueezeBERT | ✅ | ✅ | ✅ | ❌ | ❌ | -| Swin | ❌ | ❌ | ✅ | ❌ | ❌ | +| Swin | ❌ | ❌ | ✅ | ✅ | ❌ | | T5 | ✅ | ✅ | ✅ | ✅ | ✅ | | TAPAS | ✅ | ❌ | ✅ | ✅ | ❌ | | Transformer-XL | ✅ | ❌ | ✅ | ✅ | ❌ | diff --git a/docs/source/en/model_doc/swin.mdx b/docs/source/en/model_doc/swin.mdx index 1f247284af..e622acea26 100644 --- a/docs/source/en/model_doc/swin.mdx +++ b/docs/source/en/model_doc/swin.mdx @@ -14,22 +14,22 @@ specific language governing permissions and limitations under the License. ## Overview -The Swin Transformer was proposed in [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) -by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo. +The Swin Transformer was proposed in [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) +by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo. The abstract from the paper is the following: -*This paper presents a new vision Transformer, called Swin Transformer, that capably serves as a general-purpose backbone -for computer vision. Challenges in adapting Transformer from language to vision arise from differences between the two domains, -such as large variations in the scale of visual entities and the high resolution of pixels in images compared to words in text. -To address these differences, we propose a hierarchical Transformer whose representation is computed with \bold{S}hifted -\bold{win}dows. The shifted windowing scheme brings greater efficiency by limiting self-attention computation to non-overlapping -local windows while also allowing for cross-window connection. This hierarchical architecture has the flexibility to model at -various scales and has linear computational complexity with respect to image size. These qualities of Swin Transformer make it -compatible with a broad range of vision tasks, including image classification (87.3 top-1 accuracy on ImageNet-1K) and dense -prediction tasks such as object detection (58.7 box AP and 51.1 mask AP on COCO test-dev) and semantic segmentation -(53.5 mIoU on ADE20K val). Its performance surpasses the previous state-of-the-art by a large margin of +2.7 box AP and -+2.6 mask AP on COCO, and +3.2 mIoU on ADE20K, demonstrating the potential of Transformer-based models as vision backbones. +*This paper presents a new vision Transformer, called Swin Transformer, that capably serves as a general-purpose backbone +for computer vision. Challenges in adapting Transformer from language to vision arise from differences between the two domains, +such as large variations in the scale of visual entities and the high resolution of pixels in images compared to words in text. +To address these differences, we propose a hierarchical Transformer whose representation is computed with \bold{S}hifted +\bold{win}dows. The shifted windowing scheme brings greater efficiency by limiting self-attention computation to non-overlapping +local windows while also allowing for cross-window connection. This hierarchical architecture has the flexibility to model at +various scales and has linear computational complexity with respect to image size. These qualities of Swin Transformer make it +compatible with a broad range of vision tasks, including image classification (87.3 top-1 accuracy on ImageNet-1K) and dense +prediction tasks such as object detection (58.7 box AP and 51.1 mask AP on COCO test-dev) and semantic segmentation +(53.5 mIoU on ADE20K val). Its performance surpasses the previous state-of-the-art by a large margin of +2.7 box AP and ++2.6 mask AP on COCO, and +3.2 mIoU on ADE20K, demonstrating the potential of Transformer-based models as vision backbones. The hierarchical design and the shifted window approach also prove beneficial for all-MLP architectures.* Tips: @@ -38,11 +38,11 @@ Tips: - Swin can be used as a *backbone*. When `output_hidden_states = True`, it will output both `hidden_states` and `reshaped_hidden_states`. The `reshaped_hidden_states` have a shape of `(batch, num_channels, height, width)` rather than `(batch_size, sequence_length, num_channels)`. +alt="drawing" width="600"/> Swin Transformer architecture. Taken from the original paper. -This model was contributed by [novice03](https://huggingface.co/novice03>). The original code can be found [here](https://github.com/microsoft/Swin-Transformer). +This model was contributed by [novice03](https://huggingface.co/novice03>). The Tensorflow version of this model was contributed by [amyeroberts](https://huggingface.co/amyeroberts). The original code can be found [here](https://github.com/microsoft/Swin-Transformer). ## SwinConfig @@ -63,4 +63,19 @@ This model was contributed by [novice03](https://huggingface.co/novice03>). The ## SwinForImageClassification [[autodoc]] transformers.SwinForImageClassification - - forward \ No newline at end of file + - forward + +## TFSwinModel + +[[autodoc]] TFSwinModel + - call + +## TFSwinForMaskedImageModeling + +[[autodoc]] TFSwinForMaskedImageModeling + - call + +## TFSwinForImageClassification + +[[autodoc]] transformers.TFSwinForImageClassification + - call diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 7b8018a71e..d72c5aa53f 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1841,6 +1841,7 @@ else: [ "TF_MODEL_FOR_CAUSAL_LM_MAPPING", "TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING", + "TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING", "TF_MODEL_FOR_MASKED_LM_MAPPING", "TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING", "TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING", @@ -2208,6 +2209,15 @@ else: "TFSpeech2TextPreTrainedModel", ] ) + _import_structure["models.swin"].extend( + [ + "TF_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST", + "TFSwinForImageClassification", + "TFSwinForMaskedImageModeling", + "TFSwinModel", + "TFSwinPreTrainedModel", + ] + ) _import_structure["models.t5"].extend( [ "TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -4071,6 +4081,7 @@ if TYPE_CHECKING: from .models.auto import ( TF_MODEL_FOR_CAUSAL_LM_MAPPING, TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, + TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING, TF_MODEL_FOR_MASKED_LM_MAPPING, TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING, TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING, @@ -4363,6 +4374,13 @@ if TYPE_CHECKING: TFSpeech2TextModel, TFSpeech2TextPreTrainedModel, ) + from .models.swin import ( + TF_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST, + TFSwinForImageClassification, + TFSwinForMaskedImageModeling, + TFSwinModel, + TFSwinPreTrainedModel, + ) from .models.t5 import ( TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST, TFT5EncoderModel, diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py index a3d0f1548d..c3efdde105 100644 --- a/src/transformers/models/auto/__init__.py +++ b/src/transformers/models/auto/__init__.py @@ -101,6 +101,7 @@ else: _import_structure["modeling_tf_auto"] = [ "TF_MODEL_FOR_CAUSAL_LM_MAPPING", "TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING", + "TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING", "TF_MODEL_FOR_MASKED_LM_MAPPING", "TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING", "TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING", @@ -238,6 +239,7 @@ if TYPE_CHECKING: from .modeling_tf_auto import ( TF_MODEL_FOR_CAUSAL_LM_MAPPING, TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, + TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING, TF_MODEL_FOR_MASKED_LM_MAPPING, TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING, TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING, diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py index 3fe804cb4c..9346252f8e 100644 --- a/src/transformers/models/auto/modeling_tf_auto.py +++ b/src/transformers/models/auto/modeling_tf_auto.py @@ -65,6 +65,7 @@ TF_MODEL_MAPPING_NAMES = OrderedDict( ("roberta", "TFRobertaModel"), ("roformer", "TFRoFormerModel"), ("speech_to_text", "TFSpeech2TextModel"), + ("swin", "TFSwinModel"), ("t5", "TFT5Model"), ("tapas", "TFTapasModel"), ("transfo-xl", "TFTransfoXLModel"), @@ -159,11 +160,18 @@ TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict( ] ) +TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES = OrderedDict( + [ + ("swin", "TFSwinForMaskedImageModeling"), + ] +) + TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict( [ # Model for Image-classsification ("convnext", "TFConvNextForImageClassification"), ("data2vec-vision", "TFData2VecVisionForImageClassification"), + ("swin", "TFSwinForImageClassification"), ("vit", "TFViTForImageClassification"), ] ) @@ -349,6 +357,9 @@ TF_MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_MAPPING_NAMES TF_MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_FOR_PRETRAINING_MAPPING_NAMES) TF_MODEL_WITH_LM_HEAD_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_WITH_LM_HEAD_MAPPING_NAMES) TF_MODEL_FOR_CAUSAL_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES) +TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES +) TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = _LazyAutoMapping( CONFIG_MAPPING_NAMES, TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES ) @@ -409,6 +420,15 @@ class TFAutoModelForCausalLM(_BaseAutoModelClass): TFAutoModelForCausalLM = auto_class_update(TFAutoModelForCausalLM, head_doc="causal language modeling") +class TFAutoModelForMaskedImageModeling(_BaseAutoModelClass): + _model_mapping = TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING + + +TFAutoModelForMaskedImageModeling = auto_class_update( + TFAutoModelForMaskedImageModeling, head_doc="masked image modeling" +) + + class TFAutoModelForImageClassification(_BaseAutoModelClass): _model_mapping = TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING diff --git a/src/transformers/models/swin/__init__.py b/src/transformers/models/swin/__init__.py index 95670d3676..65b19a36ae 100644 --- a/src/transformers/models/swin/__init__.py +++ b/src/transformers/models/swin/__init__.py @@ -18,7 +18,7 @@ from typing import TYPE_CHECKING # rely on isort to merge the imports -from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available +from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tf_available, is_torch_available _import_structure = { @@ -40,6 +40,19 @@ else: "SwinPreTrainedModel", ] +try: + if not is_tf_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_tf_swin"] = [ + "TF_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST", + "TFSwinForImageClassification", + "TFSwinForMaskedImageModeling", + "TFSwinModel", + "TFSwinPreTrainedModel", + ] if TYPE_CHECKING: from .configuration_swin import SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP, SwinConfig @@ -58,6 +71,19 @@ if TYPE_CHECKING: SwinPreTrainedModel, ) + try: + if not is_tf_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_tf_swin import ( + TF_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST, + TFSwinForImageClassification, + TFSwinForMaskedImageModeling, + TFSwinModel, + TFSwinPreTrainedModel, + ) else: import sys diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py index 2b415dbe26..be46b8dc2f 100644 --- a/src/transformers/models/swin/modeling_swin.py +++ b/src/transformers/models/swin/modeling_swin.py @@ -1068,7 +1068,6 @@ class SwinForMaskedImageModeling(SwinPreTrainedModel): ) sequence_output = outputs[0] - # Reshape to (batch_size, num_channels, height, width) sequence_output = sequence_output.transpose(1, 2) batch_size, num_channels, sequence_length = sequence_output.shape diff --git a/src/transformers/models/swin/modeling_tf_swin.py b/src/transformers/models/swin/modeling_tf_swin.py new file mode 100644 index 0000000000..68029074d3 --- /dev/null +++ b/src/transformers/models/swin/modeling_tf_swin.py @@ -0,0 +1,1408 @@ +# coding=utf-8 +# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" TF 2.0 Swin Transformer model.""" + + +import collections.abc +import math +from dataclasses import dataclass +from functools import partial +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union + +import tensorflow as tf + +from ...activations_tf import ACT2FN +from ...modeling_tf_utils import TFPreTrainedModel, TFSequenceClassificationLoss, get_initializer, unpack_inputs +from ...tf_utils import shape_list +from ...utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_swin import SwinConfig + + +logger = logging.get_logger(__name__) + +# General docstring +_CONFIG_FOR_DOC = "SwinConfig" +_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor" + +# Base docstring +_CHECKPOINT_FOR_DOC = "microsoft/swin-tiny-patch4-window7-224" +_EXPECTED_OUTPUT_SHAPE = [1, 49, 768] + +# Image classification docstring +_IMAGE_CLASS_CHECKPOINT = "microsoft/swin-tiny-patch4-window7-224" +_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat" + + +TF_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "microsoft/swin-tiny-patch4-window7-224", + # See all Swin models at https://huggingface.co/models?filter=swin +] + +# to_2tuple, drop_path, TFSwinPatchEmbeddings, TFSwinPatchMerging and TFSwinDropPath are tensorflow +# implementations of PyTorch functionalities in the timm library. + + +@dataclass +class TFSwinEncoderOutput(ModelOutput): + """ + Swin encoder's outputs, with potential hidden states and attentions. + + Args: + last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + reshaped_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape + `(batch_size, hidden_size, height, width)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to + include the spatial dimensions. + """ + + last_hidden_state: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + reshaped_hidden_states: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFSwinModelOutput(ModelOutput): + """ + Swin model's outputs that also contains a pooling of the last hidden states. + + Args: + last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`): + Average pooling of the last layer hidden-state. + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + reshaped_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape + `(batch_size, hidden_size, height, width)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to + include the spatial dimensions. + """ + + last_hidden_state: tf.Tensor = None + pooler_output: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + reshaped_hidden_states: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFSwinMaskedImageModelingOutput(ModelOutput): + """ + Swin masked image model outputs. + + Args: + loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided): + Masked image modeling (MLM) loss. + logits (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`): + Reconstructed pixel values. + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + reshaped_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape + `(batch_size, hidden_size, height, width)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to + include the spatial dimensions. + """ + + loss: Optional[tf.Tensor] = None + logits: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + reshaped_hidden_states: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFSwinImageClassifierOutput(ModelOutput): + """ + Swin outputs for image classification. + + Args: + loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + reshaped_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape + `(batch_size, hidden_size, height, width)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to + include the spatial dimensions. + """ + + loss: Optional[tf.Tensor] = None + logits: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + reshaped_hidden_states: Optional[Tuple[tf.Tensor]] = None + + +# Copied from transformers.models.vit.modeling_tf_vit.to_2tuple +def to_2tuple(x) -> Tuple[Any, Any]: + if isinstance(x, collections.abc.Iterable): + return x + return (x, x) + + +def window_partition(input_feature: tf.Tensor, window_size: int) -> tf.Tensor: + """ + Partitions the given input into windows. + """ + batch_size, height, width, num_channels = input_feature.shape + input_feature = tf.reshape( + input_feature, + (batch_size, height // window_size, window_size, width // window_size, window_size, num_channels), + ) + windows = tf.transpose(input_feature, (0, 1, 3, 2, 4, 5)) + windows = tf.reshape(windows, (-1, window_size, window_size, num_channels)) + return windows + + +def window_reverse(windows: tf.Tensor, window_size: int, height: int, width: int) -> tf.Tensor: + """ + Merges windows to produce higher resolution features. + """ + batch_size = int(windows.shape[0] / (height * width / window_size / window_size)) + windows = tf.reshape( + windows, (batch_size, height // window_size, width // window_size, window_size, window_size, -1) + ) + windows = tf.transpose(windows, (0, 1, 3, 2, 4, 5)) + windows = tf.reshape(windows, (batch_size, height, width, -1)) + return windows + + +def drop_path( + input: tf.Tensor, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True +) -> tf.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = tf.random.uniform(shape) + random_tensor = tf.where(random_tensor <= keep_prob, 1.0, 0.0) + if keep_prob > 0.0 and scale_by_keep: + random_tensor /= keep_prob + return input * random_tensor + + +class TFSwinEmbeddings(tf.keras.layers.Layer): + """ + Construct the patch and position embeddings. Optionally, also the mask token. + """ + + def __init__(self, config: SwinConfig, use_mask_token: bool = False, **kwargs) -> None: + super().__init__(**kwargs) + self.patch_embeddings = TFSwinPatchEmbeddings( + image_size=config.image_size, + patch_size=config.patch_size, + num_channels=config.num_channels, + embed_dim=config.embed_dim, + name="patch_embeddings", + ) + self.num_patches = self.patch_embeddings.num_patches + self.patch_grid = self.patch_embeddings.grid_size + self.embed_dim = config.embed_dim + self.use_mask_token = use_mask_token + self.use_absolute_embeddings = config.use_absolute_embeddings + + self.norm = tf.keras.layers.LayerNormalization(name="norm", epsilon=1e-5) + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, name="dropout") + + def build(self, input_shape: tf.TensorShape) -> None: + if self.use_mask_token: + self.mask_token = self.add_weight(shape=(1, 1, self.embed_dim), initializer="zeros", name="mask_token") + else: + self.mask_token = None + + if self.use_absolute_embeddings: + self.position_embeddings = self.add_weight( + (1, self.num_patches + 1, self.embed_dim), initializer="zeros", name="positional_embeddings" + ) + else: + self.position_embeddings = None + super().build(input_shape) + + def call( + self, pixel_values: tf.Tensor, bool_masked_pos: bool = None, training: bool = False + ) -> Tuple[tf.Tensor, Tuple[int, int]]: + embeddings, output_dimensions = self.patch_embeddings(pixel_values, training=training) + embeddings = self.norm(embeddings, training=training) + batch_size, seq_len, _ = embeddings.shape + + if bool_masked_pos is not None: + mask_tokens = tf.repeat(self.mask_token, batch_size, 0) + mask_tokens = tf.repeat(mask_tokens, seq_len, 1) + # replace the masked visual tokens by mask_tokens + mask = tf.expand_dims(bool_masked_pos, -1) + mask = tf.cast(mask, mask_tokens.dtype) + + embeddings = embeddings * (1.0 - mask) + mask_tokens * mask + + if self.position_embeddings is not None: + embeddings = embeddings + self.position_embeddings + + embeddings = self.dropout(embeddings, training=training) + + return embeddings, output_dimensions + + +class TFSwinPatchEmbeddings(tf.keras.layers.Layer): + """ + Image to Patch Embedding. + """ + + def __init__( + self, image_size: int = 224, patch_size: int = 16, num_channels: int = 3, embed_dim: int = 768, **kwargs + ) -> None: + super().__init__(**kwargs) + image_size = to_2tuple(image_size) + patch_size = to_2tuple(patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.image_size = image_size + self.patch_size = patch_size + self.num_patches = num_patches + self.grid_size = (image_size[0] // patch_size[0], image_size[1] // patch_size[1]) + + self.projection = tf.keras.layers.Conv2D( + filters=embed_dim, kernel_size=self.patch_size, strides=self.patch_size, padding="valid", name="projection" + ) + + def maybe_pad(self, pixel_values: tf.Tensor, height: int, width: int) -> tf.Tensor: + if width % self.patch_size[1] != 0: + pad_values = ((0, 0), (0, 0), (0, 0), (0, self.patch_size[1] - width % self.patch_size[1])) + pixel_values = tf.pad(pixel_values, pad_values) + if height % self.patch_size[0] != 0: + pad_values = ((0, 0), (0, 0), (0, self.patch_size[0] - height % self.patch_size[0]), (0, 0)) + pixel_values = tf.pad(pixel_values, pad_values) + return pixel_values + + def call(self, pixel_values: tf.Tensor, training: bool = False) -> Tuple[tf.Tensor, Tuple[int, int]]: + _, _, height, width = shape_list(pixel_values) + # pad the input to be divisible by self.patch_size, if needed + pixel_values = self.maybe_pad(pixel_values, height, width) + + # B,C,H,W -> B,H,W,C + pixel_values = tf.transpose(pixel_values, (0, 2, 3, 1)) + + embeddings = self.projection(pixel_values, training=training) + + # B,H,W,C -> B,C,H,W + embeddings = tf.transpose(embeddings, (0, 3, 1, 2)) + + _, _, height, width = embeddings.shape + output_dimensions = (height, width) + + embeddings = tf.reshape(embeddings, (embeddings.shape[0], embeddings.shape[1], -1)) + embeddings = tf.transpose(embeddings, (0, 2, 1)) + return embeddings, output_dimensions + + +class TFSwinPatchMerging(tf.keras.layers.Layer): + """ + Patch Merging Layer. + + Args: + input_resolution (`Tuple[int]`): + Resolution of input feature. + dim (`int`): + Number of input channels. + norm_layer (`tf.keras.layer.Layer`, *optional*, defaults to `tf.keras.layers.LayerNormalization`): + Normalization layer class. + """ + + def __init__( + self, input_resolution: Tuple[int, int], dim: int, norm_layer: Optional[Callable] = None, **kwargs + ) -> None: + super().__init__(**kwargs) + self.input_resolution = input_resolution + self.dim = dim + self.reduction = tf.keras.layers.Dense(2 * dim, use_bias=False, name="reduction") + if norm_layer is None: + # Use same default epsilon as PyTorch + self.norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="norm") + else: + self.norm = norm_layer(name="norm") + + def maybe_pad(self, input_feature: tf.Tensor, height: int, width: int) -> tf.Tensor: + should_pad = (height % 2 == 1) or (width % 2 == 1) + if should_pad: + pad_values = ((0, 0), (0, height % 2), (0, width % 2), (0, 0)) + input_feature = tf.pad(input_feature, pad_values) + + return input_feature + + def call(self, input_feature: tf.Tensor, input_dimensions: Tuple[int, int], training: bool = False) -> tf.Tensor: + height, width = input_dimensions + # `dim` is height * width + batch_size, _, num_channels = input_feature.shape + + input_feature = tf.reshape(input_feature, (batch_size, height, width, num_channels)) + # pad input to be disible by width and height, if needed + input_feature = self.maybe_pad(input_feature, height, width) + # [batch_size, height/2, width/2, num_channels] + input_feature_0 = input_feature[:, 0::2, 0::2, :] + # [batch_size, height/2, width/2, num_channels] + input_feature_1 = input_feature[:, 1::2, 0::2, :] + # [batch_size, height/2, width/2, num_channels] + input_feature_2 = input_feature[:, 0::2, 1::2, :] + # [batch_size, height/2, width/2, num_channels] + input_feature_3 = input_feature[:, 1::2, 1::2, :] + # batch_size height/2 width/2 4*num_channels + input_feature = tf.concat([input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1) + input_feature = tf.reshape( + input_feature, (batch_size, -1, 4 * num_channels) + ) # batch_size height/2*width/2 4*C + + input_feature = self.norm(input_feature, training=training) + input_feature = self.reduction(input_feature, training=training) + + return input_feature + + +class TFSwinDropPath(tf.keras.layers.Layer): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: float = None, scale_by_keep: bool = True, **kwargs) -> None: + super(TFSwinDropPath, self).__init__(**kwargs) + self.drop_prob = drop_prob + self.scale_by_keep = scale_by_keep + + def call(self, input: tf.Tensor, training: bool = False) -> tf.Tensor: + return drop_path(input, self.drop_prob, training, self.scale_by_keep) + + +class TFSwinSelfAttention(tf.keras.layers.Layer): + def __init__(self, config: SwinConfig, dim: int, num_heads: int, **kwargs) -> None: + super().__init__(**kwargs) + if dim % num_heads != 0: + raise ValueError( + f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})" + ) + + self.num_attention_heads = num_heads + self.attention_head_size = int(dim / num_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.window_size = to_2tuple(config.window_size) + + # get pair-wise relative position index for each token inside the window + coords_h = tf.range(self.window_size[0]) + coords_w = tf.range(self.window_size[1]) + coords = tf.stack(tf.meshgrid(coords_h, coords_w, indexing="ij")) + coords_flatten = tf.reshape(coords, (coords.shape[0], -1)) + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] + relative_coords = tf.transpose(relative_coords, (1, 2, 0)) + + stack_0, stack_1 = tf.unstack(relative_coords, axis=2) + stack_0 += self.window_size[0] - 1 + stack_0 *= 2 * self.window_size[1] - 1 + stack_1 += self.window_size[1] - 1 + relative_coords = tf.stack([stack_0, stack_1], axis=2) + self.relative_position_index = tf.reduce_sum(relative_coords, axis=-1) + + self.query = tf.keras.layers.Dense( + self.all_head_size, + kernel_initializer=get_initializer(config.initializer_range), + use_bias=config.qkv_bias, + name="query", + ) + self.key = tf.keras.layers.Dense( + self.all_head_size, + kernel_initializer=get_initializer(config.initializer_range), + use_bias=config.qkv_bias, + name="key", + ) + self.value = tf.keras.layers.Dense( + self.all_head_size, + kernel_initializer=get_initializer(config.initializer_range), + use_bias=config.qkv_bias, + name="value", + ) + + self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) + + def build(self, input_shape: tf.TensorShape) -> None: + self.relative_position_bias_table = self.add_weight( + shape=(((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1)), self.num_attention_heads), + initializer="zeros", + name="relative_position_bias_table", + ) + super().build(input_shape) + + def transpose_for_scores(self, x: tf.Tensor) -> tf.Tensor: + new_x_shape = x.shape[:-1] + (self.num_attention_heads, self.attention_head_size) + x = tf.reshape(x, new_x_shape) + return tf.transpose(x, (0, 2, 1, 3)) + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: Optional[tf.Tensor] = None, + head_mask: Optional[tf.Tensor] = None, + output_attentions: bool = False, + training: bool = False, + ) -> Tuple[tf.Tensor, ...]: + batch_size, dim, _ = hidden_states.shape + mixed_query_layer = self.query(hidden_states) + + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + query_layer = self.transpose_for_scores(mixed_query_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = tf.matmul(query_layer, tf.transpose(key_layer, (0, 1, 3, 2))) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + relative_position_bias = tf.gather( + self.relative_position_bias_table, tf.reshape(self.relative_position_index, (-1,)) + ) + relative_position_bias = tf.reshape( + relative_position_bias, + (self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1), + ) + + relative_position_bias = tf.transpose(relative_position_bias, (2, 0, 1)) + attention_scores = attention_scores + tf.expand_dims(relative_position_bias, 0) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in SwinModel forward() function) + mask_shape = attention_mask.shape[0] + attention_scores = tf.reshape( + attention_scores, (batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim) + ) + attention_mask = tf.expand_dims(attention_mask, 1) + attention_mask = tf.expand_dims(attention_mask, 0) + attention_scores = attention_scores + attention_mask + attention_scores = tf.reshape(attention_scores, (-1, self.num_attention_heads, dim, dim)) + + # Normalize the attention scores to probabilities. + attention_probs = tf.nn.softmax(attention_scores, axis=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs, training=training) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = tf.matmul(attention_probs, value_layer) + context_layer = tf.transpose(context_layer, (0, 2, 1, 3)) + new_context_layer_shape = context_layer.shape[:-2] + (self.all_head_size,) + context_layer = tf.reshape(context_layer, new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +class TFSwinSelfOutput(tf.keras.layers.Layer): + def __init__(self, config: SwinConfig, dim: int, **kwargs) -> None: + super().__init__(**kwargs) + self.dense = tf.keras.layers.Dense(dim, name="dense") + self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob, name="dropout") + + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + return hidden_states + + +class TFSwinAttention(tf.keras.layers.Layer): + def __init__(self, config: SwinConfig, dim: int, num_heads: int, **kwargs) -> None: + super().__init__(**kwargs) + self.self = TFSwinSelfAttention(config, dim, num_heads, name="self") + self.self_output = TFSwinSelfOutput(config, dim, name="output") + self.pruned_heads = set() + + def prune_heads(self, heads): + """ + Prunes heads of the model. See base class PreTrainedModel heads: dict of {layer_num: list of heads to prune in + this layer} + """ + raise NotImplementedError + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: Optional[tf.Tensor] = None, + head_mask: Optional[tf.Tensor] = None, + output_attentions: bool = False, + training: bool = False, + ) -> tf.Tensor: + self_outputs = self.self(hidden_states, attention_mask, head_mask, output_attentions, training=training) + attention_output = self.self_output(self_outputs[0], hidden_states, training=training) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class TFSwinIntermediate(tf.keras.layers.Layer): + def __init__(self, config: SwinConfig, dim: int, **kwargs) -> None: + super().__init__(**kwargs) + self.dense = tf.keras.layers.Dense(int(config.mlp_ratio * dim), name="dense") + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class TFSwinOutput(tf.keras.layers.Layer): + def __init__(self, config: SwinConfig, dim: int, **kwargs) -> None: + super().__init__(**kwargs) + self.dense = tf.keras.layers.Dense(dim, name="dense") + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, "dropout") + + def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + return hidden_states + + +class TFSwinLayer(tf.keras.layers.Layer): + def __init__( + self, config, dim, input_resolution: Tuple[int, int], num_heads: int, shift_size: int = 0, **kwargs + ) -> None: + super().__init__(**kwargs) + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.shift_size = shift_size + self.window_size = config.window_size + self.input_resolution = input_resolution + self.set_shift_and_window_size(input_resolution) + + self.layernorm_before = tf.keras.layers.LayerNormalization( + epsilon=config.layer_norm_eps, name="layernorm_before" + ) + self.attention = TFSwinAttention(config, dim, num_heads, name="attention") + self.drop_path = ( + TFSwinDropPath(config.drop_path_rate, name="drop_path") + if config.drop_path_rate > 0.0 + else tf.identity(name="drop_path") + ) + self.layernorm_after = tf.keras.layers.LayerNormalization( + epsilon=config.layer_norm_eps, name="layernorm_after" + ) + self.intermediate = TFSwinIntermediate(config, dim, name="intermediate") + self.swin_output = TFSwinOutput(config, dim, name="output") + + def set_shift_and_window_size(self, input_resolution: Tuple[int, int]) -> None: + if min(input_resolution) <= self.window_size: + # if window size is larger than input resolution, we don't partition windows + self.shift_size = 0 + self.window_size = min(input_resolution) + + def get_attn_mask(self, height: int, width: int) -> Optional[tf.Tensor]: + if self.shift_size > 0: + # calculate attention mask for SW-MSA + img_mask = tf.zeros((height, width)) + height_slices = ( + (0, -self.window_size), + (-self.window_size, -self.shift_size), + (-self.shift_size, -1), + ) + width_slices = ( + (0, -self.window_size), + (-self.window_size, -self.shift_size), + (-self.shift_size, -1), + ) + + count = 0 + for height_slice in height_slices: + for width_slice in width_slices: + indices = [ + [i, j] + for i in range(height_slice[0] % height, height_slice[1] % height + 1) + for j in range(width_slice[0] % width, width_slice[1] % width + 1) + ] + if indices: + updates = tf.ones((len(indices),), dtype=img_mask.dtype) * count + img_mask = tf.tensor_scatter_nd_update(img_mask, indices, updates) + count += 1 + + img_mask = tf.expand_dims(img_mask, -1) + img_mask = tf.expand_dims(img_mask, 0) + + mask_windows = window_partition(img_mask, self.window_size) + mask_windows = tf.reshape(mask_windows, (-1, self.window_size * self.window_size)) + attn_mask = tf.expand_dims(mask_windows, 1) - tf.expand_dims(mask_windows, 2) + attn_mask = tf.where(attn_mask != 0, float(-100.0), attn_mask) + attn_mask = tf.where(attn_mask == 0, float(0.0), attn_mask) + else: + attn_mask = None + return attn_mask + + def maybe_pad(self, hidden_states: tf.Tensor, height: int, width: int) -> Tuple[tf.Tensor, tf.Tensor]: + pad_right = (self.window_size - width % self.window_size) % self.window_size + pad_bottom = (self.window_size - height % self.window_size) % self.window_size + pad_values = tf.constant([[0, 0], [0, pad_bottom], [0, pad_right], [0, 0]]) + hidden_states = tf.pad(hidden_states, pad_values) + pad_values = tf.reshape(pad_values, (-1,)) + return hidden_states, pad_values + + def call( + self, + hidden_states: tf.Tensor, + input_dimensions: Tuple[int, int], + head_mask: Optional[tf.Tensor] = None, + output_attentions: bool = False, + training: bool = False, + ) -> tf.Tensor: + self.set_shift_and_window_size(input_dimensions) + height, width = input_dimensions + batch_size, _, channels = hidden_states.shape + shortcut = hidden_states + + hidden_states = self.layernorm_before(hidden_states, training=training) + hidden_states = tf.reshape(hidden_states, (batch_size, height, width, channels)) + # pad hidden_states to multiples of window size + hidden_states, pad_values = self.maybe_pad(hidden_states, height, width) + + _, height_pad, width_pad, _ = hidden_states.shape + # cyclic shift + if self.shift_size > 0: + shifted_hidden_states = tf.roll(hidden_states, shift=(-self.shift_size, -self.shift_size), axis=(1, 2)) + else: + shifted_hidden_states = hidden_states + + # partition windows + hidden_states_windows = window_partition(shifted_hidden_states, self.window_size) + hidden_states_windows = tf.reshape(hidden_states_windows, (-1, self.window_size * self.window_size, channels)) + attn_mask = self.get_attn_mask(height_pad, width_pad) + + attention_outputs = self.attention( + hidden_states_windows, attn_mask, head_mask, output_attentions=output_attentions, training=training + ) + + attention_output = attention_outputs[0] + + attention_windows = tf.reshape(attention_output, (-1, self.window_size, self.window_size, channels)) + shifted_windows = window_reverse(attention_windows, self.window_size, height_pad, width_pad) + + # reverse cyclic shift + if self.shift_size > 0: + attention_windows = tf.roll(shifted_windows, shift=(self.shift_size, self.shift_size), axis=(1, 2)) + else: + attention_windows = shifted_windows + + was_padded = pad_values[3] > 0 or pad_values[5] > 0 + if was_padded: + attention_windows = attention_windows[:, :height, :width, :] + + attention_windows = tf.reshape(attention_windows, (batch_size, height * width, channels)) + + hidden_states = shortcut + self.drop_path(attention_windows, training=training) + + layer_output = self.layernorm_after(hidden_states, training=training) + layer_output = self.intermediate(layer_output) + layer_output = hidden_states + self.swin_output(layer_output, training=training) + + layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,) + return layer_outputs + + +class TFSwinStage(tf.keras.layers.Layer): + def __init__( + self, + config: SwinConfig, + dim: int, + input_resolution: Tuple[int, int], + depth: int, + num_heads: int, + drop_path: List[float], + downsample: Optional[Callable], + **kwargs + ) -> None: + super().__init__(**kwargs) + self.config = config + self.dim = dim + self.blocks = [ + TFSwinLayer( + config=config, + dim=dim, + input_resolution=input_resolution, + num_heads=num_heads, + shift_size=0 if (i % 2 == 0) else config.window_size // 2, + name=f"blocks.{i}", + ) + for i in range(depth) + ] + + # patch merging layer + if downsample is not None: + self.downsample = downsample( + input_resolution, + dim=dim, + norm_layer=partial(tf.keras.layers.LayerNormalization, epsilon=1e-5), + name="downsample", + ) + else: + self.downsample = None + + self.pointing = False + + def call( + self, + hidden_states: tf.Tensor, + input_dimensions: Tuple[int, int], + head_mask: Optional[tf.Tensor] = None, + output_attentions: Optional[bool] = False, + training: bool = False, + ) -> Tuple[tf.Tensor, ...]: + height, width = input_dimensions + for i, layer_module in enumerate(self.blocks): + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module( + hidden_states, input_dimensions, layer_head_mask, output_attentions, training=training + ) + + hidden_states = layer_outputs[0] + + if self.downsample is not None: + height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2 + output_dimensions = (height, width, height_downsampled, width_downsampled) + hidden_states = self.downsample(layer_outputs[0], input_dimensions, training=training) + else: + output_dimensions = (height, width, height, width) + + stage_outputs = (hidden_states, output_dimensions) + + if output_attentions: + stage_outputs += layer_outputs[1:] + return stage_outputs + + +class TFSwinEncoder(tf.keras.layers.Layer): + def __init__(self, config: SwinConfig, grid_size: Tuple[int, int], **kwargs): + super().__init__(**kwargs) + self.num_layers = len(config.depths) + self.config = config + dpr = list((tf.linspace(0, 1, sum(config.depths)) * config.drop_path_rate).numpy()) + self.layers = [ + TFSwinStage( + config=config, + dim=int(config.embed_dim * 2**i_layer), + input_resolution=(grid_size[0] // (2**i_layer), grid_size[1] // (2**i_layer)), + depth=config.depths[i_layer], + num_heads=config.num_heads[i_layer], + drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])], + downsample=TFSwinPatchMerging if (i_layer < self.num_layers - 1) else None, + name=f"layers.{i_layer}", + ) + for i_layer in range(self.num_layers) + ] + + self.gradient_checkpointing = False + + def call( + self, + hidden_states: tf.Tensor, + input_dimensions: Tuple[int, int], + head_mask: Optional[tf.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + training: bool = False, + ) -> Union[Tuple[tf.Tensor, ...], TFSwinEncoderOutput]: + all_input_dimensions = () + all_hidden_states = () if output_hidden_states else None + all_reshaped_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + if output_hidden_states: + batch_size, _, hidden_size = hidden_states.shape + # rearrange b (h w) c -> b c h w + reshaped_hidden_state = tf.reshape(hidden_states, (batch_size, *input_dimensions, hidden_size)) + reshaped_hidden_state = tf.transpose(reshaped_hidden_state, (0, 3, 1, 2)) + all_hidden_states += (hidden_states,) + all_reshaped_hidden_states += (reshaped_hidden_state,) + + for i, layer_module in enumerate(self.layers): + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module( + hidden_states, input_dimensions, layer_head_mask, output_attentions, training=training + ) + + hidden_states = layer_outputs[0] + output_dimensions = layer_outputs[1] + + input_dimensions = (output_dimensions[-2], output_dimensions[-1]) + all_input_dimensions += (input_dimensions,) + + if output_hidden_states: + batch_size, _, hidden_size = hidden_states.shape + # rearrange b (h w) c -> b c h w + reshaped_hidden_state = tf.reshape(hidden_states, (batch_size, *input_dimensions, hidden_size)) + reshaped_hidden_state = tf.transpose(reshaped_hidden_state, (0, 3, 1, 2)) + all_hidden_states += (hidden_states,) + all_reshaped_hidden_states += (reshaped_hidden_state,) + + if output_attentions: + all_self_attentions += layer_outputs[2:] + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + + return TFSwinEncoderOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + reshaped_hidden_states=all_reshaped_hidden_states, + ) + + +class TFSwinPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = SwinConfig + base_model_prefix = "swin" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + + def _set_gradient_checkpointing(self, module, value=False) -> None: + if isinstance(module, TFSwinEncoder): + module.gradient_checkpointing = value + + @property + def dummy_inputs(self) -> Dict[str, tf.Tensor]: + """ + Dummy inputs to build the network. Returns: + `Dict[str, tf.Tensor]`: The dummy inputs. + """ + VISION_DUMMY_INPUTS = tf.random.uniform( + shape=(3, self.config.num_channels, self.config.image_size, self.config.image_size), + dtype=tf.float32, + ) + return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)} + + +SWIN_START_DOCSTRING = r""" + This model is a Tensorflow + [tf.keras.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer) sub-class. Use it as a + regular Tensorflow Module and refer to the Tensorflow documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`SwinConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +SWIN_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See + [`AutoFeatureExtractor.__call__`] for details. + head_mask (`tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +def normalize_data_format(value: str) -> str: + """ + From tensorflow addons + https://github.com/tensorflow/addons/blob/8cec33fcaaf1cf90aec7bdd55a0fcdbb251ce5c2/tensorflow_addons/utils/keras_utils.py#L71 + """ + if value is None: + value = tf.keras.backend.image_data_format() + data_format = value.lower() + if data_format not in {"channels_first", "channels_last"}: + raise ValueError( + 'The `data_format` argument must be one of "channels_first", "channels_last". Received: ' + str(value) + ) + return data_format + + +class AdaptiveAveragePooling1D(tf.keras.layers.Layer): + """ + Args: + Average 1D Pooling with adaptive kernel size. + output_size: An integer or tuple/list of a single integer, specifying pooled_features. + The new size of output channels. + data_format: A string, + one of `channels_last` (default) or `channels_first`. The ordering of the dimensions in the inputs. + `channels_last` corresponds to inputs with shape `(batch, steps, channels)` while `channels_first` corresponds + to inputs with shape `(batch, channels, steps)`. + Input shape: + - If `data_format='channels_last'`: 3D tensor with shape `(batch, steps, channels)`. + - If `data_format='channels_first'`: 3D tensor with shape `(batch, channels, steps)`. + Output shape: + - If `data_format='channels_last'`: 3D tensor with shape `(batch_size, pooled_steps, channels)`. + - If `data_format='channels_first'`: 3D tensor with shape `(batch_size, channels, pooled_steps)`. + + Adapted from [tensorflow-addon's adaptive pooling.py]( + https://github.com/tensorflow/addons/blob/8cec33fcaaf1cf90aec7bdd55a0fcdbb251ce5c2/tensorflow_addons/layers/adaptive_pooling.py#L90-L120 + ) + """ + + def __init__( + self, + output_size: Union[int, Iterable[int]], + reduce_function: Callable = tf.reduce_mean, + data_format: Optional[str] = None, + **kwargs, + ) -> None: + self.data_format = normalize_data_format(data_format) + self.reduce_function = reduce_function + self.output_size = (output_size,) if isinstance(output_size, int) else tuple(output_size) + super().__init__(**kwargs) + + def call(self, inputs: tf.Tensor, *args) -> None: + bins = self.output_size[0] + if self.data_format == "channels_last": + splits = tf.split(inputs, bins, axis=1) + splits = tf.stack(splits, axis=1) + out_vect = self.reduce_function(splits, axis=2) + else: + splits = tf.split(inputs, bins, axis=2) + splits = tf.stack(splits, axis=2) + out_vect = self.reduce_function(splits, axis=3) + return out_vect + + def compute_output_shape(self, input_shape: Iterable[int]) -> tf.TensorShape: + input_shape = tf.TensorShape(input_shape).as_list() + if self.data_format == "channels_last": + shape = tf.TensorShape([input_shape[0], self.output_size[0], input_shape[2]]) + else: + shape = tf.TensorShape([input_shape[0], input_shape[1], self.output_size[0]]) + return shape + + def get_config(self) -> Dict[str, Any]: + config = { + "output_size": self.output_size, + "data_format": self.data_format, + } + base_config = super().get_config() + return {**base_config, **config} + + +@add_start_docstrings( + "The bare Swin Model transformer outputting raw hidden-states without any specific head on top.", + SWIN_START_DOCSTRING, +) +class TFSwinModel(TFSwinPreTrainedModel): + def __init__( + self, config: SwinConfig, add_pooling_layer: bool = True, use_mask_token: bool = False, **kwargs + ) -> None: + super().__init__(config, **kwargs) + self.config = config + self.num_layers = len(config.depths) + self.num_features = int(config.embed_dim * 2 ** (self.num_layers - 1)) + + self.embeddings = TFSwinEmbeddings(config, use_mask_token=use_mask_token, name="embeddings") + self.encoder = TFSwinEncoder(config, self.embeddings.patch_grid, name="encoder") + + self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") + self.pooler = AdaptiveAveragePooling1D(output_size=(1,)) if add_pooling_layer else None + + def get_input_embeddings(self) -> TFSwinPatchEmbeddings: + return self.embeddings.patch_embeddings + + def _prune_heads(self, heads_to_prune: Dict[int, List]): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def get_head_mask(self, head_mask: Optional[Any]) -> List: + if head_mask is not None: + raise NotImplementedError + return [None] * len(self.config.depths) + + @add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + processor_class=_FEAT_EXTRACTOR_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFSwinModelOutput, + config_class=_CONFIG_FOR_DOC, + modality="vision", + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + @unpack_inputs + def call( + self, + pixel_values: Optional[tf.Tensor] = None, + bool_masked_pos: Optional[tf.Tensor] = None, + head_mask: Optional[tf.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + ) -> Union[TFSwinModelOutput, Tuple[tf.Tensor, ...]]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask) + embedding_output, input_dimensions = self.embeddings( + pixel_values, bool_masked_pos=bool_masked_pos, training=training + ) + + encoder_outputs = self.encoder( + embedding_output, + input_dimensions, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = encoder_outputs[0] + sequence_output = self.layernorm(sequence_output, training=training) + + pooled_output = None + if self.pooler is not None: + batch_size, _, num_features = sequence_output.shape + pooled_output = self.pooler(sequence_output) + pooled_output = tf.reshape(pooled_output, (batch_size, num_features)) + + if not return_dict: + output = (sequence_output, pooled_output) + encoder_outputs[1:] + return output + + return TFSwinModelOutput( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + reshaped_hidden_states=encoder_outputs.reshaped_hidden_states, + ) + + +class PixelShuffle(tf.keras.layers.Layer): + """TF layer implementation of torch.nn.PixelShuffle""" + + def __init__( + self, + upscale_factor: int, + data_format: str = "NHWC", + trainable: bool = True, + name: str = None, + dtype=None, + dynamic: bool = False, + **kwargs + ) -> None: + super().__init__(trainable, name, dtype, dynamic, **kwargs) + if upscale_factor < 2: + raise ValueError("upscale_factor must be an integer value >= 2") + self.upscale_factor = upscale_factor + self.data_format = data_format + + def call(self, x: tf.Tensor) -> tf.Tensor: + return tf.nn.depth_to_space(x, block_size=self.upscale_factor, data_format=self.data_format) + + +class TFSwinDecoder(tf.keras.layers.Layer): + def __init__(self, config: SwinConfig, **kwargs): + super().__init__(**kwargs) + self.conv2d = tf.keras.layers.Conv2D( + filters=config.encoder_stride**2 * 3, kernel_size=1, strides=1, name="0" + ) + self._block_size = config.encoder_stride + self.pixel_shuffle = PixelShuffle(self._block_size, name="1") + + def call(self, x: tf.Tensor) -> tf.Tensor: + hidden_states = x + # B,C,H,W -> B,H,W,C + hidden_states = tf.transpose(hidden_states, (0, 2, 3, 1)) + hidden_states = self.conv2d(hidden_states) + batch_size, _, _, num_input_channels = hidden_states.shape + block_size_squared = self._block_size**2 + output_depth = int(num_input_channels / block_size_squared) + # When the number of output channels >= 2, PyTorch's PixelShuffle and + # TF's depth_to_space differ in their output as the order of channels selected for combining + # is a permutation of the other c.f. + # https://stackoverflow.com/questions/68272502/tf-depth-to-space-not-same-as-torchs-pixelshuffle-when-output-channels-1 + permutation = tf.constant( + [[i + j * block_size_squared for i in range(block_size_squared) for j in range(output_depth)]] + ) + hidden_states = tf.gather(params=hidden_states, indices=tf.tile(permutation, [batch_size, 1]), batch_dims=-1) + hidden_states = self.pixel_shuffle(hidden_states) + # B,H,W,C -> B,C,H,W + hidden_states = tf.transpose(hidden_states, (0, 3, 1, 2)) + return hidden_states + + +@add_start_docstrings( + "Swin Model with a decoder on top for masked image modeling, as proposed in `SimMIM" + " `__.", + SWIN_START_DOCSTRING, +) +class TFSwinForMaskedImageModeling(TFSwinPreTrainedModel): + def __init__(self, config: SwinConfig): + super().__init__(config) + + self.swin = TFSwinModel(config, add_pooling_layer=False, use_mask_token=True, name="swin") + + self.decoder = TFSwinDecoder(config, name="decoder") + + @add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFSwinMaskedImageModelingOutput, config_class=_CONFIG_FOR_DOC) + @unpack_inputs + def call( + self, + pixel_values: Optional[tf.Tensor] = None, + bool_masked_pos: Optional[tf.Tensor] = None, + head_mask: Optional[tf.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + ) -> Union[Tuple, TFSwinMaskedImageModelingOutput]: + r""" + bool_masked_pos (`tf.Tensor` of shape `(batch_size, num_patches)`): + Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). + + Returns: + + Examples: + ```python + >>> from transformers import AutoFeatureExtractor, TFSwinForMaskedImageModeling + >>> import tensorflow as tf + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/swin-tiny-patch4-window7-224") + >>> model = TFSwinForMaskedImageModeling.from_pretrained("microsoft/swin-tiny-patch4-window7-224") + + >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2 + >>> pixel_values = feature_extractor(images=image, return_tensors="tf").pixel_values + >>> # create random boolean mask of shape (batch_size, num_patches) + >>> bool_masked_pos = tf.random.uniform((1, num_patches)) >= 0.5 + + >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos) + >>> loss, reconstructed_pixel_values = outputs.loss, outputs.logits + >>> list(reconstructed_pixel_values.shape) + [1, 3, 224, 224] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.swin( + pixel_values, + bool_masked_pos=bool_masked_pos, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = outputs[0] + # Reshape to (batch_size, num_channels, height, width) + sequence_output = tf.transpose(sequence_output, (0, 2, 1)) + batch_size, num_channels, sequence_length = sequence_output.shape + height = width = int(sequence_length**0.5) + sequence_output = tf.reshape(sequence_output, (batch_size, num_channels, height, width)) + + # Reconstruct pixel values + reconstructed_pixel_values = self.decoder(sequence_output) + + masked_im_loss = None + if bool_masked_pos is not None: + size = self.config.image_size // self.config.patch_size + bool_masked_pos = tf.reshape(bool_masked_pos, (-1, size, size)) + mask = tf.repeat(bool_masked_pos, self.config.patch_size, 1) + mask = tf.repeat(mask, self.config.patch_size, 2) + mask = tf.expand_dims(mask, 1) + mask = tf.cast(mask, tf.float32) + + reconstruction_loss = tf.keras.losses.mean_absolute_error( + # Swap axes as metric calculation reduces over the final dimension + tf.transpose(pixel_values, (1, 2, 3, 0)), + tf.transpose(reconstructed_pixel_values, (1, 2, 3, 0)), + ) + reconstruction_loss = tf.expand_dims(reconstruction_loss, 0) + total_loss = tf.reduce_sum(reconstruction_loss * mask) + num_masked_pixels = (tf.reduce_sum(mask) + 1e-5) * self.config.num_channels + masked_im_loss = total_loss / num_masked_pixels + + if not return_dict: + output = (reconstructed_pixel_values,) + outputs[2:] + return ((masked_im_loss,) + output) if masked_im_loss is not None else output + + return TFSwinMaskedImageModelingOutput( + loss=masked_im_loss, + logits=reconstructed_pixel_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + reshaped_hidden_states=outputs.reshaped_hidden_states, + ) + + +@add_start_docstrings( + """ + Swin Model transformer with an image classification head on top (a linear layer on top of the final hidden state of + the [CLS] token) e.g. for ImageNet. + """, + SWIN_START_DOCSTRING, +) +class TFSwinForImageClassification(TFSwinPreTrainedModel, TFSequenceClassificationLoss): + def __init__(self, config: SwinConfig): + super().__init__(config) + + self.num_labels = config.num_labels + self.swin = TFSwinModel(config, name="swin") + + # Classifier head + self.classifier = ( + tf.keras.layers.Dense(config.num_labels, name="classifier") + if config.num_labels > 0 + else tf.identity(name="classifier") + ) + + @add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + processor_class=_FEAT_EXTRACTOR_FOR_DOC, + checkpoint=_IMAGE_CLASS_CHECKPOINT, + output_type=TFSwinImageClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT, + ) + @unpack_inputs + def call( + self, + pixel_values: Optional[tf.Tensor] = None, + head_mask: Optional[tf.Tensor] = None, + labels: Optional[tf.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + ) -> Union[Tuple[tf.Tensor, ...], TFSwinImageClassifierOutput]: + r""" + labels (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.swin( + pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + pooled_output = outputs[1] + + logits = self.classifier(pooled_output, training=training) + + loss = None if labels is None else self.hf_compute_loss(labels, logits) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFSwinImageClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + reshaped_hidden_states=outputs.reshaped_hidden_states, + ) diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py index e089a267a0..d2e8d5c0cd 100644 --- a/src/transformers/utils/dummy_tf_objects.py +++ b/src/transformers/utils/dummy_tf_objects.py @@ -261,6 +261,9 @@ TF_MODEL_FOR_CAUSAL_LM_MAPPING = None TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = None +TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING = None + + TF_MODEL_FOR_MASKED_LM_MAPPING = None @@ -1887,6 +1890,37 @@ class TFSpeech2TextPreTrainedModel(metaclass=DummyObject): requires_backends(self, ["tf"]) +TF_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class TFSwinForImageClassification(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFSwinForMaskedImageModeling(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFSwinModel(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFSwinPreTrainedModel(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/tests/models/swin/test_modeling_swin.py b/tests/models/swin/test_modeling_swin.py index c4f73ef360..367c118c05 100644 --- a/tests/models/swin/test_modeling_swin.py +++ b/tests/models/swin/test_modeling_swin.py @@ -286,56 +286,76 @@ class SwinModelTest(ModelTesterMixin, unittest.TestCase): [self.model_tester.num_heads[0], window_size_squared, window_size_squared], ) + def check_hidden_states_output(self, inputs_dict, config, model_class, image_size): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.hidden_states + + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", len(self.model_tester.depths) + 1 + ) + self.assertEqual(len(hidden_states), expected_num_layers) + + # Swin has a different seq_length + patch_size = to_2tuple(config.patch_size) + + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [num_patches, self.model_tester.embed_dim], + ) + + reshaped_hidden_states = outputs.reshaped_hidden_states + self.assertEqual(len(reshaped_hidden_states), expected_num_layers) + + batch_size, num_channels, height, width = reshaped_hidden_states[0].shape + reshaped_hidden_states = ( + reshaped_hidden_states[0].view(batch_size, num_channels, height * width).permute(0, 2, 1) + ) + self.assertListEqual( + list(reshaped_hidden_states.shape[-2:]), + [num_patches, self.model_tester.embed_dim], + ) + def test_hidden_states_output(self): - def check_hidden_states_output(inputs_dict, config, model_class): - model = model_class(config) - model.to(torch_device) - model.eval() - - with torch.no_grad(): - outputs = model(**self._prepare_for_class(inputs_dict, model_class)) - - hidden_states = outputs.hidden_states - - expected_num_layers = getattr( - self.model_tester, "expected_num_hidden_layers", len(self.model_tester.depths) + 1 - ) - self.assertEqual(len(hidden_states), expected_num_layers) - - # Swin has a different seq_length - image_size = to_2tuple(self.model_tester.image_size) - patch_size = to_2tuple(self.model_tester.patch_size) - - num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) - - self.assertListEqual( - list(hidden_states[0].shape[-2:]), - [num_patches, self.model_tester.embed_dim], - ) - - reshaped_hidden_states = outputs.reshaped_hidden_states - self.assertEqual(len(reshaped_hidden_states), expected_num_layers) - - batch_size, num_channels, height, width = reshaped_hidden_states[0].shape - reshaped_hidden_states = ( - reshaped_hidden_states[0].view(batch_size, num_channels, height * width).permute(0, 2, 1) - ) - self.assertListEqual( - list(reshaped_hidden_states.shape[-2:]), - [num_patches, self.model_tester.embed_dim], - ) - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + image_size = to_2tuple(self.model_tester.image_size) + for model_class in self.all_model_classes: inputs_dict["output_hidden_states"] = True - check_hidden_states_output(inputs_dict, config, model_class) + self.check_hidden_states_output(inputs_dict, config, model_class, image_size) # check that output_hidden_states also work using config del inputs_dict["output_hidden_states"] config.output_hidden_states = True - check_hidden_states_output(inputs_dict, config, model_class) + self.check_hidden_states_output(inputs_dict, config, model_class, image_size) + + def test_hidden_states_output_with_padding(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.patch_size = 3 + + image_size = to_2tuple(self.model_tester.image_size) + patch_size = to_2tuple(config.patch_size) + + padded_height = image_size[0] + patch_size[0] - (image_size[0] % patch_size[0]) + padded_width = image_size[1] + patch_size[1] - (image_size[1] % patch_size[1]) + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + self.check_hidden_states_output(inputs_dict, config, model_class, (padded_height, padded_width)) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + self.check_hidden_states_output(inputs_dict, config, model_class, (padded_height, padded_width)) def test_for_image_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() diff --git a/tests/models/swin/test_modeling_tf_swin.py b/tests/models/swin/test_modeling_tf_swin.py new file mode 100644 index 0000000000..88323d7fd7 --- /dev/null +++ b/tests/models/swin/test_modeling_tf_swin.py @@ -0,0 +1,376 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the TF 2.0 Swin model. """ + + +import inspect +import unittest + +import numpy as np + +from transformers import SwinConfig +from transformers.testing_utils import require_tf, require_vision, slow +from transformers.utils import cached_property, is_tf_available, is_vision_available + +from ...test_configuration_common import ConfigTester +from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor + + +if is_tf_available(): + import tensorflow as tf + + from transformers.models.swin.modeling_tf_swin import ( + TF_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST, + TFSwinForImageClassification, + TFSwinForMaskedImageModeling, + TFSwinModel, + to_2tuple, + ) + + +if is_vision_available(): + from PIL import Image + + from transformers import AutoFeatureExtractor + + +class TFSwinModelTester: + def __init__( + self, + parent, + batch_size=13, + image_size=32, + patch_size=2, + num_channels=3, + embed_dim=16, + depths=[1, 2, 1], + num_heads=[2, 2, 4], + window_size=2, + mlp_ratio=2.0, + qkv_bias=True, + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + drop_path_rate=0.1, + hidden_act="gelu", + use_absolute_embeddings=False, + patch_norm=True, + initializer_range=0.02, + layer_norm_eps=1e-5, + is_training=True, + scope=None, + use_labels=True, + type_sequence_label_size=10, + encoder_stride=8, + ) -> None: + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.embed_dim = embed_dim + self.depths = depths + self.num_heads = num_heads + self.window_size = window_size + self.mlp_ratio = mlp_ratio + self.qkv_bias = qkv_bias + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.drop_path_rate = drop_path_rate + self.hidden_act = hidden_act + self.use_absolute_embeddings = use_absolute_embeddings + self.patch_norm = patch_norm + self.layer_norm_eps = layer_norm_eps + self.initializer_range = initializer_range + self.is_training = is_training + self.scope = scope + self.use_labels = use_labels + self.type_sequence_label_size = type_sequence_label_size + self.encoder_stride = encoder_stride + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + + config = self.get_config() + + return config, pixel_values, labels + + def get_config(self): + return SwinConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + embed_dim=self.embed_dim, + depths=self.depths, + num_heads=self.num_heads, + window_size=self.window_size, + mlp_ratio=self.mlp_ratio, + qkv_bias=self.qkv_bias, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + drop_path_rate=self.drop_path_rate, + hidden_act=self.hidden_act, + use_absolute_embeddings=self.use_absolute_embeddings, + path_norm=self.patch_norm, + layer_norm_eps=self.layer_norm_eps, + initializer_range=self.initializer_range, + encoder_stride=self.encoder_stride, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = TFSwinModel(config=config) + result = model(pixel_values) + + expected_seq_len = ((config.image_size // config.patch_size) ** 2) // (4 ** (len(config.depths) - 1)) + expected_dim = int(config.embed_dim * 2 ** (len(config.depths) - 1)) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, expected_seq_len, expected_dim)) + + def create_and_check_for_image_classification(self, config, pixel_values, labels): + config.num_labels = self.type_sequence_label_size + model = TFSwinForImageClassification(config) + result = model(pixel_values, labels=labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_tf +class TFSwinModelTest(TFModelTesterMixin, unittest.TestCase): + + all_model_classes = ( + ( + TFSwinModel, + TFSwinForImageClassification, + TFSwinForMaskedImageModeling, + ) + if is_tf_available() + else () + ) + + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + test_onnx = False + + def setUp(self): + self.model_tester = TFSwinModelTester(self) + self.config_tester = ConfigTester(self, config_class=SwinConfig, embed_dim=37) + + def test_config(self): + self.create_and_test_config_common_properties() + self.config_tester.create_and_test_config_to_json_string() + self.config_tester.create_and_test_config_to_json_file() + self.config_tester.create_and_test_config_from_and_save_pretrained() + self.config_tester.create_and_test_config_with_num_labels() + self.config_tester.check_config_can_be_init_without_params() + self.config_tester.check_config_arguments_init() + + def create_and_test_config_common_properties(self): + return + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @unittest.skip(reason="Swin does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + def test_model_common_attributes(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), tf.keras.layers.Layer) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, tf.keras.layers.Dense)) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.call) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.attentions + expected_num_attentions = len(self.model_tester.depths) + self.assertEqual(len(attentions), expected_num_attentions) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + window_size_squared = config.window_size**2 + model = model_class(config) + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.attentions + self.assertEqual(len(attentions), expected_num_attentions) + + self.assertListEqual( + list(attentions[0].shape[-3:]), + [self.model_tester.num_heads[0], window_size_squared, window_size_squared], + ) + out_len = len(outputs) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + if hasattr(self.model_tester, "num_hidden_states_types"): + added_hidden_states = self.model_tester.num_hidden_states_types + else: + # also another +1 for reshaped_hidden_states + added_hidden_states = 2 + self.assertEqual(out_len + added_hidden_states, len(outputs)) + + self_attentions = outputs.attentions + + self.assertEqual(len(self_attentions), expected_num_attentions) + + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [self.model_tester.num_heads[0], window_size_squared, window_size_squared], + ) + + def check_hidden_states_output(self, inputs_dict, config, model_class, image_size): + model = model_class(config) + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + hidden_states = outputs.hidden_states + + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", len(self.model_tester.depths) + 1 + ) + self.assertEqual(len(hidden_states), expected_num_layers) + + # Swin has a different seq_length + patch_size = to_2tuple(config.patch_size) + + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [num_patches, self.model_tester.embed_dim], + ) + + reshaped_hidden_states = outputs.reshaped_hidden_states + self.assertEqual(len(reshaped_hidden_states), expected_num_layers) + + batch_size, num_channels, height, width = reshaped_hidden_states[0].shape + + reshaped_hidden_states = tf.reshape(reshaped_hidden_states[0], (batch_size, num_channels, height * width)) + reshaped_hidden_states = tf.transpose(reshaped_hidden_states, (0, 2, 1)) + + self.assertListEqual( + list(reshaped_hidden_states.shape[-2:]), + [num_patches, self.model_tester.embed_dim], + ) + + def test_hidden_states_output(self): + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + image_size = to_2tuple(self.model_tester.image_size) + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + self.check_hidden_states_output(inputs_dict, config, model_class, image_size) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + self.check_hidden_states_output(inputs_dict, config, model_class, image_size) + + def test_inputs_requiring_padding(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.patch_size = 3 + + image_size = to_2tuple(self.model_tester.image_size) + patch_size = to_2tuple(config.patch_size) + + padded_height = image_size[0] + patch_size[0] - (image_size[0] % patch_size[0]) + padded_width = image_size[1] + patch_size[1] - (image_size[1] % patch_size[1]) + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + self.check_hidden_states_output(inputs_dict, config, model_class, (padded_height, padded_width)) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + self.check_hidden_states_output(inputs_dict, config, model_class, (padded_height, padded_width)) + + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + for model_name in TF_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = TFSwinModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +@require_vision +@require_tf +class TFSwinModelIntegrationTest(unittest.TestCase): + @cached_property + def default_feature_extractor(self): + return ( + AutoFeatureExtractor.from_pretrained("microsoft/swin-tiny-patch4-window7-224") + if is_vision_available() + else None + ) + + @slow + def test_inference_image_classification_head(self): + model = TFSwinForImageClassification.from_pretrained("microsoft/swin-tiny-patch4-window7-224") + feature_extractor = self.default_feature_extractor + + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + inputs = feature_extractor(images=image, return_tensors="tf") + + # forward pass + outputs = model(inputs) + + # verify the logits + expected_shape = tf.TensorShape((1, 1000)) + self.assertEqual(outputs.logits.shape, expected_shape) + expected_slice = tf.constant([-0.0948, -0.6454, -0.0921]) + self.assertTrue(np.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))