From f8afb2b2ec5b2fe33667d0a9b7655501d5d2e19a Mon Sep 17 00:00:00 2001 From: Andi Powers Holmes Date: Thu, 2 Nov 2023 02:09:55 +1100 Subject: [PATCH] Add TensorFlow implementation of ConvNeXTv2 (#25558) * Add type annotations to TFConvNextDropPath * Use tf.debugging.assert_equal for TFConvNextEmbeddings shape check * Add TensorFlow implementation of ConvNeXTV2 * check_docstrings: add TFConvNextV2Model to exclusions TFConvNextV2Model and TFConvNextV2ForImageClassification have docstrings which are equivalent to their PyTorch cousins, but a parsing issue prevents them from passing the test. Adding exclusions for these two classes as discussed in #25558. --- docs/source/en/index.md | 2 +- docs/source/en/model_doc/convnextv2.md | 13 +- src/transformers/__init__.py | 12 + .../models/auto/modeling_tf_auto.py | 2 + .../models/convnext/modeling_tf_convnext.py | 49 +- .../models/convnextv2/__init__.py | 24 + .../convnextv2/modeling_tf_convnextv2.py | 595 ++++++++++++++++++ .../modeling_tf_efficientformer.py | 4 +- .../models/segformer/modeling_tf_segformer.py | 4 +- src/transformers/utils/dummy_tf_objects.py | 21 + .../convnextv2/test_modeling_tf_convnextv2.py | 308 +++++++++ utils/check_docstrings.py | 2 + 12 files changed, 1012 insertions(+), 24 deletions(-) create mode 100644 src/transformers/models/convnextv2/modeling_tf_convnextv2.py create mode 100644 tests/models/convnextv2/test_modeling_tf_convnextv2.py diff --git a/docs/source/en/index.md b/docs/source/en/index.md index 5a76935b71..8aa372391d 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -97,7 +97,7 @@ Flax), PyTorch, and/or TensorFlow. | [Conditional DETR](model_doc/conditional_detr) | ✅ | ❌ | ❌ | | [ConvBERT](model_doc/convbert) | ✅ | ✅ | ❌ | | [ConvNeXT](model_doc/convnext) | ✅ | ✅ | ❌ | -| [ConvNeXTV2](model_doc/convnextv2) | ✅ | ❌ | ❌ | +| [ConvNeXTV2](model_doc/convnextv2) | ✅ | ✅ | ❌ | | [CPM](model_doc/cpm) | ✅ | ✅ | ✅ | | [CPM-Ant](model_doc/cpmant) | ✅ | ❌ | ❌ | | [CTRL](model_doc/ctrl) | ✅ | ✅ | ❌ | diff --git a/docs/source/en/model_doc/convnextv2.md b/docs/source/en/model_doc/convnextv2.md index 9479cdd56f..af08128c45 100644 --- a/docs/source/en/model_doc/convnextv2.md +++ b/docs/source/en/model_doc/convnextv2.md @@ -58,4 +58,15 @@ If you're interested in submitting a resource to be included here, please feel f ## ConvNextV2ForImageClassification [[autodoc]] ConvNextV2ForImageClassification - - forward \ No newline at end of file + - forward + +## TFConvNextV2Model + +[[autodoc]] TFConvNextV2Model + - call + + +## TFConvNextV2ForImageClassification + +[[autodoc]] TFConvNextV2ForImageClassification + - call diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 01127a5651..cb3e7b0353 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -3415,6 +3415,13 @@ else: "TFConvNextPreTrainedModel", ] ) + _import_structure["models.convnextv2"].extend( + [ + "TFConvNextV2ForImageClassification", + "TFConvNextV2Model", + "TFConvNextV2PreTrainedModel", + ] + ) _import_structure["models.ctrl"].extend( [ "TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -7127,6 +7134,11 @@ if TYPE_CHECKING: TFConvBertPreTrainedModel, ) from .models.convnext import TFConvNextForImageClassification, TFConvNextModel, TFConvNextPreTrainedModel + from .models.convnextv2 import ( + TFConvNextV2ForImageClassification, + TFConvNextV2Model, + TFConvNextV2PreTrainedModel, + ) from .models.ctrl import ( TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST, TFCTRLForSequenceClassification, diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py index b334dd3091..e79922f928 100644 --- a/src/transformers/models/auto/modeling_tf_auto.py +++ b/src/transformers/models/auto/modeling_tf_auto.py @@ -39,6 +39,7 @@ TF_MODEL_MAPPING_NAMES = OrderedDict( ("clip", "TFCLIPModel"), ("convbert", "TFConvBertModel"), ("convnext", "TFConvNextModel"), + ("convnextv2", "TFConvNextV2Model"), ("ctrl", "TFCTRLModel"), ("cvt", "TFCvtModel"), ("data2vec-vision", "TFData2VecVisionModel"), @@ -200,6 +201,7 @@ TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict( [ # Model for Image-classsification ("convnext", "TFConvNextForImageClassification"), + ("convnextv2", "TFConvNextV2ForImageClassification"), ("cvt", "TFCvtForImageClassification"), ("data2vec-vision", "TFData2VecVisionForImageClassification"), ("deit", ("TFDeiTForImageClassification", "TFDeiTForImageClassificationWithTeacher")), diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index 1629988900..59a36b3983 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -17,7 +17,7 @@ from __future__ import annotations -from typing import Optional, Tuple, Union +from typing import List, Optional, Tuple, Union import numpy as np import tensorflow as tf @@ -50,11 +50,11 @@ class TFConvNextDropPath(tf.keras.layers.Layer): (1) github.com:rwightman/pytorch-image-models """ - def __init__(self, drop_path, **kwargs): + def __init__(self, drop_path: float, **kwargs): super().__init__(**kwargs) self.drop_path = drop_path - def call(self, x, training=None): + def call(self, x: tf.Tensor, training=None): if training: keep_prob = 1 - self.drop_path shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1) @@ -69,7 +69,7 @@ class TFConvNextEmbeddings(tf.keras.layers.Layer): found in src/transformers/models/swin/modeling_swin.py. """ - def __init__(self, config, **kwargs): + def __init__(self, config: ConvNextConfig, **kwargs): super().__init__(**kwargs) self.patch_embeddings = tf.keras.layers.Conv2D( filters=config.hidden_sizes[0], @@ -77,7 +77,7 @@ class TFConvNextEmbeddings(tf.keras.layers.Layer): strides=config.patch_size, name="patch_embeddings", kernel_initializer=get_initializer(config.initializer_range), - bias_initializer="zeros", + bias_initializer=tf.keras.initializers.Zeros(), ) self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm") self.num_channels = config.num_channels @@ -86,15 +86,15 @@ class TFConvNextEmbeddings(tf.keras.layers.Layer): if isinstance(pixel_values, dict): pixel_values = pixel_values["pixel_values"] - num_channels = shape_list(pixel_values)[1] - if tf.executing_eagerly() and num_channels != self.num_channels: - raise ValueError( - "Make sure that the channel dimension of the pixel values match with the one set in the configuration." - ) + tf.debugging.assert_equal( + shape_list(pixel_values)[1], + self.num_channels, + message="Make sure that the channel dimension of the pixel values match with the one set in the configuration.", + ) # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format. # So change the input format from `NCHW` to `NHWC`. - # shape = (batch_size, in_height, in_width, in_channels=num_channels) + # shape = (batch_size, in_height, in_width, in_channels) pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) embeddings = self.patch_embeddings(pixel_values) @@ -188,15 +188,28 @@ class TFConvNextStage(tf.keras.layers.Layer): """ConvNext stage, consisting of an optional downsampling layer + multiple residual blocks. Args: - config ([`ConvNextConfig`]): Model configuration class. - in_channels (`int`): Number of input channels. - out_channels (`int`): Number of output channels. - depth (`int`): Number of residual blocks. - drop_path_rates(`List[float]`): Stochastic depth rates for each layer. + config (`ConvNextV2Config`): + Model configuration class. + in_channels (`int`): + Number of input channels. + out_channels (`int`): + Number of output channels. + depth (`int`): + Number of residual blocks. + drop_path_rates(`List[float]`): + Stochastic depth rates for each layer. """ def __init__( - self, config, in_channels, out_channels, kernel_size=2, stride=2, depth=2, drop_path_rates=None, **kwargs + self, + config: ConvNextConfig, + in_channels: int, + out_channels: int, + kernel_size: int = 2, + stride: int = 2, + depth: int = 2, + drop_path_rates: Optional[List[float]] = None, + **kwargs, ): super().__init__(**kwargs) if in_channels != out_channels or stride > 1: @@ -215,7 +228,7 @@ class TFConvNextStage(tf.keras.layers.Layer): kernel_size=kernel_size, strides=stride, kernel_initializer=get_initializer(config.initializer_range), - bias_initializer="zeros", + bias_initializer=tf.keras.initializers.Zeros(), name="downsampling_layer.1", ), ] diff --git a/src/transformers/models/convnextv2/__init__.py b/src/transformers/models/convnextv2/__init__.py index 9bfd6b26e0..d2a484b9b8 100644 --- a/src/transformers/models/convnextv2/__init__.py +++ b/src/transformers/models/convnextv2/__init__.py @@ -22,6 +22,7 @@ from ...utils import ( OptionalDependencyNotAvailable, _LazyModule, is_torch_available, + is_tf_available, ) @@ -46,6 +47,17 @@ else: "ConvNextV2Backbone", ] +try: + if not is_tf_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_tf_convnextv2"] = [ + "TFConvNextV2ForImageClassification", + "TFConvNextV2Model", + "TFConvNextV2PreTrainedModel", + ] if TYPE_CHECKING: from .configuration_convnextv2 import ( @@ -67,6 +79,18 @@ if TYPE_CHECKING: ConvNextV2PreTrainedModel, ) + try: + if not is_tf_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_tf_convnextv2 import ( + TFConvNextV2ForImageClassification, + TFConvNextV2Model, + TFConvNextV2PreTrainedModel, + ) + else: import sys diff --git a/src/transformers/models/convnextv2/modeling_tf_convnextv2.py b/src/transformers/models/convnextv2/modeling_tf_convnextv2.py new file mode 100644 index 0000000000..863e59406f --- /dev/null +++ b/src/transformers/models/convnextv2/modeling_tf_convnextv2.py @@ -0,0 +1,595 @@ +# coding=utf-8 +# Copyright 2023 Meta Platforms Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" TF 2.0 ConvNextV2 model.""" + + +from __future__ import annotations + +from typing import List, Optional, Tuple, Union + +import numpy as np +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...modeling_tf_outputs import ( + TFBaseModelOutputWithNoAttention, + TFBaseModelOutputWithPooling, + TFBaseModelOutputWithPoolingAndNoAttention, + TFImageClassifierOutputWithNoAttention, +) +from ...modeling_tf_utils import ( + TFModelInputType, + TFPreTrainedModel, + TFSequenceClassificationLoss, + get_initializer, + keras_serializable, + unpack_inputs, +) +from ...tf_utils import shape_list +from ...utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, +) +from .configuration_convnextv2 import ConvNextV2Config + + +logger = logging.get_logger(__name__) + +# General docstring +_CONFIG_FOR_DOC = "ConvNextV2Config" + +# Base docstring +_CHECKPOINT_FOR_DOC = "facebook/convnextv2-tiny-1k-224" +_EXPECTED_OUTPUT_SHAPE = [1, 768, 7, 7] + +# Image classification docstring +_IMAGE_CLASS_CHECKPOINT = "facebook/convnextv2-tiny-1k-224" +_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat" + +CONVNEXTV2_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "facebook/convnextv2-tiny-1k-224", + # See all ConvNextV2 models at https://huggingface.co/models?filter=convnextv2 +] + + +# Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextDropPath with ConvNext->ConvNextV2 +class TFConvNextV2DropPath(tf.keras.layers.Layer): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + References: + (1) github.com:rwightman/pytorch-image-models + """ + + def __init__(self, drop_path: float, **kwargs): + super().__init__(**kwargs) + self.drop_path = drop_path + + def call(self, x: tf.Tensor, training=None): + if training: + keep_prob = 1 - self.drop_path + shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1) + random_tensor = keep_prob + tf.random.uniform(shape, 0, 1) + random_tensor = tf.floor(random_tensor) + return (x / keep_prob) * random_tensor + return x + + +class TFConvNextV2GRN(tf.keras.layers.Layer): + """GRN (Global Response Normalization) layer""" + + def __init__(self, config: ConvNextV2Config, dim: int, **kwargs): + super().__init__(**kwargs) + self.dim = dim + + def build(self, input_shape: tf.TensorShape = None): + # PT's `nn.Parameters` must be mapped to a TF layer weight to inherit the same name hierarchy (and vice-versa) + self.weight = self.add_weight( + name="weight", + shape=(1, 1, 1, self.dim), + initializer=tf.keras.initializers.Zeros(), + ) + self.bias = self.add_weight( + name="bias", + shape=(1, 1, 1, self.dim), + initializer=tf.keras.initializers.Zeros(), + ) + return super().build(input_shape) + + def call(self, hidden_states: tf.Tensor): + global_features = tf.norm(hidden_states, ord="euclidean", axis=(1, 2), keepdims=True) + norm_features = global_features / (tf.reduce_mean(global_features, axis=-1, keepdims=True) + 1e-6) + hidden_states = self.weight * (hidden_states * norm_features) + self.bias + hidden_states + return hidden_states + + +# Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextEmbeddings with ConvNext->ConvNextV2 +class TFConvNextV2Embeddings(tf.keras.layers.Layer): + """This class is comparable to (and inspired by) the SwinEmbeddings class + found in src/transformers/models/swin/modeling_swin.py. + """ + + def __init__(self, config: ConvNextV2Config, **kwargs): + super().__init__(**kwargs) + self.patch_embeddings = tf.keras.layers.Conv2D( + filters=config.hidden_sizes[0], + kernel_size=config.patch_size, + strides=config.patch_size, + name="patch_embeddings", + kernel_initializer=get_initializer(config.initializer_range), + bias_initializer=tf.keras.initializers.Zeros(), + ) + self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm") + self.num_channels = config.num_channels + + def call(self, pixel_values): + if isinstance(pixel_values, dict): + pixel_values = pixel_values["pixel_values"] + + tf.debugging.assert_equal( + shape_list(pixel_values)[1], + self.num_channels, + message="Make sure that the channel dimension of the pixel values match with the one set in the configuration.", + ) + + # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format. + # So change the input format from `NCHW` to `NHWC`. + # shape = (batch_size, in_height, in_width, in_channels) + pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) + + embeddings = self.patch_embeddings(pixel_values) + embeddings = self.layernorm(embeddings) + return embeddings + + +class TFConvNextV2Layer(tf.keras.layers.Layer): + """This corresponds to the `Block` class in the original implementation. + + There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C, + H, W) (2) [DwConv, Permute to (N, H, W, C), LayerNorm (channels_last), Linear, GELU, Linear]; Permute back + + The authors used (2) as they find it slightly faster in PyTorch. Since we already permuted the inputs to follow + NHWC ordering, we can just apply the operations straight-away without the permutation. + + Args: + config (`ConvNextV2Config`): + Model configuration class. + dim (`int`): + Number of input channels. + drop_path (`float`, defaults to 0.0): + Stochastic depth rate. + """ + + def __init__(self, config: ConvNextV2Config, dim: int, drop_path: float = 0.0, **kwargs): + super().__init__(**kwargs) + self.dim = dim + self.config = config + self.dwconv = tf.keras.layers.Conv2D( + filters=dim, + kernel_size=7, + padding="same", + groups=dim, + kernel_initializer=get_initializer(config.initializer_range), + bias_initializer=tf.keras.initializers.Zeros(), + name="dwconv", + ) # depthwise conv + self.layernorm = tf.keras.layers.LayerNormalization( + epsilon=1e-6, + name="layernorm", + ) + self.pwconv1 = tf.keras.layers.Dense( + units=4 * dim, + kernel_initializer=get_initializer(config.initializer_range), + bias_initializer=tf.keras.initializers.Zeros(), + name="pwconv1", + ) # pointwise/1x1 convs, implemented with linear layers + self.act = get_tf_activation(config.hidden_act) + self.grn = TFConvNextV2GRN(config, 4 * dim, dtype=tf.float32, name="grn") + self.pwconv2 = tf.keras.layers.Dense( + units=dim, + kernel_initializer=get_initializer(config.initializer_range), + bias_initializer=tf.keras.initializers.Zeros(), + name="pwconv2", + ) + # Using `layers.Activation` instead of `tf.identity` to better control `training` + # behaviour. + self.drop_path = ( + TFConvNextV2DropPath(drop_path, name="drop_path") + if drop_path > 0.0 + else tf.keras.layers.Activation("linear", name="drop_path") + ) + + def call(self, hidden_states, training=False): + input = hidden_states + x = self.dwconv(hidden_states) + x = self.layernorm(x) + x = self.pwconv1(x) + x = self.act(x) + x = self.grn(x) + x = self.pwconv2(x) + x = self.drop_path(x, training=training) + x = input + x + return x + + +# Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextStage with ConvNext->ConvNextV2 +class TFConvNextV2Stage(tf.keras.layers.Layer): + """ConvNextV2 stage, consisting of an optional downsampling layer + multiple residual blocks. + + Args: + config (`ConvNextV2V2Config`): + Model configuration class. + in_channels (`int`): + Number of input channels. + out_channels (`int`): + Number of output channels. + depth (`int`): + Number of residual blocks. + drop_path_rates(`List[float]`): + Stochastic depth rates for each layer. + """ + + def __init__( + self, + config: ConvNextV2Config, + in_channels: int, + out_channels: int, + kernel_size: int = 2, + stride: int = 2, + depth: int = 2, + drop_path_rates: Optional[List[float]] = None, + **kwargs, + ): + super().__init__(**kwargs) + if in_channels != out_channels or stride > 1: + self.downsampling_layer = [ + tf.keras.layers.LayerNormalization( + epsilon=1e-6, + name="downsampling_layer.0", + ), + # Inputs to this layer will follow NHWC format since we + # transposed the inputs from NCHW to NHWC in the `TFConvNextV2Embeddings` + # layer. All the outputs throughout the model will be in NHWC + # from this point on until the output where we again change to + # NCHW. + tf.keras.layers.Conv2D( + filters=out_channels, + kernel_size=kernel_size, + strides=stride, + kernel_initializer=get_initializer(config.initializer_range), + bias_initializer=tf.keras.initializers.Zeros(), + name="downsampling_layer.1", + ), + ] + else: + self.downsampling_layer = [tf.identity] + + drop_path_rates = drop_path_rates or [0.0] * depth + self.layers = [ + TFConvNextV2Layer( + config, + dim=out_channels, + drop_path=drop_path_rates[j], + name=f"layers.{j}", + ) + for j in range(depth) + ] + + def call(self, hidden_states): + for layer in self.downsampling_layer: + hidden_states = layer(hidden_states) + for layer in self.layers: + hidden_states = layer(hidden_states) + return hidden_states + + +class TFConvNextV2Encoder(tf.keras.layers.Layer): + def __init__(self, config: ConvNextV2Config, **kwargs): + super().__init__(**kwargs) + self.stages = [] + drop_path_rates = tf.linspace(0.0, config.drop_path_rate, sum(config.depths)) + drop_path_rates = tf.split(drop_path_rates, config.depths) + drop_path_rates = [x.numpy().tolist() for x in drop_path_rates] + prev_chs = config.hidden_sizes[0] + for i in range(config.num_stages): + out_chs = config.hidden_sizes[i] + stage = TFConvNextV2Stage( + config, + in_channels=prev_chs, + out_channels=out_chs, + stride=2 if i > 0 else 1, + depth=config.depths[i], + drop_path_rates=drop_path_rates[i], + name=f"stages.{i}", + ) + self.stages.append(stage) + prev_chs = out_chs + + def call( + self, + hidden_states: tf.Tensor, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[Tuple, TFBaseModelOutputWithNoAttention]: + all_hidden_states = () if output_hidden_states else None + + for i, layer_module in enumerate(self.stages): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + hidden_states = layer_module(hidden_states) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states] if v is not None) + + return TFBaseModelOutputWithNoAttention(last_hidden_state=hidden_states, hidden_states=all_hidden_states) + + +@keras_serializable +class TFConvNextV2MainLayer(tf.keras.layers.Layer): + config_class = ConvNextV2Config + + def __init__(self, config: ConvNextV2Config, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.embeddings = TFConvNextV2Embeddings(config, name="embeddings") + self.encoder = TFConvNextV2Encoder(config, name="encoder") + self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") + # We are setting the `data_format` like so because from here on we will revert to the + # NCHW output format + self.pooler = tf.keras.layers.GlobalAvgPool2D(data_format="channels_last") + + @unpack_inputs + def call( + self, + pixel_values: TFModelInputType | None = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + embedding_output = self.embeddings(pixel_values, training=training) + + encoder_outputs = self.encoder( + embedding_output, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + last_hidden_state = encoder_outputs[0] + + # Change to NCHW output format have uniformity in the modules + pooled_output = self.pooler(last_hidden_state) + last_hidden_state = tf.transpose(last_hidden_state, perm=(0, 3, 1, 2)) + pooled_output = self.layernorm(pooled_output) + + # Change the other hidden state outputs to NCHW as well + if output_hidden_states: + hidden_states = tuple([tf.transpose(h, perm=(0, 3, 1, 2)) for h in encoder_outputs[1]]) + + if not return_dict: + hidden_states = hidden_states if output_hidden_states else () + return (last_hidden_state, pooled_output) + hidden_states + + return TFBaseModelOutputWithPoolingAndNoAttention( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states, + ) + + +class TFConvNextV2PreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = ConvNextV2Config + base_model_prefix = "convnextv2" + main_input_name = "pixel_values" + + +CONVNEXTV2_START_DOCSTRING = r""" + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TensorFlow models and layers in `transformers` accept two formats as input: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional argument. + + The reason the second format is supported is that Keras methods prefer this format when passing inputs to models + and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just + pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second + format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with + the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first + positional argument: + + - a single Tensor with `pixel_values` only and nothing else: `model(pixel_values)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([pixel_values, attention_mask])` or `model([pixel_values, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + `model({"pixel_values": pixel_values, "token_type_ids": token_type_ids})` + + Note that when creating models and layers with + [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry + about any of this, as you can just pass inputs like you would to any other Python function! + + + + Parameters: + config ([`ConvNextV2Config`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights. +""" + +CONVNEXTV2_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]`, `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See + [`ConvNextImageProcessor.__call__`] for details. + + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to `True`. +""" + + +@add_start_docstrings( + "The bare ConvNextV2 model outputting raw features without any specific head on top.", + CONVNEXTV2_START_DOCSTRING, +) +class TFConvNextV2Model(TFConvNextV2PreTrainedModel): + def __init__(self, config: ConvNextV2Config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.convnextv2 = TFConvNextV2MainLayer(config, name="convnextv2") + + @unpack_inputs + @add_start_docstrings_to_model_forward(CONVNEXTV2_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFBaseModelOutputWithPoolingAndNoAttention, + config_class=_CONFIG_FOR_DOC, + modality="vision", + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + def call( + self, + pixel_values: TFModelInputType | None = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + ) -> Union[TFBaseModelOutputWithPoolingAndNoAttention, Tuple[tf.Tensor]]: + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + outputs = self.convnextv2( + pixel_values=pixel_values, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + if not return_dict: + return outputs[:] + + return TFBaseModelOutputWithPoolingAndNoAttention( + last_hidden_state=outputs.last_hidden_state, + pooler_output=outputs.pooler_output, + hidden_states=outputs.hidden_states, + ) + + +@add_start_docstrings( + """ + ConvNextV2 Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for + ImageNet. + """, + CONVNEXTV2_START_DOCSTRING, +) +class TFConvNextV2ForImageClassification(TFConvNextV2PreTrainedModel, TFSequenceClassificationLoss): + def __init__(self, config: ConvNextV2Config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.num_labels = config.num_labels + self.convnextv2 = TFConvNextV2MainLayer(config, name="convnextv2") + + # Classifier head + self.classifier = tf.keras.layers.Dense( + units=config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + bias_initializer=tf.keras.initializers.Zeros(), + name="classifier", + ) + + @unpack_inputs + @add_start_docstrings_to_model_forward(CONVNEXTV2_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_IMAGE_CLASS_CHECKPOINT, + output_type=TFImageClassifierOutputWithNoAttention, + config_class=_CONFIG_FOR_DOC, + expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT, + ) + def call( + self, + pixel_values: TFModelInputType | None = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFImageClassifierOutputWithNoAttention, Tuple[tf.Tensor]]: + r""" + labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + outputs = self.convnextv2( + pixel_values, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + pooled_output = outputs.pooler_output if return_dict else outputs[1] + + logits = self.classifier(pooled_output) + loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFImageClassifierOutputWithNoAttention( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + ) diff --git a/src/transformers/models/efficientformer/modeling_tf_efficientformer.py b/src/transformers/models/efficientformer/modeling_tf_efficientformer.py index 1907af388f..c44a153428 100644 --- a/src/transformers/models/efficientformer/modeling_tf_efficientformer.py +++ b/src/transformers/models/efficientformer/modeling_tf_efficientformer.py @@ -337,11 +337,11 @@ class TFEfficientFormerDropPath(tf.keras.layers.Layer): (1) github.com:rwightman/pytorch-image-models """ - def __init__(self, drop_path, **kwargs): + def __init__(self, drop_path: float, **kwargs): super().__init__(**kwargs) self.drop_path = drop_path - def call(self, x, training=None): + def call(self, x: tf.Tensor, training=None): if training: keep_prob = 1 - self.drop_path shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1) diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py index 9d6132928a..b7fd4d2258 100644 --- a/src/transformers/models/segformer/modeling_tf_segformer.py +++ b/src/transformers/models/segformer/modeling_tf_segformer.py @@ -62,11 +62,11 @@ class TFSegformerDropPath(tf.keras.layers.Layer): (1) github.com:rwightman/pytorch-image-models """ - def __init__(self, drop_path, **kwargs): + def __init__(self, drop_path: float, **kwargs): super().__init__(**kwargs) self.drop_path = drop_path - def call(self, x, training=None): + def call(self, x: tf.Tensor, training=None): if training: keep_prob = 1 - self.drop_path shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1) diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py index 972ab49c0f..5bc238f542 100644 --- a/src/transformers/utils/dummy_tf_objects.py +++ b/src/transformers/utils/dummy_tf_objects.py @@ -836,6 +836,27 @@ class TFConvNextPreTrainedModel(metaclass=DummyObject): requires_backends(self, ["tf"]) +class TFConvNextV2ForImageClassification(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFConvNextV2Model(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFConvNextV2PreTrainedModel(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/tests/models/convnextv2/test_modeling_tf_convnextv2.py b/tests/models/convnextv2/test_modeling_tf_convnextv2.py new file mode 100644 index 0000000000..5f0bd2f29d --- /dev/null +++ b/tests/models/convnextv2/test_modeling_tf_convnextv2.py @@ -0,0 +1,308 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the TensorFlow ConvNext model. """ + +from __future__ import annotations + +import inspect +import unittest +from typing import List, Tuple + +import numpy as np + +from transformers import ConvNextV2Config +from transformers.testing_utils import require_tf, require_vision, slow +from transformers.utils import cached_property, is_tf_available, is_vision_available + +from ...test_configuration_common import ConfigTester +from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_tf_available(): + import tensorflow as tf + + from transformers import TFConvNextV2ForImageClassification, TFConvNextV2Model + + +if is_vision_available(): + from PIL import Image + + from transformers import ConvNextImageProcessor + + +class TFConvNextV2ModelTester: + def __init__( + self, + parent, + batch_size=13, + image_size=32, + num_channels=3, + num_stages=4, + hidden_sizes=[10, 20, 30, 40], + depths=[2, 2, 3, 2], + is_training=True, + use_labels=True, + intermediate_size=37, + hidden_act="gelu", + type_sequence_label_size=10, + initializer_range=0.02, + num_labels=3, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.num_channels = num_channels + self.num_stages = num_stages + self.hidden_sizes = hidden_sizes + self.depths = depths + self.is_training = is_training + self.use_labels = use_labels + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.scope = scope + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + + config = self.get_config() + + return config, pixel_values, labels + + def get_config(self): + return ConvNextV2Config( + num_channels=self.num_channels, + hidden_sizes=self.hidden_sizes, + depths=self.depths, + num_stages=self.num_stages, + hidden_act=self.hidden_act, + is_decoder=False, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = TFConvNextV2Model(config=config) + result = model(pixel_values, training=False) + # expected last hidden states: batch_size, channels, height // 32, width // 32 + self.parent.assertEqual( + result.last_hidden_state.shape, + (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32), + ) + + def create_and_check_for_image_classification(self, config, pixel_values, labels): + config.num_labels = self.type_sequence_label_size + model = TFConvNextV2ForImageClassification(config) + result = model(pixel_values, labels=labels, training=False) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_tf +class TFConvNextV2ModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as ConvNext does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (TFConvNextV2Model, TFConvNextV2ForImageClassification) if is_tf_available() else () + pipeline_model_mapping = ( + {"feature-extraction": TFConvNextV2Model, "image-classification": TFConvNextV2ForImageClassification} + if is_tf_available() + else {} + ) + + test_pruning = False + test_onnx = False + test_resize_embeddings = False + test_head_masking = False + has_attentions = False + + def setUp(self): + self.model_tester = TFConvNextV2ModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=ConvNextV2Config, + has_text_modality=False, + hidden_size=37, + ) + + @unittest.skip(reason="ConvNext does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skipIf( + not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0, + reason="TF does not support backprop for grouped convolutions on CPU.", + ) + @slow + def test_keras_fit(self): + super().test_keras_fit() + + @unittest.skip(reason="ConvNext does not support input and output embeddings") + def test_model_common_attributes(self): + pass + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.call) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @unittest.skipIf( + not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0, + reason="TF does not support backprop for grouped convolutions on CPU.", + ) + def test_dataset_conversion(self): + super().test_dataset_conversion() + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states + + expected_num_stages = self.model_tester.num_stages + self.assertEqual(len(hidden_states), expected_num_stages + 1) + + # ConvNext's feature maps are of shape (batch_size, num_channels, height, width) + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [self.model_tester.image_size // 4, self.model_tester.image_size // 4], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + # Since ConvNext does not have any attention we need to rewrite this test. + def test_model_outputs_equivalence(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): + tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs) + dict_output = model(dict_inputs, return_dict=True, **additional_kwargs).to_tuple() + + def recursive_check(tuple_object, dict_object): + if isinstance(tuple_object, (List, Tuple)): + for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif tuple_object is None: + return + else: + self.assertTrue( + all(tf.equal(tuple_object, dict_object)), + msg=( + "Tuple and dict output are not equal. Difference:" + f" {tf.math.reduce_max(tf.abs(tuple_object - dict_object))}" + ), + ) + + recursive_check(tuple_output, dict_output) + + for model_class in self.all_model_classes: + model = model_class(config) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model = TFConvNextV2Model.from_pretrained("facebook/convnextv2-tiny-1k-224") + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_tf +@require_vision +class TFConvNextV2ModelIntegrationTest(unittest.TestCase): + @cached_property + def default_image_processor(self): + return ( + ConvNextImageProcessor.from_pretrained("facebook/convnextv2-tiny-1k-224") + if is_vision_available() + else None + ) + + @slow + def test_inference_image_classification_head(self): + model = TFConvNextV2ForImageClassification.from_pretrained("facebook/convnextv2-tiny-1k-224") + + image_processor = self.default_image_processor + image = prepare_img() + inputs = image_processor(images=image, return_tensors="tf") + + # forward pass + outputs = model(**inputs) + + # verify the logits + expected_shape = tf.TensorShape((1, 1000)) + self.assertEqual(outputs.logits.shape, expected_shape) + + expected_slice = np.array([0.9996, 0.1966, -0.4386]) + + self.assertTrue(np.allclose(outputs.logits[0, :3].numpy(), expected_slice, atol=1e-4)) diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py index 3d8a2881bf..cddd04ac65 100644 --- a/utils/check_docstrings.py +++ b/utils/check_docstrings.py @@ -533,6 +533,8 @@ OBJECTS_TO_IGNORE = [ "TFConvBertModel", "TFConvNextForImageClassification", "TFConvNextModel", + "TFConvNextV2Model", # Parsing issue. Equivalent to PT ConvNextV2Model, see PR #25558 + "TFConvNextV2ForImageClassification", "TFCvtForImageClassification", "TFCvtModel", "TFDPRReader",