Add TensorFlow implementation of ConvNeXTv2 (#25558)

* Add type annotations to TFConvNextDropPath * Use tf.debugging.assert_equal for TFConvNextEmbeddings shape check * Add TensorFlow implementation of ConvNeXTV2 * check_docstrings: add TFConvNextV2Model to exclusions TFConvNextV2Model and TFConvNextV2ForImageClassification have docstrings which are equivalent to their PyTorch cousins, but a parsing issue prevents them from passing the test. Adding exclusions for these two classes as discussed in #25558.
2023-11-02 02:09:55 +11:00
parent 391d14e810
commit f8afb2b2ec
12 changed files with 1012 additions and 24 deletions
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -97,7 +97,7 @@ Flax), PyTorch, and/or TensorFlow.
 |              [Conditional DETR](model_doc/conditional_detr)              |       ✅        |         ❌         |      ❌      |
 |                      [ConvBERT](model_doc/convbert)                      |       ✅        |         ✅         |      ❌      |
 |                      [ConvNeXT](model_doc/convnext)                      |       ✅        |         ✅         |      ❌      |
-|                    [ConvNeXTV2](model_doc/convnextv2)                    |       ✅        |         ❌         |      ❌      |
+|                    [ConvNeXTV2](model_doc/convnextv2)                    |       ✅        |         ✅         |      ❌      |
 |                           [CPM](model_doc/cpm)                           |       ✅        |         ✅         |      ✅      |
 |                       [CPM-Ant](model_doc/cpmant)                        |       ✅        |         ❌         |      ❌      |
 |                          [CTRL](model_doc/ctrl)                          |       ✅        |         ✅         |      ❌      |
--- a/docs/source/en/model_doc/convnextv2.md
+++ b/docs/source/en/model_doc/convnextv2.md
@@ -59,3 +59,14 @@ If you're interested in submitting a resource to be included here, please feel f

 [[autodoc]] ConvNextV2ForImageClassification
    - forward
+
+## TFConvNextV2Model
+
+[[autodoc]] TFConvNextV2Model
+    - call
+
+
+## TFConvNextV2ForImageClassification
+
+[[autodoc]] TFConvNextV2ForImageClassification
+    - call
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@@ -3415,6 +3415,13 @@ else:
            "TFConvNextPreTrainedModel",
        ]
    )
+    _import_structure["models.convnextv2"].extend(
+        [
+            "TFConvNextV2ForImageClassification",
+            "TFConvNextV2Model",
+            "TFConvNextV2PreTrainedModel",
+        ]
+    )
    _import_structure["models.ctrl"].extend(
        [
            "TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -7127,6 +7134,11 @@ if TYPE_CHECKING:
            TFConvBertPreTrainedModel,
        )
        from .models.convnext import TFConvNextForImageClassification, TFConvNextModel, TFConvNextPreTrainedModel
+        from .models.convnextv2 import (
+            TFConvNextV2ForImageClassification,
+            TFConvNextV2Model,
+            TFConvNextV2PreTrainedModel,
+        )
        from .models.ctrl import (
            TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFCTRLForSequenceClassification,
--- a/src/transformers/models/auto/modeling_tf_auto.py
+++ b/src/transformers/models/auto/modeling_tf_auto.py
@@ -39,6 +39,7 @@ TF_MODEL_MAPPING_NAMES = OrderedDict(
        ("clip", "TFCLIPModel"),
        ("convbert", "TFConvBertModel"),
        ("convnext", "TFConvNextModel"),
+        ("convnextv2", "TFConvNextV2Model"),
        ("ctrl", "TFCTRLModel"),
        ("cvt", "TFCvtModel"),
        ("data2vec-vision", "TFData2VecVisionModel"),
@@ -200,6 +201,7 @@ TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
    [
        # Model for Image-classsification
        ("convnext", "TFConvNextForImageClassification"),
+        ("convnextv2", "TFConvNextV2ForImageClassification"),
        ("cvt", "TFCvtForImageClassification"),
        ("data2vec-vision", "TFData2VecVisionForImageClassification"),
        ("deit", ("TFDeiTForImageClassification", "TFDeiTForImageClassificationWithTeacher")),
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -17,7 +17,7 @@

 from __future__ import annotations

-from typing import Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union

 import numpy as np
 import tensorflow as tf
@@ -50,11 +50,11 @@ class TFConvNextDropPath(tf.keras.layers.Layer):
        (1) github.com:rwightman/pytorch-image-models
    """

-    def __init__(self, drop_path, **kwargs):
+    def __init__(self, drop_path: float, **kwargs):
        super().__init__(**kwargs)
        self.drop_path = drop_path

-    def call(self, x, training=None):
+    def call(self, x: tf.Tensor, training=None):
        if training:
            keep_prob = 1 - self.drop_path
            shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
@@ -69,7 +69,7 @@ class TFConvNextEmbeddings(tf.keras.layers.Layer):
    found in src/transformers/models/swin/modeling_swin.py.
    """

-    def __init__(self, config, **kwargs):
+    def __init__(self, config: ConvNextConfig, **kwargs):
        super().__init__(**kwargs)
        self.patch_embeddings = tf.keras.layers.Conv2D(
            filters=config.hidden_sizes[0],
@@ -77,7 +77,7 @@ class TFConvNextEmbeddings(tf.keras.layers.Layer):
            strides=config.patch_size,
            name="patch_embeddings",
            kernel_initializer=get_initializer(config.initializer_range),
-            bias_initializer="zeros",
+            bias_initializer=tf.keras.initializers.Zeros(),
        )
        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm")
        self.num_channels = config.num_channels
@@ -86,15 +86,15 @@ class TFConvNextEmbeddings(tf.keras.layers.Layer):
        if isinstance(pixel_values, dict):
            pixel_values = pixel_values["pixel_values"]

-        num_channels = shape_list(pixel_values)[1]
-        if tf.executing_eagerly() and num_channels != self.num_channels:
-            raise ValueError(
-                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+        tf.debugging.assert_equal(
+            shape_list(pixel_values)[1],
+            self.num_channels,
+            message="Make sure that the channel dimension of the pixel values match with the one set in the configuration.",
        )

        # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format.
        # So change the input format from `NCHW` to `NHWC`.
-        # shape = (batch_size, in_height, in_width, in_channels=num_channels)
+        # shape = (batch_size, in_height, in_width, in_channels)
        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))

        embeddings = self.patch_embeddings(pixel_values)
@@ -188,15 +188,28 @@ class TFConvNextStage(tf.keras.layers.Layer):
    """ConvNext stage, consisting of an optional downsampling layer + multiple residual blocks.

    Args:
-        config ([`ConvNextConfig`]): Model configuration class.
-        in_channels (`int`): Number of input channels.
-        out_channels (`int`): Number of output channels.
-        depth (`int`): Number of residual blocks.
-        drop_path_rates(`List[float]`): Stochastic depth rates for each layer.
+        config (`ConvNextV2Config`):
+            Model configuration class.
+        in_channels (`int`):
+            Number of input channels.
+        out_channels (`int`):
+            Number of output channels.
+        depth (`int`):
+            Number of residual blocks.
+        drop_path_rates(`List[float]`):
+            Stochastic depth rates for each layer.
    """

    def __init__(
-        self, config, in_channels, out_channels, kernel_size=2, stride=2, depth=2, drop_path_rates=None, **kwargs
+        self,
+        config: ConvNextConfig,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 2,
+        stride: int = 2,
+        depth: int = 2,
+        drop_path_rates: Optional[List[float]] = None,
+        **kwargs,
    ):
        super().__init__(**kwargs)
        if in_channels != out_channels or stride > 1:
@@ -215,7 +228,7 @@ class TFConvNextStage(tf.keras.layers.Layer):
                    kernel_size=kernel_size,
                    strides=stride,
                    kernel_initializer=get_initializer(config.initializer_range),
-                    bias_initializer="zeros",
+                    bias_initializer=tf.keras.initializers.Zeros(),
                    name="downsampling_layer.1",
                ),
            ]
--- a/src/transformers/models/convnextv2/init.py
+++ b/src/transformers/models/convnextv2/init.py
@@ -22,6 +22,7 @@ from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_torch_available,
+    is_tf_available,
 )


@@ -46,6 +47,17 @@ else:
        "ConvNextV2Backbone",
    ]

+try:
+    if not is_tf_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_tf_convnextv2"] = [
+        "TFConvNextV2ForImageClassification",
+        "TFConvNextV2Model",
+        "TFConvNextV2PreTrainedModel",
+    ]

 if TYPE_CHECKING:
    from .configuration_convnextv2 import (
@@ -67,6 +79,18 @@ if TYPE_CHECKING:
            ConvNextV2PreTrainedModel,
        )

+    try:
+        if not is_tf_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_tf_convnextv2 import (
+            TFConvNextV2ForImageClassification,
+            TFConvNextV2Model,
+            TFConvNextV2PreTrainedModel,
+        )
+
 else:
    import sys

--- a/src/transformers/models/convnextv2/modeling_tf_convnextv2.py
+++ b/src/transformers/models/convnextv2/modeling_tf_convnextv2.py
@@ -0,0 +1,595 @@
+# coding=utf-8
+# Copyright 2023 Meta Platforms Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TF 2.0 ConvNextV2 model."""
+
+
+from __future__ import annotations
+
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import (
+    TFBaseModelOutputWithNoAttention,
+    TFBaseModelOutputWithPooling,
+    TFBaseModelOutputWithPoolingAndNoAttention,
+    TFImageClassifierOutputWithNoAttention,
+)
+from ...modeling_tf_utils import (
+    TFModelInputType,
+    TFPreTrainedModel,
+    TFSequenceClassificationLoss,
+    get_initializer,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import shape_list
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_convnextv2 import ConvNextV2Config
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "ConvNextV2Config"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/convnextv2-tiny-1k-224"
+_EXPECTED_OUTPUT_SHAPE = [1, 768, 7, 7]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "facebook/convnextv2-tiny-1k-224"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+
+CONVNEXTV2_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/convnextv2-tiny-1k-224",
+    # See all ConvNextV2 models at https://huggingface.co/models?filter=convnextv2
+]
+
+
+# Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextDropPath with ConvNext->ConvNextV2
+class TFConvNextV2DropPath(tf.keras.layers.Layer):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    References:
+        (1) github.com:rwightman/pytorch-image-models
+    """
+
+    def __init__(self, drop_path: float, **kwargs):
+        super().__init__(**kwargs)
+        self.drop_path = drop_path
+
+    def call(self, x: tf.Tensor, training=None):
+        if training:
+            keep_prob = 1 - self.drop_path
+            shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
+            random_tensor = keep_prob + tf.random.uniform(shape, 0, 1)
+            random_tensor = tf.floor(random_tensor)
+            return (x / keep_prob) * random_tensor
+        return x
+
+
+class TFConvNextV2GRN(tf.keras.layers.Layer):
+    """GRN (Global Response Normalization) layer"""
+
+    def __init__(self, config: ConvNextV2Config, dim: int, **kwargs):
+        super().__init__(**kwargs)
+        self.dim = dim
+
+    def build(self, input_shape: tf.TensorShape = None):
+        # PT's `nn.Parameters` must be mapped to a TF layer weight to inherit the same name hierarchy (and vice-versa)
+        self.weight = self.add_weight(
+            name="weight",
+            shape=(1, 1, 1, self.dim),
+            initializer=tf.keras.initializers.Zeros(),
+        )
+        self.bias = self.add_weight(
+            name="bias",
+            shape=(1, 1, 1, self.dim),
+            initializer=tf.keras.initializers.Zeros(),
+        )
+        return super().build(input_shape)
+
+    def call(self, hidden_states: tf.Tensor):
+        global_features = tf.norm(hidden_states, ord="euclidean", axis=(1, 2), keepdims=True)
+        norm_features = global_features / (tf.reduce_mean(global_features, axis=-1, keepdims=True) + 1e-6)
+        hidden_states = self.weight * (hidden_states * norm_features) + self.bias + hidden_states
+        return hidden_states
+
+
+# Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextEmbeddings with ConvNext->ConvNextV2
+class TFConvNextV2Embeddings(tf.keras.layers.Layer):
+    """This class is comparable to (and inspired by) the SwinEmbeddings class
+    found in src/transformers/models/swin/modeling_swin.py.
+    """
+
+    def __init__(self, config: ConvNextV2Config, **kwargs):
+        super().__init__(**kwargs)
+        self.patch_embeddings = tf.keras.layers.Conv2D(
+            filters=config.hidden_sizes[0],
+            kernel_size=config.patch_size,
+            strides=config.patch_size,
+            name="patch_embeddings",
+            kernel_initializer=get_initializer(config.initializer_range),
+            bias_initializer=tf.keras.initializers.Zeros(),
+        )
+        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm")
+        self.num_channels = config.num_channels
+
+    def call(self, pixel_values):
+        if isinstance(pixel_values, dict):
+            pixel_values = pixel_values["pixel_values"]
+
+        tf.debugging.assert_equal(
+            shape_list(pixel_values)[1],
+            self.num_channels,
+            message="Make sure that the channel dimension of the pixel values match with the one set in the configuration.",
+        )
+
+        # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format.
+        # So change the input format from `NCHW` to `NHWC`.
+        # shape = (batch_size, in_height, in_width, in_channels)
+        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
+
+        embeddings = self.patch_embeddings(pixel_values)
+        embeddings = self.layernorm(embeddings)
+        return embeddings
+
+
+class TFConvNextV2Layer(tf.keras.layers.Layer):
+    """This corresponds to the `Block` class in the original implementation.
+
+    There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C,
+    H, W) (2) [DwConv, Permute to (N, H, W, C), LayerNorm (channels_last), Linear, GELU, Linear]; Permute back
+
+    The authors used (2) as they find it slightly faster in PyTorch. Since we already permuted the inputs to follow
+    NHWC ordering, we can just apply the operations straight-away without the permutation.
+
+    Args:
+        config (`ConvNextV2Config`):
+            Model configuration class.
+        dim (`int`):
+            Number of input channels.
+        drop_path (`float`, defaults to 0.0):
+            Stochastic depth rate.
+    """
+
+    def __init__(self, config: ConvNextV2Config, dim: int, drop_path: float = 0.0, **kwargs):
+        super().__init__(**kwargs)
+        self.dim = dim
+        self.config = config
+        self.dwconv = tf.keras.layers.Conv2D(
+            filters=dim,
+            kernel_size=7,
+            padding="same",
+            groups=dim,
+            kernel_initializer=get_initializer(config.initializer_range),
+            bias_initializer=tf.keras.initializers.Zeros(),
+            name="dwconv",
+        )  # depthwise conv
+        self.layernorm = tf.keras.layers.LayerNormalization(
+            epsilon=1e-6,
+            name="layernorm",
+        )
+        self.pwconv1 = tf.keras.layers.Dense(
+            units=4 * dim,
+            kernel_initializer=get_initializer(config.initializer_range),
+            bias_initializer=tf.keras.initializers.Zeros(),
+            name="pwconv1",
+        )  # pointwise/1x1 convs, implemented with linear layers
+        self.act = get_tf_activation(config.hidden_act)
+        self.grn = TFConvNextV2GRN(config, 4 * dim, dtype=tf.float32, name="grn")
+        self.pwconv2 = tf.keras.layers.Dense(
+            units=dim,
+            kernel_initializer=get_initializer(config.initializer_range),
+            bias_initializer=tf.keras.initializers.Zeros(),
+            name="pwconv2",
+        )
+        # Using `layers.Activation` instead of `tf.identity` to better control `training`
+        # behaviour.
+        self.drop_path = (
+            TFConvNextV2DropPath(drop_path, name="drop_path")
+            if drop_path > 0.0
+            else tf.keras.layers.Activation("linear", name="drop_path")
+        )
+
+    def call(self, hidden_states, training=False):
+        input = hidden_states
+        x = self.dwconv(hidden_states)
+        x = self.layernorm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.grn(x)
+        x = self.pwconv2(x)
+        x = self.drop_path(x, training=training)
+        x = input + x
+        return x
+
+
+# Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextStage with ConvNext->ConvNextV2
+class TFConvNextV2Stage(tf.keras.layers.Layer):
+    """ConvNextV2 stage, consisting of an optional downsampling layer + multiple residual blocks.
+
+    Args:
+        config (`ConvNextV2V2Config`):
+            Model configuration class.
+        in_channels (`int`):
+            Number of input channels.
+        out_channels (`int`):
+            Number of output channels.
+        depth (`int`):
+            Number of residual blocks.
+        drop_path_rates(`List[float]`):
+            Stochastic depth rates for each layer.
+    """
+
+    def __init__(
+        self,
+        config: ConvNextV2Config,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 2,
+        stride: int = 2,
+        depth: int = 2,
+        drop_path_rates: Optional[List[float]] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if in_channels != out_channels or stride > 1:
+            self.downsampling_layer = [
+                tf.keras.layers.LayerNormalization(
+                    epsilon=1e-6,
+                    name="downsampling_layer.0",
+                ),
+                # Inputs to this layer will follow NHWC format since we
+                # transposed the inputs from NCHW to NHWC in the `TFConvNextV2Embeddings`
+                # layer. All the outputs throughout the model will be in NHWC
+                # from this point on until the output where we again change to
+                # NCHW.
+                tf.keras.layers.Conv2D(
+                    filters=out_channels,
+                    kernel_size=kernel_size,
+                    strides=stride,
+                    kernel_initializer=get_initializer(config.initializer_range),
+                    bias_initializer=tf.keras.initializers.Zeros(),
+                    name="downsampling_layer.1",
+                ),
+            ]
+        else:
+            self.downsampling_layer = [tf.identity]
+
+        drop_path_rates = drop_path_rates or [0.0] * depth
+        self.layers = [
+            TFConvNextV2Layer(
+                config,
+                dim=out_channels,
+                drop_path=drop_path_rates[j],
+                name=f"layers.{j}",
+            )
+            for j in range(depth)
+        ]
+
+    def call(self, hidden_states):
+        for layer in self.downsampling_layer:
+            hidden_states = layer(hidden_states)
+        for layer in self.layers:
+            hidden_states = layer(hidden_states)
+        return hidden_states
+
+
+class TFConvNextV2Encoder(tf.keras.layers.Layer):
+    def __init__(self, config: ConvNextV2Config, **kwargs):
+        super().__init__(**kwargs)
+        self.stages = []
+        drop_path_rates = tf.linspace(0.0, config.drop_path_rate, sum(config.depths))
+        drop_path_rates = tf.split(drop_path_rates, config.depths)
+        drop_path_rates = [x.numpy().tolist() for x in drop_path_rates]
+        prev_chs = config.hidden_sizes[0]
+        for i in range(config.num_stages):
+            out_chs = config.hidden_sizes[i]
+            stage = TFConvNextV2Stage(
+                config,
+                in_channels=prev_chs,
+                out_channels=out_chs,
+                stride=2 if i > 0 else 1,
+                depth=config.depths[i],
+                drop_path_rates=drop_path_rates[i],
+                name=f"stages.{i}",
+            )
+            self.stages.append(stage)
+            prev_chs = out_chs
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple, TFBaseModelOutputWithNoAttention]:
+        all_hidden_states = () if output_hidden_states else None
+
+        for i, layer_module in enumerate(self.stages):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            hidden_states = layer_module(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
+
+        return TFBaseModelOutputWithNoAttention(last_hidden_state=hidden_states, hidden_states=all_hidden_states)
+
+
+@keras_serializable
+class TFConvNextV2MainLayer(tf.keras.layers.Layer):
+    config_class = ConvNextV2Config
+
+    def __init__(self, config: ConvNextV2Config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        self.embeddings = TFConvNextV2Embeddings(config, name="embeddings")
+        self.encoder = TFConvNextV2Encoder(config, name="encoder")
+        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
+        # We are setting the `data_format` like so because from here on we will revert to the
+        # NCHW output format
+        self.pooler = tf.keras.layers.GlobalAvgPool2D(data_format="channels_last")
+
+    @unpack_inputs
+    def call(
+        self,
+        pixel_values: TFModelInputType | None = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        embedding_output = self.embeddings(pixel_values, training=training)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+
+        # Change to NCHW output format have uniformity in the modules
+        pooled_output = self.pooler(last_hidden_state)
+        last_hidden_state = tf.transpose(last_hidden_state, perm=(0, 3, 1, 2))
+        pooled_output = self.layernorm(pooled_output)
+
+        # Change the other hidden state outputs to NCHW as well
+        if output_hidden_states:
+            hidden_states = tuple([tf.transpose(h, perm=(0, 3, 1, 2)) for h in encoder_outputs[1]])
+
+        if not return_dict:
+            hidden_states = hidden_states if output_hidden_states else ()
+            return (last_hidden_state, pooled_output) + hidden_states
+
+        return TFBaseModelOutputWithPoolingAndNoAttention(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states,
+        )
+
+
+class TFConvNextV2PreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = ConvNextV2Config
+    base_model_prefix = "convnextv2"
+    main_input_name = "pixel_values"
+
+
+CONVNEXTV2_START_DOCSTRING = r"""
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TensorFlow models and layers in `transformers` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `pixel_values` only and nothing else: `model(pixel_values)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([pixel_values, attention_mask])` or `model([pixel_values, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"pixel_values": pixel_values, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Parameters:
+        config ([`ConvNextV2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CONVNEXTV2_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]`, `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`ConvNextImageProcessor.__call__`] for details.
+
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to `True`.
+"""
+
+
+@add_start_docstrings(
+    "The bare ConvNextV2 model outputting raw features without any specific head on top.",
+    CONVNEXTV2_START_DOCSTRING,
+)
+class TFConvNextV2Model(TFConvNextV2PreTrainedModel):
+    def __init__(self, config: ConvNextV2Config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.convnextv2 = TFConvNextV2MainLayer(config, name="convnextv2")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(CONVNEXTV2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutputWithPoolingAndNoAttention,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def call(
+        self,
+        pixel_values: TFModelInputType | None = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[TFBaseModelOutputWithPoolingAndNoAttention, Tuple[tf.Tensor]]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        outputs = self.convnextv2(
+            pixel_values=pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        if not return_dict:
+            return outputs[:]
+
+        return TFBaseModelOutputWithPoolingAndNoAttention(
+            last_hidden_state=outputs.last_hidden_state,
+            pooler_output=outputs.pooler_output,
+            hidden_states=outputs.hidden_states,
+        )
+
+
+@add_start_docstrings(
+    """
+    ConvNextV2 Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
+    ImageNet.
+    """,
+    CONVNEXTV2_START_DOCSTRING,
+)
+class TFConvNextV2ForImageClassification(TFConvNextV2PreTrainedModel, TFSequenceClassificationLoss):
+    def __init__(self, config: ConvNextV2Config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.num_labels = config.num_labels
+        self.convnextv2 = TFConvNextV2MainLayer(config, name="convnextv2")
+
+        # Classifier head
+        self.classifier = tf.keras.layers.Dense(
+            units=config.num_labels,
+            kernel_initializer=get_initializer(config.initializer_range),
+            bias_initializer=tf.keras.initializers.Zeros(),
+            name="classifier",
+        )
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(CONVNEXTV2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=TFImageClassifierOutputWithNoAttention,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def call(
+        self,
+        pixel_values: TFModelInputType | None = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: Optional[bool] = False,
+    ) -> Union[TFImageClassifierOutputWithNoAttention, Tuple[tf.Tensor]]:
+        r"""
+        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        outputs = self.convnextv2(
+            pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        pooled_output = outputs.pooler_output if return_dict else outputs[1]
+
+        logits = self.classifier(pooled_output)
+        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFImageClassifierOutputWithNoAttention(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+        )
--- a/src/transformers/models/efficientformer/modeling_tf_efficientformer.py
+++ b/src/transformers/models/efficientformer/modeling_tf_efficientformer.py
@@ -337,11 +337,11 @@ class TFEfficientFormerDropPath(tf.keras.layers.Layer):
        (1) github.com:rwightman/pytorch-image-models
    """

-    def __init__(self, drop_path, **kwargs):
+    def __init__(self, drop_path: float, **kwargs):
        super().__init__(**kwargs)
        self.drop_path = drop_path

-    def call(self, x, training=None):
+    def call(self, x: tf.Tensor, training=None):
        if training:
            keep_prob = 1 - self.drop_path
            shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
--- a/src/transformers/models/segformer/modeling_tf_segformer.py
+++ b/src/transformers/models/segformer/modeling_tf_segformer.py
@@ -62,11 +62,11 @@ class TFSegformerDropPath(tf.keras.layers.Layer):
        (1) github.com:rwightman/pytorch-image-models
    """

-    def __init__(self, drop_path, **kwargs):
+    def __init__(self, drop_path: float, **kwargs):
        super().__init__(**kwargs)
        self.drop_path = drop_path

-    def call(self, x, training=None):
+    def call(self, x: tf.Tensor, training=None):
        if training:
            keep_prob = 1 - self.drop_path
            shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
--- a/src/transformers/utils/dummy_tf_objects.py
+++ b/src/transformers/utils/dummy_tf_objects.py
@@ -836,6 +836,27 @@ class TFConvNextPreTrainedModel(metaclass=DummyObject):
        requires_backends(self, ["tf"])


+class TFConvNextV2ForImageClassification(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFConvNextV2Model(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFConvNextV2PreTrainedModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = None


--- a/tests/models/convnextv2/test_modeling_tf_convnextv2.py
+++ b/tests/models/convnextv2/test_modeling_tf_convnextv2.py
@@ -0,0 +1,308 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the TensorFlow ConvNext model. """
+
+from __future__ import annotations
+
+import inspect
+import unittest
+from typing import List, Tuple
+
+import numpy as np
+
+from transformers import ConvNextV2Config
+from transformers.testing_utils import require_tf, require_vision, slow
+from transformers.utils import cached_property, is_tf_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers import TFConvNextV2ForImageClassification, TFConvNextV2Model
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import ConvNextImageProcessor
+
+
+class TFConvNextV2ModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=32,
+        num_channels=3,
+        num_stages=4,
+        hidden_sizes=[10, 20, 30, 40],
+        depths=[2, 2, 3, 2],
+        is_training=True,
+        use_labels=True,
+        intermediate_size=37,
+        hidden_act="gelu",
+        type_sequence_label_size=10,
+        initializer_range=0.02,
+        num_labels=3,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.num_channels = num_channels
+        self.num_stages = num_stages
+        self.hidden_sizes = hidden_sizes
+        self.depths = depths
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return ConvNextV2Config(
+            num_channels=self.num_channels,
+            hidden_sizes=self.hidden_sizes,
+            depths=self.depths,
+            num_stages=self.num_stages,
+            hidden_act=self.hidden_act,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = TFConvNextV2Model(config=config)
+        result = model(pixel_values, training=False)
+        # expected last hidden states: batch_size, channels, height // 32, width // 32
+        self.parent.assertEqual(
+            result.last_hidden_state.shape,
+            (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32),
+        )
+
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+        config.num_labels = self.type_sequence_label_size
+        model = TFConvNextV2ForImageClassification(config)
+        result = model(pixel_values, labels=labels, training=False)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, labels = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_tf
+class TFConvNextV2ModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as ConvNext does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (TFConvNextV2Model, TFConvNextV2ForImageClassification) if is_tf_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": TFConvNextV2Model, "image-classification": TFConvNextV2ForImageClassification}
+        if is_tf_available()
+        else {}
+    )
+
+    test_pruning = False
+    test_onnx = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = TFConvNextV2ModelTester(self)
+        self.config_tester = ConfigTester(
+            self,
+            config_class=ConvNextV2Config,
+            has_text_modality=False,
+            hidden_size=37,
+        )
+
+    @unittest.skip(reason="ConvNext does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skipIf(
+        not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0,
+        reason="TF does not support backprop for grouped convolutions on CPU.",
+    )
+    @slow
+    def test_keras_fit(self):
+        super().test_keras_fit()
+
+    @unittest.skip(reason="ConvNext does not support input and output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.call)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skipIf(
+        not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0,
+        reason="TF does not support backprop for grouped convolutions on CPU.",
+    )
+    def test_dataset_conversion(self):
+        super().test_dataset_conversion()
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_stages = self.model_tester.num_stages
+            self.assertEqual(len(hidden_states), expected_num_stages + 1)
+
+            # ConvNext's feature maps are of shape (batch_size, num_channels, height, width)
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [self.model_tester.image_size // 4, self.model_tester.image_size // 4],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    # Since ConvNext does not have any attention we need to rewrite this test.
+    def test_model_outputs_equivalence(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
+            tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs)
+            dict_output = model(dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
+
+            def recursive_check(tuple_object, dict_object):
+                if isinstance(tuple_object, (List, Tuple)):
+                    for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
+                        recursive_check(tuple_iterable_value, dict_iterable_value)
+                elif tuple_object is None:
+                    return
+                else:
+                    self.assertTrue(
+                        all(tf.equal(tuple_object, dict_object)),
+                        msg=(
+                            "Tuple and dict output are not equal. Difference:"
+                            f" {tf.math.reduce_max(tf.abs(tuple_object - dict_object))}"
+                        ),
+                    )
+
+                recursive_check(tuple_output, dict_output)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
+
+    def test_for_image_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        model = TFConvNextV2Model.from_pretrained("facebook/convnextv2-tiny-1k-224")
+        self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_tf
+@require_vision
+class TFConvNextV2ModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_image_processor(self):
+        return (
+            ConvNextImageProcessor.from_pretrained("facebook/convnextv2-tiny-1k-224")
+            if is_vision_available()
+            else None
+        )
+
+    @slow
+    def test_inference_image_classification_head(self):
+        model = TFConvNextV2ForImageClassification.from_pretrained("facebook/convnextv2-tiny-1k-224")
+
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="tf")
+
+        # forward pass
+        outputs = model(**inputs)
+
+        # verify the logits
+        expected_shape = tf.TensorShape((1, 1000))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = np.array([0.9996, 0.1966, -0.4386])
+
+        self.assertTrue(np.allclose(outputs.logits[0, :3].numpy(), expected_slice, atol=1e-4))
--- a/utils/check_docstrings.py
+++ b/utils/check_docstrings.py
@@ -533,6 +533,8 @@ OBJECTS_TO_IGNORE = [
    "TFConvBertModel",
    "TFConvNextForImageClassification",
    "TFConvNextModel",
+    "TFConvNextV2Model",  # Parsing issue. Equivalent to PT ConvNextV2Model, see PR #25558
+    "TFConvNextV2ForImageClassification",
    "TFCvtForImageClassification",
    "TFCvtModel",
    "TFDPRReader",