From 8f2cc1c3ab9fac0b49c6c9372e6557532a607024 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Thu, 23 Dec 2021 17:19:44 +0100
Subject: [PATCH] Add TFCLIPModel (#13967)

* Start the work for TFCLIPModel

* Convert to TF code (TODO: loss + doc)

* Clean up

* Fix pooled_output for TFCLIPTextTransformer - using tf.gather_nd

* assert -> raise error

* Expose TFCLIPModel

* Deal with dummy_inputs

* Add tests

* Fix all tests. TODO: manual check weight loading + add more comments

* Fix pt tf equivalence test

* fixes

* update TFCLIPVisionEmbeddings's Conv2D

* Fix loss + overwrite test_pt_tf_model_equivalence from common

* Add a comment about the change about MainLayer in test_keras_save_load

* Set return_loss=True in TFCLIPModelTester + make tests pass

* overwrite test_pt_tf_model_equivalence from tf common

* fix base_model_prefix

* Fix examples

* remove unused

* Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* apply review suggestions

* change self.pre_layrnorm to self.pre_layernorm

* apply more review suggestions

* return attention probs before dropout (to align with PT)

* fix weight init

* fix

* build doc

* fix missing doc

* fix for test

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 docs/source/index.mdx                         |    2 +-
 docs/source/model_doc/clip.mdx                |   17 +
 src/transformers/__init__.py                  |   16 +
 src/transformers/activations_tf.py            |    7 +
 .../models/auto/modeling_tf_auto.py           |    1 +
 src/transformers/models/clip/__init__.py      |   19 +
 .../models/clip/modeling_tf_clip.py           | 1506 +++++++++++++++++
 src/transformers/utils/dummy_tf_objects.py    |   51 +
 tests/test_modeling_clip.py                   |  143 ++
 tests/test_modeling_tf_clip.py                |  659 ++++++++
 tests/test_modeling_tf_common.py              |   25 +-
 tests/test_modeling_tf_vit.py                 |    4 +-
 utils/check_repo.py                           |    2 +
 13 files changed, 2448 insertions(+), 4 deletions(-)
 create mode 100644 src/transformers/models/clip/modeling_tf_clip.py
 create mode 100644 tests/test_modeling_tf_clip.py

diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 1609228aa0..112e84b825 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -202,7 +202,7 @@ Flax), PyTorch, and/or TensorFlow.
 |       BlenderbotSmall       |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |          CamemBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |           Canine            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            CLIP             |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
+|            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
diff --git a/docs/source/model_doc/clip.mdx b/docs/source/model_doc/clip.mdx
index 35685c040a..2bbc6f1f8e 100644
--- a/docs/source/model_doc/clip.mdx
+++ b/docs/source/model_doc/clip.mdx
@@ -125,6 +125,23 @@ This model was contributed by [valhalla](https://huggingface.co/valhalla). The o
 [[autodoc]] CLIPVisionModel
     - forward
 
+## TFCLIPModel
+
+[[autodoc]] TFCLIPModel
+    - call
+    - get_text_features
+    - get_image_features
+
+## TFCLIPTextModel
+
+[[autodoc]] TFCLIPTextModel
+    - call
+
+## TFCLIPVisionModel
+
+[[autodoc]] TFCLIPVisionModel
+    - call
+
 ## FlaxCLIPModel
 
 [[autodoc]] FlaxCLIPModel
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 80b077a648..a0044f849c 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1549,6 +1549,15 @@ if is_tf_available():
             "TFCamembertModel",
         ]
     )
+    _import_structure["models.clip"].extend(
+        [
+            "TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "TFCLIPModel",
+            "TFCLIPPreTrainedModel",
+            "TFCLIPTextModel",
+            "TFCLIPVisionModel",
+        ]
+    )
     _import_structure["models.convbert"].extend(
         [
             "TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -3394,6 +3403,13 @@ if TYPE_CHECKING:
             TFCamembertForTokenClassification,
             TFCamembertModel,
         )
+        from .models.clip import (
+            TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TFCLIPModel,
+            TFCLIPPreTrainedModel,
+            TFCLIPTextModel,
+            TFCLIPVisionModel,
+        )
         from .models.convbert import (
             TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             TFConvBertForMaskedLM,
diff --git a/src/transformers/activations_tf.py b/src/transformers/activations_tf.py
index e0cefc323c..5c40803f25 100644
--- a/src/transformers/activations_tf.py
+++ b/src/transformers/activations_tf.py
@@ -63,6 +63,12 @@ def gelu_fast(x):
     return 0.5 * x * (1.0 + tf.tanh(x * coeff2 * (1.0 + coeff1 * x * x)))
 
 
+def quick_gelu(x):
+    x = tf.convert_to_tensor(x)
+    coeff = tf.cast(1.702, x.dtype)
+    return x * tf.math.sigmoid(coeff * x)
+
+
 if version.parse(tf.version.VERSION) >= version.parse("2.4"):
 
     def approximate_gelu_wrap(x):
@@ -84,6 +90,7 @@ ACT2FN = {
     "mish": mish,
     "tanh": tf.keras.activations.tanh,
     "gelu_fast": gelu_fast,
+    "quick_gelu": quick_gelu,
 }
 
 
diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py
index d32abfdd8a..12bcd2e296 100644
--- a/src/transformers/models/auto/modeling_tf_auto.py
+++ b/src/transformers/models/auto/modeling_tf_auto.py
@@ -29,6 +29,7 @@ logger = logging.get_logger(__name__)
 TF_MODEL_MAPPING_NAMES = OrderedDict(
     [
         # Base model mapping
+        ("clip", "TFCLIPModel"),
         ("deberta-v2", "TFDebertaV2Model"),
         ("deberta", "TFDebertaModel"),
         ("rembert", "TFRemBertModel"),
diff --git a/src/transformers/models/clip/__init__.py b/src/transformers/models/clip/__init__.py
index 78b7d8a599..310f22cfc6 100644
--- a/src/transformers/models/clip/__init__.py
+++ b/src/transformers/models/clip/__init__.py
@@ -20,6 +20,7 @@ from typing import TYPE_CHECKING
 from ...file_utils import (
     _LazyModule,
     is_flax_available,
+    is_tf_available,
     is_tokenizers_available,
     is_torch_available,
     is_vision_available,
@@ -47,6 +48,15 @@ if is_torch_available():
         "CLIPVisionModel",
     ]
 
+if is_tf_available():
+    _import_structure["modeling_tf_clip"] = [
+        "TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "TFCLIPModel",
+        "TFCLIPPreTrainedModel",
+        "TFCLIPTextModel",
+        "TFCLIPVisionModel",
+    ]
+
 if is_flax_available():
     _import_structure["modeling_flax_clip"] = [
         "FlaxCLIPModel",
@@ -78,6 +88,15 @@ if TYPE_CHECKING:
             CLIPVisionModel,
         )
 
+    if is_tf_available():
+        from .modeling_tf_clip import (
+            TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TFCLIPModel,
+            TFCLIPPreTrainedModel,
+            TFCLIPTextModel,
+            TFCLIPVisionModel,
+        )
+
     if is_flax_available():
         from .modeling_flax_clip import (
             FlaxCLIPModel,
diff --git a/src/transformers/models/clip/modeling_tf_clip.py b/src/transformers/models/clip/modeling_tf_clip.py
new file mode 100644
index 0000000000..918ecdd1be
--- /dev/null
+++ b/src/transformers/models/clip/modeling_tf_clip.py
@@ -0,0 +1,1506 @@
+# coding=utf-8
+# Copyright 2021 The OpenAI Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TF 2.0 CLIP model. """
+
+
+import math
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Tuple, Union
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...file_utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling
+
+# Public API
+from ...modeling_tf_utils import (
+    DUMMY_INPUTS,
+    TFModelInputType,
+    TFPreTrainedModel,
+    get_initializer,
+    input_processing,
+    keras_serializable,
+    shape_list,
+)
+from ...utils import logging
+from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "openai/clip-vit-base-patch32"
+
+TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "openai/clip-vit-base-patch32",
+    # See all CLIP models at https://huggingface.co/models?filter=clip
+]
+
+
+LARGE_NEGATIVE = -1e8
+
+
+# Copied from transformers.models.bart.modeling_tf_bart._expand_mask
+def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None, past_key_values_length: int = 0):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    src_len = shape_list(mask)[1]
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    one_cst = tf.constant(1.0)
+    mask = tf.cast(mask, dtype=one_cst.dtype)
+    expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))
+
+    return (one_cst - expanded_mask) * LARGE_NEGATIVE
+
+
+# contrastive loss function, adapted from
+# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html
+def contrastive_loss(logits: tf.Tensor) -> tf.Tensor:
+    return tf.math.reduce_mean(
+        tf.keras.metrics.sparse_categorical_crossentropy(
+            y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True
+        )
+    )
+
+
+def clip_loss(similarity: tf.Tensor) -> tf.Tensor:
+    caption_loss = contrastive_loss(similarity)
+    image_loss = contrastive_loss(tf.transpose(similarity))
+    return (caption_loss + image_loss) / 2.0
+
+
+@dataclass
+class TFCLIPOutput(ModelOutput):
+    """
+    Args:
+        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+            Contrastive loss for image-text similarity.
+        logits_per_image:(`tf.Tensor` of shape `(image_batch_size, text_batch_size)`):
+            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the
+            image-text similarity scores.
+        logits_per_text:(`tf.Tensor` of shape `(text_batch_size, image_batch_size)`):
+            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the
+            text-image similarity scores.
+        text_embeds(`tf.Tensor` of shape `(batch_size, output_dim`):
+            The text embeddings obtained by applying the projection layer to the pooled output of
+            [`TFCLIPTextModel`].
+        image_embeds(`tf.Tensor` of shape `(batch_size, output_dim`):
+            The image embeddings obtained by applying the projection layer to the pooled output of
+            [`TFCLIPVisionModel`].
+        text_model_output([`~modeling_tf_utils.TFBaseModelOutputWithPooling`]):
+            The output of the [`TFCLIPTextModel`].
+        vision_model_output([`~modeling_tf_utils.TFBaseModelOutputWithPooling`]):
+            The output of the [`TFCLIPVisionModel`].
+    """
+
+    loss: Optional[tf.Tensor] = None
+    logits_per_image: tf.Tensor = None
+    logits_per_text: tf.Tensor = None
+    text_embeds: tf.Tensor = None
+    image_embeds: tf.Tensor = None
+    text_model_output: TFBaseModelOutputWithPooling = None
+    vision_model_output: TFBaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+class TFCLIPVisionEmbeddings(tf.keras.layers.Layer):
+    def __init__(self, config: CLIPVisionConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+
+        self.config = config
+
+        self.patch_embedding = tf.keras.layers.Conv2D(
+            filters=self.embed_dim,
+            kernel_size=self.patch_size,
+            strides=self.patch_size,
+            padding="valid",
+            data_format="channels_last",
+            use_bias=False,
+            kernel_initializer=get_initializer(self.config.initializer_range * self.config.initializer_factor),
+            name="patch_embedding",
+        )
+
+    def build(self, input_shape: tf.TensorShape):
+
+        factor = self.config.initializer_factor
+
+        self.class_embedding = self.add_weight(
+            shape=(self.embed_dim,),
+            initializer=get_initializer(self.embed_dim ** -0.5 * factor),
+            trainable=True,
+            name="class_embedding",
+        )
+
+        with tf.name_scope("position_embedding"):
+            self.position_embedding = self.add_weight(
+                shape=(self.num_positions, self.embed_dim),
+                initializer=get_initializer(self.config.initializer_range * factor),
+                trainable=True,
+                name="embeddings",
+            )
+
+        super().build(input_shape)
+
+    def call(self, pixel_values: tf.Tensor) -> tf.Tensor:
+        """`pixel_values` is expected to be of NCHW format."""
+
+        batch_size, num_channels, height, width = shape_list(pixel_values)
+
+        # When running on CPU, `tf.nn.conv2d` doesn't support `NCHW` format.
+        # So change the input format from `NCHW` to `NHWC`.
+        # shape = (batch_size, in_height, in_width, in_channels=num_channels)
+        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
+
+        patch_embeds = self.patch_embedding(pixel_values)
+
+        # Change the 2D spatial dimensions to a single temporal dimension.
+        # shape = (batch_size, num_patches, out_channels=embed_dim)
+        patch_embeds = tf.reshape(tensor=patch_embeds, shape=(batch_size, self.num_patches, -1))
+
+        # add the [CLS] token to the embedded patch tokens
+        class_embeds = tf.broadcast_to(self.class_embedding, shape=(batch_size, 1, self.embed_dim))
+        embeddings = tf.concat((class_embeds, patch_embeds), axis=1)
+
+        embeddings = embeddings + self.position_embedding
+
+        return embeddings
+
+
+class TFCLIPTextEmbeddings(tf.keras.layers.Layer):
+    def __init__(self, config: CLIPTextConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.embed_dim = config.hidden_size
+        self.vocab_size = config.vocab_size
+
+        self.config = config
+
+    def build(self, input_shape: tf.TensorShape):
+
+        with tf.name_scope("token_embedding"):
+            self.weight = self.add_weight(
+                shape=(self.vocab_size, self.embed_dim),
+                initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range),
+                trainable=True,
+                name="weight",
+            )
+
+        with tf.name_scope("position_embedding"):
+            self.position_embedding = self.add_weight(
+                shape=(self.config.max_position_embeddings, self.embed_dim),
+                initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range),
+                trainable=True,
+                name="embeddings",
+            )
+
+        super().build(input_shape)
+
+    def call(
+        self,
+        input_ids: tf.Tensor = None,
+        position_ids: tf.Tensor = None,
+        inputs_embeds: tf.Tensor = None,
+    ) -> tf.Tensor:
+        """
+        Applies embedding based on inputs tensor.
+
+        Returns:
+            final_embeddings (`tf.Tensor`): output embedding tensor.
+        """
+        if input_ids is None and inputs_embeds is None:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
+
+        input_shape = shape_list(inputs_embeds)[:-1]
+
+        if position_ids is None:
+            position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)
+
+        position_embeds = tf.gather(params=self.position_embedding, indices=position_ids)
+        position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1))
+        final_embeddings = inputs_embeds + position_embeds
+
+        return final_embeddings
+
+
+class TFCLIPAttention(tf.keras.layers.Layer):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: CLIPConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.embed_dim = config.hidden_size
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = self.embed_dim // self.num_attention_heads
+        if self.attention_head_size * self.num_attention_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_attention_heads})."
+            )
+
+        factor = config.initializer_factor
+        in_proj_std = (self.embed_dim ** -0.5) * ((2 * config.num_hidden_layers) ** -0.5) * factor
+        out_proj_std = (self.embed_dim ** -0.5) * factor
+
+        self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
+
+        self.q_proj = tf.keras.layers.Dense(
+            units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="q_proj"
+        )
+        self.k_proj = tf.keras.layers.Dense(
+            units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="k_proj"
+        )
+        self.v_proj = tf.keras.layers.Dense(
+            units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="v_proj"
+        )
+
+        self.dropout = tf.keras.layers.Dropout(rate=config.attention_dropout)
+
+        self.out_proj = tf.keras.layers.Dense(
+            units=self.embed_dim, kernel_initializer=get_initializer(out_proj_std), name="out_proj"
+        )
+
+    # copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention.transpose_for_scores
+    def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
+        # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
+        tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
+
+        # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size]
+        return tf.transpose(tensor, perm=[0, 2, 1, 3])
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        causal_attention_mask: tf.Tensor,
+        output_attentions: bool,
+        training: bool = False,
+    ) -> Tuple[tf.Tensor]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size = shape_list(hidden_states)[0]
+        mixed_query_layer = self.q_proj(inputs=hidden_states)
+        mixed_key_layer = self.k_proj(inputs=hidden_states)
+        mixed_value_layer = self.v_proj(inputs=hidden_states)
+        query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
+        key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
+        value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        # (batch size, num_heads, seq_len_q, seq_len_k)
+        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
+        dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
+        attention_scores = tf.divide(attention_scores, dk)
+
+        # apply the causal_attention_mask first
+        if causal_attention_mask is not None:
+            # Apply the causal attention mask (precomputed for all layers in TFCLIPModel call() function)
+            attention_scores = tf.add(attention_scores, causal_attention_mask)
+
+        if attention_mask is not None:
+            # Apply the attention mask (precomputed for all layers in TFCLIPModel call() function)
+            attention_scores = tf.add(attention_scores, attention_mask)
+
+        # Normalize the attention scores to probabilities.
+        _attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(inputs=_attention_probs, training=training)
+
+        attention_output = tf.matmul(attention_probs, value_layer)
+        attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])
+
+        # (batch_size, seq_len_q, embed_dim)
+        attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.embed_dim))
+
+        attention_output = self.out_proj(attention_output, training=training)
+        # In TFBert, attention weights are returned after dropout.
+        # However, in CLIP, they are returned before dropout.
+        outputs = (attention_output, _attention_probs) if output_attentions else (attention_output,)
+
+        return outputs
+
+
+class TFCLIPMLP(tf.keras.layers.Layer):
+    def __init__(self, config: CLIPConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.activation_fn = get_tf_activation(config.hidden_act)
+
+        factor = config.initializer_factor
+        in_proj_std = (config.hidden_size ** -0.5) * ((2 * config.num_hidden_layers) ** -0.5) * factor
+        fc_std = (2 * config.hidden_size) ** -0.5 * factor
+
+        self.fc1 = tf.keras.layers.Dense(
+            units=config.intermediate_size, kernel_initializer=get_initializer(fc_std), name="fc1"
+        )
+        self.fc2 = tf.keras.layers.Dense(
+            units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2"
+        )
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+
+        hidden_states = self.fc1(inputs=hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(inputs=hidden_states)
+        return hidden_states
+
+
+class TFCLIPEncoderLayer(tf.keras.layers.Layer):
+    def __init__(self, config: CLIPConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.embed_dim = config.hidden_size
+        self.self_attn = TFCLIPAttention(config, name="self_attn")
+        self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1")
+        self.mlp = TFCLIPMLP(config, name="mlp")
+        self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2")
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        causal_attention_mask: tf.Tensor,
+        output_attentions: bool,
+        training: bool = False,
+    ) -> Tuple[tf.Tensor]:
+        """
+        Args:
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`tf.Tensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            causal_attention_mask (`tf.Tensor`): causal attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`):
+                Whether or not to return the attentions tensors of all attention layers. See `outputs` under returned
+                tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(inputs=hidden_states)
+        attention_outputs = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+            training=training,
+        )
+        hidden_states = attention_outputs[0]
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(inputs=hidden_states)
+        hidden_states = self.mlp(hidden_states=hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,) + attention_outputs[1:]  # add attentions if we output them
+
+        return outputs
+
+
+class TFCLIPEncoder(tf.keras.layers.Layer):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`TFCLIPEncoderLayer`].
+
+    Args:
+        config: CLIPConfig
+    """
+
+    def __init__(self, config: CLIPConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.layers = [TFCLIPEncoderLayer(config, name=f"layers_._{i}") for i in range(config.num_hidden_layers)]
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        causal_attention_mask: tf.Tensor,
+        output_attentions: bool,
+        output_hidden_states: bool,
+        return_dict: bool,
+        training: bool = False,
+    ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = layer_module(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                causal_attention_mask=causal_attention_mask,
+                output_attentions=output_attentions,
+                training=training,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
+
+        return TFBaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
+        )
+
+
+class TFCLIPTextTransformer(tf.keras.layers.Layer):
+    def __init__(self, config: CLIPTextConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.embeddings = TFCLIPTextEmbeddings(config, name="embeddings")
+        self.encoder = TFCLIPEncoder(config, name="encoder")
+        self.final_layer_norm = tf.keras.layers.LayerNormalization(
+            epsilon=config.layer_norm_eps, name="final_layer_norm"
+        )
+
+    def call(
+        self,
+        input_ids: TFModelInputType,
+        attention_mask: tf.Tensor,
+        position_ids: tf.Tensor,
+        output_attentions: bool,
+        output_hidden_states: bool,
+        return_dict: bool,
+        training: bool = False,
+        **kwargs,
+    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
+        input_shape = shape_list(input_ids)
+
+        embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+        batch_size, seq_length = input_shape
+        # CLIP's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+        causal_attention_mask = self._build_causal_attention_mask(batch_size, seq_length, dtype=embedding_output.dtype)
+
+        # check attention mask and invert
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        attention_mask = _expand_mask(attention_mask)
+
+        encoder_outputs = self.encoder(
+            hidden_states=embedding_output,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.final_layer_norm(inputs=sequence_output)
+
+        # text_embeds.shape = [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        pooled_output = tf.gather_nd(
+            params=sequence_output,
+            indices=tf.stack(
+                values=(tf.range(input_shape[0], dtype=tf.int64), tf.math.argmax(input_ids, axis=-1)), axis=1
+            ),
+        )
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return TFBaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def _build_causal_attention_mask(self, batch_size, seq_length, dtype=tf.float32):
+
+        diag = tf.constant(0.0, shape=(seq_length,), dtype=dtype)
+
+        # set an additive 2D attention mask with all places being masked
+        to_mask = tf.constant(-10000.0, shape=(seq_length, seq_length), dtype=dtype)
+
+        # set diagonal & lower triangular parts to 0 (i.e. the places not to be masked)
+        # TIP: think the 2D matrix as the space of (query_seq, key_seq)
+        to_mask = tf.linalg.band_part(to_mask, 0, -1)
+        # to_mask = tf.linalg.band_part(to_mask, -1, 0)
+        to_mask = tf.linalg.set_diag(to_mask, diagonal=diag)
+
+        return tf.broadcast_to(input=to_mask, shape=(batch_size, 1, seq_length, seq_length))
+
+
+@keras_serializable
+class TFCLIPTextMainLayer(tf.keras.layers.Layer):
+    config_class = CLIPTextConfig
+
+    def __init__(self, config: CLIPTextConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.text_model = TFCLIPTextTransformer(config, name="text_model")
+
+    def get_input_embeddings(self) -> tf.keras.layers.Layer:
+        return self.text_model.embeddings
+
+    def set_input_embeddings(self, value: tf.Variable):
+        self.text_model.embeddings.weight = value
+        self.text_model.embeddings.vocab_size = shape_list(value)[0]
+
+    def call(
+        self,
+        input_ids: Optional[TFModelInputType] = None,
+        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+        **kwargs,
+    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
+        inputs = input_processing(
+            func=self.call,
+            config=self.config,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+            kwargs_call=kwargs,
+        )
+
+        if inputs["input_ids"] is None:
+            raise ValueError("You have to specify either input_ids")
+
+        input_shape = shape_list(inputs["input_ids"])
+
+        if inputs["attention_mask"] is None:
+            inputs["attention_mask"] = tf.fill(dims=input_shape, value=1)
+
+        text_model_outputs = self.text_model(
+            input_ids=inputs["input_ids"],
+            attention_mask=inputs["attention_mask"],
+            position_ids=inputs["position_ids"],
+            output_attentions=inputs["output_attentions"],
+            output_hidden_states=inputs["output_hidden_states"],
+            return_dict=inputs["return_dict"],
+            training=inputs["training"],
+        )
+
+        return text_model_outputs
+
+
+class TFCLIPVisionTransformer(tf.keras.layers.Layer):
+    def __init__(self, config: CLIPVisionConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.embeddings = TFCLIPVisionEmbeddings(config, name="embeddings")
+        self.pre_layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="pre_layrnorm")
+        self.encoder = TFCLIPEncoder(config, name="encoder")
+        self.post_layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm")
+
+    def call(
+        self,
+        pixel_values: TFModelInputType,
+        output_attentions: bool,
+        output_hidden_states: bool,
+        return_dict: bool,
+        training: bool = False,
+        **kwargs,
+    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
+
+        embedding_output = self.embeddings(pixel_values=pixel_values)
+        embedding_output = self.pre_layernorm(inputs=embedding_output)
+
+        encoder_outputs = self.encoder(
+            hidden_states=embedding_output,
+            attention_mask=None,
+            causal_attention_mask=None,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = encoder_outputs[0]
+        pooled_output = sequence_output[:, 0, :]
+        pooled_output = self.post_layernorm(inputs=pooled_output)
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return TFBaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@keras_serializable
+class TFCLIPVisionMainLayer(tf.keras.layers.Layer):
+    config_class = CLIPVisionConfig
+
+    def __init__(self, config: CLIPVisionConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.vision_model = TFCLIPVisionTransformer(config, name="vision_model")
+
+    def get_input_embeddings(self) -> tf.keras.layers.Layer:
+        return self.vision_model.embeddings
+
+    def call(
+        self,
+        pixel_values: Optional[TFModelInputType] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+        **kwargs,
+    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
+        inputs = input_processing(
+            func=self.call,
+            config=self.config,
+            input_ids=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+            kwargs_call=kwargs,
+        )
+
+        if "input_ids" in inputs:
+            inputs["pixel_values"] = inputs.pop("input_ids")
+
+        if inputs["pixel_values"] is None:
+            raise ValueError("You have to specify pixel_values")
+
+        vision_model_outputs = self.vision_model(
+            pixel_values=inputs["pixel_values"],
+            output_attentions=inputs["output_attentions"],
+            output_hidden_states=inputs["output_hidden_states"],
+            return_dict=inputs["return_dict"],
+            training=inputs["training"],
+        )
+
+        return vision_model_outputs
+
+
+@keras_serializable
+class TFCLIPMainLayer(tf.keras.layers.Layer):
+    config_class = CLIPConfig
+
+    def __init__(self, config: CLIPConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        if not isinstance(config.text_config, CLIPTextConfig):
+            raise ValueError(
+                f"config.text_config is expected to be of type CLIPTextConfig but is of type {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, CLIPVisionConfig):
+            raise ValueError(
+                f"config.vision_config is expected to be of type CLIPVisionConfig but is of type {type(config.vision_config)}."
+            )
+
+        self.config = config
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.projection_dim = config.projection_dim
+
+        self.text_model = TFCLIPTextTransformer(text_config, name="text_model")
+        self.vision_model = TFCLIPVisionTransformer(vision_config, name="vision_model")
+
+        self.visual_projection = tf.keras.layers.Dense(
+            units=self.projection_dim,
+            kernel_initializer=get_initializer(vision_config.hidden_size ** -0.5 * self.config.initializer_factor),
+            use_bias=False,
+            name="visual_projection",
+        )
+
+        self.text_projection = tf.keras.layers.Dense(
+            units=self.projection_dim,
+            kernel_initializer=get_initializer(text_config.hidden_size ** -0.5 * self.config.initializer_factor),
+            use_bias=False,
+            name="text_projection",
+        )
+
+    def build(self, input_shape: tf.TensorShape):
+
+        self.logit_scale = self.add_weight(
+            shape=(1,),
+            initializer=tf.keras.initializers.Constant(self.config.logit_scale_init_value),
+            trainable=True,
+            name="logit_scale",
+        )
+
+        super().build(input_shape)
+
+    def get_text_features(
+        self,
+        input_ids: Optional[TFModelInputType] = None,
+        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+        **kwargs,
+    ) -> tf.Tensor:
+        inputs = input_processing(
+            func=self.get_text_features,
+            config=self.config,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+            kwargs_call=kwargs,
+        )
+
+        if inputs["input_ids"] is None:
+            raise ValueError("You have to specify either input_ids")
+
+        input_shape = shape_list(inputs["input_ids"])
+
+        if inputs["attention_mask"] is None:
+            inputs["attention_mask"] = tf.fill(dims=input_shape, value=1)
+
+        text_outputs = self.text_model(
+            input_ids=inputs["input_ids"],
+            attention_mask=inputs["attention_mask"],
+            position_ids=inputs["position_ids"],
+            output_attentions=inputs["output_attentions"],
+            output_hidden_states=inputs["output_hidden_states"],
+            return_dict=inputs["return_dict"],
+            training=inputs["training"],
+        )
+
+        pooled_output = text_outputs[1]
+        text_features = self.text_projection(inputs=pooled_output)
+
+        return text_features
+
+    def get_image_features(
+        self,
+        pixel_values: Optional[TFModelInputType] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+        **kwargs,
+    ) -> tf.Tensor:
+        inputs = input_processing(
+            func=self.get_image_features,
+            config=self.config,
+            input_ids=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+            kwargs_call=kwargs,
+        )
+
+        if "input_ids" in inputs:
+            inputs["pixel_values"] = inputs.pop("input_ids")
+
+        if inputs["pixel_values"] is None:
+            raise ValueError("You have to specify pixel_values")
+
+        vision_outputs = self.vision_model(
+            pixel_values=inputs["pixel_values"],
+            output_attentions=inputs["output_attentions"],
+            output_hidden_states=inputs["output_hidden_states"],
+            return_dict=inputs["return_dict"],
+            training=inputs["training"],
+        )
+
+        pooled_output = vision_outputs[1]  # pooled_output
+        image_features = self.visual_projection(inputs=pooled_output)
+
+        return image_features
+
+    def call(
+        self,
+        input_ids: Optional[TFModelInputType] = None,
+        pixel_values: Optional[TFModelInputType] = None,
+        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+        **kwargs,
+    ) -> Union[TFCLIPOutput, Tuple[tf.Tensor]]:
+        inputs = input_processing(
+            func=self.call,
+            config=self.config,
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            return_loss=return_loss,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+            kwargs_call=kwargs,
+        )
+
+        if inputs["input_ids"] is None:
+            raise ValueError("You have to specify either input_ids")
+        if inputs["pixel_values"] is None:
+            raise ValueError("You have to specify pixel_values")
+
+        input_shape = shape_list(inputs["input_ids"])
+
+        if inputs["attention_mask"] is None:
+            inputs["attention_mask"] = tf.fill(dims=input_shape, value=1)
+
+        vision_outputs = self.vision_model(
+            pixel_values=inputs["pixel_values"],
+            output_attentions=inputs["output_attentions"],
+            output_hidden_states=inputs["output_hidden_states"],
+            return_dict=inputs["return_dict"],
+            training=inputs["training"],
+        )
+
+        text_outputs = self.text_model(
+            input_ids=inputs["input_ids"],
+            attention_mask=inputs["attention_mask"],
+            position_ids=inputs["position_ids"],
+            output_attentions=inputs["output_attentions"],
+            output_hidden_states=inputs["output_hidden_states"],
+            return_dict=inputs["return_dict"],
+            training=inputs["training"],
+        )
+
+        image_embeds = vision_outputs[1]
+        image_embeds = self.visual_projection(inputs=image_embeds)
+
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(inputs=text_embeds)
+
+        # normalized features
+        image_embeds = image_embeds / tf.norm(tensor=image_embeds, ord="euclidean", axis=-1, keepdims=True)
+        text_embeds = text_embeds / tf.norm(tensor=text_embeds, ord="euclidean", axis=-1, keepdims=True)
+
+        # cosine similarity as logits
+        logit_scale = tf.math.exp(self.logit_scale)
+        logits_per_text = tf.matmul(text_embeds, image_embeds, transpose_b=True) * logit_scale
+        logits_per_image = tf.transpose(logits_per_text)
+
+        loss = None
+        if inputs["return_loss"]:
+            loss = clip_loss(logits_per_text)
+
+        if not inputs["return_dict"]:
+            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+            return (loss,) + output if loss is not None else output
+
+        return TFCLIPOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
+
+
+class TFCLIPPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = CLIPConfig
+    base_model_prefix = "clip"
+
+
+CLIP_START_DOCSTRING = r"""
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
+    generic methods the library implements for all its model (such as downloading or saving, resizing the input
+    embeddings, pruning heads etc.)
+
+    This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
+    it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
+    and behavior.
+
+    <Tip>
+
+    TF 2.0 models accepts two formats as inputs:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional arguments.
+
+    This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+    the tensors in the first argument of the model call function: `model(inputs)`.
+
+    If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+    the first positional argument :
+
+    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+      `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+      `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+    </Tip>
+
+    Args:
+        config ([`CLIPConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the
+            model weights.
+"""
+
+CLIP_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`BertTokenizer`]. See
+            [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
+            details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
+            argument can be used in eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False``):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+CLIP_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`): Pixel values. Pixel values can be obtained using [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
+            argument can be used in eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False``):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+CLIP_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`BertTokenizer`]. See
+            [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
+            details.
+
+            [What are input IDs?](../glossary#input-ids)
+        pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`CLIPFeatureExtractor`]. See
+            [`CLIPFeatureExtractor.__call__`] for details.
+        attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
+            argument can be used in eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False``):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+
+class TFCLIPTextModel(TFCLIPPreTrainedModel):
+    config_class = CLIPTextConfig
+
+    def __init__(self, config: CLIPTextConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.clip = TFCLIPTextMainLayer(config, name="clip")
+
+    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=CLIPTextConfig)
+    def call(
+        self,
+        input_ids: Optional[TFModelInputType] = None,
+        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: Optional[bool] = False,
+        **kwargs,
+    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import CLIPTokenizer, TFCLIPTextModel
+
+        >>> model = TFCLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"],  padding=True, return_tensors="tf")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output # pooled (EOS token) states
+        ```"""
+        inputs = input_processing(
+            func=self.call,
+            config=self.config,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+            kwargs_call=kwargs,
+        )
+        outputs = self.clip(
+            input_ids=inputs["input_ids"],
+            attention_mask=inputs["attention_mask"],
+            position_ids=inputs["position_ids"],
+            output_attentions=inputs["output_attentions"],
+            output_hidden_states=inputs["output_hidden_states"],
+            return_dict=inputs["return_dict"],
+            training=inputs["training"],
+        )
+
+        return outputs
+
+    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
+        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
+        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
+
+        return TFBaseModelOutputWithPooling(
+            last_hidden_state=output.last_hidden_state,
+            pooler_output=output.pooler_output,
+            hidden_states=hs,
+            attentions=attns,
+        )
+
+
+class TFCLIPVisionModel(TFCLIPPreTrainedModel):
+    config_class = CLIPVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: CLIPVisionConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.clip = TFCLIPVisionMainLayer(config, name="clip")
+
+    @property
+    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
+        """
+        Dummy inputs to build the network.
+
+        Returns:
+            `Dict[str, tf.Tensor]`: The dummy inputs.
+        """
+        VISION_DUMMY_INPUTS = tf.random.uniform(
+            shape=(len(DUMMY_INPUTS), 3, self.config.image_size, self.config.image_size), dtype=tf.float32
+        )
+        return {"pixel_values": VISION_DUMMY_INPUTS}
+
+    @tf.function(
+        input_signature=[
+            {
+                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
+            }
+        ]
+    )
+    def serving(self, inputs):
+        """
+        Method used for serving the model.
+
+        Args:
+            inputs (`Dict[str, tf.Tensor]`):
+                The input of the saved model as a dictionary of tensors.
+        """
+        output = self.call(inputs)
+
+        return self.serving_output(output)
+
+    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=CLIPVisionConfig)
+    def call(
+        self,
+        pixel_values: Optional[TFModelInputType] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: Optional[bool] = False,
+        **kwargs,
+    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import CLIPProcessor, TFCLIPVisionModel
+
+        >>> model = TFCLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="tf")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output # pooled CLS states
+        ```"""
+        inputs = input_processing(
+            func=self.call,
+            config=self.config,
+            input_ids=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+            kwargs_call=kwargs,
+        )
+
+        if "input_ids" in inputs:
+            inputs["pixel_values"] = inputs.pop("input_ids")
+
+        outputs = self.clip(
+            pixel_values=inputs["pixel_values"],
+            output_attentions=inputs["output_attentions"],
+            output_hidden_states=inputs["output_hidden_states"],
+            return_dict=inputs["return_dict"],
+            training=inputs["training"],
+        )
+
+        return outputs
+
+    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
+        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
+        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
+
+        return TFBaseModelOutputWithPooling(
+            last_hidden_state=output.last_hidden_state,
+            pooler_output=output.pooler_output,
+            hidden_states=hs,
+            attentions=attns,
+        )
+
+
+@add_start_docstrings(CLIP_START_DOCSTRING)
+class TFCLIPModel(TFCLIPPreTrainedModel):
+    config_class = CLIPConfig
+
+    def __init__(self, config: CLIPConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.clip = TFCLIPMainLayer(config, name="clip")
+
+    @property
+    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
+        """
+        Dummy inputs to build the network.
+
+        Returns:
+            `Dict[str, tf.Tensor]`: The dummy inputs.
+        """
+        VISION_DUMMY_INPUTS = tf.random.uniform(
+            shape=(len(DUMMY_INPUTS), 3, self.config.vision_config.image_size, self.config.vision_config.image_size),
+            dtype=tf.float32,
+        )
+        return {
+            "input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32),
+            "pixel_values": VISION_DUMMY_INPUTS,
+        }
+
+    @tf.function(
+        input_signature=[
+            {
+                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
+                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
+                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
+            }
+        ]
+    )
+    def serving(self, inputs):
+        """
+        Method used for serving the model.
+
+        Args:
+            inputs (`Dict[str, tf.Tensor]`):
+                The input of the saved model as a dictionary of tensors.
+        """
+        output = self.call(inputs)
+
+        return self.serving_output(output)
+
+    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def get_text_features(
+        self,
+        input_ids: Optional[TFModelInputType] = None,
+        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+        **kwargs,
+    ) -> tf.Tensor:
+        r"""
+        Returns:
+            text_features (`tf.Tensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`TFCLIPTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import CLIPTokenizer, TFCLIPModel
+
+        >>> model = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"],  padding=True, return_tensors="tf")
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        inputs = input_processing(
+            func=self.get_text_features,
+            config=self.config,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+            kwargs_call=kwargs,
+        )
+
+        text_features = self.clip.get_text_features(
+            input_ids=inputs["input_ids"],
+            attention_mask=inputs["attention_mask"],
+            position_ids=inputs["position_ids"],
+            output_attentions=inputs["output_attentions"],
+            output_hidden_states=inputs["output_hidden_states"],
+            return_dict=inputs["return_dict"],
+        )
+
+        return text_features
+
+    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+    def get_image_features(
+        self,
+        pixel_values: Optional[TFModelInputType] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+        **kwargs,
+    ) -> tf.Tensor:
+        r"""
+        Returns:
+            image_features (`tf.Tensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`TFCLIPVisionModel`].
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import CLIPProcessor, TFCLIPModel
+
+        >>> model = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="tf")
+
+        >>> image_features = model.get_image_features(**inputs)
+        ```"""
+        inputs = input_processing(
+            func=self.get_image_features,
+            config=self.config,
+            input_ids=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+            kwargs_call=kwargs,
+        )
+
+        if "input_ids" in inputs:
+            inputs["pixel_values"] = inputs.pop("input_ids")
+
+        image_features = self.clip.get_image_features(
+            pixel_values=inputs["pixel_values"],
+            output_attentions=inputs["output_attentions"],
+            output_hidden_states=inputs["output_hidden_states"],
+            return_dict=inputs["return_dict"],
+        )
+
+        return image_features
+
+    @add_start_docstrings_to_model_forward(CLIP_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=TFCLIPOutput, config_class=CLIPConfig)
+    def call(
+        self,
+        input_ids: Optional[TFModelInputType] = None,
+        pixel_values: Optional[TFModelInputType] = None,
+        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+        **kwargs,
+    ) -> Union[TFCLIPOutput, Tuple[tf.Tensor]]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> import tensorflow as tf
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import CLIPProcessor, TFCLIPModel
+
+        >>> model = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="tf", padding=True)
+
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
+        >>> probs = tf.nn.softmax(logits_per_image, axis=1) # we can take the softmax to get the label probabilities
+        ```"""
+        inputs = input_processing(
+            func=self.call,
+            config=self.config,
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            return_loss=return_loss,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+            kwargs_call=kwargs,
+        )
+
+        outputs = self.clip(
+            input_ids=inputs["input_ids"],
+            pixel_values=inputs["pixel_values"],
+            attention_mask=inputs["attention_mask"],
+            position_ids=inputs["position_ids"],
+            return_loss=inputs["return_loss"],
+            output_attentions=inputs["output_attentions"],
+            output_hidden_states=inputs["output_hidden_states"],
+            return_dict=inputs["return_dict"],
+        )
+
+        return outputs
+
+    def serving_output(self, output: TFCLIPOutput) -> TFCLIPOutput:
+        return output
diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py
index ca564457a2..2e1d43ec40 100644
--- a/src/transformers/utils/dummy_tf_objects.py
+++ b/src/transformers/utils/dummy_tf_objects.py
@@ -704,6 +704,57 @@ class TFCamembertModel:
         requires_backends(self, ["tf"])
 
 
+TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class TFCLIPModel:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["tf"])
+
+    def call(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFCLIPPreTrainedModel:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["tf"])
+
+    def call(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFCLIPTextModel:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["tf"])
+
+    def call(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFCLIPVisionModel:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["tf"])
+
+    def call(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/test_modeling_clip.py b/tests/test_modeling_clip.py
index b5e960bea2..e7de75fc8e 100644
--- a/tests/test_modeling_clip.py
+++ b/tests/test_modeling_clip.py
@@ -29,6 +29,7 @@ from transformers.file_utils import is_torch_available, is_vision_available
 from transformers.testing_utils import (
     is_flax_available,
     is_pt_flax_cross_test,
+    is_pt_tf_cross_test,
     require_torch,
     require_vision,
     slow,
@@ -581,6 +582,148 @@ class CLIPModelTest(ModelTesterMixin, unittest.TestCase):
 
             self.assertTrue(models_equal)
 
+    # overwrite from common since CLIPModel/TFCLIPModel return CLIPOutput/TFCLIPOutput
+    @is_pt_tf_cross_test
+    def test_pt_tf_model_equivalence(self):
+        import numpy as np
+        import tensorflow as tf
+
+        import transformers
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            tf_model_class_name = "TF" + model_class.__name__  # Add the "TF" at the beginning
+
+            if not hasattr(transformers, tf_model_class_name):
+                # transformers does not have TF version yet
+                return
+
+            tf_model_class = getattr(transformers, tf_model_class_name)
+
+            config.output_hidden_states = True
+
+            tf_model = tf_model_class(config)
+            pt_model = model_class(config)
+
+            # make sure only tf inputs are forward that actually exist in function args
+            tf_input_keys = set(inspect.signature(tf_model.call).parameters.keys())
+
+            # remove all head masks
+            tf_input_keys.discard("head_mask")
+            tf_input_keys.discard("cross_attn_head_mask")
+            tf_input_keys.discard("decoder_head_mask")
+
+            pt_inputs = self._prepare_for_class(inputs_dict, model_class)
+            pt_inputs = {k: v for k, v in pt_inputs.items() if k in tf_input_keys}
+
+            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
+            pt_model.eval()
+            tf_inputs_dict = {}
+            for key, tensor in pt_inputs.items():
+                # skip key that does not exist in tf
+                if type(tensor) == bool:
+                    tf_inputs_dict[key] = tensor
+                elif key == "input_values":
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32)
+                elif key == "pixel_values":
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32)
+                else:
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.int32)
+
+            # Check we can load pt model in tf and vice-versa with model => model functions
+            tf_model = transformers.load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=tf_inputs_dict)
+            pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
+
+            # need to rename encoder-decoder "inputs" for PyTorch
+            #            if "inputs" in pt_inputs_dict and self.is_encoder_decoder:
+            #                pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs")
+
+            with torch.no_grad():
+                pto = pt_model(**pt_inputs)
+            tfo = tf_model(tf_inputs_dict, training=False)
+
+            self.assertEqual(len(tfo), len(pto), "Output lengths differ between TF and PyTorch")
+            for tf_output, pt_output in zip(tfo.to_tuple(), pto.to_tuple()):
+
+                if not (isinstance(tf_output, tf.Tensor) and isinstance(pt_output, torch.Tensor)):
+                    continue
+
+                tf_out = tf_output.numpy()
+                pt_out = pt_output.numpy()
+
+                self.assertEqual(tf_out.shape, pt_out.shape, "Output component shapes differ between TF and PyTorch")
+
+                if len(tf_out.shape) > 0:
+
+                    tf_nans = np.copy(np.isnan(tf_out))
+                    pt_nans = np.copy(np.isnan(pt_out))
+
+                    pt_out[tf_nans] = 0
+                    tf_out[tf_nans] = 0
+                    pt_out[pt_nans] = 0
+                    tf_out[pt_nans] = 0
+
+                max_diff = np.amax(np.abs(tf_out - pt_out))
+                self.assertLessEqual(max_diff, 4e-2)
+
+            # Check we can load pt model in tf and vice-versa with checkpoint => model functions
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
+                torch.save(pt_model.state_dict(), pt_checkpoint_path)
+                tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path)
+
+                tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
+                tf_model.save_weights(tf_checkpoint_path)
+                pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path)
+
+            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
+            pt_model.eval()
+            tf_inputs_dict = {}
+            for key, tensor in pt_inputs.items():
+                # skip key that does not exist in tf
+                if type(tensor) == bool:
+                    tensor = np.array(tensor, dtype=bool)
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor, dtype=tf.int32)
+                elif key == "input_values":
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32)
+                elif key == "pixel_values":
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32)
+                else:
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.int32)
+
+            # need to rename encoder-decoder "inputs" for PyTorch
+            #            if "inputs" in pt_inputs_dict and self.is_encoder_decoder:
+            #                pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs")
+
+            with torch.no_grad():
+                pto = pt_model(**pt_inputs)
+
+            tfo = tf_model(tf_inputs_dict)
+
+            self.assertEqual(len(tfo), len(pto), "Output lengths differ between TF and PyTorch")
+            for tf_output, pt_output in zip(tfo.to_tuple(), pto.to_tuple()):
+
+                if not (isinstance(tf_output, tf.Tensor) and isinstance(pt_output, torch.Tensor)):
+                    continue
+
+                tf_out = tf_output.numpy()
+                pt_out = pt_output.numpy()
+
+                self.assertEqual(tf_out.shape, pt_out.shape, "Output component shapes differ between TF and PyTorch")
+
+                if len(tf_out.shape) > 0:
+                    tf_nans = np.copy(np.isnan(tf_out))
+                    pt_nans = np.copy(np.isnan(pt_out))
+
+                    pt_out[tf_nans] = 0
+                    tf_out[tf_nans] = 0
+                    pt_out[pt_nans] = 0
+                    tf_out[pt_nans] = 0
+
+                max_diff = np.amax(np.abs(tf_out - pt_out))
+                self.assertLessEqual(max_diff, 4e-2)
+
     # overwrite from common since FlaxCLIPModel returns nested output
     # which is not supported in the common test
     @is_pt_flax_cross_test
diff --git a/tests/test_modeling_tf_clip.py b/tests/test_modeling_tf_clip.py
new file mode 100644
index 0000000000..59ef2c49d0
--- /dev/null
+++ b/tests/test_modeling_tf_clip.py
@@ -0,0 +1,659 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the TensorFlow CLIP model. """
+
+
+import inspect
+import os
+import tempfile
+import unittest
+from importlib import import_module
+
+import requests
+from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
+from transformers.file_utils import is_tf_available, is_vision_available
+from transformers.testing_utils import is_pt_tf_cross_test, require_tf, require_vision, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+
+
+if is_tf_available():
+    import numpy as np
+    import tensorflow as tf
+
+    from transformers import TFCLIPModel, TFCLIPTextModel, TFCLIPVisionModel, TFSharedEmbeddings
+    from transformers.models.clip.modeling_tf_clip import TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import CLIPProcessor
+
+
+class TFCLIPVisionModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def get_config(self):
+        return CLIPVisionConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, pixel_values):
+        model = TFCLIPVisionModel(config=config)
+        result = model(pixel_values, training=False)
+        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.image_size, self.image_size)
+        patch_size = (self.patch_size, self.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_tf
+class TFCLIPVisionModelTest(TFModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as CLIP does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (TFCLIPVisionModel,) if is_tf_available() else ()
+
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFCLIPVisionModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=CLIPVisionConfig, has_text_modality=False, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_inputs_embeds(self):
+        # CLIP does not use inputs_embeds
+        pass
+
+    def test_graph_mode_with_inputs_embeds(self):
+        # CLIP does not use inputs_embeds
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (tf.keras.layers.Layer))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, tf.keras.layers.Layer))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.call)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        # in CLIP, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.model_tester.image_size, self.model_tester.image_size)
+        patch_size = (self.model_tester.patch_size, self.model_tester.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        seq_len = num_patches + 1
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            out_len = len(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
+
+            added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, seq_len, seq_len],
+            )
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            # CLIP has a different seq_length
+            image_size = (self.model_tester.image_size, self.model_tester.image_size)
+            patch_size = (self.model_tester.patch_size, self.model_tester.patch_size)
+            num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+            seq_length = num_patches + 1
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = TFCLIPVisionModel.from_pretrained(model_name, from_pt=True)
+            self.assertIsNotNone(model)
+
+
+class TFCLIPTextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        config = self.get_config()
+
+        return config, input_ids, input_mask
+
+    def get_config(self):
+        return CLIPTextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, input_ids, input_mask):
+        model = TFCLIPTextModel(config=config)
+        result = model(input_ids, attention_mask=input_mask, training=False)
+        result = model(input_ids, training=False)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, input_mask = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_tf
+class TFCLIPTextModelTest(TFModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (TFCLIPTextModel,) if is_tf_available() else ()
+    test_pruning = False
+    test_head_masking = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFCLIPTextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=CLIPTextConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_inputs_embeds(self):
+        # CLIP does not use inputs_embeds
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = TFCLIPTextModel.from_pretrained(model_name, from_pt=True)
+            self.assertIsNotNone(model)
+
+
+class TFCLIPModelTester:
+    def __init__(self, parent, is_training=True):
+        self.parent = parent
+        self.text_model_tester = TFCLIPTextModelTester(parent)
+        self.vision_model_tester = TFCLIPVisionModelTester(parent)
+        self.is_training = is_training
+
+    def prepare_config_and_inputs(self):
+        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+
+        config = self.get_config()
+
+        return config, input_ids, attention_mask, pixel_values
+
+    def get_config(self):
+        return CLIPConfig.from_text_vision_configs(
+            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
+        )
+
+    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
+        model = TFCLIPModel(config)
+        result = model(input_ids, pixel_values, attention_mask, training=False)
+        self.parent.assertEqual(
+            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
+        )
+        self.parent.assertEqual(
+            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, pixel_values = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "pixel_values": pixel_values,
+            "return_loss": True,
+        }
+        return config, inputs_dict
+
+
+@require_tf
+class TFCLIPModelTest(TFModelTesterMixin, unittest.TestCase):
+    all_model_classes = (TFCLIPModel,) if is_tf_available() else ()
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFCLIPModelTester(self)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    # hidden_states are tested in individual model tests
+    def test_hidden_states_output(self):
+        pass
+
+    # input_embeds are tested in individual model tests
+    def test_inputs_embeds(self):
+        pass
+
+    # CLIPModel does not have input/output embeddings
+    def test_model_common_attributes(self):
+        pass
+
+    # overwrite from common since `TFCLIPModelTester` set `return_loss` to `True` and causes the preparation of
+    # `symbolic_inputs` failed.
+    def test_keras_save_load(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # remove `return_loss` to make code work
+        if self.__class__.__name__ == "TFCLIPModelTest":
+            inputs_dict.pop("return_loss", None)
+
+        tf_main_layer_classes = set(
+            module_member
+            for model_class in self.all_model_classes
+            for module in (import_module(model_class.__module__),)
+            for module_member_name in dir(module)
+            if module_member_name.endswith("MainLayer")
+            # This condition is required, since `modeling_tf_clip.py` has 3 classes whose names end with `MainLayer`.
+            and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")]
+            for module_member in (getattr(module, module_member_name),)
+            if isinstance(module_member, type)
+            and tf.keras.layers.Layer in module_member.__bases__
+            and getattr(module_member, "_keras_serializable", False)
+        )
+        for main_layer_class in tf_main_layer_classes:
+            # T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter
+            if "T5" in main_layer_class.__name__:
+                # Take the same values than in TFT5ModelTester for this shared layer
+                shared = TFSharedEmbeddings(99, 32, name="shared")
+                config.use_cache = inputs_dict.pop("use_cache", None)
+                main_layer = main_layer_class(config, embed_tokens=shared)
+            else:
+                main_layer = main_layer_class(config)
+
+            symbolic_inputs = {
+                name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items()
+            }
+
+            model = tf.keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs))
+            outputs = model(inputs_dict)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                filepath = os.path.join(tmpdirname, "keras_model.h5")
+                model.save(filepath)
+                if "T5" in main_layer_class.__name__:
+                    model = tf.keras.models.load_model(
+                        filepath,
+                        custom_objects={
+                            main_layer_class.__name__: main_layer_class,
+                            "TFSharedEmbeddings": TFSharedEmbeddings,
+                        },
+                    )
+                else:
+                    model = tf.keras.models.load_model(
+                        filepath, custom_objects={main_layer_class.__name__: main_layer_class}
+                    )
+                assert isinstance(model, tf.keras.Model)
+                after_outputs = model(inputs_dict)
+                self.assert_outputs_same(after_outputs, outputs)
+
+    # overwrite from common since CLIPModel/TFCLIPModel return CLIPOutput/TFCLIPOutput
+    @is_pt_tf_cross_test
+    def test_pt_tf_model_equivalence(self):
+        import torch
+
+        import transformers
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beginning
+            pt_model_class = getattr(transformers, pt_model_class_name)
+
+            config.output_hidden_states = True
+
+            tf_model = model_class(config)
+            pt_model = pt_model_class(config)
+
+            # Check we can load pt model in tf and vice-versa with model => model functions
+
+            tf_model = transformers.load_pytorch_model_in_tf2_model(
+                tf_model, pt_model, tf_inputs=self._prepare_for_class(inputs_dict, model_class)
+            )
+            pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
+
+            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
+            pt_model.eval()
+            pt_inputs_dict = {}
+            for name, key in self._prepare_for_class(inputs_dict, model_class).items():
+                if type(key) == bool:
+                    pt_inputs_dict[name] = key
+                elif name == "input_values":
+                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
+                elif name == "pixel_values":
+                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
+                else:
+                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long)
+
+            # need to rename encoder-decoder "inputs" for PyTorch
+            if "inputs" in pt_inputs_dict and self.is_encoder_decoder:
+                pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs")
+
+            with torch.no_grad():
+                pto = pt_model(**pt_inputs_dict)
+            tfo = tf_model(self._prepare_for_class(inputs_dict, model_class), training=False)
+
+            self.assertEqual(len(tfo), len(pto), "Output lengths differ between TF and PyTorch")
+            for tf_output, pt_output in zip(tfo.to_tuple(), pto.to_tuple()):
+
+                if not (isinstance(tf_output, tf.Tensor) and isinstance(pt_output, torch.Tensor)):
+                    continue
+
+                tf_out = tf_output.numpy()
+                pt_out = pt_output.numpy()
+
+                self.assertEqual(tf_out.shape, pt_out.shape, "Output component shapes differ between TF and PyTorch")
+
+                if len(tf_out.shape) > 0:
+
+                    tf_nans = np.copy(np.isnan(tf_out))
+                    pt_nans = np.copy(np.isnan(pt_out))
+
+                    pt_out[tf_nans] = 0
+                    tf_out[tf_nans] = 0
+                    pt_out[pt_nans] = 0
+                    tf_out[pt_nans] = 0
+
+                max_diff = np.amax(np.abs(tf_out - pt_out))
+                self.assertLessEqual(max_diff, 4e-2)
+
+            # Check we can load pt model in tf and vice-versa with checkpoint => model functions
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
+                torch.save(pt_model.state_dict(), pt_checkpoint_path)
+                tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path)
+
+                tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
+                tf_model.save_weights(tf_checkpoint_path)
+                pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path)
+
+            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
+            pt_model.eval()
+            pt_inputs_dict = {}
+            for name, key in self._prepare_for_class(inputs_dict, model_class).items():
+                if type(key) == bool:
+                    key = np.array(key, dtype=bool)
+                    pt_inputs_dict[name] = torch.from_numpy(key).to(torch.long)
+                elif name == "input_values":
+                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
+                elif name == "pixel_values":
+                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
+                else:
+                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long)
+            # need to rename encoder-decoder "inputs" for PyTorch
+            if "inputs" in pt_inputs_dict and self.is_encoder_decoder:
+                pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs")
+
+            with torch.no_grad():
+                pto = pt_model(**pt_inputs_dict)
+            tfo = tf_model(self._prepare_for_class(inputs_dict, model_class))
+
+            self.assertEqual(len(tfo), len(pto), "Output lengths differ between TF and PyTorch")
+            for tf_output, pt_output in zip(tfo.to_tuple(), pto.to_tuple()):
+
+                if not (isinstance(tf_output, tf.Tensor) and isinstance(pt_output, torch.Tensor)):
+                    continue
+
+                tf_out = tf_output.numpy()
+                pt_out = pt_output.numpy()
+
+                self.assertEqual(tf_out.shape, pt_out.shape, "Output component shapes differ between TF and PyTorch")
+
+                if len(tf_out.shape) > 0:
+                    tf_nans = np.copy(np.isnan(tf_out))
+                    pt_nans = np.copy(np.isnan(pt_out))
+
+                    pt_out[tf_nans] = 0
+                    tf_out[tf_nans] = 0
+                    pt_out[pt_nans] = 0
+                    tf_out[pt_nans] = 0
+
+                max_diff = np.amax(np.abs(tf_out - pt_out))
+                self.assertLessEqual(max_diff, 4e-2)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = TFCLIPModel.from_pretrained(model_name, from_pt=True)
+            self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+@require_vision
+class TFCLIPModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference(self):
+        model_name = "openai/clip-vit-base-patch32"
+        model = TFCLIPModel.from_pretrained(model_name, from_pt=True)
+        processor = CLIPProcessor.from_pretrained(model_name)
+
+        image = prepare_img()
+        inputs = processor(
+            text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="tf"
+        )
+
+        outputs = model(**inputs, training=False)
+
+        # verify the logits
+        self.assertEqual(
+            outputs.logits_per_image.shape,
+            tf.TensorShape((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
+        )
+        self.assertEqual(
+            outputs.logits_per_text.shape,
+            tf.TensorShape((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
+        )
+
+        expected_logits = tf.constant([[24.5701, 19.3049]])
+
+        tf.debugging.assert_near(outputs.logits_per_image, expected_logits, atol=1e-3)
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index fd7a6f9069..074e8174d2 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -282,6 +282,8 @@ class TFModelTesterMixin:
             for module in (import_module(model_class.__module__),)
             for module_member_name in dir(module)
             if module_member_name.endswith("MainLayer")
+            # This condition is required, since `modeling_tf_clip.py` has 3 classes whose names end with `MainLayer`.
+            and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")]
             for module_member in (getattr(module, module_member_name),)
             if isinstance(module_member, type)
             and tf.keras.layers.Layer in module_member.__bases__
@@ -458,7 +460,7 @@ class TFModelTesterMixin:
                     "input_ids": tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32"),
                 }
             # TODO: A better way to handle vision models
-            elif model_class.__name__ in ["TFViTModel", "TFViTForImageClassification"]:
+            elif model_class.__name__ in ["TFViTModel", "TFViTForImageClassification", "TFCLIPVisionModel"]:
                 inputs = tf.keras.Input(
                     batch_shape=(
                         3,
@@ -469,6 +471,20 @@ class TFModelTesterMixin:
                     name="pixel_values",
                     dtype="float32",
                 )
+            elif model_class.__name__ in ["TFCLIPModel"]:
+                inputs = {
+                    "input_ids": tf.keras.Input(batch_shape=(3, max_input), name="input_ids", dtype="int32"),
+                    "pixel_values": tf.keras.Input(
+                        batch_shape=(
+                            3,
+                            self.model_tester.vision_model_tester.num_channels,
+                            self.model_tester.vision_model_tester.image_size,
+                            self.model_tester.vision_model_tester.image_size,
+                        ),
+                        name="pixel_values",
+                        dtype="float32",
+                    ),
+                }
             elif model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
                 inputs = tf.keras.Input(batch_shape=(4, 2, max_input), name="input_ids", dtype="int32")
             else:
@@ -1244,6 +1260,13 @@ def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
     return output
 
 
+def random_attention_mask(shape, rng=None, name=None, dtype=None):
+    attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None, dtype=dtype)
+    # make sure that at least one token is attended to for each batch
+    attn_mask = tf.concat([tf.constant(value=1, shape=(shape[0], 1), dtype=dtype), attn_mask[:, 1:]], axis=1)
+    return attn_mask
+
+
 def floats_tensor(shape, scale=1.0, rng=None, name=None, dtype=None):
     """Creates a random float32 tensor"""
     if rng is None:
diff --git a/tests/test_modeling_tf_vit.py b/tests/test_modeling_tf_vit.py
index 538b5563c8..ea493fc593 100644
--- a/tests/test_modeling_tf_vit.py
+++ b/tests/test_modeling_tf_vit.py
@@ -22,7 +22,7 @@ import unittest
 
 from transformers import ViTConfig
 from transformers.file_utils import cached_property, is_tf_available, is_vision_available
-from transformers.testing_utils import require_tf, require_vision, slow
+from transformers.testing_utils import require_tf, require_vision, slow, tooslow
 
 from .test_configuration_common import ConfigTester
 from .test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
@@ -200,7 +200,7 @@ class TFViTModelTest(TFModelTesterMixin, unittest.TestCase):
 
     # overwrite from common since `encoder_seq_length` and `encoder_key_length` are calculated
     # in a different way than in text models.
-    @slow
+    @tooslow
     def test_saved_model_creation_extended(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.output_hidden_states = True
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 86cf587864..a64d2495d1 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -111,6 +111,8 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
     "BeitForMaskedImageModeling",
     "CLIPTextModel",
     "CLIPVisionModel",
+    "TFCLIPTextModel",
+    "TFCLIPVisionModel",
     "FlaxCLIPTextModel",
     "FlaxCLIPVisionModel",
     "FlaxWav2Vec2ForCTC",