From 8f2cc1c3ab9fac0b49c6c9372e6557532a607024 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Thu, 23 Dec 2021 17:19:44 +0100 Subject: [PATCH] Add TFCLIPModel (#13967) * Start the work for TFCLIPModel * Convert to TF code (TODO: loss + doc) * Clean up * Fix pooled_output for TFCLIPTextTransformer - using tf.gather_nd * assert -> raise error * Expose TFCLIPModel * Deal with dummy_inputs * Add tests * Fix all tests. TODO: manual check weight loading + add more comments * Fix pt tf equivalence test * fixes * update TFCLIPVisionEmbeddings's Conv2D * Fix loss + overwrite test_pt_tf_model_equivalence from common * Add a comment about the change about MainLayer in test_keras_save_load * Set return_loss=True in TFCLIPModelTester + make tests pass * overwrite test_pt_tf_model_equivalence from tf common * fix base_model_prefix * Fix examples * remove unused * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * apply review suggestions * change self.pre_layrnorm to self.pre_layernorm * apply more review suggestions * return attention probs before dropout (to align with PT) * fix weight init * fix * build doc * fix missing doc * fix for test Co-authored-by: ydshieh Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- docs/source/index.mdx | 2 +- docs/source/model_doc/clip.mdx | 17 + src/transformers/__init__.py | 16 + src/transformers/activations_tf.py | 7 + .../models/auto/modeling_tf_auto.py | 1 + src/transformers/models/clip/__init__.py | 19 + .../models/clip/modeling_tf_clip.py | 1506 +++++++++++++++++ src/transformers/utils/dummy_tf_objects.py | 51 + tests/test_modeling_clip.py | 143 ++ tests/test_modeling_tf_clip.py | 659 ++++++++ tests/test_modeling_tf_common.py | 25 +- tests/test_modeling_tf_vit.py | 4 +- utils/check_repo.py | 2 + 13 files changed, 2448 insertions(+), 4 deletions(-) create mode 100644 src/transformers/models/clip/modeling_tf_clip.py create mode 100644 tests/test_modeling_tf_clip.py diff --git a/docs/source/index.mdx b/docs/source/index.mdx index 1609228aa0..112e84b825 100644 --- a/docs/source/index.mdx +++ b/docs/source/index.mdx @@ -202,7 +202,7 @@ Flax), PyTorch, and/or TensorFlow. | BlenderbotSmall | ✅ | ✅ | ✅ | ✅ | ✅ | | CamemBERT | ✅ | ✅ | ✅ | ✅ | ❌ | | Canine | ✅ | ❌ | ✅ | ❌ | ❌ | -| CLIP | ✅ | ✅ | ✅ | ❌ | ✅ | +| CLIP | ✅ | ✅ | ✅ | ✅ | ✅ | | ConvBERT | ✅ | ✅ | ✅ | ✅ | ❌ | | CTRL | ✅ | ❌ | ✅ | ✅ | ❌ | | DeBERTa | ✅ | ✅ | ✅ | ✅ | ❌ | diff --git a/docs/source/model_doc/clip.mdx b/docs/source/model_doc/clip.mdx index 35685c040a..2bbc6f1f8e 100644 --- a/docs/source/model_doc/clip.mdx +++ b/docs/source/model_doc/clip.mdx @@ -125,6 +125,23 @@ This model was contributed by [valhalla](https://huggingface.co/valhalla). The o [[autodoc]] CLIPVisionModel - forward +## TFCLIPModel + +[[autodoc]] TFCLIPModel + - call + - get_text_features + - get_image_features + +## TFCLIPTextModel + +[[autodoc]] TFCLIPTextModel + - call + +## TFCLIPVisionModel + +[[autodoc]] TFCLIPVisionModel + - call + ## FlaxCLIPModel [[autodoc]] FlaxCLIPModel diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 80b077a648..a0044f849c 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1549,6 +1549,15 @@ if is_tf_available(): "TFCamembertModel", ] ) + _import_structure["models.clip"].extend( + [ + "TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST", + "TFCLIPModel", + "TFCLIPPreTrainedModel", + "TFCLIPTextModel", + "TFCLIPVisionModel", + ] + ) _import_structure["models.convbert"].extend( [ "TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -3394,6 +3403,13 @@ if TYPE_CHECKING: TFCamembertForTokenClassification, TFCamembertModel, ) + from .models.clip import ( + TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST, + TFCLIPModel, + TFCLIPPreTrainedModel, + TFCLIPTextModel, + TFCLIPVisionModel, + ) from .models.convbert import ( TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST, TFConvBertForMaskedLM, diff --git a/src/transformers/activations_tf.py b/src/transformers/activations_tf.py index e0cefc323c..5c40803f25 100644 --- a/src/transformers/activations_tf.py +++ b/src/transformers/activations_tf.py @@ -63,6 +63,12 @@ def gelu_fast(x): return 0.5 * x * (1.0 + tf.tanh(x * coeff2 * (1.0 + coeff1 * x * x))) +def quick_gelu(x): + x = tf.convert_to_tensor(x) + coeff = tf.cast(1.702, x.dtype) + return x * tf.math.sigmoid(coeff * x) + + if version.parse(tf.version.VERSION) >= version.parse("2.4"): def approximate_gelu_wrap(x): @@ -84,6 +90,7 @@ ACT2FN = { "mish": mish, "tanh": tf.keras.activations.tanh, "gelu_fast": gelu_fast, + "quick_gelu": quick_gelu, } diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py index d32abfdd8a..12bcd2e296 100644 --- a/src/transformers/models/auto/modeling_tf_auto.py +++ b/src/transformers/models/auto/modeling_tf_auto.py @@ -29,6 +29,7 @@ logger = logging.get_logger(__name__) TF_MODEL_MAPPING_NAMES = OrderedDict( [ # Base model mapping + ("clip", "TFCLIPModel"), ("deberta-v2", "TFDebertaV2Model"), ("deberta", "TFDebertaModel"), ("rembert", "TFRemBertModel"), diff --git a/src/transformers/models/clip/__init__.py b/src/transformers/models/clip/__init__.py index 78b7d8a599..310f22cfc6 100644 --- a/src/transformers/models/clip/__init__.py +++ b/src/transformers/models/clip/__init__.py @@ -20,6 +20,7 @@ from typing import TYPE_CHECKING from ...file_utils import ( _LazyModule, is_flax_available, + is_tf_available, is_tokenizers_available, is_torch_available, is_vision_available, @@ -47,6 +48,15 @@ if is_torch_available(): "CLIPVisionModel", ] +if is_tf_available(): + _import_structure["modeling_tf_clip"] = [ + "TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST", + "TFCLIPModel", + "TFCLIPPreTrainedModel", + "TFCLIPTextModel", + "TFCLIPVisionModel", + ] + if is_flax_available(): _import_structure["modeling_flax_clip"] = [ "FlaxCLIPModel", @@ -78,6 +88,15 @@ if TYPE_CHECKING: CLIPVisionModel, ) + if is_tf_available(): + from .modeling_tf_clip import ( + TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST, + TFCLIPModel, + TFCLIPPreTrainedModel, + TFCLIPTextModel, + TFCLIPVisionModel, + ) + if is_flax_available(): from .modeling_flax_clip import ( FlaxCLIPModel, diff --git a/src/transformers/models/clip/modeling_tf_clip.py b/src/transformers/models/clip/modeling_tf_clip.py new file mode 100644 index 0000000000..918ecdd1be --- /dev/null +++ b/src/transformers/models/clip/modeling_tf_clip.py @@ -0,0 +1,1506 @@ +# coding=utf-8 +# Copyright 2021 The OpenAI Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" TF 2.0 CLIP model. """ + + +import math +from dataclasses import dataclass +from typing import Any, Dict, Optional, Tuple, Union + +import numpy as np +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...file_utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + replace_return_docstrings, +) +from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling + +# Public API +from ...modeling_tf_utils import ( + DUMMY_INPUTS, + TFModelInputType, + TFPreTrainedModel, + get_initializer, + input_processing, + keras_serializable, + shape_list, +) +from ...utils import logging +from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "openai/clip-vit-base-patch32" + +TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "openai/clip-vit-base-patch32", + # See all CLIP models at https://huggingface.co/models?filter=clip +] + + +LARGE_NEGATIVE = -1e8 + + +# Copied from transformers.models.bart.modeling_tf_bart._expand_mask +def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None, past_key_values_length: int = 0): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + src_len = shape_list(mask)[1] + tgt_len = tgt_len if tgt_len is not None else src_len + one_cst = tf.constant(1.0) + mask = tf.cast(mask, dtype=one_cst.dtype) + expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1)) + + return (one_cst - expanded_mask) * LARGE_NEGATIVE + + +# contrastive loss function, adapted from +# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html +def contrastive_loss(logits: tf.Tensor) -> tf.Tensor: + return tf.math.reduce_mean( + tf.keras.metrics.sparse_categorical_crossentropy( + y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True + ) + ) + + +def clip_loss(similarity: tf.Tensor) -> tf.Tensor: + caption_loss = contrastive_loss(similarity) + image_loss = contrastive_loss(tf.transpose(similarity)) + return (caption_loss + image_loss) / 2.0 + + +@dataclass +class TFCLIPOutput(ModelOutput): + """ + Args: + loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`): + Contrastive loss for image-text similarity. + logits_per_image:(`tf.Tensor` of shape `(image_batch_size, text_batch_size)`): + The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the + image-text similarity scores. + logits_per_text:(`tf.Tensor` of shape `(text_batch_size, image_batch_size)`): + The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the + text-image similarity scores. + text_embeds(`tf.Tensor` of shape `(batch_size, output_dim`): + The text embeddings obtained by applying the projection layer to the pooled output of + [`TFCLIPTextModel`]. + image_embeds(`tf.Tensor` of shape `(batch_size, output_dim`): + The image embeddings obtained by applying the projection layer to the pooled output of + [`TFCLIPVisionModel`]. + text_model_output([`~modeling_tf_utils.TFBaseModelOutputWithPooling`]): + The output of the [`TFCLIPTextModel`]. + vision_model_output([`~modeling_tf_utils.TFBaseModelOutputWithPooling`]): + The output of the [`TFCLIPVisionModel`]. + """ + + loss: Optional[tf.Tensor] = None + logits_per_image: tf.Tensor = None + logits_per_text: tf.Tensor = None + text_embeds: tf.Tensor = None + image_embeds: tf.Tensor = None + text_model_output: TFBaseModelOutputWithPooling = None + vision_model_output: TFBaseModelOutputWithPooling = None + + def to_tuple(self) -> Tuple[Any]: + return tuple( + self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() + for k in self.keys() + ) + + +class TFCLIPVisionEmbeddings(tf.keras.layers.Layer): + def __init__(self, config: CLIPVisionConfig, **kwargs): + super().__init__(**kwargs) + + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + + self.config = config + + self.patch_embedding = tf.keras.layers.Conv2D( + filters=self.embed_dim, + kernel_size=self.patch_size, + strides=self.patch_size, + padding="valid", + data_format="channels_last", + use_bias=False, + kernel_initializer=get_initializer(self.config.initializer_range * self.config.initializer_factor), + name="patch_embedding", + ) + + def build(self, input_shape: tf.TensorShape): + + factor = self.config.initializer_factor + + self.class_embedding = self.add_weight( + shape=(self.embed_dim,), + initializer=get_initializer(self.embed_dim ** -0.5 * factor), + trainable=True, + name="class_embedding", + ) + + with tf.name_scope("position_embedding"): + self.position_embedding = self.add_weight( + shape=(self.num_positions, self.embed_dim), + initializer=get_initializer(self.config.initializer_range * factor), + trainable=True, + name="embeddings", + ) + + super().build(input_shape) + + def call(self, pixel_values: tf.Tensor) -> tf.Tensor: + """`pixel_values` is expected to be of NCHW format.""" + + batch_size, num_channels, height, width = shape_list(pixel_values) + + # When running on CPU, `tf.nn.conv2d` doesn't support `NCHW` format. + # So change the input format from `NCHW` to `NHWC`. + # shape = (batch_size, in_height, in_width, in_channels=num_channels) + pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) + + patch_embeds = self.patch_embedding(pixel_values) + + # Change the 2D spatial dimensions to a single temporal dimension. + # shape = (batch_size, num_patches, out_channels=embed_dim) + patch_embeds = tf.reshape(tensor=patch_embeds, shape=(batch_size, self.num_patches, -1)) + + # add the [CLS] token to the embedded patch tokens + class_embeds = tf.broadcast_to(self.class_embedding, shape=(batch_size, 1, self.embed_dim)) + embeddings = tf.concat((class_embeds, patch_embeds), axis=1) + + embeddings = embeddings + self.position_embedding + + return embeddings + + +class TFCLIPTextEmbeddings(tf.keras.layers.Layer): + def __init__(self, config: CLIPTextConfig, **kwargs): + super().__init__(**kwargs) + + self.embed_dim = config.hidden_size + self.vocab_size = config.vocab_size + + self.config = config + + def build(self, input_shape: tf.TensorShape): + + with tf.name_scope("token_embedding"): + self.weight = self.add_weight( + shape=(self.vocab_size, self.embed_dim), + initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range), + trainable=True, + name="weight", + ) + + with tf.name_scope("position_embedding"): + self.position_embedding = self.add_weight( + shape=(self.config.max_position_embeddings, self.embed_dim), + initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range), + trainable=True, + name="embeddings", + ) + + super().build(input_shape) + + def call( + self, + input_ids: tf.Tensor = None, + position_ids: tf.Tensor = None, + inputs_embeds: tf.Tensor = None, + ) -> tf.Tensor: + """ + Applies embedding based on inputs tensor. + + Returns: + final_embeddings (`tf.Tensor`): output embedding tensor. + """ + if input_ids is None and inputs_embeds is None: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + + input_shape = shape_list(inputs_embeds)[:-1] + + if position_ids is None: + position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0) + + position_embeds = tf.gather(params=self.position_embedding, indices=position_ids) + position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1)) + final_embeddings = inputs_embeds + position_embeds + + return final_embeddings + + +class TFCLIPAttention(tf.keras.layers.Layer): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: CLIPConfig, **kwargs): + super().__init__(**kwargs) + + self.embed_dim = config.hidden_size + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = self.embed_dim // self.num_attention_heads + if self.attention_head_size * self.num_attention_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_attention_heads})." + ) + + factor = config.initializer_factor + in_proj_std = (self.embed_dim ** -0.5) * ((2 * config.num_hidden_layers) ** -0.5) * factor + out_proj_std = (self.embed_dim ** -0.5) * factor + + self.sqrt_att_head_size = math.sqrt(self.attention_head_size) + + self.q_proj = tf.keras.layers.Dense( + units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="q_proj" + ) + self.k_proj = tf.keras.layers.Dense( + units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="k_proj" + ) + self.v_proj = tf.keras.layers.Dense( + units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="v_proj" + ) + + self.dropout = tf.keras.layers.Dropout(rate=config.attention_dropout) + + self.out_proj = tf.keras.layers.Dense( + units=self.embed_dim, kernel_initializer=get_initializer(out_proj_std), name="out_proj" + ) + + # copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention.transpose_for_scores + def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: + # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] + tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size)) + + # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size] + return tf.transpose(tensor, perm=[0, 2, 1, 3]) + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + causal_attention_mask: tf.Tensor, + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: + """Input shape: Batch x Time x Channel""" + + batch_size = shape_list(hidden_states)[0] + mixed_query_layer = self.q_proj(inputs=hidden_states) + mixed_key_layer = self.k_proj(inputs=hidden_states) + mixed_value_layer = self.v_proj(inputs=hidden_states) + query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) + key_layer = self.transpose_for_scores(mixed_key_layer, batch_size) + value_layer = self.transpose_for_scores(mixed_value_layer, batch_size) + + # Take the dot product between "query" and "key" to get the raw attention scores. + # (batch size, num_heads, seq_len_q, seq_len_k) + attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) + dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype) + attention_scores = tf.divide(attention_scores, dk) + + # apply the causal_attention_mask first + if causal_attention_mask is not None: + # Apply the causal attention mask (precomputed for all layers in TFCLIPModel call() function) + attention_scores = tf.add(attention_scores, causal_attention_mask) + + if attention_mask is not None: + # Apply the attention mask (precomputed for all layers in TFCLIPModel call() function) + attention_scores = tf.add(attention_scores, attention_mask) + + # Normalize the attention scores to probabilities. + _attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(inputs=_attention_probs, training=training) + + attention_output = tf.matmul(attention_probs, value_layer) + attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3]) + + # (batch_size, seq_len_q, embed_dim) + attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.embed_dim)) + + attention_output = self.out_proj(attention_output, training=training) + # In TFBert, attention weights are returned after dropout. + # However, in CLIP, they are returned before dropout. + outputs = (attention_output, _attention_probs) if output_attentions else (attention_output,) + + return outputs + + +class TFCLIPMLP(tf.keras.layers.Layer): + def __init__(self, config: CLIPConfig, **kwargs): + super().__init__(**kwargs) + + self.activation_fn = get_tf_activation(config.hidden_act) + + factor = config.initializer_factor + in_proj_std = (config.hidden_size ** -0.5) * ((2 * config.num_hidden_layers) ** -0.5) * factor + fc_std = (2 * config.hidden_size) ** -0.5 * factor + + self.fc1 = tf.keras.layers.Dense( + units=config.intermediate_size, kernel_initializer=get_initializer(fc_std), name="fc1" + ) + self.fc2 = tf.keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2" + ) + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + + hidden_states = self.fc1(inputs=hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(inputs=hidden_states) + return hidden_states + + +class TFCLIPEncoderLayer(tf.keras.layers.Layer): + def __init__(self, config: CLIPConfig, **kwargs): + super().__init__(**kwargs) + + self.embed_dim = config.hidden_size + self.self_attn = TFCLIPAttention(config, name="self_attn") + self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1") + self.mlp = TFCLIPMLP(config, name="mlp") + self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2") + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + causal_attention_mask: tf.Tensor, + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: + """ + Args: + hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`tf.Tensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + causal_attention_mask (`tf.Tensor`): causal attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + output_attentions (`bool`): + Whether or not to return the attentions tensors of all attention layers. See `outputs` under returned + tensors for more detail. + """ + residual = hidden_states + + hidden_states = self.layer_norm1(inputs=hidden_states) + attention_outputs = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + causal_attention_mask=causal_attention_mask, + output_attentions=output_attentions, + training=training, + ) + hidden_states = attention_outputs[0] + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.layer_norm2(inputs=hidden_states) + hidden_states = self.mlp(hidden_states=hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + attention_outputs[1:] # add attentions if we output them + + return outputs + + +class TFCLIPEncoder(tf.keras.layers.Layer): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`TFCLIPEncoderLayer`]. + + Args: + config: CLIPConfig + """ + + def __init__(self, config: CLIPConfig, **kwargs): + super().__init__(**kwargs) + + self.layers = [TFCLIPEncoderLayer(config, name=f"layers_._{i}") for i in range(config.num_hidden_layers)] + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + causal_attention_mask: tf.Tensor, + output_attentions: bool, + output_hidden_states: bool, + return_dict: bool, + training: bool = False, + ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]: + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + for i, layer_module in enumerate(self.layers): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_outputs = layer_module( + hidden_states=hidden_states, + attention_mask=attention_mask, + causal_attention_mask=causal_attention_mask, + output_attentions=output_attentions, + training=training, + ) + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + # Add last layer + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) + + return TFBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions + ) + + +class TFCLIPTextTransformer(tf.keras.layers.Layer): + def __init__(self, config: CLIPTextConfig, **kwargs): + super().__init__(**kwargs) + + self.embeddings = TFCLIPTextEmbeddings(config, name="embeddings") + self.encoder = TFCLIPEncoder(config, name="encoder") + self.final_layer_norm = tf.keras.layers.LayerNormalization( + epsilon=config.layer_norm_eps, name="final_layer_norm" + ) + + def call( + self, + input_ids: TFModelInputType, + attention_mask: tf.Tensor, + position_ids: tf.Tensor, + output_attentions: bool, + output_hidden_states: bool, + return_dict: bool, + training: bool = False, + **kwargs, + ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: + input_shape = shape_list(input_ids) + + embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids) + + batch_size, seq_length = input_shape + # CLIP's text model uses causal mask, prepare it here. + # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324 + causal_attention_mask = self._build_causal_attention_mask(batch_size, seq_length, dtype=embedding_output.dtype) + + # check attention mask and invert + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _expand_mask(attention_mask) + + encoder_outputs = self.encoder( + hidden_states=embedding_output, + attention_mask=attention_mask, + causal_attention_mask=causal_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = encoder_outputs[0] + sequence_output = self.final_layer_norm(inputs=sequence_output) + + # text_embeds.shape = [batch_size, n_ctx, transformer.width] + # take features from the eot embedding (eot_token is the highest number in each sequence) + pooled_output = tf.gather_nd( + params=sequence_output, + indices=tf.stack( + values=(tf.range(input_shape[0], dtype=tf.int64), tf.math.argmax(input_ids, axis=-1)), axis=1 + ), + ) + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return TFBaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + def _build_causal_attention_mask(self, batch_size, seq_length, dtype=tf.float32): + + diag = tf.constant(0.0, shape=(seq_length,), dtype=dtype) + + # set an additive 2D attention mask with all places being masked + to_mask = tf.constant(-10000.0, shape=(seq_length, seq_length), dtype=dtype) + + # set diagonal & lower triangular parts to 0 (i.e. the places not to be masked) + # TIP: think the 2D matrix as the space of (query_seq, key_seq) + to_mask = tf.linalg.band_part(to_mask, 0, -1) + # to_mask = tf.linalg.band_part(to_mask, -1, 0) + to_mask = tf.linalg.set_diag(to_mask, diagonal=diag) + + return tf.broadcast_to(input=to_mask, shape=(batch_size, 1, seq_length, seq_length)) + + +@keras_serializable +class TFCLIPTextMainLayer(tf.keras.layers.Layer): + config_class = CLIPTextConfig + + def __init__(self, config: CLIPTextConfig, **kwargs): + super().__init__(**kwargs) + self.config = config + self.text_model = TFCLIPTextTransformer(config, name="text_model") + + def get_input_embeddings(self) -> tf.keras.layers.Layer: + return self.text_model.embeddings + + def set_input_embeddings(self, value: tf.Variable): + self.text_model.embeddings.weight = value + self.text_model.embeddings.vocab_size = shape_list(value)[0] + + def call( + self, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + **kwargs, + ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + kwargs_call=kwargs, + ) + + if inputs["input_ids"] is None: + raise ValueError("You have to specify either input_ids") + + input_shape = shape_list(inputs["input_ids"]) + + if inputs["attention_mask"] is None: + inputs["attention_mask"] = tf.fill(dims=input_shape, value=1) + + text_model_outputs = self.text_model( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + position_ids=inputs["position_ids"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + training=inputs["training"], + ) + + return text_model_outputs + + +class TFCLIPVisionTransformer(tf.keras.layers.Layer): + def __init__(self, config: CLIPVisionConfig, **kwargs): + super().__init__(**kwargs) + + self.embeddings = TFCLIPVisionEmbeddings(config, name="embeddings") + self.pre_layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="pre_layrnorm") + self.encoder = TFCLIPEncoder(config, name="encoder") + self.post_layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm") + + def call( + self, + pixel_values: TFModelInputType, + output_attentions: bool, + output_hidden_states: bool, + return_dict: bool, + training: bool = False, + **kwargs, + ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: + + embedding_output = self.embeddings(pixel_values=pixel_values) + embedding_output = self.pre_layernorm(inputs=embedding_output) + + encoder_outputs = self.encoder( + hidden_states=embedding_output, + attention_mask=None, + causal_attention_mask=None, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = encoder_outputs[0] + pooled_output = sequence_output[:, 0, :] + pooled_output = self.post_layernorm(inputs=pooled_output) + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return TFBaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@keras_serializable +class TFCLIPVisionMainLayer(tf.keras.layers.Layer): + config_class = CLIPVisionConfig + + def __init__(self, config: CLIPVisionConfig, **kwargs): + super().__init__(**kwargs) + self.config = config + self.vision_model = TFCLIPVisionTransformer(config, name="vision_model") + + def get_input_embeddings(self) -> tf.keras.layers.Layer: + return self.vision_model.embeddings + + def call( + self, + pixel_values: Optional[TFModelInputType] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + **kwargs, + ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + kwargs_call=kwargs, + ) + + if "input_ids" in inputs: + inputs["pixel_values"] = inputs.pop("input_ids") + + if inputs["pixel_values"] is None: + raise ValueError("You have to specify pixel_values") + + vision_model_outputs = self.vision_model( + pixel_values=inputs["pixel_values"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + training=inputs["training"], + ) + + return vision_model_outputs + + +@keras_serializable +class TFCLIPMainLayer(tf.keras.layers.Layer): + config_class = CLIPConfig + + def __init__(self, config: CLIPConfig, **kwargs): + super().__init__(**kwargs) + + if not isinstance(config.text_config, CLIPTextConfig): + raise ValueError( + f"config.text_config is expected to be of type CLIPTextConfig but is of type {type(config.text_config)}." + ) + + if not isinstance(config.vision_config, CLIPVisionConfig): + raise ValueError( + f"config.vision_config is expected to be of type CLIPVisionConfig but is of type {type(config.vision_config)}." + ) + + self.config = config + + text_config = config.text_config + vision_config = config.vision_config + + self.projection_dim = config.projection_dim + + self.text_model = TFCLIPTextTransformer(text_config, name="text_model") + self.vision_model = TFCLIPVisionTransformer(vision_config, name="vision_model") + + self.visual_projection = tf.keras.layers.Dense( + units=self.projection_dim, + kernel_initializer=get_initializer(vision_config.hidden_size ** -0.5 * self.config.initializer_factor), + use_bias=False, + name="visual_projection", + ) + + self.text_projection = tf.keras.layers.Dense( + units=self.projection_dim, + kernel_initializer=get_initializer(text_config.hidden_size ** -0.5 * self.config.initializer_factor), + use_bias=False, + name="text_projection", + ) + + def build(self, input_shape: tf.TensorShape): + + self.logit_scale = self.add_weight( + shape=(1,), + initializer=tf.keras.initializers.Constant(self.config.logit_scale_init_value), + trainable=True, + name="logit_scale", + ) + + super().build(input_shape) + + def get_text_features( + self, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + **kwargs, + ) -> tf.Tensor: + inputs = input_processing( + func=self.get_text_features, + config=self.config, + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + kwargs_call=kwargs, + ) + + if inputs["input_ids"] is None: + raise ValueError("You have to specify either input_ids") + + input_shape = shape_list(inputs["input_ids"]) + + if inputs["attention_mask"] is None: + inputs["attention_mask"] = tf.fill(dims=input_shape, value=1) + + text_outputs = self.text_model( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + position_ids=inputs["position_ids"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + training=inputs["training"], + ) + + pooled_output = text_outputs[1] + text_features = self.text_projection(inputs=pooled_output) + + return text_features + + def get_image_features( + self, + pixel_values: Optional[TFModelInputType] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + **kwargs, + ) -> tf.Tensor: + inputs = input_processing( + func=self.get_image_features, + config=self.config, + input_ids=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + kwargs_call=kwargs, + ) + + if "input_ids" in inputs: + inputs["pixel_values"] = inputs.pop("input_ids") + + if inputs["pixel_values"] is None: + raise ValueError("You have to specify pixel_values") + + vision_outputs = self.vision_model( + pixel_values=inputs["pixel_values"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + training=inputs["training"], + ) + + pooled_output = vision_outputs[1] # pooled_output + image_features = self.visual_projection(inputs=pooled_output) + + return image_features + + def call( + self, + input_ids: Optional[TFModelInputType] = None, + pixel_values: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + return_loss: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + **kwargs, + ) -> Union[TFCLIPOutput, Tuple[tf.Tensor]]: + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + position_ids=position_ids, + return_loss=return_loss, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + kwargs_call=kwargs, + ) + + if inputs["input_ids"] is None: + raise ValueError("You have to specify either input_ids") + if inputs["pixel_values"] is None: + raise ValueError("You have to specify pixel_values") + + input_shape = shape_list(inputs["input_ids"]) + + if inputs["attention_mask"] is None: + inputs["attention_mask"] = tf.fill(dims=input_shape, value=1) + + vision_outputs = self.vision_model( + pixel_values=inputs["pixel_values"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + training=inputs["training"], + ) + + text_outputs = self.text_model( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + position_ids=inputs["position_ids"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + training=inputs["training"], + ) + + image_embeds = vision_outputs[1] + image_embeds = self.visual_projection(inputs=image_embeds) + + text_embeds = text_outputs[1] + text_embeds = self.text_projection(inputs=text_embeds) + + # normalized features + image_embeds = image_embeds / tf.norm(tensor=image_embeds, ord="euclidean", axis=-1, keepdims=True) + text_embeds = text_embeds / tf.norm(tensor=text_embeds, ord="euclidean", axis=-1, keepdims=True) + + # cosine similarity as logits + logit_scale = tf.math.exp(self.logit_scale) + logits_per_text = tf.matmul(text_embeds, image_embeds, transpose_b=True) * logit_scale + logits_per_image = tf.transpose(logits_per_text) + + loss = None + if inputs["return_loss"]: + loss = clip_loss(logits_per_text) + + if not inputs["return_dict"]: + output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs) + return (loss,) + output if loss is not None else output + + return TFCLIPOutput( + loss=loss, + logits_per_image=logits_per_image, + logits_per_text=logits_per_text, + text_embeds=text_embeds, + image_embeds=image_embeds, + text_model_output=text_outputs, + vision_model_output=vision_outputs, + ) + + +class TFCLIPPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = CLIPConfig + base_model_prefix = "clip" + + +CLIP_START_DOCSTRING = r""" + + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the + generic methods the library implements for all its model (such as downloading or saving, resizing the input + embeddings, pruning heads etc.) + + This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use + it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage + and behavior. + + + + TF 2.0 models accepts two formats as inputs: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional arguments. + + This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all + the tensors in the first argument of the model call function: `model(inputs)`. + + If you choose this second option, there are three possibilities you can use to gather all the input Tensors in + the first positional argument : + + - a single Tensor with `input_ids` only and nothing else: `model(input_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + `model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + + + Args: + config ([`CLIPConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the + model weights. +""" + +CLIP_TEXT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`BertTokenizer`]. See + [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for + details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This + argument can be used in eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False``): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + +CLIP_VISION_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`): Pixel values. Pixel values can be obtained using [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This + argument can be used in eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False``): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + +CLIP_INPUTS_DOCSTRING = r""" + Args: + input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`BertTokenizer`]. See + [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for + details. + + [What are input IDs?](../glossary#input-ids) + pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`CLIPFeatureExtractor`]. See + [`CLIPFeatureExtractor.__call__`] for details. + attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + return_loss (`bool`, *optional*): + Whether or not to return the contrastive loss. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This + argument can be used in eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False``): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +class TFCLIPTextModel(TFCLIPPreTrainedModel): + config_class = CLIPTextConfig + + def __init__(self, config: CLIPTextConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.clip = TFCLIPTextMainLayer(config, name="clip") + + @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=CLIPTextConfig) + def call( + self, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = False, + **kwargs, + ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: + r""" + Returns: + + Examples: + + ```python + >>> from transformers import CLIPTokenizer, TFCLIPTextModel + + >>> model = TFCLIPTextModel.from_pretrained("openai/clip-vit-base-patch32") + >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output # pooled (EOS token) states + ```""" + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + kwargs_call=kwargs, + ) + outputs = self.clip( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + position_ids=inputs["position_ids"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + training=inputs["training"], + ) + + return outputs + + def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling: + hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None + attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None + + return TFBaseModelOutputWithPooling( + last_hidden_state=output.last_hidden_state, + pooler_output=output.pooler_output, + hidden_states=hs, + attentions=attns, + ) + + +class TFCLIPVisionModel(TFCLIPPreTrainedModel): + config_class = CLIPVisionConfig + main_input_name = "pixel_values" + + def __init__(self, config: CLIPVisionConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.clip = TFCLIPVisionMainLayer(config, name="clip") + + @property + def dummy_inputs(self) -> Dict[str, tf.Tensor]: + """ + Dummy inputs to build the network. + + Returns: + `Dict[str, tf.Tensor]`: The dummy inputs. + """ + VISION_DUMMY_INPUTS = tf.random.uniform( + shape=(len(DUMMY_INPUTS), 3, self.config.image_size, self.config.image_size), dtype=tf.float32 + ) + return {"pixel_values": VISION_DUMMY_INPUTS} + + @tf.function( + input_signature=[ + { + "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"), + } + ] + ) + def serving(self, inputs): + """ + Method used for serving the model. + + Args: + inputs (`Dict[str, tf.Tensor]`): + The input of the saved model as a dictionary of tensors. + """ + output = self.call(inputs) + + return self.serving_output(output) + + @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=CLIPVisionConfig) + def call( + self, + pixel_values: Optional[TFModelInputType] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = False, + **kwargs, + ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: + r""" + Returns: + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import CLIPProcessor, TFCLIPVisionModel + + >>> model = TFCLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32") + >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="tf") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output # pooled CLS states + ```""" + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + kwargs_call=kwargs, + ) + + if "input_ids" in inputs: + inputs["pixel_values"] = inputs.pop("input_ids") + + outputs = self.clip( + pixel_values=inputs["pixel_values"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + training=inputs["training"], + ) + + return outputs + + def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling: + hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None + attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None + + return TFBaseModelOutputWithPooling( + last_hidden_state=output.last_hidden_state, + pooler_output=output.pooler_output, + hidden_states=hs, + attentions=attns, + ) + + +@add_start_docstrings(CLIP_START_DOCSTRING) +class TFCLIPModel(TFCLIPPreTrainedModel): + config_class = CLIPConfig + + def __init__(self, config: CLIPConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.clip = TFCLIPMainLayer(config, name="clip") + + @property + def dummy_inputs(self) -> Dict[str, tf.Tensor]: + """ + Dummy inputs to build the network. + + Returns: + `Dict[str, tf.Tensor]`: The dummy inputs. + """ + VISION_DUMMY_INPUTS = tf.random.uniform( + shape=(len(DUMMY_INPUTS), 3, self.config.vision_config.image_size, self.config.vision_config.image_size), + dtype=tf.float32, + ) + return { + "input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32), + "pixel_values": VISION_DUMMY_INPUTS, + } + + @tf.function( + input_signature=[ + { + "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"), + "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"), + "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"), + } + ] + ) + def serving(self, inputs): + """ + Method used for serving the model. + + Args: + inputs (`Dict[str, tf.Tensor]`): + The input of the saved model as a dictionary of tensors. + """ + output = self.call(inputs) + + return self.serving_output(output) + + @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def get_text_features( + self, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + **kwargs, + ) -> tf.Tensor: + r""" + Returns: + text_features (`tf.Tensor` of shape `(batch_size, output_dim`): The text embeddings obtained by + applying the projection layer to the pooled output of [`TFCLIPTextModel`]. + + Examples: + + ```python + >>> from transformers import CLIPTokenizer, TFCLIPModel + + >>> model = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32") + >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf") + >>> text_features = model.get_text_features(**inputs) + ```""" + inputs = input_processing( + func=self.get_text_features, + config=self.config, + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + kwargs_call=kwargs, + ) + + text_features = self.clip.get_text_features( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + position_ids=inputs["position_ids"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + ) + + return text_features + + @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING) + def get_image_features( + self, + pixel_values: Optional[TFModelInputType] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + **kwargs, + ) -> tf.Tensor: + r""" + Returns: + image_features (`tf.Tensor` of shape `(batch_size, output_dim`): The image embeddings obtained by + applying the projection layer to the pooled output of [`TFCLIPVisionModel`]. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import CLIPProcessor, TFCLIPModel + + >>> model = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32") + >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="tf") + + >>> image_features = model.get_image_features(**inputs) + ```""" + inputs = input_processing( + func=self.get_image_features, + config=self.config, + input_ids=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + kwargs_call=kwargs, + ) + + if "input_ids" in inputs: + inputs["pixel_values"] = inputs.pop("input_ids") + + image_features = self.clip.get_image_features( + pixel_values=inputs["pixel_values"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + ) + + return image_features + + @add_start_docstrings_to_model_forward(CLIP_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=TFCLIPOutput, config_class=CLIPConfig) + def call( + self, + input_ids: Optional[TFModelInputType] = None, + pixel_values: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + return_loss: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + **kwargs, + ) -> Union[TFCLIPOutput, Tuple[tf.Tensor]]: + r""" + Returns: + + Examples: + + ```python + >>> import tensorflow as tf + >>> from PIL import Image + >>> import requests + >>> from transformers import CLIPProcessor, TFCLIPModel + + >>> model = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32") + >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="tf", padding=True) + + >>> outputs = model(**inputs) + >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score + >>> probs = tf.nn.softmax(logits_per_image, axis=1) # we can take the softmax to get the label probabilities + ```""" + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + position_ids=position_ids, + return_loss=return_loss, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + kwargs_call=kwargs, + ) + + outputs = self.clip( + input_ids=inputs["input_ids"], + pixel_values=inputs["pixel_values"], + attention_mask=inputs["attention_mask"], + position_ids=inputs["position_ids"], + return_loss=inputs["return_loss"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + ) + + return outputs + + def serving_output(self, output: TFCLIPOutput) -> TFCLIPOutput: + return output diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py index ca564457a2..2e1d43ec40 100644 --- a/src/transformers/utils/dummy_tf_objects.py +++ b/src/transformers/utils/dummy_tf_objects.py @@ -704,6 +704,57 @@ class TFCamembertModel: requires_backends(self, ["tf"]) +TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class TFCLIPModel: + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["tf"]) + + def call(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFCLIPPreTrainedModel: + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["tf"]) + + def call(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFCLIPTextModel: + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["tf"]) + + def call(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFCLIPVisionModel: + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["tf"]) + + def call(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/tests/test_modeling_clip.py b/tests/test_modeling_clip.py index b5e960bea2..e7de75fc8e 100644 --- a/tests/test_modeling_clip.py +++ b/tests/test_modeling_clip.py @@ -29,6 +29,7 @@ from transformers.file_utils import is_torch_available, is_vision_available from transformers.testing_utils import ( is_flax_available, is_pt_flax_cross_test, + is_pt_tf_cross_test, require_torch, require_vision, slow, @@ -581,6 +582,148 @@ class CLIPModelTest(ModelTesterMixin, unittest.TestCase): self.assertTrue(models_equal) + # overwrite from common since CLIPModel/TFCLIPModel return CLIPOutput/TFCLIPOutput + @is_pt_tf_cross_test + def test_pt_tf_model_equivalence(self): + import numpy as np + import tensorflow as tf + + import transformers + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + tf_model_class_name = "TF" + model_class.__name__ # Add the "TF" at the beginning + + if not hasattr(transformers, tf_model_class_name): + # transformers does not have TF version yet + return + + tf_model_class = getattr(transformers, tf_model_class_name) + + config.output_hidden_states = True + + tf_model = tf_model_class(config) + pt_model = model_class(config) + + # make sure only tf inputs are forward that actually exist in function args + tf_input_keys = set(inspect.signature(tf_model.call).parameters.keys()) + + # remove all head masks + tf_input_keys.discard("head_mask") + tf_input_keys.discard("cross_attn_head_mask") + tf_input_keys.discard("decoder_head_mask") + + pt_inputs = self._prepare_for_class(inputs_dict, model_class) + pt_inputs = {k: v for k, v in pt_inputs.items() if k in tf_input_keys} + + # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences + pt_model.eval() + tf_inputs_dict = {} + for key, tensor in pt_inputs.items(): + # skip key that does not exist in tf + if type(tensor) == bool: + tf_inputs_dict[key] = tensor + elif key == "input_values": + tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32) + elif key == "pixel_values": + tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32) + else: + tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.int32) + + # Check we can load pt model in tf and vice-versa with model => model functions + tf_model = transformers.load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=tf_inputs_dict) + pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model) + + # need to rename encoder-decoder "inputs" for PyTorch + # if "inputs" in pt_inputs_dict and self.is_encoder_decoder: + # pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs") + + with torch.no_grad(): + pto = pt_model(**pt_inputs) + tfo = tf_model(tf_inputs_dict, training=False) + + self.assertEqual(len(tfo), len(pto), "Output lengths differ between TF and PyTorch") + for tf_output, pt_output in zip(tfo.to_tuple(), pto.to_tuple()): + + if not (isinstance(tf_output, tf.Tensor) and isinstance(pt_output, torch.Tensor)): + continue + + tf_out = tf_output.numpy() + pt_out = pt_output.numpy() + + self.assertEqual(tf_out.shape, pt_out.shape, "Output component shapes differ between TF and PyTorch") + + if len(tf_out.shape) > 0: + + tf_nans = np.copy(np.isnan(tf_out)) + pt_nans = np.copy(np.isnan(pt_out)) + + pt_out[tf_nans] = 0 + tf_out[tf_nans] = 0 + pt_out[pt_nans] = 0 + tf_out[pt_nans] = 0 + + max_diff = np.amax(np.abs(tf_out - pt_out)) + self.assertLessEqual(max_diff, 4e-2) + + # Check we can load pt model in tf and vice-versa with checkpoint => model functions + with tempfile.TemporaryDirectory() as tmpdirname: + pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin") + torch.save(pt_model.state_dict(), pt_checkpoint_path) + tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path) + + tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5") + tf_model.save_weights(tf_checkpoint_path) + pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path) + + # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences + pt_model.eval() + tf_inputs_dict = {} + for key, tensor in pt_inputs.items(): + # skip key that does not exist in tf + if type(tensor) == bool: + tensor = np.array(tensor, dtype=bool) + tf_inputs_dict[key] = tf.convert_to_tensor(tensor, dtype=tf.int32) + elif key == "input_values": + tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32) + elif key == "pixel_values": + tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32) + else: + tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.int32) + + # need to rename encoder-decoder "inputs" for PyTorch + # if "inputs" in pt_inputs_dict and self.is_encoder_decoder: + # pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs") + + with torch.no_grad(): + pto = pt_model(**pt_inputs) + + tfo = tf_model(tf_inputs_dict) + + self.assertEqual(len(tfo), len(pto), "Output lengths differ between TF and PyTorch") + for tf_output, pt_output in zip(tfo.to_tuple(), pto.to_tuple()): + + if not (isinstance(tf_output, tf.Tensor) and isinstance(pt_output, torch.Tensor)): + continue + + tf_out = tf_output.numpy() + pt_out = pt_output.numpy() + + self.assertEqual(tf_out.shape, pt_out.shape, "Output component shapes differ between TF and PyTorch") + + if len(tf_out.shape) > 0: + tf_nans = np.copy(np.isnan(tf_out)) + pt_nans = np.copy(np.isnan(pt_out)) + + pt_out[tf_nans] = 0 + tf_out[tf_nans] = 0 + pt_out[pt_nans] = 0 + tf_out[pt_nans] = 0 + + max_diff = np.amax(np.abs(tf_out - pt_out)) + self.assertLessEqual(max_diff, 4e-2) + # overwrite from common since FlaxCLIPModel returns nested output # which is not supported in the common test @is_pt_flax_cross_test diff --git a/tests/test_modeling_tf_clip.py b/tests/test_modeling_tf_clip.py new file mode 100644 index 0000000000..59ef2c49d0 --- /dev/null +++ b/tests/test_modeling_tf_clip.py @@ -0,0 +1,659 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the TensorFlow CLIP model. """ + + +import inspect +import os +import tempfile +import unittest +from importlib import import_module + +import requests +from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig +from transformers.file_utils import is_tf_available, is_vision_available +from transformers.testing_utils import is_pt_tf_cross_test, require_tf, require_vision, slow + +from .test_configuration_common import ConfigTester +from .test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask + + +if is_tf_available(): + import numpy as np + import tensorflow as tf + + from transformers import TFCLIPModel, TFCLIPTextModel, TFCLIPVisionModel, TFSharedEmbeddings + from transformers.models.clip.modeling_tf_clip import TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST + + +if is_vision_available(): + from PIL import Image + + from transformers import CLIPProcessor + + +class TFCLIPVisionModelTester: + def __init__( + self, + parent, + batch_size=12, + image_size=30, + patch_size=2, + num_channels=3, + is_training=True, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + initializer_range=0.02, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.initializer_range = initializer_range + self.scope = scope + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + config = self.get_config() + + return config, pixel_values + + def get_config(self): + return CLIPVisionConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, pixel_values): + model = TFCLIPVisionModel(config=config) + result = model(pixel_values, training=False) + # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token) + image_size = (self.image_size, self.image_size) + patch_size = (self.patch_size, self.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_tf +class TFCLIPVisionModelTest(TFModelTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as CLIP does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (TFCLIPVisionModel,) if is_tf_available() else () + + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + test_onnx = False + + def setUp(self): + self.model_tester = TFCLIPVisionModelTester(self) + self.config_tester = ConfigTester(self, config_class=CLIPVisionConfig, has_text_modality=False, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_inputs_embeds(self): + # CLIP does not use inputs_embeds + pass + + def test_graph_mode_with_inputs_embeds(self): + # CLIP does not use inputs_embeds + pass + + def test_model_common_attributes(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (tf.keras.layers.Layer)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, tf.keras.layers.Layer)) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.call) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + # in CLIP, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token) + image_size = (self.model_tester.image_size, self.model_tester.image_size) + patch_size = (self.model_tester.patch_size, self.model_tester.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + seq_len = num_patches + 1 + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False) + attentions = outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False) + attentions = outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + out_len = len(outputs) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False) + + added_hidden_states = 1 + self.assertEqual(out_len + added_hidden_states, len(outputs)) + + self_attentions = outputs.attentions + + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) + + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, seq_len, seq_len], + ) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + + outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False) + + hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states + + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 + ) + self.assertEqual(len(hidden_states), expected_num_layers) + + # CLIP has a different seq_length + image_size = (self.model_tester.image_size, self.model_tester.image_size) + patch_size = (self.model_tester.patch_size, self.model_tester.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + seq_length = num_patches + 1 + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [seq_length, self.model_tester.hidden_size], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + @slow + def test_model_from_pretrained(self): + for model_name in TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = TFCLIPVisionModel.from_pretrained(model_name, from_pt=True) + self.assertIsNotNone(model) + + +class TFCLIPTextModelTester: + def __init__( + self, + parent, + batch_size=12, + seq_length=7, + is_training=True, + use_input_mask=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + max_position_embeddings=512, + initializer_range=0.02, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + config = self.get_config() + + return config, input_ids, input_mask + + def get_config(self): + return CLIPTextConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, input_ids, input_mask): + model = TFCLIPTextModel(config=config) + result = model(input_ids, attention_mask=input_mask, training=False) + result = model(input_ids, training=False) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, input_mask = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_tf +class TFCLIPTextModelTest(TFModelTesterMixin, unittest.TestCase): + + all_model_classes = (TFCLIPTextModel,) if is_tf_available() else () + test_pruning = False + test_head_masking = False + test_onnx = False + + def setUp(self): + self.model_tester = TFCLIPTextModelTester(self) + self.config_tester = ConfigTester(self, config_class=CLIPTextConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_inputs_embeds(self): + # CLIP does not use inputs_embeds + pass + + @slow + def test_model_from_pretrained(self): + for model_name in TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = TFCLIPTextModel.from_pretrained(model_name, from_pt=True) + self.assertIsNotNone(model) + + +class TFCLIPModelTester: + def __init__(self, parent, is_training=True): + self.parent = parent + self.text_model_tester = TFCLIPTextModelTester(parent) + self.vision_model_tester = TFCLIPVisionModelTester(parent) + self.is_training = is_training + + def prepare_config_and_inputs(self): + text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() + vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs() + + config = self.get_config() + + return config, input_ids, attention_mask, pixel_values + + def get_config(self): + return CLIPConfig.from_text_vision_configs( + self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64 + ) + + def create_and_check_model(self, config, input_ids, attention_mask, pixel_values): + model = TFCLIPModel(config) + result = model(input_ids, pixel_values, attention_mask, training=False) + self.parent.assertEqual( + result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size) + ) + self.parent.assertEqual( + result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size) + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, attention_mask, pixel_values = config_and_inputs + inputs_dict = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "pixel_values": pixel_values, + "return_loss": True, + } + return config, inputs_dict + + +@require_tf +class TFCLIPModelTest(TFModelTesterMixin, unittest.TestCase): + all_model_classes = (TFCLIPModel,) if is_tf_available() else () + test_head_masking = False + test_pruning = False + test_resize_embeddings = False + test_attention_outputs = False + test_onnx = False + + def setUp(self): + self.model_tester = TFCLIPModelTester(self) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + # hidden_states are tested in individual model tests + def test_hidden_states_output(self): + pass + + # input_embeds are tested in individual model tests + def test_inputs_embeds(self): + pass + + # CLIPModel does not have input/output embeddings + def test_model_common_attributes(self): + pass + + # overwrite from common since `TFCLIPModelTester` set `return_loss` to `True` and causes the preparation of + # `symbolic_inputs` failed. + def test_keras_save_load(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # remove `return_loss` to make code work + if self.__class__.__name__ == "TFCLIPModelTest": + inputs_dict.pop("return_loss", None) + + tf_main_layer_classes = set( + module_member + for model_class in self.all_model_classes + for module in (import_module(model_class.__module__),) + for module_member_name in dir(module) + if module_member_name.endswith("MainLayer") + # This condition is required, since `modeling_tf_clip.py` has 3 classes whose names end with `MainLayer`. + and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")] + for module_member in (getattr(module, module_member_name),) + if isinstance(module_member, type) + and tf.keras.layers.Layer in module_member.__bases__ + and getattr(module_member, "_keras_serializable", False) + ) + for main_layer_class in tf_main_layer_classes: + # T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter + if "T5" in main_layer_class.__name__: + # Take the same values than in TFT5ModelTester for this shared layer + shared = TFSharedEmbeddings(99, 32, name="shared") + config.use_cache = inputs_dict.pop("use_cache", None) + main_layer = main_layer_class(config, embed_tokens=shared) + else: + main_layer = main_layer_class(config) + + symbolic_inputs = { + name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items() + } + + model = tf.keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs)) + outputs = model(inputs_dict) + + with tempfile.TemporaryDirectory() as tmpdirname: + filepath = os.path.join(tmpdirname, "keras_model.h5") + model.save(filepath) + if "T5" in main_layer_class.__name__: + model = tf.keras.models.load_model( + filepath, + custom_objects={ + main_layer_class.__name__: main_layer_class, + "TFSharedEmbeddings": TFSharedEmbeddings, + }, + ) + else: + model = tf.keras.models.load_model( + filepath, custom_objects={main_layer_class.__name__: main_layer_class} + ) + assert isinstance(model, tf.keras.Model) + after_outputs = model(inputs_dict) + self.assert_outputs_same(after_outputs, outputs) + + # overwrite from common since CLIPModel/TFCLIPModel return CLIPOutput/TFCLIPOutput + @is_pt_tf_cross_test + def test_pt_tf_model_equivalence(self): + import torch + + import transformers + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + pt_model_class_name = model_class.__name__[2:] # Skip the "TF" at the beginning + pt_model_class = getattr(transformers, pt_model_class_name) + + config.output_hidden_states = True + + tf_model = model_class(config) + pt_model = pt_model_class(config) + + # Check we can load pt model in tf and vice-versa with model => model functions + + tf_model = transformers.load_pytorch_model_in_tf2_model( + tf_model, pt_model, tf_inputs=self._prepare_for_class(inputs_dict, model_class) + ) + pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model) + + # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences + pt_model.eval() + pt_inputs_dict = {} + for name, key in self._prepare_for_class(inputs_dict, model_class).items(): + if type(key) == bool: + pt_inputs_dict[name] = key + elif name == "input_values": + pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) + elif name == "pixel_values": + pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) + else: + pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long) + + # need to rename encoder-decoder "inputs" for PyTorch + if "inputs" in pt_inputs_dict and self.is_encoder_decoder: + pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs") + + with torch.no_grad(): + pto = pt_model(**pt_inputs_dict) + tfo = tf_model(self._prepare_for_class(inputs_dict, model_class), training=False) + + self.assertEqual(len(tfo), len(pto), "Output lengths differ between TF and PyTorch") + for tf_output, pt_output in zip(tfo.to_tuple(), pto.to_tuple()): + + if not (isinstance(tf_output, tf.Tensor) and isinstance(pt_output, torch.Tensor)): + continue + + tf_out = tf_output.numpy() + pt_out = pt_output.numpy() + + self.assertEqual(tf_out.shape, pt_out.shape, "Output component shapes differ between TF and PyTorch") + + if len(tf_out.shape) > 0: + + tf_nans = np.copy(np.isnan(tf_out)) + pt_nans = np.copy(np.isnan(pt_out)) + + pt_out[tf_nans] = 0 + tf_out[tf_nans] = 0 + pt_out[pt_nans] = 0 + tf_out[pt_nans] = 0 + + max_diff = np.amax(np.abs(tf_out - pt_out)) + self.assertLessEqual(max_diff, 4e-2) + + # Check we can load pt model in tf and vice-versa with checkpoint => model functions + with tempfile.TemporaryDirectory() as tmpdirname: + pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin") + torch.save(pt_model.state_dict(), pt_checkpoint_path) + tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path) + + tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5") + tf_model.save_weights(tf_checkpoint_path) + pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path) + + # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences + pt_model.eval() + pt_inputs_dict = {} + for name, key in self._prepare_for_class(inputs_dict, model_class).items(): + if type(key) == bool: + key = np.array(key, dtype=bool) + pt_inputs_dict[name] = torch.from_numpy(key).to(torch.long) + elif name == "input_values": + pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) + elif name == "pixel_values": + pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) + else: + pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long) + # need to rename encoder-decoder "inputs" for PyTorch + if "inputs" in pt_inputs_dict and self.is_encoder_decoder: + pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs") + + with torch.no_grad(): + pto = pt_model(**pt_inputs_dict) + tfo = tf_model(self._prepare_for_class(inputs_dict, model_class)) + + self.assertEqual(len(tfo), len(pto), "Output lengths differ between TF and PyTorch") + for tf_output, pt_output in zip(tfo.to_tuple(), pto.to_tuple()): + + if not (isinstance(tf_output, tf.Tensor) and isinstance(pt_output, torch.Tensor)): + continue + + tf_out = tf_output.numpy() + pt_out = pt_output.numpy() + + self.assertEqual(tf_out.shape, pt_out.shape, "Output component shapes differ between TF and PyTorch") + + if len(tf_out.shape) > 0: + tf_nans = np.copy(np.isnan(tf_out)) + pt_nans = np.copy(np.isnan(pt_out)) + + pt_out[tf_nans] = 0 + tf_out[tf_nans] = 0 + pt_out[pt_nans] = 0 + tf_out[pt_nans] = 0 + + max_diff = np.amax(np.abs(tf_out - pt_out)) + self.assertLessEqual(max_diff, 4e-2) + + @slow + def test_model_from_pretrained(self): + for model_name in TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = TFCLIPModel.from_pretrained(model_name, from_pt=True) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + im = Image.open(requests.get(url, stream=True).raw) + return im + + +@require_vision +class TFCLIPModelIntegrationTest(unittest.TestCase): + @slow + def test_inference(self): + model_name = "openai/clip-vit-base-patch32" + model = TFCLIPModel.from_pretrained(model_name, from_pt=True) + processor = CLIPProcessor.from_pretrained(model_name) + + image = prepare_img() + inputs = processor( + text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="tf" + ) + + outputs = model(**inputs, training=False) + + # verify the logits + self.assertEqual( + outputs.logits_per_image.shape, + tf.TensorShape((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])), + ) + self.assertEqual( + outputs.logits_per_text.shape, + tf.TensorShape((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])), + ) + + expected_logits = tf.constant([[24.5701, 19.3049]]) + + tf.debugging.assert_near(outputs.logits_per_image, expected_logits, atol=1e-3) diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py index fd7a6f9069..074e8174d2 100644 --- a/tests/test_modeling_tf_common.py +++ b/tests/test_modeling_tf_common.py @@ -282,6 +282,8 @@ class TFModelTesterMixin: for module in (import_module(model_class.__module__),) for module_member_name in dir(module) if module_member_name.endswith("MainLayer") + # This condition is required, since `modeling_tf_clip.py` has 3 classes whose names end with `MainLayer`. + and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")] for module_member in (getattr(module, module_member_name),) if isinstance(module_member, type) and tf.keras.layers.Layer in module_member.__bases__ @@ -458,7 +460,7 @@ class TFModelTesterMixin: "input_ids": tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32"), } # TODO: A better way to handle vision models - elif model_class.__name__ in ["TFViTModel", "TFViTForImageClassification"]: + elif model_class.__name__ in ["TFViTModel", "TFViTForImageClassification", "TFCLIPVisionModel"]: inputs = tf.keras.Input( batch_shape=( 3, @@ -469,6 +471,20 @@ class TFModelTesterMixin: name="pixel_values", dtype="float32", ) + elif model_class.__name__ in ["TFCLIPModel"]: + inputs = { + "input_ids": tf.keras.Input(batch_shape=(3, max_input), name="input_ids", dtype="int32"), + "pixel_values": tf.keras.Input( + batch_shape=( + 3, + self.model_tester.vision_model_tester.num_channels, + self.model_tester.vision_model_tester.image_size, + self.model_tester.vision_model_tester.image_size, + ), + name="pixel_values", + dtype="float32", + ), + } elif model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING): inputs = tf.keras.Input(batch_shape=(4, 2, max_input), name="input_ids", dtype="int32") else: @@ -1244,6 +1260,13 @@ def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None): return output +def random_attention_mask(shape, rng=None, name=None, dtype=None): + attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None, dtype=dtype) + # make sure that at least one token is attended to for each batch + attn_mask = tf.concat([tf.constant(value=1, shape=(shape[0], 1), dtype=dtype), attn_mask[:, 1:]], axis=1) + return attn_mask + + def floats_tensor(shape, scale=1.0, rng=None, name=None, dtype=None): """Creates a random float32 tensor""" if rng is None: diff --git a/tests/test_modeling_tf_vit.py b/tests/test_modeling_tf_vit.py index 538b5563c8..ea493fc593 100644 --- a/tests/test_modeling_tf_vit.py +++ b/tests/test_modeling_tf_vit.py @@ -22,7 +22,7 @@ import unittest from transformers import ViTConfig from transformers.file_utils import cached_property, is_tf_available, is_vision_available -from transformers.testing_utils import require_tf, require_vision, slow +from transformers.testing_utils import require_tf, require_vision, slow, tooslow from .test_configuration_common import ConfigTester from .test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor @@ -200,7 +200,7 @@ class TFViTModelTest(TFModelTesterMixin, unittest.TestCase): # overwrite from common since `encoder_seq_length` and `encoder_key_length` are calculated # in a different way than in text models. - @slow + @tooslow def test_saved_model_creation_extended(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.output_hidden_states = True diff --git a/utils/check_repo.py b/utils/check_repo.py index 86cf587864..a64d2495d1 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -111,6 +111,8 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [ "BeitForMaskedImageModeling", "CLIPTextModel", "CLIPVisionModel", + "TFCLIPTextModel", + "TFCLIPVisionModel", "FlaxCLIPTextModel", "FlaxCLIPVisionModel", "FlaxWav2Vec2ForCTC",