Add TFVisionTextDualEncoder (#21873)
* Temporary commit to stash everything so far * Temporary commit to stash everything so far * stash commit * Refactor from_pretrained * Fix final test, make fixup * Update dummies * Add model to TEST_FILES_WITH_NO_COMMON_TESTS * Update src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Add TFVisionTextDualEncoder to utils/documentation_tests.txt * make fixup --------- Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
This commit is contained in:
@@ -397,7 +397,7 @@ Flax), PyTorch, and/or TensorFlow.
|
|||||||
| VideoMAE | ❌ | ❌ | ✅ | ❌ | ❌ |
|
| VideoMAE | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
| ViLT | ❌ | ❌ | ✅ | ❌ | ❌ |
|
| ViLT | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
| Vision Encoder decoder | ❌ | ❌ | ✅ | ✅ | ✅ |
|
| Vision Encoder decoder | ❌ | ❌ | ✅ | ✅ | ✅ |
|
||||||
| VisionTextDualEncoder | ❌ | ❌ | ✅ | ❌ | ✅ |
|
| VisionTextDualEncoder | ❌ | ❌ | ✅ | ✅ | ✅ |
|
||||||
| VisualBERT | ❌ | ❌ | ✅ | ❌ | ❌ |
|
| VisualBERT | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
| ViT | ❌ | ❌ | ✅ | ✅ | ✅ |
|
| ViT | ❌ | ❌ | ✅ | ✅ | ✅ |
|
||||||
| ViT Hybrid | ❌ | ❌ | ✅ | ❌ | ❌ |
|
| ViT Hybrid | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
|
|||||||
@@ -41,3 +41,8 @@ new zero-shot vision tasks such as image classification or retrieval.
|
|||||||
|
|
||||||
[[autodoc]] FlaxVisionTextDualEncoderModel
|
[[autodoc]] FlaxVisionTextDualEncoderModel
|
||||||
- __call__
|
- __call__
|
||||||
|
|
||||||
|
## TFVisionTextDualEncoderModel
|
||||||
|
|
||||||
|
[[autodoc]] TFVisionTextDualEncoderModel
|
||||||
|
- call
|
||||||
|
|||||||
@@ -3275,6 +3275,7 @@ else:
|
|||||||
]
|
]
|
||||||
)
|
)
|
||||||
_import_structure["models.vision_encoder_decoder"].extend(["TFVisionEncoderDecoderModel"])
|
_import_structure["models.vision_encoder_decoder"].extend(["TFVisionEncoderDecoderModel"])
|
||||||
|
_import_structure["models.vision_text_dual_encoder"].extend(["TFVisionTextDualEncoderModel"])
|
||||||
_import_structure["models.vit"].extend(
|
_import_structure["models.vit"].extend(
|
||||||
[
|
[
|
||||||
"TFViTForImageClassification",
|
"TFViTForImageClassification",
|
||||||
@@ -6335,6 +6336,7 @@ if TYPE_CHECKING:
|
|||||||
TFTransfoXLPreTrainedModel,
|
TFTransfoXLPreTrainedModel,
|
||||||
)
|
)
|
||||||
from .models.vision_encoder_decoder import TFVisionEncoderDecoderModel
|
from .models.vision_encoder_decoder import TFVisionEncoderDecoderModel
|
||||||
|
from .models.vision_text_dual_encoder import TFVisionTextDualEncoderModel
|
||||||
from .models.vit import TFViTForImageClassification, TFViTModel, TFViTPreTrainedModel
|
from .models.vit import TFViTForImageClassification, TFViTModel, TFViTPreTrainedModel
|
||||||
from .models.vit_mae import TFViTMAEForPreTraining, TFViTMAEModel, TFViTMAEPreTrainedModel
|
from .models.vit_mae import TFViTMAEForPreTraining, TFViTMAEModel, TFViTMAEPreTrainedModel
|
||||||
from .models.wav2vec2 import (
|
from .models.wav2vec2 import (
|
||||||
|
|||||||
@@ -892,8 +892,6 @@ def load_tf_weights(model, resolved_archive_file, ignore_mismatched_sizes=False,
|
|||||||
|
|
||||||
|
|
||||||
def load_tf_weights_from_h5(model, resolved_archive_file, ignore_mismatched_sizes=False, _prefix=None):
|
def load_tf_weights_from_h5(model, resolved_archive_file, ignore_mismatched_sizes=False, _prefix=None):
|
||||||
missing_layers = []
|
|
||||||
unexpected_layers = []
|
|
||||||
mismatched_layers = []
|
mismatched_layers = []
|
||||||
|
|
||||||
# Read the H5 file
|
# Read the H5 file
|
||||||
|
|||||||
@@ -81,6 +81,7 @@ TF_MODEL_MAPPING_NAMES = OrderedDict(
|
|||||||
("t5", "TFT5Model"),
|
("t5", "TFT5Model"),
|
||||||
("tapas", "TFTapasModel"),
|
("tapas", "TFTapasModel"),
|
||||||
("transfo-xl", "TFTransfoXLModel"),
|
("transfo-xl", "TFTransfoXLModel"),
|
||||||
|
("vision-text-dual-encoder", "TFVisionTextDualEncoderModel"),
|
||||||
("vit", "TFViTModel"),
|
("vit", "TFViTModel"),
|
||||||
("vit_mae", "TFViTMAEModel"),
|
("vit_mae", "TFViTMAEModel"),
|
||||||
("wav2vec2", "TFWav2Vec2Model"),
|
("wav2vec2", "TFWav2Vec2Model"),
|
||||||
|
|||||||
@@ -900,6 +900,8 @@ class TFCLIPPreTrainedModel(TFPreTrainedModel):
|
|||||||
|
|
||||||
config_class = CLIPConfig
|
config_class = CLIPConfig
|
||||||
base_model_prefix = "clip"
|
base_model_prefix = "clip"
|
||||||
|
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||||
|
_keys_to_ignore_on_load_unexpected = [r"position_ids"]
|
||||||
|
|
||||||
|
|
||||||
CLIP_START_DOCSTRING = r"""
|
CLIP_START_DOCSTRING = r"""
|
||||||
|
|||||||
@@ -13,7 +13,13 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_flax_available, is_torch_available
|
from ...utils import (
|
||||||
|
OptionalDependencyNotAvailable,
|
||||||
|
_LazyModule,
|
||||||
|
is_flax_available,
|
||||||
|
is_tf_available,
|
||||||
|
is_torch_available,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
_import_structure = {
|
_import_structure = {
|
||||||
@@ -39,10 +45,18 @@ except OptionalDependencyNotAvailable:
|
|||||||
else:
|
else:
|
||||||
_import_structure["modeling_flax_vision_text_dual_encoder"] = ["FlaxVisionTextDualEncoderModel"]
|
_import_structure["modeling_flax_vision_text_dual_encoder"] = ["FlaxVisionTextDualEncoderModel"]
|
||||||
|
|
||||||
|
try:
|
||||||
|
if not is_tf_available():
|
||||||
|
raise OptionalDependencyNotAvailable()
|
||||||
|
except OptionalDependencyNotAvailable:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
_import_structure["modeling_tf_vision_text_dual_encoder"] = ["TFVisionTextDualEncoderModel"]
|
||||||
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from .configuration_vision_text_dual_encoder import VisionTextDualEncoderConfig
|
from .configuration_vision_text_dual_encoder import VisionTextDualEncoderConfig
|
||||||
from .processing_visiotn_text_dual_encoder import VisionTextDualEncoderProcessor
|
from .processing_vision_text_dual_encoder import VisionTextDualEncoderProcessor
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if not is_torch_available():
|
if not is_torch_available():
|
||||||
@@ -58,7 +72,15 @@ if TYPE_CHECKING:
|
|||||||
except OptionalDependencyNotAvailable:
|
except OptionalDependencyNotAvailable:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
from .modeling_vision_text_dual_encoder import FlaxVisionTextDualEncoderModel
|
from .modeling_flax_vision_text_dual_encoder import FlaxVisionTextDualEncoderModel
|
||||||
|
|
||||||
|
try:
|
||||||
|
if not is_tf_available():
|
||||||
|
raise OptionalDependencyNotAvailable()
|
||||||
|
except OptionalDependencyNotAvailable:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
from .modeling_tf_vision_text_dual_encoder import TFVisionTextDualEncoderModel
|
||||||
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -0,0 +1,614 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""TensorFlow VisionTextDualEncoder model."""
|
||||||
|
|
||||||
|
|
||||||
|
import re
|
||||||
|
from typing import Optional, Tuple, Union
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from tensorflow.keras.layers import Dense
|
||||||
|
|
||||||
|
from ...configuration_utils import PretrainedConfig
|
||||||
|
from ...modeling_tf_utils import TFPreTrainedModel, unpack_inputs
|
||||||
|
from ...tf_utils import shape_list
|
||||||
|
from ...utils import (
|
||||||
|
DUMMY_INPUTS,
|
||||||
|
add_start_docstrings,
|
||||||
|
add_start_docstrings_to_model_forward,
|
||||||
|
logging,
|
||||||
|
replace_return_docstrings,
|
||||||
|
)
|
||||||
|
from ..auto.configuration_auto import AutoConfig
|
||||||
|
from ..auto.modeling_tf_auto import TFAutoModel
|
||||||
|
from ..clip.modeling_tf_clip import CLIPVisionConfig, TFCLIPOutput, TFCLIPVisionModel
|
||||||
|
from .configuration_vision_text_dual_encoder import VisionTextDualEncoderConfig
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
_CONFIG_FOR_DOC = "VisionTextDualEncoderConfig"
|
||||||
|
|
||||||
|
VISION_TEXT_DUAL_ENCODER_START_DOCSTRING = r"""
|
||||||
|
This class can be used to initialize a vision-text dual encoder model with any pretrained vision autoencoding model
|
||||||
|
as the vision encoder and any pretrained text model as the text encoder. The vision and text encoders are loaded
|
||||||
|
via the [`~TFAutoModel.from_pretrained`] method. The projection layers are automatically added to the model and
|
||||||
|
should be fine-tuned on a downstream task, like contrastive image-text modeling.
|
||||||
|
|
||||||
|
In [LiT: Zero-Shot Transfer with Locked-image Text Tuning](https://arxiv.org/abs/2111.07991) it is shown how
|
||||||
|
leveraging pre-trained (locked/frozen) image and text model for contrastive learning yields significant improvment
|
||||||
|
on new zero-shot vision tasks such as image classification or retrieval.
|
||||||
|
|
||||||
|
After such a Vision-Text-Dual-Encoder model has been trained/fine-tuned, it can be saved/loaded just like any other
|
||||||
|
models (see the examples for more information).
|
||||||
|
|
||||||
|
This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
|
||||||
|
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
||||||
|
etc.)
|
||||||
|
|
||||||
|
This model is also a Keras [Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a
|
||||||
|
regular Keras Model and refer to the TF documentation for all matter related to general usage and behavior.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
config ([`VisionEncoderDecoderConfig`]): Model configuration class with all the parameters of the model.
|
||||||
|
Initializing with a config file does not load the weights associated with the model, only the
|
||||||
|
configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
VISION_TEXT_DUAL_ENCODER_TEXT_INPUTS_DOCSTRING = r"""
|
||||||
|
Args:
|
||||||
|
input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
|
||||||
|
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
||||||
|
it.
|
||||||
|
|
||||||
|
Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
||||||
|
[`PreTrainedTokenizer.__call__`] for details.
|
||||||
|
|
||||||
|
[What are input IDs?](../glossary#input-ids)
|
||||||
|
attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||||
|
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
||||||
|
|
||||||
|
- 1 for tokens that are **not masked**,
|
||||||
|
- 0 for tokens that are **masked**.
|
||||||
|
|
||||||
|
[What are attention masks?](../glossary#attention-mask)
|
||||||
|
position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||||
|
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
||||||
|
config.max_position_embeddings - 1]`.
|
||||||
|
|
||||||
|
[What are position IDs?](../glossary#position-ids)
|
||||||
|
output_attentions (`bool`, *optional*):
|
||||||
|
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||||
|
tensors for more detail.
|
||||||
|
output_hidden_states (`bool`, *optional*):
|
||||||
|
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
||||||
|
more detail.
|
||||||
|
return_dict (`bool`, *optional*):
|
||||||
|
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
||||||
|
"""
|
||||||
|
|
||||||
|
VISION_TEXT_DUAL_ENCODER_VISION_INPUTS_DOCSTRING = r"""
|
||||||
|
Args:
|
||||||
|
pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
|
||||||
|
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
|
||||||
|
[`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
|
||||||
|
output_attentions (`bool`, *optional*):
|
||||||
|
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||||
|
tensors for more detail.
|
||||||
|
output_hidden_states (`bool`, *optional*):
|
||||||
|
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
||||||
|
more detail.
|
||||||
|
return_dict (`bool`, *optional*):
|
||||||
|
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
||||||
|
"""
|
||||||
|
|
||||||
|
VISION_TEXT_DUAL_ENCODER_INPUTS_DOCSTRING = r"""
|
||||||
|
Args:
|
||||||
|
input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
|
||||||
|
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
||||||
|
it.
|
||||||
|
|
||||||
|
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
||||||
|
[`PreTrainedTokenizer.__call__`] for details.
|
||||||
|
|
||||||
|
[What are input IDs?](../glossary#input-ids)
|
||||||
|
attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||||
|
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
||||||
|
|
||||||
|
- 1 for tokens that are **not masked**,
|
||||||
|
- 0 for tokens that are **masked**.
|
||||||
|
|
||||||
|
[What are attention masks?](../glossary#attention-mask)
|
||||||
|
position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||||
|
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
||||||
|
config.max_position_embeddings - 1]`.
|
||||||
|
|
||||||
|
[What are position IDs?](../glossary#position-ids)
|
||||||
|
pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
|
||||||
|
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
|
||||||
|
an image processor (e.g. if you use ViT as the encoder, you should use [`AutoImageProcessor`]). See
|
||||||
|
[`ViTImageProcessor.__call__`] for details.
|
||||||
|
return_loss (`bool`, *optional*):
|
||||||
|
Whether or not to return the contrastive loss.
|
||||||
|
output_attentions (`bool`, *optional*):
|
||||||
|
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||||
|
tensors for more detail.
|
||||||
|
output_hidden_states (`bool`, *optional*):
|
||||||
|
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
||||||
|
more detail.
|
||||||
|
return_dict (`bool`, *optional*):
|
||||||
|
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# Copied from transformers.models.clip.modeling_tf_clip.contrastive_loss
|
||||||
|
def contrastive_loss(logits: tf.Tensor) -> tf.Tensor:
|
||||||
|
return tf.math.reduce_mean(
|
||||||
|
tf.keras.metrics.sparse_categorical_crossentropy(
|
||||||
|
y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Copied from transformers.models.clip.modeling_tf_clip.clip_loss
|
||||||
|
def clip_loss(similarity: tf.Tensor) -> tf.Tensor:
|
||||||
|
caption_loss = contrastive_loss(similarity)
|
||||||
|
image_loss = contrastive_loss(tf.transpose(similarity))
|
||||||
|
return (caption_loss + image_loss) / 2.0
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings(VISION_TEXT_DUAL_ENCODER_START_DOCSTRING)
|
||||||
|
class TFVisionTextDualEncoderModel(TFPreTrainedModel):
|
||||||
|
config_class = VisionTextDualEncoderConfig
|
||||||
|
base_model_prefix = "vision_text_dual_encoder"
|
||||||
|
load_weight_prefix = "tf_vision_text_dual_encoder_model"
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
config: Optional[VisionTextDualEncoderConfig] = None,
|
||||||
|
vision_model: Optional[TFPreTrainedModel] = None,
|
||||||
|
text_model: Optional[TFPreTrainedModel] = None,
|
||||||
|
):
|
||||||
|
if config is None and (vision_model is None or text_model is None):
|
||||||
|
raise ValueError("Either a configuration or an vision and a text model has to be provided")
|
||||||
|
|
||||||
|
if config is None:
|
||||||
|
config = VisionTextDualEncoderConfig.from_vision_text_configs(vision_model.config, text_model.config)
|
||||||
|
else:
|
||||||
|
if not isinstance(config, self.config_class):
|
||||||
|
raise ValueError(f"config: {config} has to be of type {self.config_class}")
|
||||||
|
|
||||||
|
# initialize with config
|
||||||
|
super().__init__(config)
|
||||||
|
|
||||||
|
if vision_model is None:
|
||||||
|
if isinstance(config.vision_config, CLIPVisionConfig):
|
||||||
|
vision_model = TFCLIPVisionModel.from_config(config.vision_config, name="vision_model")
|
||||||
|
else:
|
||||||
|
vision_model = TFAutoModel.from_config(config.vision_config, name="vision_model")
|
||||||
|
|
||||||
|
if text_model is None:
|
||||||
|
text_model = TFAutoModel.from_config(config.text_config, name="text_model")
|
||||||
|
|
||||||
|
self.vision_model = vision_model
|
||||||
|
self.text_model = text_model
|
||||||
|
|
||||||
|
# make sure that the individual model's config refers to the shared config
|
||||||
|
# so that the updates to the config will be synced
|
||||||
|
self.vision_model.config = self.config.vision_config
|
||||||
|
self.text_model.config = self.config.text_config
|
||||||
|
|
||||||
|
self.vision_embed_dim = config.vision_config.hidden_size
|
||||||
|
self.text_embed_dim = config.text_config.hidden_size
|
||||||
|
self.projection_dim = config.projection_dim
|
||||||
|
|
||||||
|
self.visual_projection = Dense(self.projection_dim, use_bias=False, name="visual_projection")
|
||||||
|
self.text_projection = Dense(self.projection_dim, use_bias=False, name="text_projection")
|
||||||
|
self.logit_scale = None
|
||||||
|
|
||||||
|
def build(self, input_shape=None):
|
||||||
|
# Build in the build() method to make sure the names are right
|
||||||
|
initializer = tf.keras.initializers.Constant(self.config.logit_scale_init_value)
|
||||||
|
self.logit_scale = self.add_weight(shape=(1,), initializer=initializer, name="logit_scale")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
|
||||||
|
# Matt: The TF and PT weights don't align because our TF base classes have an extra layer compared to PT models
|
||||||
|
# (the main model stem is in the MainLayer class). If we remove that layer, then weight names sync up as normal.
|
||||||
|
# However, the name of that extra layer is the name of the MainLayer in the base model.
|
||||||
|
|
||||||
|
if kwargs.get("from_pt", False):
|
||||||
|
|
||||||
|
def tf_to_pt_weight_rename(tf_weight):
|
||||||
|
if "vision_model" in tf_weight:
|
||||||
|
if tf_weight.count("vision_model") == 1:
|
||||||
|
return re.sub(r"vision_model\..*?\.", "vision_model.", tf_weight)
|
||||||
|
elif tf_weight.count("vision_model") == 2:
|
||||||
|
return re.sub(r"vision_model\..*?\.vision_model", "vision_model.vision_model", tf_weight)
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
f"Unexpected weight name {tf_weight}. Please file an issue on the"
|
||||||
|
" Transformers repo to let us know about this error!"
|
||||||
|
)
|
||||||
|
elif "text_model" in tf_weight:
|
||||||
|
return re.sub(r"text_model\..*?\.", "text_model.", tf_weight)
|
||||||
|
else:
|
||||||
|
return tf_weight
|
||||||
|
|
||||||
|
kwargs["tf_to_pt_weight_rename"] = tf_to_pt_weight_rename
|
||||||
|
return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
|
||||||
|
@add_start_docstrings_to_model_forward(VISION_TEXT_DUAL_ENCODER_TEXT_INPUTS_DOCSTRING)
|
||||||
|
def get_text_features(
|
||||||
|
self,
|
||||||
|
input_ids=None,
|
||||||
|
attention_mask=None,
|
||||||
|
position_ids=None,
|
||||||
|
token_type_ids=None,
|
||||||
|
output_attentions=None,
|
||||||
|
output_hidden_states=None,
|
||||||
|
return_dict=None,
|
||||||
|
):
|
||||||
|
r"""
|
||||||
|
Returns:
|
||||||
|
text_features (`tf.Tensor` of shape `(batch_size, output_dim`): The text embeddings obtained by applying
|
||||||
|
the projection layer to the pooled output of [`TFCLIPTextModel`].
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> from transformers import TFVisionTextDualEncoderModel, AutoTokenizer
|
||||||
|
|
||||||
|
>>> model = TFVisionTextDualEncoderModel.from_pretrained("clip-italian/clip-italian")
|
||||||
|
>>> tokenizer = AutoTokenizer.from_pretrained("clip-italian/clip-italian")
|
||||||
|
|
||||||
|
>>> inputs = tokenizer(["una foto di un gatto", "una foto di un cane"], padding=True, return_tensors="pt")
|
||||||
|
>>> text_features = model.get_text_features(**inputs)
|
||||||
|
```"""
|
||||||
|
text_outputs = self.text_model(
|
||||||
|
input_ids=input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
position_ids=position_ids,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
output_attentions=output_attentions,
|
||||||
|
output_hidden_states=output_hidden_states,
|
||||||
|
return_dict=return_dict,
|
||||||
|
)
|
||||||
|
|
||||||
|
pooled_output = text_outputs[1]
|
||||||
|
text_features = self.text_projection(pooled_output)
|
||||||
|
|
||||||
|
return text_features
|
||||||
|
|
||||||
|
@add_start_docstrings_to_model_forward(VISION_TEXT_DUAL_ENCODER_VISION_INPUTS_DOCSTRING)
|
||||||
|
def get_image_features(
|
||||||
|
self,
|
||||||
|
pixel_values=None,
|
||||||
|
output_attentions=None,
|
||||||
|
output_hidden_states=None,
|
||||||
|
return_dict=None,
|
||||||
|
):
|
||||||
|
r"""
|
||||||
|
Returns:
|
||||||
|
image_features (`tf.Tensor` of shape `(batch_size, output_dim`): The image embeddings obtained by applying
|
||||||
|
the projection layer to the pooled output of [`TFCLIPVisionModel`].
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> from PIL import Image
|
||||||
|
>>> import requests
|
||||||
|
>>> from transformers import TFVisionTextDualEncoderModel, AutoImageProcessor
|
||||||
|
|
||||||
|
>>> model = VisionTextDualEncoderModel.from_pretrained("clip-italian/clip-italian")
|
||||||
|
>>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
|
||||||
|
|
||||||
|
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||||
|
>>> image = Image.open(requests.get(url, stream=True).raw)
|
||||||
|
|
||||||
|
>>> inputs = image_processor(images=image, return_tensors="np")
|
||||||
|
|
||||||
|
>>> image_features = model.get_image_features(**inputs)
|
||||||
|
```"""
|
||||||
|
vision_outputs = self.vision_model(
|
||||||
|
pixel_values=pixel_values,
|
||||||
|
output_attentions=output_attentions,
|
||||||
|
output_hidden_states=output_hidden_states,
|
||||||
|
return_dict=return_dict,
|
||||||
|
)
|
||||||
|
|
||||||
|
pooled_output = vision_outputs[1] # pooled_output
|
||||||
|
image_features = self.visual_projection(pooled_output)
|
||||||
|
|
||||||
|
return image_features
|
||||||
|
|
||||||
|
@unpack_inputs
|
||||||
|
@add_start_docstrings_to_model_forward(VISION_TEXT_DUAL_ENCODER_INPUTS_DOCSTRING)
|
||||||
|
@replace_return_docstrings(output_type=TFCLIPOutput, config_class=_CONFIG_FOR_DOC)
|
||||||
|
def call(
|
||||||
|
self,
|
||||||
|
input_ids: Optional[tf.Tensor] = None,
|
||||||
|
pixel_values: Optional[tf.Tensor] = None,
|
||||||
|
attention_mask: Optional[tf.Tensor] = None,
|
||||||
|
position_ids: Optional[tf.Tensor] = None,
|
||||||
|
return_loss: Optional[bool] = None,
|
||||||
|
token_type_ids: Optional[tf.Tensor] = None,
|
||||||
|
output_attentions: Optional[bool] = None,
|
||||||
|
output_hidden_states: Optional[bool] = None,
|
||||||
|
return_dict: Optional[bool] = None,
|
||||||
|
training: bool = False,
|
||||||
|
) -> Union[Tuple[tf.Tensor], TFCLIPOutput]:
|
||||||
|
r"""
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> from PIL import Image
|
||||||
|
>>> import requests
|
||||||
|
>>> from transformers import (
|
||||||
|
... TFVisionTextDualEncoderModel,
|
||||||
|
... VisionTextDualEncoderProcessor,
|
||||||
|
... AutoImageProcessor,
|
||||||
|
... AutoTokenizer,
|
||||||
|
... )
|
||||||
|
|
||||||
|
>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
||||||
|
>>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
|
||||||
|
>>> processor = VisionTextDualEncoderProcessor(image_processor, tokenizer)
|
||||||
|
>>> model = TFVisionTextDualEncoderModel.from_vision_text_pretrained(
|
||||||
|
... "google/vit-base-patch16-224", "bert-base-uncased"
|
||||||
|
... )
|
||||||
|
|
||||||
|
>>> # contrastive training
|
||||||
|
>>> urls = [
|
||||||
|
... "http://images.cocodataset.org/val2017/000000039769.jpg",
|
||||||
|
... "https://farm3.staticflickr.com/2674/5850229113_4fe05d5265_z.jpg",
|
||||||
|
... ]
|
||||||
|
>>> images = [Image.open(requests.get(url, stream=True).raw) for url in urls]
|
||||||
|
>>> inputs = processor(
|
||||||
|
... text=["a photo of a cat", "a photo of a dog"], images=images, return_tensors="pt", padding=True
|
||||||
|
... )
|
||||||
|
>>> outputs = model(
|
||||||
|
... input_ids=inputs.input_ids,
|
||||||
|
... attention_mask=inputs.attention_mask,
|
||||||
|
... pixel_values=inputs.pixel_values,
|
||||||
|
... return_loss=True,
|
||||||
|
... )
|
||||||
|
>>> loss, logits_per_image = outputs.loss, outputs.logits_per_image # this is the image-text similarity score
|
||||||
|
|
||||||
|
>>> # save and load from pretrained
|
||||||
|
>>> model.save_pretrained("vit-bert")
|
||||||
|
>>> model = TFVisionTextDualEncoderModel.from_pretrained("vit-bert")
|
||||||
|
|
||||||
|
>>> # inference
|
||||||
|
>>> outputs = model(**inputs)
|
||||||
|
>>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
|
||||||
|
>>> probs = tf.nn.softmax(logits_per_image, axis=1) # we can take the softmax to get the label probabilities
|
||||||
|
```"""
|
||||||
|
return_dict = return_dict if return_dict is not None else self.config.return_dict
|
||||||
|
|
||||||
|
vision_outputs = self.vision_model(
|
||||||
|
pixel_values=pixel_values,
|
||||||
|
output_attentions=output_attentions,
|
||||||
|
output_hidden_states=output_hidden_states,
|
||||||
|
return_dict=return_dict,
|
||||||
|
training=training,
|
||||||
|
)
|
||||||
|
|
||||||
|
text_outputs = self.text_model(
|
||||||
|
input_ids=input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
output_attentions=output_attentions,
|
||||||
|
output_hidden_states=output_hidden_states,
|
||||||
|
return_dict=return_dict,
|
||||||
|
training=training,
|
||||||
|
)
|
||||||
|
|
||||||
|
image_embeds = vision_outputs[1] # pooler_output
|
||||||
|
image_embeds = self.visual_projection(image_embeds)
|
||||||
|
|
||||||
|
text_embeds = text_outputs[1] # pooler_output
|
||||||
|
text_embeds = self.text_projection(text_embeds)
|
||||||
|
|
||||||
|
# normalized features
|
||||||
|
image_embeds = image_embeds / tf.norm(image_embeds, axis=-1, keepdims=True)
|
||||||
|
text_embeds = text_embeds / tf.norm(text_embeds, axis=-1, keepdims=True)
|
||||||
|
|
||||||
|
# cosine similarity as logits
|
||||||
|
logit_scale = tf.math.exp(self.logit_scale)
|
||||||
|
logits_per_text = tf.matmul(text_embeds, image_embeds, transpose_b=True) * logit_scale
|
||||||
|
logits_per_image = tf.transpose(logits_per_text)
|
||||||
|
|
||||||
|
loss = None
|
||||||
|
if return_loss:
|
||||||
|
loss = clip_loss(logits_per_text)
|
||||||
|
|
||||||
|
if not return_dict:
|
||||||
|
output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
|
||||||
|
return ((loss,) + output) if loss is not None else output
|
||||||
|
|
||||||
|
return TFCLIPOutput(
|
||||||
|
loss=loss,
|
||||||
|
logits_per_image=logits_per_image,
|
||||||
|
logits_per_text=logits_per_text,
|
||||||
|
text_embeds=text_embeds,
|
||||||
|
image_embeds=image_embeds,
|
||||||
|
text_model_output=text_outputs,
|
||||||
|
vision_model_output=vision_outputs,
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_vision_text_pretrained(
|
||||||
|
cls,
|
||||||
|
vision_model_name_or_path: str = None,
|
||||||
|
text_model_name_or_path: str = None,
|
||||||
|
*model_args,
|
||||||
|
**kwargs,
|
||||||
|
) -> TFPreTrainedModel:
|
||||||
|
"""
|
||||||
|
Params:
|
||||||
|
vision_model_name_or_path (`str`, *optional*, defaults to `None`):
|
||||||
|
Information necessary to initiate the vision model. Can be either:
|
||||||
|
|
||||||
|
- A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
|
||||||
|
Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
|
||||||
|
user or organization name, like `dbmdz/bert-base-german-cased`.
|
||||||
|
- A path to a *directory* containing model weights saved using
|
||||||
|
[`~TFPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
|
||||||
|
- A path or url to a *PyTorch checkpoint folder* (e.g, `./pt_model`). In this case, `from_pt`
|
||||||
|
should be set to `True` and a configuration object should be provided as `config` argument.
|
||||||
|
|
||||||
|
text_model_name_or_path (`str`, *optional*):
|
||||||
|
Information necessary to initiate the text model. Can be either:
|
||||||
|
|
||||||
|
- A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
|
||||||
|
Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
|
||||||
|
user or organization name, like `dbmdz/bert-base-german-cased`.
|
||||||
|
- A path to a *directory* containing model weights saved using
|
||||||
|
[`~TFPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
|
||||||
|
- A path or url to a *PyTorch checkpoint folder* (e.g, `./pt_model`). In this case, `from_pt`
|
||||||
|
should be set to `True` and a configuration object should be provided as `config` argument.
|
||||||
|
|
||||||
|
model_args (remaining positional arguments, *optional*):
|
||||||
|
All remaning positional arguments will be passed to the underlying model's `__init__` method.
|
||||||
|
|
||||||
|
kwargs (remaining dictionary of keyword arguments, *optional*):
|
||||||
|
Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
|
||||||
|
`output_attentions=True`).
|
||||||
|
|
||||||
|
- To update the text configuration, use the prefix *text_* for each configuration parameter.
|
||||||
|
- To update the vision configuration, use the prefix *vision_* for each configuration parameter.
|
||||||
|
- To update the parent model configuration, do not use a prefix for each configuration parameter.
|
||||||
|
|
||||||
|
Behaves differently depending on whether a `config` is provided or automatically loaded.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> from transformers import TFVisionTextDualEncoderModel
|
||||||
|
|
||||||
|
>>> # initialize a model from pretrained ViT and BERT models. Note that the projection layers will be randomly initialized.
|
||||||
|
>>> model = TFVisionTextDualEncoderModel.from_vision_text_pretrained(
|
||||||
|
... "google/vit-base-patch16-224", "bert-base-uncased"
|
||||||
|
... )
|
||||||
|
>>> # saving model after fine-tuning
|
||||||
|
>>> model.save_pretrained("./vit-bert")
|
||||||
|
>>> # load fine-tuned model
|
||||||
|
>>> model = TFVisionTextDualEncoderModel.from_pretrained("./vit-bert")
|
||||||
|
```"""
|
||||||
|
kwargs_vision = {
|
||||||
|
argument[len("vision_") :]: value for argument, value in kwargs.items() if argument.startswith("vision_")
|
||||||
|
}
|
||||||
|
|
||||||
|
kwargs_text = {
|
||||||
|
argument[len("text_") :]: value for argument, value in kwargs.items() if argument.startswith("text_")
|
||||||
|
}
|
||||||
|
|
||||||
|
# remove vision, text kwargs from kwargs
|
||||||
|
for key in kwargs_vision.keys():
|
||||||
|
del kwargs["vision_" + key]
|
||||||
|
for key in kwargs_text.keys():
|
||||||
|
del kwargs["text_" + key]
|
||||||
|
|
||||||
|
# Load and initialize the vision and text model
|
||||||
|
vision_model = kwargs_vision.pop("model", None)
|
||||||
|
if vision_model is None:
|
||||||
|
if vision_model_name_or_path is None:
|
||||||
|
raise ValueError(
|
||||||
|
"If `vision_model` is not defined as an argument, a `vision_model_name_or_path` has to be defined"
|
||||||
|
)
|
||||||
|
kwargs_vision["name"] = "vision_model"
|
||||||
|
kwargs_vision["load_weight_prefix"] = cls.load_weight_prefix
|
||||||
|
|
||||||
|
vision_config_dict, unused_args = PretrainedConfig.get_config_dict(vision_model_name_or_path, **kwargs)
|
||||||
|
if vision_config_dict.get("model_type", None) == "clip_vision_model":
|
||||||
|
vision_config = CLIPVisionConfig.from_dict(vision_config_dict)
|
||||||
|
else:
|
||||||
|
vision_config = AutoConfig.from_pretrained(vision_model_name_or_path)
|
||||||
|
|
||||||
|
if vision_config.model_type == "clip_vision_model":
|
||||||
|
kwargs_vision["config"] = vision_config
|
||||||
|
vision_class = TFCLIPVisionModel
|
||||||
|
elif vision_config.model_type == "clip":
|
||||||
|
kwargs_vision["config"] = vision_config.vision_config
|
||||||
|
vision_class = TFCLIPVisionModel
|
||||||
|
else:
|
||||||
|
kwargs_vision["config"] = vision_config
|
||||||
|
vision_class = TFAutoModel
|
||||||
|
vision_model = vision_class.from_pretrained(vision_model_name_or_path, *model_args, **kwargs_vision)
|
||||||
|
|
||||||
|
text_model = kwargs_text.pop("model", None)
|
||||||
|
if text_model is None:
|
||||||
|
if text_model_name_or_path is None:
|
||||||
|
raise ValueError(
|
||||||
|
"If `text_model` is not defined as an argument, a `text_model_name_or_path` has to be defined"
|
||||||
|
)
|
||||||
|
kwargs_text["name"] = "text_model"
|
||||||
|
kwargs_text["load_weight_prefix"] = cls.load_weight_prefix
|
||||||
|
|
||||||
|
if "config" not in kwargs_text:
|
||||||
|
text_config = AutoConfig.from_pretrained(text_model_name_or_path)
|
||||||
|
kwargs_text["config"] = text_config
|
||||||
|
|
||||||
|
text_model = TFAutoModel.from_pretrained(text_model_name_or_path, *model_args, **kwargs_text)
|
||||||
|
|
||||||
|
# instantiate config with corresponding kwargs
|
||||||
|
config = VisionTextDualEncoderConfig.from_vision_text_configs(vision_model.config, text_model.config, **kwargs)
|
||||||
|
|
||||||
|
# init model
|
||||||
|
model = cls(config=config, vision_model=vision_model, text_model=text_model)
|
||||||
|
|
||||||
|
# the projection layers are always newly initialized when loading the model
|
||||||
|
# using pre-trained vision and text model.
|
||||||
|
logger.warning(
|
||||||
|
"The projection layer and logit scale weights `['visual_projection.weight', 'text_projection.weight',"
|
||||||
|
" 'logit_scale']` are newly initialized. You should probably TRAIN this model on a down-stream task to be"
|
||||||
|
" able to use it for predictions and inference."
|
||||||
|
)
|
||||||
|
|
||||||
|
if vision_model.name != "vision_model":
|
||||||
|
raise ValueError("vision model must be created with the name `vision_model`.")
|
||||||
|
if text_model.name != "text_model":
|
||||||
|
raise ValueError("text model must be created with the name `text_model`.")
|
||||||
|
|
||||||
|
return model
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dummy_inputs(self):
|
||||||
|
"""
|
||||||
|
Dummy inputs to build the network.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`Dict[str, tf.Tensor]`: The dummy inputs.
|
||||||
|
"""
|
||||||
|
input_ids = tf.constant(DUMMY_INPUTS, dtype=tf.int32)
|
||||||
|
batch_size, seq_len = input_ids.shape
|
||||||
|
|
||||||
|
VISION_DUMMY_INPUTS = tf.random.uniform(
|
||||||
|
shape=(
|
||||||
|
batch_size,
|
||||||
|
self.config.vision_config.num_channels,
|
||||||
|
self.config.vision_config.image_size,
|
||||||
|
self.config.vision_config.image_size,
|
||||||
|
),
|
||||||
|
dtype=tf.float32,
|
||||||
|
)
|
||||||
|
pixel_values = tf.constant(VISION_DUMMY_INPUTS)
|
||||||
|
dummy = {"pixel_values": pixel_values, "input_ids": input_ids}
|
||||||
|
return dummy
|
||||||
@@ -316,7 +316,7 @@ class VisionTextDualEncoderModel(PreTrainedModel):
|
|||||||
... VisionTextDualEncoderModel,
|
... VisionTextDualEncoderModel,
|
||||||
... VisionTextDualEncoderProcessor,
|
... VisionTextDualEncoderProcessor,
|
||||||
... AutoImageProcessor,
|
... AutoImageProcessor,
|
||||||
... Autookenizer,
|
... AutoTokenizer,
|
||||||
... )
|
... )
|
||||||
|
|
||||||
>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
||||||
@@ -428,7 +428,7 @@ class VisionTextDualEncoderModel(PreTrainedModel):
|
|||||||
Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
|
Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
|
||||||
user or organization name, like `dbmdz/bert-base-german-cased`.
|
user or organization name, like `dbmdz/bert-base-german-cased`.
|
||||||
- A path to a *directory* containing model weights saved using
|
- A path to a *directory* containing model weights saved using
|
||||||
[`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
|
[`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
|
||||||
- A path or url to a *PyTorch checkpoint folder* (e.g, `./pt_model`). In this case, `from_pt`
|
- A path or url to a *PyTorch checkpoint folder* (e.g, `./pt_model`). In this case, `from_pt`
|
||||||
should be set to `True` and a configuration object should be provided as `config` argument. This
|
should be set to `True` and a configuration object should be provided as `config` argument. This
|
||||||
loading path is slower than converting the PyTorch checkpoint in a Flax model using the provided
|
loading path is slower than converting the PyTorch checkpoint in a Flax model using the provided
|
||||||
@@ -441,7 +441,7 @@ class VisionTextDualEncoderModel(PreTrainedModel):
|
|||||||
Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
|
Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
|
||||||
user or organization name, like `dbmdz/bert-base-german-cased`.
|
user or organization name, like `dbmdz/bert-base-german-cased`.
|
||||||
- A path to a *directory* containing model weights saved using
|
- A path to a *directory* containing model weights saved using
|
||||||
[`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
|
[`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
|
||||||
- A path or url to a *PyTorch checkpoint folder* (e.g, `./pt_model`). In this case, `from_pt`
|
- A path or url to a *PyTorch checkpoint folder* (e.g, `./pt_model`). In this case, `from_pt`
|
||||||
should be set to `True` and a configuration object should be provided as `config` argument. This
|
should be set to `True` and a configuration object should be provided as `config` argument. This
|
||||||
loading path is slower than converting the PyTorch checkpoint in a Flax model using the provided
|
loading path is slower than converting the PyTorch checkpoint in a Flax model using the provided
|
||||||
|
|||||||
@@ -2469,6 +2469,13 @@ class TFVisionEncoderDecoderModel(metaclass=DummyObject):
|
|||||||
requires_backends(self, ["tf"])
|
requires_backends(self, ["tf"])
|
||||||
|
|
||||||
|
|
||||||
|
class TFVisionTextDualEncoderModel(metaclass=DummyObject):
|
||||||
|
_backends = ["tf"]
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
requires_backends(self, ["tf"])
|
||||||
|
|
||||||
|
|
||||||
class TFViTForImageClassification(metaclass=DummyObject):
|
class TFViTForImageClassification(metaclass=DummyObject):
|
||||||
_backends = ["tf"]
|
_backends = ["tf"]
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,419 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" Testing suite for the PyTorch VisionTextDualEncoder model. """
|
||||||
|
|
||||||
|
|
||||||
|
import collections
|
||||||
|
import tempfile
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from transformers.testing_utils import require_tf, require_vision, slow
|
||||||
|
from transformers.utils import is_tf_available, is_vision_available
|
||||||
|
|
||||||
|
from ...test_modeling_tf_common import floats_tensor, ids_tensor, random_attention_mask
|
||||||
|
from ..bert.test_modeling_tf_bert import TFBertModelTester
|
||||||
|
from ..clip.test_modeling_tf_clip import TFCLIPVisionModelTester
|
||||||
|
from ..deit.test_modeling_tf_deit import TFDeiTModelTester
|
||||||
|
from ..roberta.test_modeling_tf_roberta import TFRobertaModelTester
|
||||||
|
from ..vit.test_modeling_tf_vit import TFViTModelTester
|
||||||
|
|
||||||
|
|
||||||
|
if is_tf_available():
|
||||||
|
from transformers import (
|
||||||
|
TFBertModel,
|
||||||
|
TFCLIPVisionModel,
|
||||||
|
TFDeiTModel,
|
||||||
|
TFRobertaModel,
|
||||||
|
TFVisionTextDualEncoderModel,
|
||||||
|
TFViTModel,
|
||||||
|
VisionTextDualEncoderConfig,
|
||||||
|
)
|
||||||
|
|
||||||
|
if is_vision_available():
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from transformers import VisionTextDualEncoderProcessor
|
||||||
|
|
||||||
|
|
||||||
|
# Inspired by
|
||||||
|
# https://github.com/rwightman/pytorch-image-models/blob/b9bd960a032c75ca6b808ddeed76bee5f3ed4972/timm/models/layers/helpers.py
|
||||||
|
# From PyTorch internals
|
||||||
|
def to_2tuple(x):
|
||||||
|
if isinstance(x, collections.abc.Iterable):
|
||||||
|
return x
|
||||||
|
return (x, x)
|
||||||
|
|
||||||
|
|
||||||
|
@require_tf
|
||||||
|
class TFVisionTextDualEncoderMixin:
|
||||||
|
def get_vision_text_model(self, config, text_config):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_pretrained_model_and_inputs(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def check_model_from_pretrained_configs(
|
||||||
|
self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs
|
||||||
|
):
|
||||||
|
config = VisionTextDualEncoderConfig.from_vision_text_configs(vision_config, text_config)
|
||||||
|
|
||||||
|
model = TFVisionTextDualEncoderModel(config)
|
||||||
|
|
||||||
|
output = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)
|
||||||
|
|
||||||
|
self.assertEqual(output["text_embeds"].shape, (input_ids.shape[0], config.projection_dim))
|
||||||
|
self.assertEqual(output["image_embeds"].shape, (pixel_values.shape[0], config.projection_dim))
|
||||||
|
|
||||||
|
def check_vision_text_dual_encoder_model(
|
||||||
|
self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs
|
||||||
|
):
|
||||||
|
vision_model, text_model = self.get_vision_text_model(vision_config, text_config)
|
||||||
|
model = TFVisionTextDualEncoderModel(vision_model=vision_model, text_model=text_model)
|
||||||
|
|
||||||
|
output = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)
|
||||||
|
|
||||||
|
self.assertEqual(output["text_embeds"].shape, (input_ids.shape[0], model.config.projection_dim))
|
||||||
|
self.assertEqual(output["image_embeds"].shape, (pixel_values.shape[0], model.config.projection_dim))
|
||||||
|
|
||||||
|
def check_vision_text_dual_encoder_from_pretrained(
|
||||||
|
self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs
|
||||||
|
):
|
||||||
|
vision_model, text_model = self.get_vision_text_model(vision_config, text_config)
|
||||||
|
kwargs = {"vision_model": vision_model, "text_model": text_model}
|
||||||
|
model = TFVisionTextDualEncoderModel.from_vision_text_pretrained(**kwargs)
|
||||||
|
|
||||||
|
output = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)
|
||||||
|
|
||||||
|
self.assertEqual(output["text_embeds"].shape, (input_ids.shape[0], model.config.projection_dim))
|
||||||
|
self.assertEqual(output["image_embeds"].shape, (pixel_values.shape[0], model.config.projection_dim))
|
||||||
|
|
||||||
|
def check_save_load(self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs):
|
||||||
|
vision_model, text_model = self.get_vision_text_model(vision_config, text_config)
|
||||||
|
model = TFVisionTextDualEncoderModel(vision_model=vision_model, text_model=text_model)
|
||||||
|
|
||||||
|
output = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)
|
||||||
|
out_1 = output[0].numpy()
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||||
|
model.save_pretrained(tmpdirname)
|
||||||
|
model = TFVisionTextDualEncoderModel.from_pretrained(tmpdirname)
|
||||||
|
|
||||||
|
after_output = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)
|
||||||
|
out_2 = after_output[0].numpy()
|
||||||
|
max_diff = np.amax(np.abs(out_2 - out_1))
|
||||||
|
self.assertLessEqual(max_diff, 1e-5)
|
||||||
|
|
||||||
|
def check_vision_text_output_attention(
|
||||||
|
self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs
|
||||||
|
):
|
||||||
|
vision_model, text_model = self.get_vision_text_model(vision_config, text_config)
|
||||||
|
model = TFVisionTextDualEncoderModel(vision_model=vision_model, text_model=text_model)
|
||||||
|
|
||||||
|
output = model(
|
||||||
|
input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, output_attentions=True
|
||||||
|
)
|
||||||
|
|
||||||
|
vision_attentions = output.vision_model_output.attentions
|
||||||
|
self.assertEqual(len(vision_attentions), vision_config.num_hidden_layers)
|
||||||
|
|
||||||
|
# in ViT, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token)
|
||||||
|
image_size = to_2tuple(vision_model.config.image_size)
|
||||||
|
patch_size = to_2tuple(vision_model.config.patch_size)
|
||||||
|
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
|
||||||
|
seq_len = num_patches + 1
|
||||||
|
self.assertEqual(vision_attentions[0].shape[-3:], (vision_config.num_attention_heads, seq_len, seq_len))
|
||||||
|
|
||||||
|
text_attentions = output.text_model_output.attentions
|
||||||
|
self.assertEqual(len(text_attentions), text_config.num_hidden_layers)
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
text_attentions[0].shape[-3:],
|
||||||
|
(text_config.num_attention_heads, input_ids.shape[-1], input_ids.shape[-1]),
|
||||||
|
)
|
||||||
|
|
||||||
|
def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
|
||||||
|
diff = np.abs((a - b)).max()
|
||||||
|
self.assertLessEqual(diff, tol, f"Difference between torch and flax is {diff} (>= {tol}).")
|
||||||
|
|
||||||
|
def test_vision_text_dual_encoder_model(self):
|
||||||
|
inputs_dict = self.prepare_config_and_inputs()
|
||||||
|
self.check_vision_text_dual_encoder_model(**inputs_dict)
|
||||||
|
|
||||||
|
def test_model_from_pretrained_configs(self):
|
||||||
|
inputs_dict = self.prepare_config_and_inputs()
|
||||||
|
self.check_model_from_pretrained_configs(**inputs_dict)
|
||||||
|
|
||||||
|
def test_vision_text_dual_encoder_from_pretrained(self):
|
||||||
|
inputs_dict = self.prepare_config_and_inputs()
|
||||||
|
self.check_vision_text_dual_encoder_from_pretrained(**inputs_dict)
|
||||||
|
|
||||||
|
def test_save_load(self):
|
||||||
|
inputs_dict = self.prepare_config_and_inputs()
|
||||||
|
self.check_save_load(**inputs_dict)
|
||||||
|
|
||||||
|
def test_vision_text_output_attention(self):
|
||||||
|
inputs_dict = self.prepare_config_and_inputs()
|
||||||
|
self.check_vision_text_output_attention(**inputs_dict)
|
||||||
|
|
||||||
|
@slow
|
||||||
|
def test_real_model_save_load_from_pretrained(self):
|
||||||
|
model_2, inputs = self.get_pretrained_model_and_inputs()
|
||||||
|
|
||||||
|
outputs = model_2(**inputs)
|
||||||
|
out_2 = outputs[0].numpy()
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmp_dirname:
|
||||||
|
model_2.save_pretrained(tmp_dirname)
|
||||||
|
model_1 = TFVisionTextDualEncoderModel.from_pretrained(tmp_dirname)
|
||||||
|
|
||||||
|
after_outputs = model_1(**inputs)
|
||||||
|
out_1 = after_outputs[0].numpy()
|
||||||
|
max_diff = np.amax(np.abs(out_1 - out_2))
|
||||||
|
self.assertLessEqual(max_diff, 1e-5)
|
||||||
|
|
||||||
|
|
||||||
|
@require_tf
|
||||||
|
class TFViTBertModelTest(TFVisionTextDualEncoderMixin, unittest.TestCase):
|
||||||
|
def get_pretrained_model_and_inputs(self):
|
||||||
|
model = TFVisionTextDualEncoderModel.from_vision_text_pretrained(
|
||||||
|
"hf-internal-testing/tiny-random-vit", "hf-internal-testing/tiny-random-bert"
|
||||||
|
)
|
||||||
|
batch_size = 13
|
||||||
|
pixel_values = floats_tensor(
|
||||||
|
[
|
||||||
|
batch_size,
|
||||||
|
model.vision_model.config.num_channels,
|
||||||
|
model.vision_model.config.image_size,
|
||||||
|
model.vision_model.config.image_size,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
input_ids = ids_tensor([batch_size, 4], model.text_model.config.vocab_size)
|
||||||
|
attention_mask = random_attention_mask([batch_size, 4])
|
||||||
|
inputs = {"pixel_values": pixel_values, "input_ids": input_ids, "attention_mask": attention_mask}
|
||||||
|
|
||||||
|
return model, inputs
|
||||||
|
|
||||||
|
def get_vision_text_model(self, vision_config, text_config):
|
||||||
|
vision_model = TFViTModel(vision_config, name="vision_model")
|
||||||
|
text_model = TFBertModel(text_config, name="text_model")
|
||||||
|
return vision_model, text_model
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
vit_model_tester = TFViTModelTester(self)
|
||||||
|
bert_model_tester = TFBertModelTester(self)
|
||||||
|
vision_config_and_inputs = vit_model_tester.prepare_config_and_inputs()
|
||||||
|
text_config_and_inputs = bert_model_tester.prepare_config_and_inputs()
|
||||||
|
|
||||||
|
vision_config, pixel_values, _ = vision_config_and_inputs
|
||||||
|
|
||||||
|
(
|
||||||
|
text_config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_mask,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
choice_labels,
|
||||||
|
) = text_config_and_inputs
|
||||||
|
|
||||||
|
return {
|
||||||
|
"text_config": text_config,
|
||||||
|
"vision_config": vision_config,
|
||||||
|
"pixel_values": pixel_values,
|
||||||
|
"attention_mask": input_mask,
|
||||||
|
"input_ids": input_ids,
|
||||||
|
"text_token_type_ids": token_type_ids,
|
||||||
|
"text_sequence_labels": sequence_labels,
|
||||||
|
"text_token_labels": token_labels,
|
||||||
|
"text_choice_labels": choice_labels,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@require_tf
|
||||||
|
class TFDeiTRobertaModelTest(TFVisionTextDualEncoderMixin, unittest.TestCase):
|
||||||
|
def get_pretrained_model_and_inputs(self):
|
||||||
|
# DeiT repo doesn't have TF weights, but we don't actually use the weights at all so let's
|
||||||
|
# just reinitialize it.
|
||||||
|
model = TFVisionTextDualEncoderModel.from_vision_text_pretrained(
|
||||||
|
"Rocketknight1/tiny-random-deit-tf", "hf-internal-testing/tiny-random-roberta"
|
||||||
|
)
|
||||||
|
batch_size = 13
|
||||||
|
pixel_values = floats_tensor(
|
||||||
|
[
|
||||||
|
batch_size,
|
||||||
|
model.vision_model.config.num_channels,
|
||||||
|
model.vision_model.config.image_size,
|
||||||
|
model.vision_model.config.image_size,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
input_ids = ids_tensor([batch_size, 4], model.text_model.config.vocab_size)
|
||||||
|
attention_mask = random_attention_mask([batch_size, 4])
|
||||||
|
inputs = {"pixel_values": pixel_values, "input_ids": input_ids, "attention_mask": attention_mask}
|
||||||
|
|
||||||
|
return model, inputs
|
||||||
|
|
||||||
|
def check_vision_text_output_attention(
|
||||||
|
self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs
|
||||||
|
):
|
||||||
|
vision_model, text_model = self.get_vision_text_model(vision_config, text_config)
|
||||||
|
model = TFVisionTextDualEncoderModel(vision_model=vision_model, text_model=text_model)
|
||||||
|
|
||||||
|
output = model(
|
||||||
|
input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, output_attentions=True
|
||||||
|
)
|
||||||
|
|
||||||
|
vision_attentions = output.vision_model_output.attentions
|
||||||
|
self.assertEqual(len(vision_attentions), vision_config.num_hidden_layers)
|
||||||
|
|
||||||
|
# in DEiT, the seq_len equals the number of patches + 2 (we add 2 for the [CLS] and distillation tokens)
|
||||||
|
image_size = to_2tuple(vision_model.config.image_size)
|
||||||
|
patch_size = to_2tuple(vision_model.config.patch_size)
|
||||||
|
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
|
||||||
|
seq_len = num_patches + 2
|
||||||
|
self.assertEqual(vision_attentions[0].shape[-3:], (vision_config.num_attention_heads, seq_len, seq_len))
|
||||||
|
|
||||||
|
text_attentions = output.text_model_output.attentions
|
||||||
|
self.assertEqual(len(text_attentions), text_config.num_hidden_layers)
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
text_attentions[0].shape[-3:],
|
||||||
|
(text_config.num_attention_heads, input_ids.shape[-1], input_ids.shape[-1]),
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_vision_text_model(self, vision_config, text_config):
|
||||||
|
vision_model = TFDeiTModel(vision_config, name="vision_model")
|
||||||
|
text_model = TFRobertaModel(text_config, name="text_model")
|
||||||
|
return vision_model, text_model
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
vit_model_tester = TFDeiTModelTester(self)
|
||||||
|
bert_model_tester = TFRobertaModelTester(self)
|
||||||
|
vision_config_and_inputs = vit_model_tester.prepare_config_and_inputs()
|
||||||
|
text_config_and_inputs = bert_model_tester.prepare_config_and_inputs()
|
||||||
|
|
||||||
|
vision_config, pixel_values, _ = vision_config_and_inputs
|
||||||
|
|
||||||
|
(
|
||||||
|
text_config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_mask,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
choice_labels,
|
||||||
|
) = text_config_and_inputs
|
||||||
|
|
||||||
|
return {
|
||||||
|
"text_config": text_config,
|
||||||
|
"vision_config": vision_config,
|
||||||
|
"pixel_values": pixel_values,
|
||||||
|
"attention_mask": input_mask,
|
||||||
|
"input_ids": input_ids,
|
||||||
|
"text_token_type_ids": token_type_ids,
|
||||||
|
"text_sequence_labels": sequence_labels,
|
||||||
|
"text_token_labels": token_labels,
|
||||||
|
"text_choice_labels": choice_labels,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@require_tf
|
||||||
|
class TFCLIPVisionBertModelTest(TFVisionTextDualEncoderMixin, unittest.TestCase):
|
||||||
|
def get_pretrained_model_and_inputs(self):
|
||||||
|
model = TFVisionTextDualEncoderModel.from_vision_text_pretrained(
|
||||||
|
"Rocketknight1/tiny-random-clip-tf", "hf-internal-testing/tiny-random-bert"
|
||||||
|
)
|
||||||
|
batch_size = 13
|
||||||
|
pixel_values = floats_tensor(
|
||||||
|
[
|
||||||
|
batch_size,
|
||||||
|
model.vision_model.config.num_channels,
|
||||||
|
model.vision_model.config.image_size,
|
||||||
|
model.vision_model.config.image_size,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
input_ids = ids_tensor([batch_size, 4], model.text_model.config.vocab_size)
|
||||||
|
attention_mask = random_attention_mask([batch_size, 4])
|
||||||
|
inputs = {"pixel_values": pixel_values, "input_ids": input_ids, "attention_mask": attention_mask}
|
||||||
|
|
||||||
|
return model, inputs
|
||||||
|
|
||||||
|
def get_vision_text_model(self, vision_config, text_config):
|
||||||
|
vision_model = TFCLIPVisionModel(vision_config, name="vision_model")
|
||||||
|
text_model = TFBertModel(text_config, name="text_model")
|
||||||
|
return vision_model, text_model
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
clip_model_tester = TFCLIPVisionModelTester(self)
|
||||||
|
bert_model_tester = TFBertModelTester(self)
|
||||||
|
vision_config_and_inputs = clip_model_tester.prepare_config_and_inputs()
|
||||||
|
text_config_and_inputs = bert_model_tester.prepare_config_and_inputs()
|
||||||
|
|
||||||
|
vision_config, pixel_values = vision_config_and_inputs
|
||||||
|
|
||||||
|
(
|
||||||
|
text_config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_mask,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
choice_labels,
|
||||||
|
) = text_config_and_inputs
|
||||||
|
|
||||||
|
return {
|
||||||
|
"text_config": text_config,
|
||||||
|
"vision_config": vision_config,
|
||||||
|
"pixel_values": pixel_values,
|
||||||
|
"attention_mask": input_mask,
|
||||||
|
"input_ids": input_ids,
|
||||||
|
"text_token_type_ids": token_type_ids,
|
||||||
|
"text_sequence_labels": sequence_labels,
|
||||||
|
"text_token_labels": token_labels,
|
||||||
|
"text_choice_labels": choice_labels,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@require_vision
|
||||||
|
@require_tf
|
||||||
|
class TFVisionTextDualEncoderIntegrationTest(unittest.TestCase):
|
||||||
|
@slow
|
||||||
|
def test_inference(self):
|
||||||
|
model = TFVisionTextDualEncoderModel.from_pretrained(
|
||||||
|
"clip-italian/clip-italian", logit_scale_init_value=1, from_pt=True
|
||||||
|
)
|
||||||
|
processor = VisionTextDualEncoderProcessor.from_pretrained("clip-italian/clip-italian")
|
||||||
|
|
||||||
|
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||||
|
inputs = processor(
|
||||||
|
text=["una foto di un gatto", "una foto di un cane"], images=image, padding=True, return_tensors="np"
|
||||||
|
)
|
||||||
|
|
||||||
|
outputs = model(**inputs)
|
||||||
|
|
||||||
|
# verify the logits
|
||||||
|
self.assertEqual(outputs.logits_per_image.shape, (inputs.pixel_values.shape[0], inputs.input_ids.shape[0]))
|
||||||
|
self.assertEqual(
|
||||||
|
outputs.logits_per_text.shape,
|
||||||
|
(inputs.input_ids.shape[0], inputs.pixel_values.shape[0]),
|
||||||
|
)
|
||||||
|
|
||||||
|
expected_logits = np.array([[1.2284727, 0.3104122]])
|
||||||
|
|
||||||
|
self.assertTrue(np.allclose(outputs.logits_per_image.numpy(), expected_logits, atol=1e-3))
|
||||||
@@ -173,6 +173,7 @@ TEST_FILES_WITH_NO_COMMON_TESTS = [
|
|||||||
"models/xlm_prophetnet/test_modeling_xlm_prophetnet.py",
|
"models/xlm_prophetnet/test_modeling_xlm_prophetnet.py",
|
||||||
"models/xlm_roberta/test_modeling_xlm_roberta.py",
|
"models/xlm_roberta/test_modeling_xlm_roberta.py",
|
||||||
"models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py",
|
"models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py",
|
||||||
|
"models/vision_text_dual_encoder/test_modeling_tf_vision_text_dual_encoder.py",
|
||||||
"models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py",
|
"models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py",
|
||||||
"models/decision_transformer/test_modeling_decision_transformer.py",
|
"models/decision_transformer/test_modeling_decision_transformer.py",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -198,6 +198,7 @@ src/transformers/models/vilt/modeling_vilt.py
|
|||||||
src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
|
src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
|
||||||
src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
|
src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
|
||||||
src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py
|
src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py
|
||||||
|
src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py
|
||||||
src/transformers/models/vit/configuration_vit.py
|
src/transformers/models/vit/configuration_vit.py
|
||||||
src/transformers/models/vit/modeling_vit.py
|
src/transformers/models/vit/modeling_vit.py
|
||||||
src/transformers/models/vit/modeling_tf_vit.py
|
src/transformers/models/vit/modeling_tf_vit.py
|
||||||
|
|||||||
Reference in New Issue
Block a user