[CvT] Tensorflow implementation (#18597)
* implemented TFCvtModel and TFCvtForImageClassification and modified relevant files, added an exception in convert_tf_weight_name_to_pt_weight_name, added quick testing file to compare with pytorch model * added docstring + testing file in transformers testing suite * added test in testing file, modified docs to pass repo-consistency, passed formatting test * refactoring + passing all test * small refacto, removing unwanted comments * improved testing config * corrected import error * modified acces to pretrained model archive list, to pass tf_test * corrected import structure in init files * modified testing for keras_fit with cpu * correcting PR issues + Refactoring * Refactoring : improving readability and reducing the number of permutations * corrected momentum value + cls_token initialization * removed from_pt as weights were added to the hub * Update tests/models/cvt/test_modeling_tf_cvt.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
This commit is contained in:
@@ -225,7 +225,7 @@ Flax), PyTorch, and/or TensorFlow.
|
||||
| ConvBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| ConvNeXT | ❌ | ❌ | ✅ | ✅ | ❌ |
|
||||
| CTRL | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| CvT | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| CvT | ❌ | ❌ | ✅ | ✅ | ❌ |
|
||||
| Data2VecAudio | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| Data2VecText | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| Data2VecVision | ❌ | ❌ | ✅ | ✅ | ❌ |
|
||||
|
||||
@@ -51,3 +51,14 @@ This model was contributed by [anugunj](https://huggingface.co/anugunj). The ori
|
||||
|
||||
[[autodoc]] CvtForImageClassification
|
||||
- forward
|
||||
|
||||
## TFCvtModel
|
||||
|
||||
[[autodoc]] TFCvtModel
|
||||
- call
|
||||
|
||||
## TFCvtForImageClassification
|
||||
|
||||
[[autodoc]] TFCvtForImageClassification
|
||||
- call
|
||||
|
||||
|
||||
@@ -2358,6 +2358,14 @@ else:
|
||||
"TFCTRLPreTrainedModel",
|
||||
]
|
||||
)
|
||||
_import_structure["models.cvt"].extend(
|
||||
[
|
||||
"TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST",
|
||||
"TFCvtForImageClassification",
|
||||
"TFCvtModel",
|
||||
"TFCvtPreTrainedModel",
|
||||
]
|
||||
)
|
||||
_import_structure["models.data2vec"].extend(
|
||||
[
|
||||
"TFData2VecVisionForImageClassification",
|
||||
@@ -5024,6 +5032,12 @@ if TYPE_CHECKING:
|
||||
TFCTRLModel,
|
||||
TFCTRLPreTrainedModel,
|
||||
)
|
||||
from .models.cvt import (
|
||||
TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
TFCvtForImageClassification,
|
||||
TFCvtModel,
|
||||
TFCvtPreTrainedModel,
|
||||
)
|
||||
from .models.data2vec import (
|
||||
TFData2VecVisionForImageClassification,
|
||||
TFData2VecVisionForSemanticSegmentation,
|
||||
|
||||
@@ -39,6 +39,7 @@ TF_MODEL_MAPPING_NAMES = OrderedDict(
|
||||
("convbert", "TFConvBertModel"),
|
||||
("convnext", "TFConvNextModel"),
|
||||
("ctrl", "TFCTRLModel"),
|
||||
("cvt", "TFCvtModel"),
|
||||
("data2vec-vision", "TFData2VecVisionModel"),
|
||||
("deberta", "TFDebertaModel"),
|
||||
("deberta-v2", "TFDebertaV2Model"),
|
||||
@@ -184,6 +185,7 @@ TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
|
||||
[
|
||||
# Model for Image-classsification
|
||||
("convnext", "TFConvNextForImageClassification"),
|
||||
("cvt", "TFCvtForImageClassification"),
|
||||
("data2vec-vision", "TFData2VecVisionForImageClassification"),
|
||||
("deit", ("TFDeiTForImageClassification", "TFDeiTForImageClassificationWithTeacher")),
|
||||
("mobilevit", "TFMobileViTForImageClassification"),
|
||||
|
||||
@@ -17,7 +17,7 @@
|
||||
# limitations under the License.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
|
||||
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tf_available, is_torch_available
|
||||
|
||||
|
||||
_import_structure = {"configuration_cvt": ["CVT_PRETRAINED_CONFIG_ARCHIVE_MAP", "CvtConfig"]}
|
||||
@@ -36,6 +36,18 @@ else:
|
||||
"CvtPreTrainedModel",
|
||||
]
|
||||
|
||||
try:
|
||||
if not is_tf_available():
|
||||
raise OptionalDependencyNotAvailable()
|
||||
except OptionalDependencyNotAvailable:
|
||||
pass
|
||||
else:
|
||||
_import_structure["modeling_tf_cvt"] = [
|
||||
"TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST",
|
||||
"TFCvtForImageClassification",
|
||||
"TFCvtModel",
|
||||
"TFCvtPreTrainedModel",
|
||||
]
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_cvt import CVT_PRETRAINED_CONFIG_ARCHIVE_MAP, CvtConfig
|
||||
@@ -53,6 +65,20 @@ if TYPE_CHECKING:
|
||||
CvtPreTrainedModel,
|
||||
)
|
||||
|
||||
try:
|
||||
if not is_tf_available():
|
||||
raise OptionalDependencyNotAvailable()
|
||||
except OptionalDependencyNotAvailable:
|
||||
pass
|
||||
else:
|
||||
from .modeling_tf_cvt import (
|
||||
TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
TFCvtForImageClassification,
|
||||
TFCvtModel,
|
||||
TFCvtPreTrainedModel,
|
||||
)
|
||||
|
||||
|
||||
else:
|
||||
import sys
|
||||
|
||||
|
||||
948
src/transformers/models/cvt/modeling_tf_cvt.py
Normal file
948
src/transformers/models/cvt/modeling_tf_cvt.py
Normal file
@@ -0,0 +1,948 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" TF 2.0 Cvt model."""
|
||||
|
||||
|
||||
import collections.abc
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, Optional, Tuple, Union
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from ...modeling_tf_outputs import TFImageClassifierOutputWithNoAttention
|
||||
from ...modeling_tf_utils import (
|
||||
TFModelInputType,
|
||||
TFPreTrainedModel,
|
||||
TFSequenceClassificationLoss,
|
||||
get_initializer,
|
||||
keras_serializable,
|
||||
unpack_inputs,
|
||||
)
|
||||
from ...tf_utils import shape_list, stable_softmax
|
||||
from ...utils import (
|
||||
ModelOutput,
|
||||
add_start_docstrings,
|
||||
add_start_docstrings_to_model_forward,
|
||||
logging,
|
||||
replace_return_docstrings,
|
||||
)
|
||||
from .configuration_cvt import CvtConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "CvtConfig"
|
||||
|
||||
TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
||||
"microsoft/cvt-13",
|
||||
"microsoft/cvt-13-384",
|
||||
"microsoft/cvt-13-384-22k",
|
||||
"microsoft/cvt-21",
|
||||
"microsoft/cvt-21-384",
|
||||
"microsoft/cvt-21-384-22k",
|
||||
# See all Cvt models at https://huggingface.co/models?filter=cvt
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class TFBaseModelOutputWithCLSToken(ModelOutput):
|
||||
"""
|
||||
Base class for model's outputs.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
cls_token_value (`tf.Tensor` of shape `(batch_size, 1, hidden_size)`):
|
||||
Classification token at the output of the last layer of the model.
|
||||
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
|
||||
`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus
|
||||
the initial embedding outputs.
|
||||
"""
|
||||
|
||||
last_hidden_state: tf.Tensor = None
|
||||
cls_token_value: tf.Tensor = None
|
||||
hidden_states: Optional[Tuple[tf.Tensor]] = None
|
||||
|
||||
|
||||
class TFCvtDropPath(tf.keras.layers.Layer):
|
||||
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
|
||||
References:
|
||||
(1) github.com:rwightman/pytorch-image-models
|
||||
"""
|
||||
|
||||
def __init__(self, drop_prob: float, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.drop_prob = drop_prob
|
||||
|
||||
def call(self, x: tf.Tensor, training=None):
|
||||
if self.drop_prob == 0.0 or not training:
|
||||
return x
|
||||
keep_prob = 1 - self.drop_prob
|
||||
shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
|
||||
random_tensor = keep_prob + tf.random.uniform(shape, 0, 1)
|
||||
random_tensor = tf.floor(random_tensor)
|
||||
return (x / keep_prob) * random_tensor
|
||||
|
||||
|
||||
class TFCvtEmbeddings(tf.keras.layers.Layer):
|
||||
"""Construct the Convolutional Token Embeddings."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: CvtConfig,
|
||||
patch_size: int,
|
||||
embed_dim: int,
|
||||
stride: int,
|
||||
padding: int,
|
||||
dropout_rate: float,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.convolution_embeddings = TFCvtConvEmbeddings(
|
||||
config,
|
||||
patch_size=patch_size,
|
||||
embed_dim=embed_dim,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
name="convolution_embeddings",
|
||||
)
|
||||
self.dropout = tf.keras.layers.Dropout(dropout_rate)
|
||||
|
||||
def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor:
|
||||
hidden_state = self.convolution_embeddings(pixel_values)
|
||||
hidden_state = self.dropout(hidden_state, training=training)
|
||||
return hidden_state
|
||||
|
||||
|
||||
class TFCvtConvEmbeddings(tf.keras.layers.Layer):
|
||||
"""Image to Convolution Embeddings. This convolutional operation aims to model local spatial contexts."""
|
||||
|
||||
def __init__(self, config: CvtConfig, patch_size: int, embed_dim: int, stride: int, padding: int, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.padding = tf.keras.layers.ZeroPadding2D(padding=padding)
|
||||
self.patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
|
||||
self.projection = tf.keras.layers.Conv2D(
|
||||
filters=embed_dim,
|
||||
kernel_size=patch_size,
|
||||
strides=stride,
|
||||
padding="valid",
|
||||
data_format="channels_last",
|
||||
kernel_initializer=get_initializer(config.initializer_range),
|
||||
name="projection",
|
||||
)
|
||||
# Using the same default epsilon as PyTorch
|
||||
self.normalization = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="normalization")
|
||||
|
||||
def call(self, pixel_values: tf.Tensor) -> tf.Tensor:
|
||||
if isinstance(pixel_values, dict):
|
||||
pixel_values = pixel_values["pixel_values"]
|
||||
|
||||
pixel_values = self.projection(self.padding(pixel_values))
|
||||
|
||||
# "batch_size, height, width, num_channels -> batch_size, (height*width), num_channels"
|
||||
batch_size, height, width, num_channels = shape_list(pixel_values)
|
||||
hidden_size = height * width
|
||||
pixel_values = tf.reshape(pixel_values, shape=(batch_size, hidden_size, num_channels))
|
||||
pixel_values = self.normalization(pixel_values)
|
||||
|
||||
# "batch_size, (height*width), num_channels -> batch_size, height, width, num_channels"
|
||||
pixel_values = tf.reshape(pixel_values, shape=(batch_size, height, width, num_channels))
|
||||
return pixel_values
|
||||
|
||||
|
||||
class TFCvtSelfAttentionConvProjection(tf.keras.layers.Layer):
|
||||
"""Convolutional projection layer."""
|
||||
|
||||
def __init__(self, config: CvtConfig, embed_dim: int, kernel_size: int, stride: int, padding: int, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.padding = tf.keras.layers.ZeroPadding2D(padding=padding)
|
||||
self.convolution = tf.keras.layers.Conv2D(
|
||||
filters=embed_dim,
|
||||
kernel_size=kernel_size,
|
||||
kernel_initializer=get_initializer(config.initializer_range),
|
||||
padding="valid",
|
||||
strides=stride,
|
||||
use_bias=False,
|
||||
name="convolution",
|
||||
groups=embed_dim,
|
||||
)
|
||||
# Using the same default epsilon as PyTorch, TF uses (1 - pytorch momentum)
|
||||
self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization")
|
||||
|
||||
def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
|
||||
hidden_state = self.convolution(self.padding(hidden_state))
|
||||
hidden_state = self.normalization(hidden_state, training=training)
|
||||
return hidden_state
|
||||
|
||||
|
||||
class TFCvtSelfAttentionLinearProjection(tf.keras.layers.Layer):
|
||||
"""Linear projection layer used to flatten tokens into 1D."""
|
||||
|
||||
def call(self, hidden_state: tf.Tensor) -> tf.Tensor:
|
||||
# "batch_size, height, width, num_channels -> batch_size, (height*width), num_channels"
|
||||
batch_size, height, width, num_channels = shape_list(hidden_state)
|
||||
hidden_size = height * width
|
||||
hidden_state = tf.reshape(hidden_state, shape=(batch_size, hidden_size, num_channels))
|
||||
return hidden_state
|
||||
|
||||
|
||||
class TFCvtSelfAttentionProjection(tf.keras.layers.Layer):
|
||||
"""Convolutional Projection for Attention."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: CvtConfig,
|
||||
embed_dim: int,
|
||||
kernel_size: int,
|
||||
stride: int,
|
||||
padding: int,
|
||||
projection_method: str = "dw_bn",
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
if projection_method == "dw_bn":
|
||||
self.convolution_projection = TFCvtSelfAttentionConvProjection(
|
||||
config, embed_dim, kernel_size, stride, padding, name="convolution_projection"
|
||||
)
|
||||
self.linear_projection = TFCvtSelfAttentionLinearProjection()
|
||||
|
||||
def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
|
||||
hidden_state = self.convolution_projection(hidden_state, training=training)
|
||||
hidden_state = self.linear_projection(hidden_state)
|
||||
return hidden_state
|
||||
|
||||
|
||||
class TFCvtSelfAttention(tf.keras.layers.Layer):
|
||||
"""
|
||||
Self-attention layer. A depth-wise separable convolution operation (Convolutional Projection), is applied for
|
||||
query, key, and value embeddings.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: CvtConfig,
|
||||
num_heads: int,
|
||||
embed_dim: int,
|
||||
kernel_size: int,
|
||||
stride_q: int,
|
||||
stride_kv: int,
|
||||
padding_q: int,
|
||||
padding_kv: int,
|
||||
qkv_projection_method: str,
|
||||
qkv_bias: bool,
|
||||
attention_drop_rate: float,
|
||||
with_cls_token: bool = True,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.scale = embed_dim**-0.5
|
||||
self.with_cls_token = with_cls_token
|
||||
self.embed_dim = embed_dim
|
||||
self.num_heads = num_heads
|
||||
|
||||
self.convolution_projection_query = TFCvtSelfAttentionProjection(
|
||||
config,
|
||||
embed_dim,
|
||||
kernel_size,
|
||||
stride_q,
|
||||
padding_q,
|
||||
projection_method="linear" if qkv_projection_method == "avg" else qkv_projection_method,
|
||||
name="convolution_projection_query",
|
||||
)
|
||||
self.convolution_projection_key = TFCvtSelfAttentionProjection(
|
||||
config,
|
||||
embed_dim,
|
||||
kernel_size,
|
||||
stride_kv,
|
||||
padding_kv,
|
||||
projection_method=qkv_projection_method,
|
||||
name="convolution_projection_key",
|
||||
)
|
||||
self.convolution_projection_value = TFCvtSelfAttentionProjection(
|
||||
config,
|
||||
embed_dim,
|
||||
kernel_size,
|
||||
stride_kv,
|
||||
padding_kv,
|
||||
projection_method=qkv_projection_method,
|
||||
name="convolution_projection_value",
|
||||
)
|
||||
|
||||
self.projection_query = tf.keras.layers.Dense(
|
||||
units=embed_dim,
|
||||
kernel_initializer=get_initializer(config.initializer_range),
|
||||
use_bias=qkv_bias,
|
||||
bias_initializer="zeros",
|
||||
name="projection_query",
|
||||
)
|
||||
self.projection_key = tf.keras.layers.Dense(
|
||||
units=embed_dim,
|
||||
kernel_initializer=get_initializer(config.initializer_range),
|
||||
use_bias=qkv_bias,
|
||||
bias_initializer="zeros",
|
||||
name="projection_key",
|
||||
)
|
||||
self.projection_value = tf.keras.layers.Dense(
|
||||
units=embed_dim,
|
||||
kernel_initializer=get_initializer(config.initializer_range),
|
||||
use_bias=qkv_bias,
|
||||
bias_initializer="zeros",
|
||||
name="projection_value",
|
||||
)
|
||||
self.dropout = tf.keras.layers.Dropout(attention_drop_rate)
|
||||
|
||||
def rearrange_for_multi_head_attention(self, hidden_state: tf.Tensor) -> tf.Tensor:
|
||||
batch_size, hidden_size, _ = shape_list(hidden_state)
|
||||
head_dim = self.embed_dim // self.num_heads
|
||||
hidden_state = tf.reshape(hidden_state, shape=(batch_size, hidden_size, self.num_heads, head_dim))
|
||||
hidden_state = tf.transpose(hidden_state, perm=(0, 2, 1, 3))
|
||||
return hidden_state
|
||||
|
||||
def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor:
|
||||
if self.with_cls_token:
|
||||
cls_token, hidden_state = tf.split(hidden_state, [1, height * width], 1)
|
||||
|
||||
# "batch_size, (height*width), num_channels -> batch_size, height, width, num_channels"
|
||||
batch_size, hidden_size, num_channels = shape_list(hidden_state)
|
||||
hidden_state = tf.reshape(hidden_state, shape=(batch_size, height, width, num_channels))
|
||||
|
||||
key = self.convolution_projection_key(hidden_state, training=training)
|
||||
query = self.convolution_projection_query(hidden_state, training=training)
|
||||
value = self.convolution_projection_value(hidden_state, training=training)
|
||||
|
||||
if self.with_cls_token:
|
||||
query = tf.concat((cls_token, query), axis=1)
|
||||
key = tf.concat((cls_token, key), axis=1)
|
||||
value = tf.concat((cls_token, value), axis=1)
|
||||
|
||||
head_dim = self.embed_dim // self.num_heads
|
||||
|
||||
query = self.rearrange_for_multi_head_attention(self.projection_query(query))
|
||||
key = self.rearrange_for_multi_head_attention(self.projection_key(key))
|
||||
value = self.rearrange_for_multi_head_attention(self.projection_value(value))
|
||||
|
||||
attention_score = tf.matmul(query, key, transpose_b=True) * self.scale
|
||||
attention_probs = stable_softmax(logits=attention_score, axis=-1)
|
||||
attention_probs = self.dropout(attention_probs, training=training)
|
||||
|
||||
context = tf.matmul(attention_probs, value)
|
||||
# "batch_size, num_heads, hidden_size, head_dim -> batch_size, hidden_size, (num_heads*head_dim)"
|
||||
_, _, hidden_size, _ = shape_list(context)
|
||||
context = tf.transpose(context, perm=(0, 2, 1, 3))
|
||||
context = tf.reshape(context, (batch_size, hidden_size, self.num_heads * head_dim))
|
||||
return context
|
||||
|
||||
|
||||
class TFCvtSelfOutput(tf.keras.layers.Layer):
|
||||
"""Output of the Attention layer ."""
|
||||
|
||||
def __init__(self, config: CvtConfig, embed_dim: int, drop_rate: float, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.dense = tf.keras.layers.Dense(
|
||||
units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
||||
)
|
||||
self.dropout = tf.keras.layers.Dropout(drop_rate)
|
||||
|
||||
def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
|
||||
hidden_state = self.dense(inputs=hidden_state)
|
||||
hidden_state = self.dropout(inputs=hidden_state, training=training)
|
||||
return hidden_state
|
||||
|
||||
|
||||
class TFCvtAttention(tf.keras.layers.Layer):
|
||||
"""Attention layer. First chunk of the convolutional transformer block."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: CvtConfig,
|
||||
num_heads: int,
|
||||
embed_dim: int,
|
||||
kernel_size: int,
|
||||
stride_q: int,
|
||||
stride_kv: int,
|
||||
padding_q: int,
|
||||
padding_kv: int,
|
||||
qkv_projection_method: str,
|
||||
qkv_bias: bool,
|
||||
attention_drop_rate: float,
|
||||
drop_rate: float,
|
||||
with_cls_token: bool = True,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.attention = TFCvtSelfAttention(
|
||||
config,
|
||||
num_heads,
|
||||
embed_dim,
|
||||
kernel_size,
|
||||
stride_q,
|
||||
stride_kv,
|
||||
padding_q,
|
||||
padding_kv,
|
||||
qkv_projection_method,
|
||||
qkv_bias,
|
||||
attention_drop_rate,
|
||||
with_cls_token,
|
||||
name="attention",
|
||||
)
|
||||
self.dense_output = TFCvtSelfOutput(config, embed_dim, drop_rate, name="output")
|
||||
|
||||
def prune_heads(self, heads):
|
||||
raise NotImplementedError
|
||||
|
||||
def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False):
|
||||
self_output = self.attention(hidden_state, height, width, training=training)
|
||||
attention_output = self.dense_output(self_output, training=training)
|
||||
return attention_output
|
||||
|
||||
|
||||
class TFCvtIntermediate(tf.keras.layers.Layer):
|
||||
"""Intermediate dense layer. Second chunk of the convolutional transformer block."""
|
||||
|
||||
def __init__(self, config: CvtConfig, embed_dim: int, mlp_ratio: int, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.dense = tf.keras.layers.Dense(
|
||||
units=int(embed_dim * mlp_ratio),
|
||||
kernel_initializer=get_initializer(config.initializer_range),
|
||||
activation="gelu",
|
||||
name="dense",
|
||||
)
|
||||
|
||||
def call(self, hidden_state: tf.Tensor) -> tf.Tensor:
|
||||
hidden_state = self.dense(hidden_state)
|
||||
return hidden_state
|
||||
|
||||
|
||||
class TFCvtOutput(tf.keras.layers.Layer):
|
||||
"""
|
||||
Output of the Convolutional Transformer Block (last chunk). It consists of a MLP and a residual connection.
|
||||
"""
|
||||
|
||||
def __init__(self, config: CvtConfig, embed_dim: int, drop_rate: int, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.dense = tf.keras.layers.Dense(
|
||||
units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
||||
)
|
||||
self.dropout = tf.keras.layers.Dropout(drop_rate)
|
||||
|
||||
def call(self, hidden_state: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
|
||||
hidden_state = self.dense(inputs=hidden_state)
|
||||
hidden_state = self.dropout(inputs=hidden_state, training=training)
|
||||
hidden_state = hidden_state + input_tensor
|
||||
return hidden_state
|
||||
|
||||
|
||||
class TFCvtLayer(tf.keras.layers.Layer):
|
||||
"""
|
||||
Convolutional Transformer Block composed by attention layers, normalization and multi-layer perceptrons (mlps). It
|
||||
consists of 3 chunks : an attention layer, an intermediate dense layer and an output layer. This corresponds to the
|
||||
`Block` class in the original implementation.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: CvtConfig,
|
||||
num_heads: int,
|
||||
embed_dim: int,
|
||||
kernel_size: int,
|
||||
stride_q: int,
|
||||
stride_kv: int,
|
||||
padding_q: int,
|
||||
padding_kv: int,
|
||||
qkv_projection_method: str,
|
||||
qkv_bias: bool,
|
||||
attention_drop_rate: float,
|
||||
drop_rate: float,
|
||||
mlp_ratio: float,
|
||||
drop_path_rate: float,
|
||||
with_cls_token: bool = True,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.attention = TFCvtAttention(
|
||||
config,
|
||||
num_heads,
|
||||
embed_dim,
|
||||
kernel_size,
|
||||
stride_q,
|
||||
stride_kv,
|
||||
padding_q,
|
||||
padding_kv,
|
||||
qkv_projection_method,
|
||||
qkv_bias,
|
||||
attention_drop_rate,
|
||||
drop_rate,
|
||||
with_cls_token,
|
||||
name="attention",
|
||||
)
|
||||
self.intermediate = TFCvtIntermediate(config, embed_dim, mlp_ratio, name="intermediate")
|
||||
self.dense_output = TFCvtOutput(config, embed_dim, drop_rate, name="output")
|
||||
# Using `layers.Activation` instead of `tf.identity` to better control `training` behaviour.
|
||||
self.drop_path = (
|
||||
TFCvtDropPath(drop_path_rate, name="drop_path")
|
||||
if drop_path_rate > 0.0
|
||||
else tf.keras.layers.Activation("linear", name="drop_path")
|
||||
)
|
||||
# Using the same default epsilon as PyTorch
|
||||
self.layernorm_before = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_before")
|
||||
self.layernorm_after = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_after")
|
||||
|
||||
def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor:
|
||||
# in Cvt, layernorm is applied before self-attention
|
||||
attention_output = self.attention(self.layernorm_before(hidden_state), height, width, training=training)
|
||||
attention_output = self.drop_path(attention_output, training=training)
|
||||
|
||||
# first residual connection
|
||||
hidden_state = attention_output + hidden_state
|
||||
|
||||
# in Cvt, layernorm is also applied after self-attention
|
||||
layer_output = self.layernorm_after(hidden_state)
|
||||
layer_output = self.intermediate(layer_output)
|
||||
|
||||
# second residual connection is done here
|
||||
layer_output = self.dense_output(layer_output, hidden_state)
|
||||
layer_output = self.drop_path(layer_output, training=training)
|
||||
return layer_output
|
||||
|
||||
|
||||
class TFCvtStage(tf.keras.layers.Layer):
|
||||
"""
|
||||
Cvt stage (encoder block). Each stage has 2 parts :
|
||||
- (1) A Convolutional Token Embedding layer
|
||||
- (2) A Convolutional Transformer Block (layer).
|
||||
The classification token is added only in the last stage.
|
||||
|
||||
Args:
|
||||
config ([`CvtConfig`]): Model configuration class.
|
||||
stage (`int`): Stage number.
|
||||
"""
|
||||
|
||||
def __init__(self, config: CvtConfig, stage: int, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.config = config
|
||||
self.stage = stage
|
||||
if self.config.cls_token[self.stage]:
|
||||
self.cls_token = self.add_weight(
|
||||
shape=(1, 1, self.config.embed_dim[-1]),
|
||||
initializer=get_initializer(self.config.initializer_range),
|
||||
trainable=True,
|
||||
name="cvt.encoder.stages.2.cls_token",
|
||||
)
|
||||
|
||||
self.embedding = TFCvtEmbeddings(
|
||||
self.config,
|
||||
patch_size=config.patch_sizes[self.stage],
|
||||
stride=config.patch_stride[self.stage],
|
||||
embed_dim=config.embed_dim[self.stage],
|
||||
padding=config.patch_padding[self.stage],
|
||||
dropout_rate=config.drop_rate[self.stage],
|
||||
name="embedding",
|
||||
)
|
||||
|
||||
drop_path_rates = tf.linspace(0.0, config.drop_path_rate[self.stage], config.depth[stage])
|
||||
drop_path_rates = [x.numpy().item() for x in drop_path_rates]
|
||||
self.layers = [
|
||||
TFCvtLayer(
|
||||
config,
|
||||
num_heads=config.num_heads[self.stage],
|
||||
embed_dim=config.embed_dim[self.stage],
|
||||
kernel_size=config.kernel_qkv[self.stage],
|
||||
stride_q=config.stride_q[self.stage],
|
||||
stride_kv=config.stride_kv[self.stage],
|
||||
padding_q=config.padding_q[self.stage],
|
||||
padding_kv=config.padding_kv[self.stage],
|
||||
qkv_projection_method=config.qkv_projection_method[self.stage],
|
||||
qkv_bias=config.qkv_bias[self.stage],
|
||||
attention_drop_rate=config.attention_drop_rate[self.stage],
|
||||
drop_rate=config.drop_rate[self.stage],
|
||||
mlp_ratio=config.mlp_ratio[self.stage],
|
||||
drop_path_rate=drop_path_rates[self.stage],
|
||||
with_cls_token=config.cls_token[self.stage],
|
||||
name=f"layers.{j}",
|
||||
)
|
||||
for j in range(config.depth[self.stage])
|
||||
]
|
||||
|
||||
def call(self, hidden_state: tf.Tensor, training: bool = False):
|
||||
cls_token = None
|
||||
hidden_state = self.embedding(hidden_state, training)
|
||||
|
||||
# "batch_size, height, width, num_channels -> batch_size, (height*width), num_channels"
|
||||
batch_size, height, width, num_channels = shape_list(hidden_state)
|
||||
hidden_size = height * width
|
||||
hidden_state = tf.reshape(hidden_state, shape=(batch_size, hidden_size, num_channels))
|
||||
|
||||
if self.config.cls_token[self.stage]:
|
||||
cls_token = tf.repeat(self.cls_token, repeats=batch_size, axis=0)
|
||||
hidden_state = tf.concat((cls_token, hidden_state), axis=1)
|
||||
|
||||
for layer in self.layers:
|
||||
layer_outputs = layer(hidden_state, height, width, training=training)
|
||||
hidden_state = layer_outputs
|
||||
|
||||
if self.config.cls_token[self.stage]:
|
||||
cls_token, hidden_state = tf.split(hidden_state, [1, height * width], 1)
|
||||
|
||||
# "batch_size, (height*width), num_channels -> batch_size, height, width, num_channels"
|
||||
hidden_state = tf.reshape(hidden_state, shape=(batch_size, height, width, num_channels))
|
||||
return hidden_state, cls_token
|
||||
|
||||
|
||||
class TFCvtEncoder(tf.keras.layers.Layer):
|
||||
"""
|
||||
Convolutional Vision Transformer encoder. CVT has 3 stages of encoder blocks with their respective number of layers
|
||||
(depth) being 1, 2 and 10.
|
||||
|
||||
Args:
|
||||
config ([`CvtConfig`]): Model configuration class.
|
||||
"""
|
||||
|
||||
config_class = CvtConfig
|
||||
|
||||
def __init__(self, config: CvtConfig, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.config = config
|
||||
self.stages = [
|
||||
TFCvtStage(config, stage_idx, name=f"stages.{stage_idx}") for stage_idx in range(len(config.depth))
|
||||
]
|
||||
|
||||
def call(
|
||||
self,
|
||||
pixel_values: TFModelInputType,
|
||||
output_hidden_states: Optional[bool] = False,
|
||||
return_dict: Optional[bool] = True,
|
||||
training: Optional[bool] = False,
|
||||
) -> Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]:
|
||||
all_hidden_states = () if output_hidden_states else None
|
||||
hidden_state = pixel_values
|
||||
# When running on CPU, `tf.keras.layers.Conv2D` doesn't support (batch_size, num_channels, height, width)
|
||||
# as input format. So change the input format to (batch_size, height, width, num_channels).
|
||||
hidden_state = tf.transpose(hidden_state, perm=(0, 2, 3, 1))
|
||||
|
||||
cls_token = None
|
||||
for _, (stage_module) in enumerate(self.stages):
|
||||
hidden_state, cls_token = stage_module(hidden_state, training=training)
|
||||
if output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_state,)
|
||||
|
||||
# Change back to (batch_size, num_channels, height, width) format to have uniformity in the modules
|
||||
hidden_state = tf.transpose(hidden_state, perm=(0, 3, 1, 2))
|
||||
if output_hidden_states:
|
||||
all_hidden_states = tuple([tf.transpose(hs, perm=(0, 3, 1, 2)) for hs in all_hidden_states])
|
||||
|
||||
if not return_dict:
|
||||
return tuple(v for v in [hidden_state, cls_token, all_hidden_states] if v is not None)
|
||||
|
||||
return TFBaseModelOutputWithCLSToken(
|
||||
last_hidden_state=hidden_state,
|
||||
cls_token_value=cls_token,
|
||||
hidden_states=all_hidden_states,
|
||||
)
|
||||
|
||||
|
||||
@keras_serializable
|
||||
class TFCvtMainLayer(tf.keras.layers.Layer):
|
||||
"""Construct the Cvt model."""
|
||||
|
||||
config_class = CvtConfig
|
||||
|
||||
def __init__(self, config: CvtConfig, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.config = config
|
||||
self.encoder = TFCvtEncoder(config, name="encoder")
|
||||
|
||||
@unpack_inputs
|
||||
def call(
|
||||
self,
|
||||
pixel_values: Optional[TFModelInputType] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
training: Optional[bool] = False,
|
||||
) -> Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]:
|
||||
if pixel_values is None:
|
||||
raise ValueError("You have to specify pixel_values")
|
||||
|
||||
encoder_outputs = self.encoder(
|
||||
pixel_values,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
training=training,
|
||||
)
|
||||
|
||||
sequence_output = encoder_outputs[0]
|
||||
|
||||
if not return_dict:
|
||||
return (sequence_output,) + encoder_outputs[1:]
|
||||
|
||||
return TFBaseModelOutputWithCLSToken(
|
||||
last_hidden_state=sequence_output,
|
||||
cls_token_value=encoder_outputs.cls_token_value,
|
||||
hidden_states=encoder_outputs.hidden_states,
|
||||
)
|
||||
|
||||
|
||||
class TFCvtPreTrainedModel(TFPreTrainedModel):
|
||||
"""
|
||||
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
||||
models.
|
||||
"""
|
||||
|
||||
config_class = CvtConfig
|
||||
base_model_prefix = "cvt"
|
||||
main_input_name = "pixel_values"
|
||||
|
||||
@property
|
||||
def dummy_inputs(self) -> Dict[str, tf.Tensor]:
|
||||
"""
|
||||
Dummy inputs to build the network.
|
||||
|
||||
Returns:
|
||||
`Dict[str, tf.Tensor]`: The dummy inputs.
|
||||
"""
|
||||
VISION_DUMMY_INPUTS = tf.random.uniform(shape=(3, self.config.num_channels, 224, 224), dtype=tf.float32)
|
||||
return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
|
||||
|
||||
@tf.function(
|
||||
input_signature=[
|
||||
{
|
||||
"pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
|
||||
}
|
||||
]
|
||||
)
|
||||
def serving(self, inputs):
|
||||
"""
|
||||
Method used for serving the model.
|
||||
|
||||
Args:
|
||||
inputs (`Dict[str, tf.Tensor]`):
|
||||
The input of the saved model as a dictionary of tensors.
|
||||
"""
|
||||
output = self.call(inputs)
|
||||
return self.serving_output(output)
|
||||
|
||||
|
||||
TFCVT_START_DOCSTRING = r"""
|
||||
|
||||
This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
|
||||
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
||||
etc.)
|
||||
|
||||
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
|
||||
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
|
||||
behavior.
|
||||
|
||||
<Tip>
|
||||
|
||||
TF 2.0 models accepts two formats as inputs:
|
||||
|
||||
- having all inputs as keyword arguments (like PyTorch models), or
|
||||
- having all inputs as a list, tuple or dict in the first positional arguments.
|
||||
|
||||
This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all the
|
||||
tensors in the first argument of the model call function: `model(inputs)`.
|
||||
|
||||
</Tip>
|
||||
|
||||
Args:
|
||||
config ([`CvtConfig`]): Model configuration class with all the parameters of the model.
|
||||
Initializing with a config file does not load the weights associated with the model, only the
|
||||
configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
|
||||
"""
|
||||
|
||||
TFCVT_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
|
||||
[`AutoFeatureExtractor.__call__`] for details.
|
||||
|
||||
output_hidden_states (`bool`, *optional*):
|
||||
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
||||
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
|
||||
used instead.
|
||||
return_dict (`bool`, *optional*):
|
||||
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
|
||||
eager mode, in graph mode the value will always be set to True.
|
||||
training (`bool`, *optional*, defaults to `False``):
|
||||
Whether or not to use the model in training mode (some modules like dropout modules have different
|
||||
behaviors between training and evaluation).
|
||||
"""
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The bare Cvt Model transformer outputting raw hidden-states without any specific head on top.",
|
||||
TFCVT_START_DOCSTRING,
|
||||
)
|
||||
class TFCvtModel(TFCvtPreTrainedModel):
|
||||
def __init__(self, config: CvtConfig, *inputs, **kwargs):
|
||||
super().__init__(config, *inputs, **kwargs)
|
||||
|
||||
self.cvt = TFCvtMainLayer(config, name="cvt")
|
||||
|
||||
@unpack_inputs
|
||||
@add_start_docstrings_to_model_forward(TFCVT_INPUTS_DOCSTRING)
|
||||
@replace_return_docstrings(output_type=TFBaseModelOutputWithCLSToken, config_class=_CONFIG_FOR_DOC)
|
||||
def call(
|
||||
self,
|
||||
pixel_values: Optional[tf.Tensor] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
training: Optional[bool] = False,
|
||||
) -> Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]:
|
||||
r"""
|
||||
Returns:
|
||||
|
||||
Examples:
|
||||
|
||||
```python
|
||||
>>> from transformers import AutoFeatureExtractor, TFCvtModel
|
||||
>>> from PIL import Image
|
||||
>>> import requests
|
||||
|
||||
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
>>> image = Image.open(requests.get(url, stream=True).raw)
|
||||
|
||||
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/cvt-13")
|
||||
>>> model = TFCvtModel.from_pretrained("microsoft/cvt-13")
|
||||
|
||||
>>> inputs = feature_extractor(images=image, return_tensors="tf")
|
||||
>>> outputs = model(**inputs)
|
||||
>>> last_hidden_states = outputs.last_hidden_state
|
||||
```"""
|
||||
|
||||
if pixel_values is None:
|
||||
raise ValueError("You have to specify pixel_values")
|
||||
|
||||
outputs = self.cvt(
|
||||
pixel_values=pixel_values,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
training=training,
|
||||
)
|
||||
|
||||
if not return_dict:
|
||||
return (outputs[0],) + outputs[1:]
|
||||
|
||||
return TFBaseModelOutputWithCLSToken(
|
||||
last_hidden_state=outputs.last_hidden_state,
|
||||
cls_token_value=outputs.cls_token_value,
|
||||
hidden_states=outputs.hidden_states,
|
||||
)
|
||||
|
||||
def serving_output(self, output: TFBaseModelOutputWithCLSToken) -> TFBaseModelOutputWithCLSToken:
|
||||
return TFBaseModelOutputWithCLSToken(
|
||||
last_hidden_state=output.last_hidden_state,
|
||||
cls_token_value=output.cls_token_value,
|
||||
hidden_states=output.hidden_states,
|
||||
)
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"""
|
||||
Cvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
|
||||
the [CLS] token) e.g. for ImageNet.
|
||||
""",
|
||||
TFCVT_START_DOCSTRING,
|
||||
)
|
||||
class TFCvtForImageClassification(TFCvtPreTrainedModel, TFSequenceClassificationLoss):
|
||||
def __init__(self, config: CvtConfig, *inputs, **kwargs):
|
||||
super().__init__(config, *inputs, **kwargs)
|
||||
|
||||
self.num_labels = config.num_labels
|
||||
self.cvt = TFCvtMainLayer(config, name="cvt")
|
||||
# Using same default epsilon as in the original implementation.
|
||||
self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm")
|
||||
|
||||
# Classifier head
|
||||
self.classifier = tf.keras.layers.Dense(
|
||||
units=config.num_labels,
|
||||
kernel_initializer=get_initializer(config.initializer_range),
|
||||
use_bias=True,
|
||||
bias_initializer="zeros",
|
||||
name="classifier",
|
||||
)
|
||||
|
||||
@unpack_inputs
|
||||
@add_start_docstrings_to_model_forward(TFCVT_INPUTS_DOCSTRING)
|
||||
@replace_return_docstrings(output_type=TFImageClassifierOutputWithNoAttention, config_class=_CONFIG_FOR_DOC)
|
||||
def call(
|
||||
self,
|
||||
pixel_values: Optional[tf.Tensor] = None,
|
||||
labels: Optional[tf.Tensor] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
training: Optional[bool] = False,
|
||||
) -> Union[TFImageClassifierOutputWithNoAttention, Tuple[tf.Tensor]]:
|
||||
r"""
|
||||
labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
|
||||
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
|
||||
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
|
||||
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
|
||||
Returns:
|
||||
|
||||
Examples:
|
||||
|
||||
```python
|
||||
>>> from transformers import AutoFeatureExtractor, TFCvtForImageClassification
|
||||
>>> import tensorflow as tf
|
||||
>>> from PIL import Image
|
||||
>>> import requests
|
||||
|
||||
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
>>> image = Image.open(requests.get(url, stream=True).raw)
|
||||
|
||||
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/cvt-13")
|
||||
>>> model = TFCvtForImageClassification.from_pretrained("microsoft/cvt-13")
|
||||
|
||||
>>> inputs = feature_extractor(images=image, return_tensors="tf")
|
||||
>>> outputs = model(**inputs)
|
||||
>>> logits = outputs.logits
|
||||
>>> # model predicts one of the 1000 ImageNet classes
|
||||
>>> predicted_class_idx = tf.math.argmax(logits, axis=-1)[0]
|
||||
>>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)])
|
||||
```"""
|
||||
|
||||
outputs = self.cvt(
|
||||
pixel_values,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
training=training,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
cls_token = outputs[1]
|
||||
if self.config.cls_token[-1]:
|
||||
sequence_output = self.layernorm(cls_token)
|
||||
else:
|
||||
# rearrange "batch_size, num_channels, height, width -> batch_size, (height*width), num_channels"
|
||||
batch_size, num_channels, height, width = shape_list(sequence_output)
|
||||
sequence_output = tf.reshape(sequence_output, shape=(batch_size, num_channels, height * width))
|
||||
sequence_output = tf.transpose(sequence_output, perm=(0, 2, 1))
|
||||
sequence_output = self.layernorm(sequence_output)
|
||||
|
||||
sequence_output_mean = tf.reduce_mean(sequence_output, axis=1)
|
||||
logits = self.classifier(sequence_output_mean)
|
||||
loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
|
||||
|
||||
if not return_dict:
|
||||
output = (logits,) + outputs[2:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
return TFImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
|
||||
|
||||
def serving_output(self, output: TFImageClassifierOutputWithNoAttention) -> TFImageClassifierOutputWithNoAttention:
|
||||
return TFImageClassifierOutputWithNoAttention(logits=output.logits, hidden_states=output.hidden_states)
|
||||
@@ -786,6 +786,30 @@ class TFCTRLPreTrainedModel(metaclass=DummyObject):
|
||||
requires_backends(self, ["tf"])
|
||||
|
||||
|
||||
TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST = None
|
||||
|
||||
|
||||
class TFCvtForImageClassification(metaclass=DummyObject):
|
||||
_backends = ["tf"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["tf"])
|
||||
|
||||
|
||||
class TFCvtModel(metaclass=DummyObject):
|
||||
_backends = ["tf"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["tf"])
|
||||
|
||||
|
||||
class TFCvtPreTrainedModel(metaclass=DummyObject):
|
||||
_backends = ["tf"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["tf"])
|
||||
|
||||
|
||||
class TFData2VecVisionForImageClassification(metaclass=DummyObject):
|
||||
_backends = ["tf"]
|
||||
|
||||
|
||||
271
tests/models/cvt/test_modeling_tf_cvt.py
Normal file
271
tests/models/cvt/test_modeling_tf_cvt.py
Normal file
@@ -0,0 +1,271 @@
|
||||
""" Testing suite for the Tensorflow CvT model. """
|
||||
|
||||
|
||||
import inspect
|
||||
import unittest
|
||||
from math import floor
|
||||
|
||||
import numpy as np
|
||||
|
||||
from transformers import CvtConfig
|
||||
from transformers.testing_utils import require_tf, require_vision, slow
|
||||
from transformers.utils import cached_property, is_tf_available, is_vision_available
|
||||
|
||||
from ...test_configuration_common import ConfigTester
|
||||
from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
|
||||
|
||||
|
||||
if is_tf_available():
|
||||
import tensorflow as tf
|
||||
|
||||
from transformers import TFCvtForImageClassification, TFCvtModel
|
||||
from transformers.models.cvt.modeling_tf_cvt import TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import AutoFeatureExtractor
|
||||
|
||||
|
||||
class TFCvtConfigTester(ConfigTester):
|
||||
def create_and_test_config_common_properties(self):
|
||||
config = self.config_class(**self.inputs_dict)
|
||||
self.parent.assertTrue(hasattr(config, "embed_dim"))
|
||||
self.parent.assertTrue(hasattr(config, "num_heads"))
|
||||
|
||||
|
||||
class TFCvtModelTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=13,
|
||||
image_size=64,
|
||||
num_channels=3,
|
||||
embed_dim=[16, 48, 96],
|
||||
num_heads=[1, 3, 6],
|
||||
depth=[1, 2, 10],
|
||||
patch_sizes=[7, 3, 3],
|
||||
patch_stride=[4, 2, 2],
|
||||
patch_padding=[2, 1, 1],
|
||||
stride_kv=[2, 2, 2],
|
||||
cls_token=[False, False, True],
|
||||
attention_drop_rate=[0.0, 0.0, 0.0],
|
||||
initializer_range=0.02,
|
||||
layer_norm_eps=1e-12,
|
||||
is_training=True,
|
||||
use_labels=True,
|
||||
num_labels=2,
|
||||
):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.image_size = image_size
|
||||
self.patch_sizes = patch_sizes
|
||||
self.patch_stride = patch_stride
|
||||
self.patch_padding = patch_padding
|
||||
self.is_training = is_training
|
||||
self.use_labels = use_labels
|
||||
self.num_labels = num_labels
|
||||
self.num_channels = num_channels
|
||||
self.embed_dim = embed_dim
|
||||
self.num_heads = num_heads
|
||||
self.stride_kv = stride_kv
|
||||
self.depth = depth
|
||||
self.cls_token = cls_token
|
||||
self.attention_drop_rate = attention_drop_rate
|
||||
self.initializer_range = initializer_range
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
|
||||
|
||||
labels = None
|
||||
if self.use_labels:
|
||||
# create a random int32 tensor of given shape
|
||||
labels = ids_tensor([self.batch_size], self.num_labels)
|
||||
|
||||
config = self.get_config()
|
||||
return config, pixel_values, labels
|
||||
|
||||
def get_config(self):
|
||||
return CvtConfig(
|
||||
image_size=self.image_size,
|
||||
num_labels=self.num_labels,
|
||||
num_channels=self.num_channels,
|
||||
embed_dim=self.embed_dim,
|
||||
num_heads=self.num_heads,
|
||||
patch_sizes=self.patch_sizes,
|
||||
patch_padding=self.patch_padding,
|
||||
patch_stride=self.patch_stride,
|
||||
stride_kv=self.stride_kv,
|
||||
depth=self.depth,
|
||||
cls_token=self.cls_token,
|
||||
attention_drop_rate=self.attention_drop_rate,
|
||||
initializer_range=self.initializer_range,
|
||||
)
|
||||
|
||||
def create_and_check_model(self, config, pixel_values, labels):
|
||||
model = TFCvtModel(config=config)
|
||||
result = model(pixel_values, training=False)
|
||||
image_size = (self.image_size, self.image_size)
|
||||
height, width = image_size[0], image_size[1]
|
||||
for i in range(len(self.depth)):
|
||||
height = floor(((height + 2 * self.patch_padding[i] - self.patch_sizes[i]) / self.patch_stride[i]) + 1)
|
||||
width = floor(((width + 2 * self.patch_padding[i] - self.patch_sizes[i]) / self.patch_stride[i]) + 1)
|
||||
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.embed_dim[-1], height, width))
|
||||
|
||||
def create_and_check_for_image_classification(self, config, pixel_values, labels):
|
||||
config.num_labels = self.num_labels
|
||||
model = TFCvtForImageClassification(config)
|
||||
result = model(pixel_values, labels=labels, training=False)
|
||||
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
config, pixel_values, labels = config_and_inputs
|
||||
inputs_dict = {"pixel_values": pixel_values}
|
||||
return config, inputs_dict
|
||||
|
||||
|
||||
@require_tf
|
||||
class TFCvtModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||
"""
|
||||
Here we also overwrite some of the tests of test_modeling_common.py, as Cvt
|
||||
does not use input_ids, inputs_embeds, attention_mask and seq_length.
|
||||
"""
|
||||
|
||||
all_model_classes = (TFCvtModel, TFCvtForImageClassification) if is_tf_available() else ()
|
||||
test_pruning = False
|
||||
test_resize_embeddings = False
|
||||
test_head_masking = False
|
||||
has_attentions = False
|
||||
test_onnx = False
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = TFCvtModelTester(self)
|
||||
self.config_tester = TFCvtConfigTester(self, config_class=CvtConfig, has_text_modality=False, hidden_size=37)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.create_and_test_config_common_properties()
|
||||
self.config_tester.create_and_test_config_to_json_string()
|
||||
self.config_tester.create_and_test_config_to_json_file()
|
||||
self.config_tester.create_and_test_config_from_and_save_pretrained()
|
||||
self.config_tester.create_and_test_config_with_num_labels()
|
||||
self.config_tester.check_config_can_be_init_without_params()
|
||||
self.config_tester.check_config_arguments_init()
|
||||
|
||||
@unittest.skip(reason="Cvt does not output attentions")
|
||||
def test_attention_outputs(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="Cvt does not use inputs_embeds")
|
||||
def test_inputs_embeds(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="Cvt does not support input and output embeddings")
|
||||
def test_model_common_attributes(self):
|
||||
pass
|
||||
|
||||
@unittest.skipIf(
|
||||
not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0,
|
||||
reason="TF does not support backprop for grouped convolutions on CPU.",
|
||||
)
|
||||
def test_dataset_conversion(self):
|
||||
super().test_dataset_conversion()
|
||||
|
||||
@unittest.skipIf(
|
||||
not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0,
|
||||
reason="TF does not support backprop for grouped convolutions on CPU.",
|
||||
)
|
||||
def test_keras_fit(self):
|
||||
super().test_keras_fit()
|
||||
|
||||
def test_forward_signature(self):
|
||||
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
signature = inspect.signature(model.call)
|
||||
# signature.parameters is an OrderedDict => so arg_names order is deterministic
|
||||
arg_names = [*signature.parameters.keys()]
|
||||
|
||||
expected_arg_names = ["pixel_values"]
|
||||
self.assertListEqual(arg_names[:1], expected_arg_names)
|
||||
|
||||
def test_hidden_states_output(self):
|
||||
def check_hidden_states_output(inputs_dict, config, model_class):
|
||||
model = model_class(config)
|
||||
|
||||
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
|
||||
hidden_states = outputs.hidden_states
|
||||
|
||||
expected_num_layers = len(self.model_tester.depth)
|
||||
self.assertEqual(len(hidden_states), expected_num_layers)
|
||||
|
||||
# verify the first hidden states (first block)
|
||||
self.assertListEqual(
|
||||
list(hidden_states[0].shape[-3:]),
|
||||
[
|
||||
self.model_tester.embed_dim[0],
|
||||
self.model_tester.image_size // 4,
|
||||
self.model_tester.image_size // 4,
|
||||
],
|
||||
)
|
||||
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
inputs_dict["output_hidden_states"] = True
|
||||
check_hidden_states_output(inputs_dict, config, model_class)
|
||||
|
||||
# check that output_hidden_states also work using config
|
||||
del inputs_dict["output_hidden_states"]
|
||||
config.output_hidden_states = True
|
||||
|
||||
check_hidden_states_output(inputs_dict, config, model_class)
|
||||
|
||||
def test_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||
|
||||
def test_for_image_classification(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
|
||||
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
for model_name in TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
|
||||
model = TFCvtModel.from_pretrained(model_name)
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
|
||||
# We will verify our results on an image of cute cats
|
||||
def prepare_img():
|
||||
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
return image
|
||||
|
||||
|
||||
@require_tf
|
||||
@require_vision
|
||||
class TFCvtModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
return AutoFeatureExtractor.from_pretrained(TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST[0])
|
||||
|
||||
@slow
|
||||
def test_inference_image_classification_head(self):
|
||||
model = TFCvtForImageClassification.from_pretrained(TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST[0])
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="tf")
|
||||
|
||||
# forward pass
|
||||
outputs = model(**inputs)
|
||||
|
||||
# verify the logits
|
||||
expected_shape = tf.TensorShape((1, 1000))
|
||||
self.assertEqual(outputs.logits.shape, expected_shape)
|
||||
|
||||
expected_slice = tf.constant([0.9285, 0.9015, -0.3150])
|
||||
self.assertTrue(np.allclose(outputs.logits[0, :3].numpy(), expected_slice, atol=1e-4))
|
||||
Reference in New Issue
Block a user