diff --git a/docs/source/ja/troubleshooting.md b/docs/source/ja/troubleshooting.md index 433905354a..ece688d46a 100644 --- a/docs/source/ja/troubleshooting.md +++ b/docs/source/ja/troubleshooting.md @@ -69,7 +69,6 @@ TensorFlowの[model.save](https://www.tensorflow.org/tutorials/keras/save_and_lo ```py >>> from transformers import TFPreTrainedModel ->>> from tensorflow import keras >>> model.save_weights("some_folder/tf_model.h5") >>> model = TFPreTrainedModel.from_pretrained("some_folder") diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py index 11f35ceacc..9f4405968e 100644 --- a/examples/tensorflow/image-classification/run_image_classification.py +++ b/examples/tensorflow/image-classification/run_image_classification.py @@ -47,6 +47,7 @@ from transformers import ( set_seed, ) from transformers.keras_callbacks import KerasMetricCallback +from transformers.modeling_tf_utils import keras from transformers.trainer_utils import get_last_checkpoint, is_main_process from transformers.utils import check_min_version, send_example_telemetry from transformers.utils.versions import require_version @@ -363,7 +364,7 @@ def main(): def _train_transforms(image): img_size = image_size - image = tf.keras.utils.img_to_array(image) + image = keras.utils.img_to_array(image) image = random_resized_crop(image, size=img_size) image = tf.image.random_flip_left_right(image) image /= 255.0 @@ -372,7 +373,7 @@ def main(): return image def _val_transforms(image): - image = tf.keras.utils.img_to_array(image) + image = keras.utils.img_to_array(image) image = tf.image.resize(image, size=image_size) # image = np.array(image) # FIXME - use tf.image function image = center_crop(image, size=image_size) diff --git a/examples/tensorflow/language-modeling-tpu/run_mlm.py b/examples/tensorflow/language-modeling-tpu/run_mlm.py index e9e9862a6d..544bca716a 100644 --- a/examples/tensorflow/language-modeling-tpu/run_mlm.py +++ b/examples/tensorflow/language-modeling-tpu/run_mlm.py @@ -22,6 +22,7 @@ import os import re import tensorflow as tf +from packaging.version import parse from transformers import ( AutoConfig, @@ -33,6 +34,19 @@ from transformers import ( ) +try: + import tf_keras as keras +except (ModuleNotFoundError, ImportError): + import keras + + if parse(keras.__version__).major > 2: + raise ValueError( + "Your currently installed version of Keras is Keras 3, but this is not yet supported in " + "Transformers. Please install the backwards-compatible tf-keras package with " + "`pip install tf-keras`." + ) + + logger = logging.getLogger(__name__) AUTO = tf.data.AUTOTUNE @@ -209,7 +223,7 @@ def main(args): strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0") if args.bfloat16: - tf.keras.mixed_precision.set_global_policy("mixed_bfloat16") + keras.mixed_precision.set_global_policy("mixed_bfloat16") tokenizer = AutoTokenizer.from_pretrained(args.tokenizer) config = AutoConfig.from_pretrained(args.pretrained_model_config) diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py index 19e00c3dc4..42a30c8002 100755 --- a/examples/tensorflow/question-answering/run_qa.py +++ b/examples/tensorflow/question-answering/run_qa.py @@ -30,6 +30,7 @@ from typing import Optional import evaluate import tensorflow as tf from datasets import load_dataset +from packaging.version import parse from utils_qa import postprocess_qa_predictions import transformers @@ -48,6 +49,19 @@ from transformers import ( from transformers.utils import CONFIG_NAME, TF2_WEIGHTS_NAME, check_min_version, send_example_telemetry +try: + import tf_keras as keras +except (ModuleNotFoundError, ImportError): + import keras + + if parse(keras.__version__).major > 2: + raise ValueError( + "Your currently installed version of Keras is Keras 3, but this is not yet supported in " + "Transformers. Please install the backwards-compatible tf-keras package with " + "`pip install tf-keras`." + ) + + # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.38.0.dev0") @@ -233,7 +247,7 @@ class DataTrainingArguments: # region Helper classes -class SavePretrainedCallback(tf.keras.callbacks.Callback): +class SavePretrainedCallback(keras.callbacks.Callback): # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback # that saves the model with this method after each epoch. diff --git a/examples/tensorflow/test_tensorflow_examples.py b/examples/tensorflow/test_tensorflow_examples.py index 956209baad..b07d5f7df8 100644 --- a/examples/tensorflow/test_tensorflow_examples.py +++ b/examples/tensorflow/test_tensorflow_examples.py @@ -23,6 +23,20 @@ from unittest import skip from unittest.mock import patch import tensorflow as tf +from packaging.version import parse + + +try: + import tf_keras as keras +except (ModuleNotFoundError, ImportError): + import keras + + if parse(keras.__version__).major > 2: + raise ValueError( + "Your currently installed version of Keras is Keras 3, but this is not yet supported in " + "Transformers. Please install the backwards-compatible tf-keras package with " + "`pip install tf-keras`." + ) from transformers.testing_utils import TestCasePlus, get_gpu_count, slow @@ -115,7 +129,7 @@ class ExamplesTests(TestCasePlus): with patch.object(sys, "argv", testargs): run_text_classification.main() # Reset the mixed precision policy so we don't break other tests - tf.keras.mixed_precision.set_global_policy("float32") + keras.mixed_precision.set_global_policy("float32") result = get_results(tmp_dir) self.assertGreaterEqual(result["eval_accuracy"], 0.75) diff --git a/examples/tensorflow/text-classification/run_text_classification.py b/examples/tensorflow/text-classification/run_text_classification.py index 0c0d989c4c..2090de1b8d 100644 --- a/examples/tensorflow/text-classification/run_text_classification.py +++ b/examples/tensorflow/text-classification/run_text_classification.py @@ -27,6 +27,7 @@ from typing import Optional import numpy as np from datasets import load_dataset +from packaging.version import parse from transformers import ( AutoConfig, @@ -46,11 +47,24 @@ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1" # Reduce the amount of console output import tensorflow as tf # noqa: E402 +try: + import tf_keras as keras +except (ModuleNotFoundError, ImportError): + import keras + + if parse(keras.__version__).major > 2: + raise ValueError( + "Your currently installed version of Keras is Keras 3, but this is not yet supported in " + "Transformers. Please install the backwards-compatible tf-keras package with " + "`pip install tf-keras`." + ) + + logger = logging.getLogger(__name__) # region Helper classes -class SavePretrainedCallback(tf.keras.callbacks.Callback): +class SavePretrainedCallback(keras.callbacks.Callback): # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback # that saves the model with this method after each epoch. diff --git a/src/transformers/activations_tf.py b/src/transformers/activations_tf.py index 4fcb1493e4..d12b73ea45 100644 --- a/src/transformers/activations_tf.py +++ b/src/transformers/activations_tf.py @@ -15,7 +15,20 @@ import math import tensorflow as tf -from packaging import version +from packaging.version import parse + + +try: + import tf_keras as keras +except (ModuleNotFoundError, ImportError): + import keras + + if parse(keras.__version__).major > 2: + raise ValueError( + "Your currently installed version of Keras is Keras 3, but this is not yet supported in " + "Transformers. Please install the backwards-compatible tf-keras package with " + "`pip install tf-keras`." + ) def _gelu(x): @@ -99,12 +112,12 @@ def glu(x, axis=-1): return a * tf.math.sigmoid(b) -if version.parse(tf.version.VERSION) >= version.parse("2.4"): +if parse(tf.version.VERSION) >= parse("2.4"): def approximate_gelu_wrap(x): - return tf.keras.activations.gelu(x, approximate=True) + return keras.activations.gelu(x, approximate=True) - gelu = tf.keras.activations.gelu + gelu = keras.activations.gelu gelu_new = approximate_gelu_wrap else: gelu = _gelu @@ -119,11 +132,11 @@ ACT2FN = { "glu": glu, "mish": mish, "quick_gelu": quick_gelu, - "relu": tf.keras.activations.relu, - "sigmoid": tf.keras.activations.sigmoid, - "silu": tf.keras.activations.swish, - "swish": tf.keras.activations.swish, - "tanh": tf.keras.activations.tanh, + "relu": keras.activations.relu, + "sigmoid": keras.activations.sigmoid, + "silu": keras.activations.swish, + "swish": keras.activations.swish, + "tanh": keras.activations.tanh, } diff --git a/src/transformers/keras_callbacks.py b/src/transformers/keras_callbacks.py index 3bb4e859b1..b6e832729a 100644 --- a/src/transformers/keras_callbacks.py +++ b/src/transformers/keras_callbacks.py @@ -8,16 +8,16 @@ import numpy as np import tensorflow as tf from huggingface_hub import Repository, create_repo from packaging.version import parse -from tensorflow.keras.callbacks import Callback from . import IntervalStrategy, PreTrainedTokenizerBase from .modelcard import TrainingSummary +from .modeling_tf_utils import keras logger = logging.getLogger(__name__) -class KerasMetricCallback(Callback): +class KerasMetricCallback(keras.callbacks.Callback): """ Callback to compute metrics at the end of every epoch. Unlike normal Keras metrics, these do not need to be compilable by TF. It is particularly useful for common NLP metrics like BLEU and ROUGE that require string @@ -265,7 +265,7 @@ class KerasMetricCallback(Callback): logs.update(metric_output) -class PushToHubCallback(Callback): +class PushToHubCallback(keras.callbacks.Callback): """ Callback that will save and push the model to the Hub regularly. By default, it pushes once per epoch, but this can be changed with the `save_strategy` argument. Pushed models can be accessed like any other model on the hub, such diff --git a/src/transformers/modelcard.py b/src/transformers/modelcard.py index f1b2f70bc2..9e8f2becae 100644 --- a/src/transformers/modelcard.py +++ b/src/transformers/modelcard.py @@ -704,7 +704,7 @@ class TrainingSummary: def parse_keras_history(logs): """ - Parse the `logs` of either a `tf.keras.History` object returned by `model.fit()` or an accumulated logs `dict` + Parse the `logs` of either a `keras.History` object returned by `model.fit()` or an accumulated logs `dict` passed to the `PushToHubCallback`. Returns lines and logs compatible with those returned by `parse_log_history`. """ if hasattr(logs, "history"): @@ -800,14 +800,14 @@ def parse_log_history(log_history): def extract_hyperparameters_from_keras(model): - import tensorflow as tf + from .modeling_tf_utils import keras hyperparameters = {} if hasattr(model, "optimizer") and model.optimizer is not None: hyperparameters["optimizer"] = model.optimizer.get_config() else: hyperparameters["optimizer"] = None - hyperparameters["training_precision"] = tf.keras.mixed_precision.global_policy().name + hyperparameters["training_precision"] = keras.mixed_precision.global_policy().name return hyperparameters diff --git a/src/transformers/modeling_tf_pytorch_utils.py b/src/transformers/modeling_tf_pytorch_utils.py index a96481e062..99fa106674 100644 --- a/src/transformers/modeling_tf_pytorch_utils.py +++ b/src/transformers/modeling_tf_pytorch_utils.py @@ -260,7 +260,6 @@ def load_pytorch_state_dict_in_tf2_model( """Load a pytorch state_dict in a TF 2.0 model. pt_state_dict can be either an actual dict or a lazy-loading safetensors archive created with the safe_open() function.""" import tensorflow as tf - from keras import backend as K if tf_inputs is None: tf_inputs = tf_model.dummy_inputs @@ -360,7 +359,7 @@ def load_pytorch_state_dict_in_tf2_model( tf_loaded_numel += tensor_size(array) - K.set_value(symbolic_weight, array) + symbolic_weight.assign(tf.cast(array, symbolic_weight.dtype)) del array # Immediately free memory to keep peak usage as low as possible all_pytorch_weights.discard(name) diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index 7ff9d0731d..d1d65aa88e 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -33,7 +33,6 @@ import h5py import numpy as np import tensorflow as tf from huggingface_hub import Repository, list_repo_files -from keras import backend as K from packaging.version import parse from . import DataCollatorWithPadding, DefaultDataCollator @@ -79,6 +78,20 @@ if is_safetensors_available(): if TYPE_CHECKING: from . import PreTrainedTokenizerBase +try: + import tf_keras as keras + from tf_keras import backend as K +except (ModuleNotFoundError, ImportError): + import keras + from keras import backend as K + + if parse(keras.__version__).major > 2: + raise ValueError( + "Your currently installed version of Keras is Keras 3, but this is not yet supported in " + "Transformers. Please install the backwards-compatible tf-keras package with " + "`pip install tf-keras`." + ) + logger = logging.get_logger(__name__) tf_logger = tf.get_logger() @@ -103,7 +116,7 @@ def dummy_loss(y_true, y_pred): class TFModelUtilsMixin: """ - A few utilities for `tf.keras.Model`, to be used as a mixin. + A few utilities for `keras.Model`, to be used as a mixin. """ def num_parameters(self, only_trainable: bool = False) -> int: @@ -134,10 +147,10 @@ def keras_serializable(cls): 2. Wrapping `__init__` to accept that `transformers_config` dict (passed by Keras at deserialization time) and convert it to a config object for the actual layer initializer. 3. Registering the class as a custom object in Keras (if the Tensorflow version supports this), so that it does not - need to be supplied in `custom_objects` in the call to `tf.keras.models.load_model`. + need to be supplied in `custom_objects` in the call to `keras.models.load_model`. Args: - cls (a `tf.keras.layers.Layers subclass`): + cls (a `keras.layers.Layers subclass`): Typically a `TF.MainLayer` class in this project, in general must accept a `config` argument to its initializer. @@ -171,7 +184,7 @@ def keras_serializable(cls): cls.__init__ = wrapped_init if not hasattr(cls, "get_config"): - raise TypeError("Only use @keras_serializable on tf.keras.layers.Layer subclasses") + raise TypeError("Only use @keras_serializable on keras.layers.Layer subclasses") if hasattr(cls.get_config, "_is_default"): def get_config(self): @@ -183,8 +196,8 @@ def keras_serializable(cls): cls.get_config = get_config cls._keras_serializable = True - if hasattr(tf.keras.utils, "register_keras_serializable"): - cls = tf.keras.utils.register_keras_serializable()(cls) + if hasattr(keras.utils, "register_keras_serializable"): + cls = keras.utils.register_keras_serializable()(cls) return cls @@ -200,9 +213,7 @@ class TFCausalLanguageModelingLoss: """ def hf_compute_loss(self, labels, logits): - loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( - from_logits=True, reduction=tf.keras.losses.Reduction.NONE - ) + loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE) if self.config.tf_legacy_loss: # make sure only labels that are not equal to -100 affect the loss active_loss = tf.not_equal(tf.reshape(labels, (-1,)), -100) @@ -225,9 +236,7 @@ class TFQuestionAnsweringLoss: """ def hf_compute_loss(self, labels, logits): - loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( - from_logits=True, reduction=tf.keras.losses.Reduction.NONE - ) + loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE) start_loss = loss_fn(labels["start_position"], logits[0]) end_loss = loss_fn(labels["end_position"], logits[1]) @@ -246,9 +255,7 @@ class TFTokenClassificationLoss: """ def hf_compute_loss(self, labels, logits): - loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( - from_logits=True, reduction=tf.keras.losses.Reduction.NONE - ) + loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE) if tf.executing_eagerly(): # Data-dependent conditionals are forbidden in XLA if tf.math.reduce_any(labels == -1): tf.print("Using `-1` to mask the loss for the token is deprecated. Please use `-100` instead.") @@ -285,13 +292,13 @@ class TFSequenceClassificationLoss: def hf_compute_loss(self, labels, logits): if logits.shape.rank == 1 or logits.shape[1] == 1: - loss_fn = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.NONE) + loss_fn = keras.losses.MeanSquaredError(reduction=keras.losses.Reduction.NONE) if labels.shape.rank == 1: # MeanSquaredError returns a scalar loss if the labels are 1D, so avoid that labels = tf.expand_dims(labels, axis=-1) else: - loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( - from_logits=True, reduction=tf.keras.losses.Reduction.NONE + loss_fn = keras.losses.SparseCategoricalCrossentropy( + from_logits=True, reduction=keras.losses.Reduction.NONE ) return loss_fn(labels, logits) @@ -301,9 +308,7 @@ class TFMultipleChoiceLoss: """Loss function suitable for multiple choice tasks.""" def hf_compute_loss(self, labels, logits): - loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( - from_logits=True, reduction=tf.keras.losses.Reduction.NONE - ) + loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE) return loss_fn(labels, logits) @@ -331,9 +336,7 @@ class TFNextSentencePredictionLoss: """ def hf_compute_loss(self, labels, logits): - loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( - from_logits=True, reduction=tf.keras.losses.Reduction.NONE - ) + loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE) if self.config.tf_legacy_loss: # make sure only labels that are not equal to -100 # are taken into account as loss @@ -435,7 +438,7 @@ def unpack_inputs(func): def input_processing(func, config, **kwargs): """ Process the input of each TensorFlow model including the booleans. In case of a list of symbolic inputs, each input - has to be named accordingly to the parameters name, i.e. `input_ids = tf.keras.Input(shape=(128,), dtype='int32', + has to be named accordingly to the parameters name, i.e. `input_ids = keras.Input(shape=(128,), dtype='int32', name="input_ids")` otherwise the order of the tensors will not be guaranteed during the training. Args: @@ -710,7 +713,7 @@ def load_tf_sharded_weights(model, shard_files, ignore_mismatched_sizes=False, s loaded in the model. Args: - model (`tf.keras.models.Model`): The model in which to load the checkpoint. + model (`keras.models.Model`): The model in which to load the checkpoint. shard_files (`str` or `os.PathLike`): A list containing the sharded checkpoint names. ignore_mismatched_sizes`bool`, *optional`, defaults to `True`): Whether or not to ignore the mismatch between the sizes @@ -773,13 +776,13 @@ def load_tf_shard(model, model_layer_map, resolved_archive_file, ignore_mismatch Loads a shard from a sharded checkpoint file. Handles the missing keys and unexpected keys. Args: - model (`tf.keras.models.Model`): Model in which the weights are loaded + model (`keras.models.Model`): Model in which the weights are loaded model_layer_map (`Dict`): A dictionary mapping the layer name to the index of the layer in the model. resolved_archive_file (`str`): Path to the checkpoint file from which the weights will be loaded ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`): Whether to ignore the mismatched keys Returns: - `tf.keras.models.Model`: Three lists, one for the layers that were found and succesfully restored (from the + `keras.models.Model`: Three lists, one for the layers that were found and succesfully restored (from the shard file), one for the mismatched layers, and another one for the unexpected layers. """ saved_weight_names_set = set() @@ -862,7 +865,7 @@ def load_tf_weights(model, resolved_archive_file, ignore_mismatched_sizes=False, shapes. Args: - model (`tf.keras.models.Model`): + model (`keras.models.Model`): The model to load the weights into. resolved_archive_file (`str`): The location of the H5 file. @@ -1055,7 +1058,7 @@ def init_copy_embeddings(old_embeddings, new_num_tokens): return mask, current_weights -class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, PushToHubMixin): +class TFPreTrainedModel(keras.Model, TFModelUtilsMixin, TFGenerationMixin, PushToHubMixin): r""" Base class for all TF models. @@ -1295,7 +1298,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu return False return True - def get_input_embeddings(self) -> tf.keras.layers.Layer: + def get_input_embeddings(self) -> keras.layers.Layer: """ Returns the model's input embeddings layer. @@ -1505,7 +1508,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu self._using_dummy_loss = True else: self._using_dummy_loss = False - parent_args = list(inspect.signature(tf.keras.Model.compile).parameters.keys()) + parent_args = list(inspect.signature(keras.Model.compile).parameters.keys()) # This argument got renamed, we need to support both versions if "steps_per_execution" in parent_args: super().compile( @@ -1531,7 +1534,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu ) def compute_loss(self, *args, **kwargs): - if hasattr(tf.keras.Model, "compute_loss"): + if hasattr(keras.Model, "compute_loss"): # This will be true in TF 2.8 or greater return super().compute_loss(*args, **kwargs) else: @@ -1575,7 +1578,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu if not self._using_dummy_loss and parse(tf.__version__) < parse("2.11.0"): # Newer TF train steps leave this out data = expand_1d(data) - x, y, sample_weight = tf.keras.utils.unpack_x_y_sample_weight(data) + x, y, sample_weight = keras.utils.unpack_x_y_sample_weight(data) # If the inputs are mutable dictionaries, make a shallow copy of them because we will modify # them during input/label pre-processing. This avoids surprising the user by wrecking their data. # In addition, modifying mutable Python inputs makes XLA compilation impossible. @@ -1682,7 +1685,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu if not self._using_dummy_loss and parse(tf.__version__) < parse("2.11.0"): # Newer versions leave this out data = expand_1d(data) - x, y, sample_weight = tf.keras.utils.unpack_x_y_sample_weight(data) + x, y, sample_weight = keras.utils.unpack_x_y_sample_weight(data) # If the inputs are mutable dictionaries, make a shallow copy of them because we will modify # them during input/label pre-processing. This avoids surprising the user by wrecking their data. # In addition, modifying mutable Python inputs makes XLA compilation impossible. @@ -1851,7 +1854,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu self.build_in_name_scope() main_layer.set_input_embeddings(value) - def get_output_embeddings(self) -> Union[None, tf.keras.layers.Layer]: + def get_output_embeddings(self) -> Union[None, keras.layers.Layer]: """ Returns the model's output embeddings @@ -1888,13 +1891,13 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu self.build_in_name_scope() lm_head.set_output_embeddings(value) - def get_output_layer_with_bias(self) -> Union[None, tf.keras.layers.Layer]: + def get_output_layer_with_bias(self) -> Union[None, keras.layers.Layer]: """ Get the layer that handles a bias attribute in case the model has an LM head with weights tied to the embeddings Return: - `tf.keras.layers.Layer`: The layer that handles the bias, None if not an LM model. + `keras.layers.Layer`: The layer that handles the bias, None if not an LM model. """ warnings.warn( "The method get_output_layer_with_bias is deprecated. Please use `get_lm_head` instead.", FutureWarning @@ -1944,18 +1947,18 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu self.build_in_name_scope() lm_head.set_bias(value) - def get_lm_head(self) -> tf.keras.layers.Layer: + def get_lm_head(self) -> keras.layers.Layer: """ The LM Head layer. This method must be overwritten by all the models that have a lm head. Return: - `tf.keras.layers.Layer`: The LM head layer if the model has one, None if not. + `keras.layers.Layer`: The LM head layer if the model has one, None if not. """ return None def resize_token_embeddings( self, new_num_tokens: Optional[int] = None - ) -> Union[tf.keras.layers.Embedding, tf.Variable]: + ) -> Union[keras.layers.Embedding, tf.Variable]: """ Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`. @@ -1968,12 +1971,12 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu returns a pointer to the input tokens without doing anything. Return: - `tf.Variable` or `tf.keras.layers.Embedding`: Pointer to the input tokens of the model. + `tf.Variable` or `keras.layers.Embedding`: Pointer to the input tokens of the model. """ # TODO (joao): flagged for replacement (by `_v2_resized_token_embeddings`) due to embeddings refactor # Run the new code path if the model has a keras embeddings layer - if isinstance(self.get_input_embeddings(), tf.keras.layers.Embedding): + if isinstance(self.get_input_embeddings(), keras.layers.Embedding): return self._v2_resized_token_embeddings(new_num_tokens) if new_num_tokens is None or new_num_tokens == self.config.vocab_size: @@ -1986,7 +1989,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu return model_embeds - def _v2_resized_token_embeddings(self, new_num_tokens: Optional[int] = None) -> tf.keras.layers.Embedding: + def _v2_resized_token_embeddings(self, new_num_tokens: Optional[int] = None) -> keras.layers.Embedding: """ Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`. @@ -1997,7 +2000,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu returns a pointer to the input tokens without doing anything. Return: - `tf.keras.layers.Embedding`: Pointer to the input tokens of the model. + `keras.layers.Embedding`: Pointer to the input tokens of the model. """ if new_num_tokens is None or new_num_tokens == self.config.vocab_size: return self.get_input_embeddings() @@ -2245,20 +2248,20 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu return new_embeddings def _v2_get_resized_embeddings( - self, old_embeddings: tf.keras.layers.Embedding, new_num_tokens: int - ) -> tf.keras.layers.Embedding: + self, old_embeddings: keras.layers.Embedding, new_num_tokens: int + ) -> keras.layers.Embedding: """ Build a resized Embedding layer from a provided Embedding layer. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end. Args: - old_embeddings (`tf.keras.layers.Embedding`): + old_embeddings (`keras.layers.Embedding`): Old embeddings to be resized. new_num_tokens (`int`, *optional*): New number of tokens in the embedding matrix. Return: - `tf.keras.layers.Embedding`: Resized Embedding layer. + `keras.layers.Embedding`: Resized Embedding layer. """ # Get the initialization range for the embeddings @@ -2273,10 +2276,10 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu init_range = getattr(self.config, var_name) # Get a new (initialized) embeddings layer - new_embeddings = tf.keras.layers.Embedding( + new_embeddings = keras.layers.Embedding( input_dim=new_num_tokens, output_dim=old_embeddings.output_dim, - embeddings_initializer=tf.keras.initializers.TruncatedNormal(stddev=init_range), + embeddings_initializer=keras.initializers.TruncatedNormal(stddev=init_range), name=old_embeddings.embeddings.name[:-13], # exact same scoped name except "/embeddings:0" ) new_embeddings(tf.constant([[0]])) @@ -3184,7 +3187,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu cls._auto_class = auto_class -class TFConv1D(tf.keras.layers.Layer): +class TFConv1D(keras.layers.Layer): """ 1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2). @@ -3198,7 +3201,7 @@ class TFConv1D(tf.keras.layers.Layer): initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation to use to initialize the weights. kwargs (`Dict[str, Any]`, *optional*): - Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`. + Additional keyword arguments passed along to the `__init__` of `keras.layers.Layer`. """ def __init__(self, nf, nx, initializer_range=0.02, **kwargs): @@ -3227,7 +3230,7 @@ class TFConv1D(tf.keras.layers.Layer): return x -class TFSharedEmbeddings(tf.keras.layers.Layer): +class TFSharedEmbeddings(keras.layers.Layer): r""" Construct shared token embeddings. @@ -3243,7 +3246,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer): The standard deviation to use when initializing the weights. If no value is provided, it will default to \\(1/\sqrt{hidden\_size}\\). kwargs (`Dict[str, Any]`, *optional*): - Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`. + Additional keyword arguments passed along to the `__init__` of `keras.layers.Layer`. """ # TODO (joao): flagged for delection due to embeddings refactor @@ -3254,7 +3257,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer): self.hidden_size = hidden_size self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range warnings.warn( - "`TFSharedEmbeddings` is scheduled for deletion in v4.32, use `tf.keras.layers.Embedding` instead.", + "`TFSharedEmbeddings` is scheduled for deletion in v4.32, use `keras.layers.Embedding` instead.", DeprecationWarning, ) @@ -3331,7 +3334,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer): return tf.reshape(logits, first_dims + [self.vocab_size]) -class TFSequenceSummary(tf.keras.layers.Layer): +class TFSequenceSummary(keras.layers.Layer): """ Compute a single vector summary of a sequence hidden states. @@ -3358,7 +3361,7 @@ class TFSequenceSummary(tf.keras.layers.Layer): initializer_range (`float`, defaults to 0.02): The standard deviation to use to initialize the weights. kwargs (`Dict[str, Any]`, *optional*): - Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`. + Additional keyword arguments passed along to the `__init__` of `keras.layers.Layer`. """ def __init__(self, config: PretrainedConfig, initializer_range: float = 0.02, **kwargs): @@ -3377,7 +3380,7 @@ class TFSequenceSummary(tf.keras.layers.Layer): num_classes = config.num_labels else: num_classes = config.hidden_size - self.summary = tf.keras.layers.Dense( + self.summary = keras.layers.Dense( num_classes, kernel_initializer=get_initializer(initializer_range), name="summary" ) @@ -3389,11 +3392,11 @@ class TFSequenceSummary(tf.keras.layers.Layer): self.has_first_dropout = hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0 if self.has_first_dropout: - self.first_dropout = tf.keras.layers.Dropout(config.summary_first_dropout) + self.first_dropout = keras.layers.Dropout(config.summary_first_dropout) self.has_last_dropout = hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0 if self.has_last_dropout: - self.last_dropout = tf.keras.layers.Dropout(config.summary_last_dropout) + self.last_dropout = keras.layers.Dropout(config.summary_last_dropout) self.hidden_size = config.hidden_size def call(self, inputs, cls_index=None, training=False): @@ -3456,14 +3459,14 @@ class TFSequenceSummary(tf.keras.layers.Layer): self.summary.build(self.hidden_size) -def get_initializer(initializer_range: float = 0.02) -> tf.keras.initializers.TruncatedNormal: +def get_initializer(initializer_range: float = 0.02) -> keras.initializers.TruncatedNormal: """ - Creates a `tf.keras.initializers.TruncatedNormal` with the given range. + Creates a `keras.initializers.TruncatedNormal` with the given range. Args: initializer_range (*float*, defaults to 0.02): Standard deviation of the initializer range. Returns: - `tf.keras.initializers.TruncatedNormal`: The truncated normal initializer. + `keras.initializers.TruncatedNormal`: The truncated normal initializer. """ - return tf.keras.initializers.TruncatedNormal(stddev=initializer_range) + return keras.initializers.TruncatedNormal(stddev=initializer_range) diff --git a/src/transformers/models/albert/modeling_tf_albert.py b/src/transformers/models/albert/modeling_tf_albert.py index 9ce6456f8a..acdc8c886c 100644 --- a/src/transformers/models/albert/modeling_tf_albert.py +++ b/src/transformers/models/albert/modeling_tf_albert.py @@ -44,6 +44,7 @@ from ...modeling_tf_utils import ( TFSequenceClassificationLoss, TFTokenClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -84,9 +85,7 @@ class TFAlbertPreTrainingLoss: """ def hf_compute_loss(self, labels: tf.Tensor, logits: tf.Tensor) -> tf.Tensor: - loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( - from_logits=True, reduction=tf.keras.losses.Reduction.NONE - ) + loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE) if self.config.tf_legacy_loss: # make sure only labels that are not equal to -100 # are taken into account as loss @@ -133,7 +132,7 @@ class TFAlbertPreTrainingLoss: return tf.reshape(reduced_masked_lm_loss + reduced_masked_sop_loss, (1,)) -class TFAlbertEmbeddings(tf.keras.layers.Layer): +class TFAlbertEmbeddings(keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, config: AlbertConfig, **kwargs): @@ -143,8 +142,8 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer): self.embedding_size = config.embedding_size self.max_position_embeddings = config.max_position_embeddings self.initializer_range = config.initializer_range - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) def build(self, input_shape=None): with tf.name_scope("word_embeddings"): @@ -217,7 +216,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer): return final_embeddings -class TFAlbertAttention(tf.keras.layers.Layer): +class TFAlbertAttention(keras.layers.Layer): """Contains the complete attention sublayer, including both dropouts and layer norm.""" def __init__(self, config: AlbertConfig, **kwargs): @@ -235,22 +234,22 @@ class TFAlbertAttention(tf.keras.layers.Layer): self.sqrt_att_head_size = math.sqrt(self.attention_head_size) self.output_attentions = config.output_attentions - self.query = tf.keras.layers.Dense( + self.query = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" ) - self.key = tf.keras.layers.Dense( + self.key = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" ) - self.value = tf.keras.layers.Dense( + self.value = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") # Two different dropout probabilities; see https://github.com/google-research/albert/blob/master/modeling.py#L971-L993 - self.attention_dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) - self.output_dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.attention_dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + self.output_dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: @@ -334,12 +333,12 @@ class TFAlbertAttention(tf.keras.layers.Layer): self.LayerNorm.build([None, None, self.config.hidden_size]) -class TFAlbertLayer(tf.keras.layers.Layer): +class TFAlbertLayer(keras.layers.Layer): def __init__(self, config: AlbertConfig, **kwargs): super().__init__(**kwargs) self.attention = TFAlbertAttention(config, name="attention") - self.ffn = tf.keras.layers.Dense( + self.ffn = keras.layers.Dense( units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn" ) @@ -348,13 +347,13 @@ class TFAlbertLayer(tf.keras.layers.Layer): else: self.activation = config.hidden_act - self.ffn_output = tf.keras.layers.Dense( + self.ffn_output = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn_output" ) - self.full_layer_layer_norm = tf.keras.layers.LayerNormalization( + self.full_layer_layer_norm = keras.layers.LayerNormalization( epsilon=config.layer_norm_eps, name="full_layer_layer_norm" ) - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call( @@ -401,7 +400,7 @@ class TFAlbertLayer(tf.keras.layers.Layer): self.full_layer_layer_norm.build([None, None, self.config.hidden_size]) -class TFAlbertLayerGroup(tf.keras.layers.Layer): +class TFAlbertLayerGroup(keras.layers.Layer): def __init__(self, config: AlbertConfig, **kwargs): super().__init__(**kwargs) @@ -453,7 +452,7 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer): layer.build(None) -class TFAlbertTransformer(tf.keras.layers.Layer): +class TFAlbertTransformer(keras.layers.Layer): def __init__(self, config: AlbertConfig, **kwargs): super().__init__(**kwargs) @@ -461,7 +460,7 @@ class TFAlbertTransformer(tf.keras.layers.Layer): self.num_hidden_groups = config.num_hidden_groups # Number of layers in a hidden group self.layers_per_group = int(config.num_hidden_layers / config.num_hidden_groups) - self.embedding_hidden_mapping_in = tf.keras.layers.Dense( + self.embedding_hidden_mapping_in = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="embedding_hidden_mapping_in", @@ -534,13 +533,13 @@ class TFAlbertPreTrainedModel(TFPreTrainedModel): base_model_prefix = "albert" -class TFAlbertMLMHead(tf.keras.layers.Layer): - def __init__(self, config: AlbertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): +class TFAlbertMLMHead(keras.layers.Layer): + def __init__(self, config: AlbertConfig, input_embeddings: keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.config = config self.embedding_size = config.embedding_size - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) if isinstance(config.hidden_act, str): @@ -548,7 +547,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer): else: self.activation = config.hidden_act - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. @@ -570,7 +569,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer): with tf.name_scope(self.LayerNorm.name): self.LayerNorm.build([None, None, self.config.embedding_size]) - def get_output_embeddings(self) -> tf.keras.layers.Layer: + def get_output_embeddings(self) -> keras.layers.Layer: return self.decoder def set_output_embeddings(self, value: tf.Variable): @@ -599,7 +598,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer): @keras_serializable -class TFAlbertMainLayer(tf.keras.layers.Layer): +class TFAlbertMainLayer(keras.layers.Layer): config_class = AlbertConfig def __init__(self, config: AlbertConfig, add_pooling_layer: bool = True, **kwargs): @@ -610,7 +609,7 @@ class TFAlbertMainLayer(tf.keras.layers.Layer): self.embeddings = TFAlbertEmbeddings(config, name="embeddings") self.encoder = TFAlbertTransformer(config, name="encoder") self.pooler = ( - tf.keras.layers.Dense( + keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", @@ -620,7 +619,7 @@ class TFAlbertMainLayer(tf.keras.layers.Layer): else None ) - def get_input_embeddings(self) -> tf.keras.layers.Layer: + def get_input_embeddings(self) -> keras.layers.Layer: return self.embeddings def set_input_embeddings(self, value: tf.Variable): @@ -776,7 +775,7 @@ ALBERT_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -942,7 +941,7 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel, TFAlbertPreTrainingLoss): self.predictions = TFAlbertMLMHead(config, input_embeddings=self.albert.embeddings, name="predictions") self.sop_classifier = TFAlbertSOPHead(config, name="sop_classifier") - def get_lm_head(self) -> tf.keras.layers.Layer: + def get_lm_head(self) -> keras.layers.Layer: return self.predictions @unpack_inputs @@ -1032,12 +1031,12 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel, TFAlbertPreTrainingLoss): self.sop_classifier.build(None) -class TFAlbertSOPHead(tf.keras.layers.Layer): +class TFAlbertSOPHead(keras.layers.Layer): def __init__(self, config: AlbertConfig, **kwargs): super().__init__(**kwargs) - self.dropout = tf.keras.layers.Dropout(rate=config.classifier_dropout_prob) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(rate=config.classifier_dropout_prob) + self.classifier = keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier", @@ -1070,7 +1069,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss) self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert") self.predictions = TFAlbertMLMHead(config, input_embeddings=self.albert.embeddings, name="predictions") - def get_lm_head(self) -> tf.keras.layers.Layer: + def get_lm_head(self) -> keras.layers.Layer: return self.predictions @unpack_inputs @@ -1184,8 +1183,8 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass self.num_labels = config.num_labels self.albert = TFAlbertMainLayer(config, name="albert") - self.dropout = tf.keras.layers.Dropout(rate=config.classifier_dropout_prob) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(rate=config.classifier_dropout_prob) + self.classifier = keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config @@ -1283,8 +1282,8 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat if config.classifier_dropout_prob is not None else config.hidden_dropout_prob ) - self.dropout = tf.keras.layers.Dropout(rate=classifier_dropout_prob) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(rate=classifier_dropout_prob) + self.classifier = keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config @@ -1372,7 +1371,7 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL self.num_labels = config.num_labels self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert") - self.qa_outputs = tf.keras.layers.Dense( + self.qa_outputs = keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) self.config = config @@ -1478,8 +1477,8 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss): super().__init__(config, *inputs, **kwargs) self.albert = TFAlbertMainLayer(config, name="albert") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.classifier = keras.layers.Dense( units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config diff --git a/src/transformers/models/bart/modeling_tf_bart.py b/src/transformers/models/bart/modeling_tf_bart.py index 46b896053d..1e38908b4a 100644 --- a/src/transformers/models/bart/modeling_tf_bart.py +++ b/src/transformers/models/bart/modeling_tf_bart.py @@ -38,6 +38,7 @@ from ...modeling_tf_utils import ( TFModelInputType, TFPreTrainedModel, TFSequenceClassificationLoss, + keras, keras_serializable, unpack_inputs, ) @@ -116,7 +117,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None): return (one_cst - expanded_mask) * LARGE_NEGATIVE -class TFBartLearnedPositionalEmbedding(tf.keras.layers.Embedding): +class TFBartLearnedPositionalEmbedding(keras.layers.Embedding): """ This module learns positional embeddings up to a fixed maximum size. """ @@ -143,7 +144,7 @@ class TFBartLearnedPositionalEmbedding(tf.keras.layers.Embedding): return super().call(position_ids + tf.constant(self.offset, dtype=offset_dtype)) -class TFBartAttention(tf.keras.layers.Layer): +class TFBartAttention(keras.layers.Layer): """Multi-headed attention from "Attention Is All You Need""" def __init__( @@ -159,7 +160,7 @@ class TFBartAttention(tf.keras.layers.Layer): self.embed_dim = embed_dim self.num_heads = num_heads - self.dropout = tf.keras.layers.Dropout(dropout) + self.dropout = keras.layers.Dropout(dropout) self.head_dim = embed_dim // num_heads if (self.head_dim * num_heads) != self.embed_dim: raise ValueError( @@ -169,10 +170,10 @@ class TFBartAttention(tf.keras.layers.Layer): self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder - self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") - self.q_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") - self.v_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") - self.out_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") + self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") + self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") + self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") + self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int): return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3)) @@ -313,20 +314,20 @@ class TFBartAttention(tf.keras.layers.Layer): self.out_proj.build([None, None, self.embed_dim]) -class TFBartEncoderLayer(tf.keras.layers.Layer): +class TFBartEncoderLayer(keras.layers.Layer): def __init__(self, config: BartConfig, **kwargs): super().__init__(**kwargs) self.embed_dim = config.d_model self.self_attn = TFBartAttention( self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn" ) - self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.dropout = keras.layers.Dropout(config.dropout) self.activation_fn = get_tf_activation(config.activation_function) - self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) - self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") - self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") - self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) + self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.config = config def call( @@ -390,7 +391,7 @@ class TFBartEncoderLayer(tf.keras.layers.Layer): self.final_layer_norm.build([None, None, self.embed_dim]) -class TFBartDecoderLayer(tf.keras.layers.Layer): +class TFBartDecoderLayer(keras.layers.Layer): def __init__(self, config: BartConfig, **kwargs): super().__init__(**kwargs) self.embed_dim = config.d_model @@ -401,11 +402,11 @@ class TFBartDecoderLayer(tf.keras.layers.Layer): name="self_attn", is_decoder=True, ) - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) self.activation_fn = get_tf_activation(config.activation_function) - self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) - self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") self.encoder_attn = TFBartAttention( self.embed_dim, config.decoder_attention_heads, @@ -413,10 +414,10 @@ class TFBartDecoderLayer(tf.keras.layers.Layer): name="encoder_attn", is_decoder=True, ) - self.encoder_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") - self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") - self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") - self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") + self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.config = config def call( @@ -526,21 +527,21 @@ class TFBartDecoderLayer(tf.keras.layers.Layer): self.final_layer_norm.build([None, None, self.embed_dim]) -class TFBartClassificationHead(tf.keras.layers.Layer): +class TFBartClassificationHead(keras.layers.Layer): """Head for sentence-level classification tasks.""" def __init__(self, inner_dim: int, num_classes: int, pooler_dropout: float, name: str, **kwargs): super().__init__(name=name, **kwargs) - self.dense = tf.keras.layers.Dense(inner_dim, name="dense") - self.dropout = tf.keras.layers.Dropout(pooler_dropout) - self.out_proj = tf.keras.layers.Dense(num_classes, name="out_proj") + self.dense = keras.layers.Dense(inner_dim, name="dense") + self.dropout = keras.layers.Dropout(pooler_dropout) + self.out_proj = keras.layers.Dense(num_classes, name="out_proj") self.input_dim = inner_dim self.inner_dim = inner_dim def call(self, inputs): hidden_states = self.dropout(inputs) hidden_states = self.dense(hidden_states) - hidden_states = tf.keras.activations.tanh(hidden_states) + hidden_states = keras.activations.tanh(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = self.out_proj(hidden_states) return hidden_states @@ -583,7 +584,7 @@ BART_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -740,7 +741,7 @@ BART_INPUTS_DOCSTRING = r""" @keras_serializable -class TFBartEncoder(tf.keras.layers.Layer): +class TFBartEncoder(keras.layers.Layer): config_class = BartConfig """ Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a @@ -750,10 +751,10 @@ class TFBartEncoder(tf.keras.layers.Layer): config: BartConfig """ - def __init__(self, config: BartConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs): + def __init__(self, config: BartConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs): super().__init__(**kwargs) self.config = config - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) self.layerdrop = config.encoder_layerdrop self.padding_idx = config.pad_token_id self.max_source_positions = config.max_position_embeddings @@ -766,7 +767,7 @@ class TFBartEncoder(tf.keras.layers.Layer): name="embed_positions", ) self.layers = [TFBartEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)] - self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") + self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") self.embed_dim = config.d_model @unpack_inputs @@ -900,7 +901,7 @@ class TFBartEncoder(tf.keras.layers.Layer): @keras_serializable -class TFBartDecoder(tf.keras.layers.Layer): +class TFBartDecoder(keras.layers.Layer): config_class = BartConfig """ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFBartDecoderLayer`] @@ -910,7 +911,7 @@ class TFBartDecoder(tf.keras.layers.Layer): embed_tokens: output embedding """ - def __init__(self, config: BartConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs): + def __init__(self, config: BartConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs): super().__init__(**kwargs) self.config = config self.padding_idx = config.pad_token_id @@ -923,9 +924,9 @@ class TFBartDecoder(tf.keras.layers.Layer): ) self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0 self.layers = [TFBartDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)] - self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") + self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) @unpack_inputs def call( @@ -1130,16 +1131,16 @@ class TFBartDecoder(tf.keras.layers.Layer): @keras_serializable -class TFBartMainLayer(tf.keras.layers.Layer): +class TFBartMainLayer(keras.layers.Layer): config_class = BartConfig def __init__(self, config: BartConfig, load_weight_prefix=None, **kwargs): super().__init__(**kwargs) self.config = config - self.shared = tf.keras.layers.Embedding( + self.shared = keras.layers.Embedding( input_dim=config.vocab_size, output_dim=config.d_model, - embeddings_initializer=tf.keras.initializers.TruncatedNormal(stddev=self.config.init_std), + embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std), name="model.shared", ) # Additional attribute to specify the expected name scope of the layer (for loading/storing weights) @@ -1358,9 +1359,9 @@ class TFBartModel(TFBartPretrainedModel): self.model.build(None) -class BiasLayer(tf.keras.layers.Layer): +class BiasLayer(keras.layers.Layer): """ - Bias as a layer. It is used for serialization purposes: `tf.keras.Model.save_weights` stores on a per-layer basis, + Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis, so all weights have to be registered in a layer. """ diff --git a/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py b/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py index 5e3ef4df9f..418e1f8905 100644 --- a/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py +++ b/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py @@ -81,7 +81,7 @@ def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name if any(x in var_name for x in tensors_to_transpose): torch_tensor = torch_tensor.T tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session) - tf.keras.backend.set_value(tf_var, torch_tensor) + tf_var.assign(tf.cast(torch_tensor, tf_var.dtype)) tf_weight = session.run(tf_var) print(f"Successfully created {tf_name}: {np.allclose(tf_weight, torch_tensor)}") diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py index 84e5d60d12..853ec6e6df 100644 --- a/src/transformers/models/bert/modeling_tf_bert.py +++ b/src/transformers/models/bert/modeling_tf_bert.py @@ -49,6 +49,7 @@ from ...modeling_tf_utils import ( TFSequenceClassificationLoss, TFTokenClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -121,9 +122,7 @@ class TFBertPreTrainingLoss: """ def hf_compute_loss(self, labels: tf.Tensor, logits: tf.Tensor) -> tf.Tensor: - loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( - from_logits=True, reduction=tf.keras.losses.Reduction.NONE - ) + loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE) # Clip negative labels to zero here to avoid NaNs and errors - those positions will get masked later anyway unmasked_lm_losses = loss_fn(y_true=tf.nn.relu(labels["labels"]), y_pred=logits[0]) @@ -143,7 +142,7 @@ class TFBertPreTrainingLoss: return tf.reshape(reduced_masked_lm_loss + reduced_masked_ns_loss, (1,)) -class TFBertEmbeddings(tf.keras.layers.Layer): +class TFBertEmbeddings(keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, config: BertConfig, **kwargs): @@ -153,8 +152,8 @@ class TFBertEmbeddings(tf.keras.layers.Layer): self.hidden_size = config.hidden_size self.max_position_embeddings = config.max_position_embeddings self.initializer_range = config.initializer_range - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) def build(self, input_shape=None): with tf.name_scope("word_embeddings"): @@ -226,7 +225,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer): return final_embeddings -class TFBertSelfAttention(tf.keras.layers.Layer): +class TFBertSelfAttention(keras.layers.Layer): def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) @@ -241,16 +240,16 @@ class TFBertSelfAttention(tf.keras.layers.Layer): self.all_head_size = self.num_attention_heads * self.attention_head_size self.sqrt_att_head_size = math.sqrt(self.attention_head_size) - self.query = tf.keras.layers.Dense( + self.query = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" ) - self.key = tf.keras.layers.Dense( + self.key = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" ) - self.value = tf.keras.layers.Dense( + self.value = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) - self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.is_decoder = config.is_decoder self.config = config @@ -358,15 +357,15 @@ class TFBertSelfAttention(tf.keras.layers.Layer): self.value.build([None, None, self.config.hidden_size]) -class TFBertSelfOutput(tf.keras.layers.Layer): +class TFBertSelfOutput(keras.layers.Layer): def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -388,7 +387,7 @@ class TFBertSelfOutput(tf.keras.layers.Layer): self.LayerNorm.build([None, None, self.config.hidden_size]) -class TFBertAttention(tf.keras.layers.Layer): +class TFBertAttention(keras.layers.Layer): def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) @@ -439,11 +438,11 @@ class TFBertAttention(tf.keras.layers.Layer): self.dense_output.build(None) -class TFBertIntermediate(tf.keras.layers.Layer): +class TFBertIntermediate(keras.layers.Layer): def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -468,15 +467,15 @@ class TFBertIntermediate(tf.keras.layers.Layer): self.dense.build([None, None, self.config.hidden_size]) -class TFBertOutput(tf.keras.layers.Layer): +class TFBertOutput(keras.layers.Layer): def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -498,7 +497,7 @@ class TFBertOutput(tf.keras.layers.Layer): self.LayerNorm.build([None, None, self.config.hidden_size]) -class TFBertLayer(tf.keras.layers.Layer): +class TFBertLayer(keras.layers.Layer): def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) @@ -601,7 +600,7 @@ class TFBertLayer(tf.keras.layers.Layer): self.crossattention.build(None) -class TFBertEncoder(tf.keras.layers.Layer): +class TFBertEncoder(keras.layers.Layer): def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) self.config = config @@ -679,11 +678,11 @@ class TFBertEncoder(tf.keras.layers.Layer): layer.build(None) -class TFBertPooler(tf.keras.layers.Layer): +class TFBertPooler(keras.layers.Layer): def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", @@ -708,11 +707,11 @@ class TFBertPooler(tf.keras.layers.Layer): self.dense.build([None, None, self.config.hidden_size]) -class TFBertPredictionHeadTransform(tf.keras.layers.Layer): +class TFBertPredictionHeadTransform(keras.layers.Layer): def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense", @@ -723,7 +722,7 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer): else: self.transform_act_fn = config.hidden_act - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: @@ -745,8 +744,8 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer): self.LayerNorm.build([None, None, self.config.hidden_size]) -class TFBertLMPredictionHead(tf.keras.layers.Layer): - def __init__(self, config: BertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): +class TFBertLMPredictionHead(keras.layers.Layer): + def __init__(self, config: BertConfig, input_embeddings: keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.config = config @@ -768,7 +767,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer): with tf.name_scope(self.transform.name): self.transform.build(None) - def get_output_embeddings(self) -> tf.keras.layers.Layer: + def get_output_embeddings(self) -> keras.layers.Layer: return self.input_embeddings def set_output_embeddings(self, value: tf.Variable): @@ -793,8 +792,8 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer): return hidden_states -class TFBertMLMHead(tf.keras.layers.Layer): - def __init__(self, config: BertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): +class TFBertMLMHead(keras.layers.Layer): + def __init__(self, config: BertConfig, input_embeddings: keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.predictions = TFBertLMPredictionHead(config, input_embeddings, name="predictions") @@ -813,11 +812,11 @@ class TFBertMLMHead(tf.keras.layers.Layer): self.predictions.build(None) -class TFBertNSPHead(tf.keras.layers.Layer): +class TFBertNSPHead(keras.layers.Layer): def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) - self.seq_relationship = tf.keras.layers.Dense( + self.seq_relationship = keras.layers.Dense( units=2, kernel_initializer=get_initializer(config.initializer_range), name="seq_relationship", @@ -839,7 +838,7 @@ class TFBertNSPHead(tf.keras.layers.Layer): @keras_serializable -class TFBertMainLayer(tf.keras.layers.Layer): +class TFBertMainLayer(keras.layers.Layer): config_class = BertConfig def __init__(self, config: BertConfig, add_pooling_layer: bool = True, **kwargs): @@ -852,7 +851,7 @@ class TFBertMainLayer(tf.keras.layers.Layer): self.encoder = TFBertEncoder(config, name="encoder") self.pooler = TFBertPooler(config, name="pooler") if add_pooling_layer else None - def get_input_embeddings(self) -> tf.keras.layers.Layer: + def get_input_embeddings(self) -> keras.layers.Layer: return self.embeddings def set_input_embeddings(self, value: tf.Variable): @@ -1086,7 +1085,7 @@ BERT_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -1281,7 +1280,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss): self.nsp = TFBertNSPHead(config, name="nsp___cls") self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings, name="mlm___cls") - def get_lm_head(self) -> tf.keras.layers.Layer: + def get_lm_head(self) -> keras.layers.Layer: return self.mlm.predictions def get_prefix_bias_name(self) -> str: @@ -1407,7 +1406,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss): self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings, name="mlm___cls") - def get_lm_head(self) -> tf.keras.layers.Layer: + def get_lm_head(self) -> keras.layers.Layer: return self.mlm.predictions def get_prefix_bias_name(self) -> str: @@ -1500,7 +1499,7 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss): self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings, name="mlm___cls") - def get_lm_head(self) -> tf.keras.layers.Layer: + def get_lm_head(self) -> keras.layers.Layer: return self.mlm.predictions def get_prefix_bias_name(self) -> str: @@ -1732,8 +1731,8 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = tf.keras.layers.Dropout(rate=classifier_dropout) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(rate=classifier_dropout) + self.classifier = keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier", @@ -1825,8 +1824,8 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss): super().__init__(config, *inputs, **kwargs) self.bert = TFBertMainLayer(config, name="bert") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.classifier = keras.layers.Dense( units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config @@ -1947,8 +1946,8 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = tf.keras.layers.Dropout(rate=classifier_dropout) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(rate=classifier_dropout) + self.classifier = keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier", @@ -2045,7 +2044,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss) self.num_labels = config.num_labels self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") - self.qa_outputs = tf.keras.layers.Dense( + self.qa_outputs = keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs", diff --git a/src/transformers/models/bert/tokenization_bert_tf.py b/src/transformers/models/bert/tokenization_bert_tf.py index 53adb390fa..4c2b392521 100644 --- a/src/transformers/models/bert/tokenization_bert_tf.py +++ b/src/transformers/models/bert/tokenization_bert_tf.py @@ -5,10 +5,11 @@ import tensorflow as tf from tensorflow_text import BertTokenizer as BertTokenizerLayer from tensorflow_text import FastBertTokenizer, ShrinkLongestTrimmer, case_fold_utf8, combine_segments, pad_model_inputs +from ...modeling_tf_utils import keras from .tokenization_bert import BertTokenizer -class TFBertTokenizer(tf.keras.layers.Layer): +class TFBertTokenizer(keras.layers.Layer): """ This is an in-graph tokenizer for BERT. It should be initialized similarly to other tokenizers, using the `from_pretrained()` method. It can also be initialized with the `from_tokenizer()` method, which imports settings diff --git a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py index 4e8da00fc0..ccb07d20ec 100644 --- a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py +++ b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py @@ -36,6 +36,7 @@ from ...modeling_tf_outputs import ( from ...modeling_tf_utils import ( TFCausalLanguageModelingLoss, TFPreTrainedModel, + keras, keras_serializable, unpack_inputs, ) @@ -117,7 +118,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None): return (one_cst - expanded_mask) * LARGE_NEGATIVE -class TFBlenderbotLearnedPositionalEmbedding(tf.keras.layers.Embedding): +class TFBlenderbotLearnedPositionalEmbedding(keras.layers.Embedding): """ This module learns positional embeddings up to a fixed maximum size. """ @@ -138,7 +139,7 @@ class TFBlenderbotLearnedPositionalEmbedding(tf.keras.layers.Embedding): # Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with Bart->Blenderbot -class TFBlenderbotAttention(tf.keras.layers.Layer): +class TFBlenderbotAttention(keras.layers.Layer): """Multi-headed attention from "Attention Is All You Need""" def __init__( @@ -154,7 +155,7 @@ class TFBlenderbotAttention(tf.keras.layers.Layer): self.embed_dim = embed_dim self.num_heads = num_heads - self.dropout = tf.keras.layers.Dropout(dropout) + self.dropout = keras.layers.Dropout(dropout) self.head_dim = embed_dim // num_heads if (self.head_dim * num_heads) != self.embed_dim: raise ValueError( @@ -164,10 +165,10 @@ class TFBlenderbotAttention(tf.keras.layers.Layer): self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder - self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") - self.q_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") - self.v_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") - self.out_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") + self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") + self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") + self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") + self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int): return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3)) @@ -309,20 +310,20 @@ class TFBlenderbotAttention(tf.keras.layers.Layer): # Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartEncoderLayer with MBart->Blenderbot -class TFBlenderbotEncoderLayer(tf.keras.layers.Layer): +class TFBlenderbotEncoderLayer(keras.layers.Layer): def __init__(self, config: BlenderbotConfig, **kwargs): super().__init__(**kwargs) self.embed_dim = config.d_model self.self_attn = TFBlenderbotAttention( self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn" ) - self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.dropout = keras.layers.Dropout(config.dropout) self.activation_fn = get_tf_activation(config.activation_function) - self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) - self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") - self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") - self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) + self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.config = config def call( @@ -387,7 +388,7 @@ class TFBlenderbotEncoderLayer(tf.keras.layers.Layer): # Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartDecoderLayer with MBart->Blenderbot -class TFBlenderbotDecoderLayer(tf.keras.layers.Layer): +class TFBlenderbotDecoderLayer(keras.layers.Layer): def __init__(self, config: BlenderbotConfig, **kwargs): super().__init__(**kwargs) self.embed_dim = config.d_model @@ -398,11 +399,11 @@ class TFBlenderbotDecoderLayer(tf.keras.layers.Layer): name="self_attn", is_decoder=True, ) - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) self.activation_fn = get_tf_activation(config.activation_function) - self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) - self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") self.encoder_attn = TFBlenderbotAttention( self.embed_dim, config.decoder_attention_heads, @@ -410,10 +411,10 @@ class TFBlenderbotDecoderLayer(tf.keras.layers.Layer): name="encoder_attn", is_decoder=True, ) - self.encoder_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") - self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") - self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") - self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") + self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.config = config def call( @@ -533,7 +534,7 @@ BLENDERBOT_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -677,7 +678,7 @@ BLENDERBOT_INPUTS_DOCSTRING = r""" @keras_serializable -class TFBlenderbotEncoder(tf.keras.layers.Layer): +class TFBlenderbotEncoder(keras.layers.Layer): config_class = BlenderbotConfig """ Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a @@ -687,10 +688,10 @@ class TFBlenderbotEncoder(tf.keras.layers.Layer): config: BlenderbotConfig """ - def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs): + def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs): super().__init__(**kwargs) self.config = config - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) self.layerdrop = config.encoder_layerdrop self.padding_idx = config.pad_token_id self.max_source_positions = config.max_position_embeddings @@ -703,7 +704,7 @@ class TFBlenderbotEncoder(tf.keras.layers.Layer): name="embed_positions", ) self.layers = [TFBlenderbotEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)] - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") + self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") def get_embed_tokens(self): return self.embed_tokens @@ -849,7 +850,7 @@ class TFBlenderbotEncoder(tf.keras.layers.Layer): @keras_serializable -class TFBlenderbotDecoder(tf.keras.layers.Layer): +class TFBlenderbotDecoder(keras.layers.Layer): config_class = BlenderbotConfig """ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFBlenderbotDecoderLayer`] @@ -859,7 +860,7 @@ class TFBlenderbotDecoder(tf.keras.layers.Layer): embed_tokens: output embedding """ - def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs): + def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs): super().__init__(**kwargs) self.config = config self.padding_idx = config.pad_token_id @@ -872,9 +873,9 @@ class TFBlenderbotDecoder(tf.keras.layers.Layer): ) self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0 self.layers = [TFBlenderbotDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)] - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") + self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) def get_embed_tokens(self): return self.embed_tokens @@ -1090,17 +1091,17 @@ class TFBlenderbotDecoder(tf.keras.layers.Layer): @keras_serializable -class TFBlenderbotMainLayer(tf.keras.layers.Layer): +class TFBlenderbotMainLayer(keras.layers.Layer): config_class = BlenderbotConfig def __init__(self, config: BlenderbotConfig, **kwargs): super().__init__(**kwargs) self.config = config - self.shared = tf.keras.layers.Embedding( + self.shared = keras.layers.Embedding( input_dim=config.vocab_size, output_dim=config.d_model, - embeddings_initializer=tf.keras.initializers.TruncatedNormal(stddev=self.config.init_std), + embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std), name="model.shared", ) # Additional attribute to specify the expected name scope of the layer (for loading/storing weights) @@ -1325,9 +1326,9 @@ class TFBlenderbotModel(TFBlenderbotPreTrainedModel): # Copied from transformers.models.bart.modeling_tf_bart.BiasLayer -class BiasLayer(tf.keras.layers.Layer): +class BiasLayer(keras.layers.Layer): """ - Bias as a layer. It is used for serialization purposes: `tf.keras.Model.save_weights` stores on a per-layer basis, + Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis, so all weights have to be registered in a layer. """ diff --git a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py index 93a480b1ea..01206831ac 100644 --- a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py @@ -35,6 +35,7 @@ from ...modeling_tf_outputs import ( from ...modeling_tf_utils import ( TFCausalLanguageModelingLoss, TFPreTrainedModel, + keras, keras_serializable, unpack_inputs, ) @@ -117,7 +118,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None): # Copied from transformers.models.blenderbot.modeling_tf_blenderbot.TFBlenderbotLearnedPositionalEmbedding with Blenderbot->BlenderbotSmall -class TFBlenderbotSmallLearnedPositionalEmbedding(tf.keras.layers.Embedding): +class TFBlenderbotSmallLearnedPositionalEmbedding(keras.layers.Embedding): """ This module learns positional embeddings up to a fixed maximum size. """ @@ -138,7 +139,7 @@ class TFBlenderbotSmallLearnedPositionalEmbedding(tf.keras.layers.Embedding): # Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with Bart->BlenderbotSmall -class TFBlenderbotSmallAttention(tf.keras.layers.Layer): +class TFBlenderbotSmallAttention(keras.layers.Layer): """Multi-headed attention from "Attention Is All You Need""" def __init__( @@ -154,7 +155,7 @@ class TFBlenderbotSmallAttention(tf.keras.layers.Layer): self.embed_dim = embed_dim self.num_heads = num_heads - self.dropout = tf.keras.layers.Dropout(dropout) + self.dropout = keras.layers.Dropout(dropout) self.head_dim = embed_dim // num_heads if (self.head_dim * num_heads) != self.embed_dim: raise ValueError( @@ -164,10 +165,10 @@ class TFBlenderbotSmallAttention(tf.keras.layers.Layer): self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder - self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") - self.q_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") - self.v_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") - self.out_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") + self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") + self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") + self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") + self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int): return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3)) @@ -309,20 +310,20 @@ class TFBlenderbotSmallAttention(tf.keras.layers.Layer): # Copied from transformers.models.bart.modeling_tf_bart.TFBartEncoderLayer with Bart->BlenderbotSmall -class TFBlenderbotSmallEncoderLayer(tf.keras.layers.Layer): +class TFBlenderbotSmallEncoderLayer(keras.layers.Layer): def __init__(self, config: BlenderbotSmallConfig, **kwargs): super().__init__(**kwargs) self.embed_dim = config.d_model self.self_attn = TFBlenderbotSmallAttention( self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn" ) - self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.dropout = keras.layers.Dropout(config.dropout) self.activation_fn = get_tf_activation(config.activation_function) - self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) - self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") - self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") - self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) + self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.config = config def call( @@ -387,7 +388,7 @@ class TFBlenderbotSmallEncoderLayer(tf.keras.layers.Layer): # Copied from transformers.models.bart.modeling_tf_bart.TFBartDecoderLayer with Bart->BlenderbotSmall -class TFBlenderbotSmallDecoderLayer(tf.keras.layers.Layer): +class TFBlenderbotSmallDecoderLayer(keras.layers.Layer): def __init__(self, config: BlenderbotSmallConfig, **kwargs): super().__init__(**kwargs) self.embed_dim = config.d_model @@ -398,11 +399,11 @@ class TFBlenderbotSmallDecoderLayer(tf.keras.layers.Layer): name="self_attn", is_decoder=True, ) - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) self.activation_fn = get_tf_activation(config.activation_function) - self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) - self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") self.encoder_attn = TFBlenderbotSmallAttention( self.embed_dim, config.decoder_attention_heads, @@ -410,10 +411,10 @@ class TFBlenderbotSmallDecoderLayer(tf.keras.layers.Layer): name="encoder_attn", is_decoder=True, ) - self.encoder_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") - self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") - self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") - self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") + self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.config = config def call( @@ -533,7 +534,7 @@ BLENDERBOT_SMALL_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -681,7 +682,7 @@ BLENDERBOT_SMALL_INPUTS_DOCSTRING = r""" @keras_serializable -class TFBlenderbotSmallEncoder(tf.keras.layers.Layer): +class TFBlenderbotSmallEncoder(keras.layers.Layer): config_class = BlenderbotSmallConfig """ Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a @@ -691,12 +692,10 @@ class TFBlenderbotSmallEncoder(tf.keras.layers.Layer): config: BlenderbotSmallConfig """ - def __init__( - self, config: BlenderbotSmallConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs - ): + def __init__(self, config: BlenderbotSmallConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs): super().__init__(**kwargs) self.config = config - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) self.layerdrop = config.encoder_layerdrop self.padding_idx = config.pad_token_id self.max_source_positions = config.max_position_embeddings @@ -709,7 +708,7 @@ class TFBlenderbotSmallEncoder(tf.keras.layers.Layer): name="embed_positions", ) self.layers = [TFBlenderbotSmallEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)] - self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") + self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") self.embed_dim = config.d_model def get_embed_tokens(self): @@ -855,7 +854,7 @@ class TFBlenderbotSmallEncoder(tf.keras.layers.Layer): @keras_serializable -class TFBlenderbotSmallDecoder(tf.keras.layers.Layer): +class TFBlenderbotSmallDecoder(keras.layers.Layer): config_class = BlenderbotSmallConfig """ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFBlenderbotSmallDecoderLayer`] @@ -865,9 +864,7 @@ class TFBlenderbotSmallDecoder(tf.keras.layers.Layer): embed_tokens: output embedding """ - def __init__( - self, config: BlenderbotSmallConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs - ): + def __init__(self, config: BlenderbotSmallConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs): super().__init__(**kwargs) self.config = config self.padding_idx = config.pad_token_id @@ -880,9 +877,9 @@ class TFBlenderbotSmallDecoder(tf.keras.layers.Layer): ) self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0 self.layers = [TFBlenderbotSmallDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)] - self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") + self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) def get_embed_tokens(self): return self.embed_tokens @@ -1095,17 +1092,17 @@ class TFBlenderbotSmallDecoder(tf.keras.layers.Layer): @keras_serializable -class TFBlenderbotSmallMainLayer(tf.keras.layers.Layer): +class TFBlenderbotSmallMainLayer(keras.layers.Layer): config_class = BlenderbotSmallConfig def __init__(self, config: BlenderbotSmallConfig, **kwargs): super().__init__(**kwargs) self.config = config - self.shared = tf.keras.layers.Embedding( + self.shared = keras.layers.Embedding( input_dim=config.vocab_size, output_dim=config.d_model, - embeddings_initializer=tf.keras.initializers.TruncatedNormal(stddev=self.config.init_std), + embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std), name="model.shared", ) # Additional attribute to specify the expected name scope of the layer (for loading/storing weights) @@ -1314,9 +1311,9 @@ class TFBlenderbotSmallModel(TFBlenderbotSmallPreTrainedModel): # Copied from transformers.models.bart.modeling_tf_bart.BiasLayer -class BiasLayer(tf.keras.layers.Layer): +class BiasLayer(keras.layers.Layer): """ - Bias as a layer. It is used for serialization purposes: `tf.keras.Model.save_weights` stores on a per-layer basis, + Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis, so all weights have to be registered in a layer. """ diff --git a/src/transformers/models/blip/modeling_tf_blip.py b/src/transformers/models/blip/modeling_tf_blip.py index a7bf639a7b..5952aa145c 100644 --- a/src/transformers/models/blip/modeling_tf_blip.py +++ b/src/transformers/models/blip/modeling_tf_blip.py @@ -27,6 +27,7 @@ from ...modeling_tf_utils import ( TFPreTrainedModel, get_initializer, get_tf_activation, + keras, keras_serializable, shape_list, unpack_inputs, @@ -63,7 +64,7 @@ TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [ # Copied from transformers.models.clip.modeling_tf_clip.contrastive_loss def contrastive_loss(logits: tf.Tensor) -> tf.Tensor: return tf.math.reduce_mean( - tf.keras.metrics.sparse_categorical_crossentropy( + keras.metrics.sparse_categorical_crossentropy( y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True ) ) @@ -234,7 +235,7 @@ class TFBlipOutput(ModelOutput): ) -class TFBlipVisionEmbeddings(tf.keras.layers.Layer): +class TFBlipVisionEmbeddings(keras.layers.Layer): def __init__(self, config: BlipVisionConfig, **kwargs): super().__init__(**kwargs) self.config = config @@ -242,7 +243,7 @@ class TFBlipVisionEmbeddings(tf.keras.layers.Layer): self.image_size = config.image_size self.patch_size = config.patch_size - self.patch_embedding = tf.keras.layers.Conv2D( + self.patch_embedding = keras.layers.Conv2D( filters=self.embed_dim, kernel_size=self.patch_size, strides=self.patch_size, @@ -291,7 +292,7 @@ class TFBlipVisionEmbeddings(tf.keras.layers.Layer): # Copied from transformers.models.clip.modeling_tf_clip.TFCLIPTextEmbeddings with CLIP->Blip -class TFBlipTextEmbeddings(tf.keras.layers.Layer): +class TFBlipTextEmbeddings(keras.layers.Layer): def __init__(self, config: BlipTextConfig, **kwargs): super().__init__(**kwargs) @@ -349,7 +350,7 @@ class TFBlipTextEmbeddings(tf.keras.layers.Layer): return final_embeddings -class TFBlipAttention(tf.keras.layers.Layer): +class TFBlipAttention(keras.layers.Layer): """Multi-headed attention from 'Attention Is All You Need' paper""" def __init__(self, config, **kwargs): @@ -364,13 +365,13 @@ class TFBlipAttention(tf.keras.layers.Layer): f" {self.num_heads})." ) self.scale = self.head_dim**-0.5 - self.dropout = tf.keras.layers.Dropout(config.attention_dropout, name="dropout") + self.dropout = keras.layers.Dropout(config.attention_dropout, name="dropout") - self.qkv = tf.keras.layers.Dense( + self.qkv = keras.layers.Dense( 3 * self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="qkv" ) - self.projection = tf.keras.layers.Dense( + self.projection = keras.layers.Dense( self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="projection" ) @@ -433,7 +434,7 @@ class TFBlipAttention(tf.keras.layers.Layer): self.projection.build([None, None, self.embed_dim]) -class TFBlipMLP(tf.keras.layers.Layer): +class TFBlipMLP(keras.layers.Layer): def __init__(self, config: BlipConfig, **kwargs): super().__init__(**kwargs) @@ -442,10 +443,10 @@ class TFBlipMLP(tf.keras.layers.Layer): in_proj_std = (config.hidden_size**-0.5) * ((2 * config.num_hidden_layers) ** -0.5) fc_std = (2 * config.hidden_size) ** -0.5 - self.fc1 = tf.keras.layers.Dense( + self.fc1 = keras.layers.Dense( units=config.intermediate_size, kernel_initializer=get_initializer(fc_std), name="fc1" ) - self.fc2 = tf.keras.layers.Dense( + self.fc2 = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2" ) self.config = config @@ -468,14 +469,14 @@ class TFBlipMLP(tf.keras.layers.Layer): self.fc2.build([None, None, self.config.intermediate_size]) -class TFBlipEncoderLayer(tf.keras.layers.Layer): +class TFBlipEncoderLayer(keras.layers.Layer): def __init__(self, config: BlipConfig, **kwargs): super().__init__(**kwargs) self.embed_dim = config.hidden_size self.self_attn = TFBlipAttention(config, name="self_attn") - self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1") + self.layer_norm1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1") self.mlp = TFBlipMLP(config, name="mlp") - self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2") + self.layer_norm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2") def call( self, @@ -551,7 +552,7 @@ BLIP_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -614,7 +615,7 @@ BLIP_INPUTS_DOCSTRING = r""" @keras_serializable -class TFBlipEncoder(tf.keras.layers.Layer): +class TFBlipEncoder(keras.layers.Layer): config_class = BlipConfig """ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a @@ -714,7 +715,7 @@ class TFBlipVisionModel(TFBlipPreTrainedModel): self.embeddings = TFBlipVisionEmbeddings(config, name="embeddings") self.encoder = TFBlipEncoder(config, name="encoder") - self.post_layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm") + self.post_layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm") self.embed_dim = config.hidden_size def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling: @@ -798,7 +799,7 @@ class TFBlipVisionModel(TFBlipPreTrainedModel): self.post_layernorm.build([None, None, self.embed_dim]) -class TFBlipMainLayer(tf.keras.layers.Layer): +class TFBlipMainLayer(keras.layers.Layer): config_class = BlipConfig def __init__(self, config: BlipConfig, *args, **kwargs): @@ -826,13 +827,13 @@ class TFBlipMainLayer(tf.keras.layers.Layer): self.text_model = TFBlipTextModel(text_config, name="text_model") self.vision_model = TFBlipVisionModel(vision_config, name="vision_model") - self.visual_projection = tf.keras.layers.Dense( + self.visual_projection = keras.layers.Dense( self.projection_dim, use_bias=False, kernel_initializer=get_initializer(config.initializer_range), name="visual_projection", ) - self.text_projection = tf.keras.layers.Dense( + self.text_projection = keras.layers.Dense( self.projection_dim, use_bias=False, kernel_initializer=get_initializer(config.initializer_range), @@ -845,7 +846,7 @@ class TFBlipMainLayer(tf.keras.layers.Layer): self.logit_scale = self.add_weight( name="logit_scale", shape=[], - initializer=tf.keras.initializers.Constant(self.config.logit_scale_init_value), + initializer=keras.initializers.Constant(self.config.logit_scale_init_value), trainable=True, ) @@ -1116,7 +1117,7 @@ class TFBlipForConditionalGeneration(TFBlipPreTrainedModel): self.decoder_input_ids = config.text_config.bos_token_id self.decoder_pad_token_id = config.text_config.pad_token_id - def get_input_embeddings(self) -> tf.keras.layers.Layer: + def get_input_embeddings(self) -> keras.layers.Layer: return self.vision_model.embeddings.patch_embedding @unpack_inputs @@ -1307,7 +1308,7 @@ class TFBlipForQuestionAnswering(TFBlipPreTrainedModel): self.decoder_pad_token_id = config.text_config.pad_token_id self.decoder_start_token_id = config.text_config.bos_token_id - def get_input_embeddings(self) -> tf.keras.layers.Layer: + def get_input_embeddings(self) -> keras.layers.Layer: return self.vision_model.embeddings.patch_embedding # Adapted from transformers.models.t5.modeling_tf_t5.TFT5PreTrainedModel._shift_right @@ -1557,21 +1558,21 @@ class TFBlipForImageTextRetrieval(TFBlipPreTrainedModel): self.text_encoder = TFBlipTextModel(config.text_config, name="text_encoder", add_pooling_layer=False) # vision projection layer - self.vision_proj = tf.keras.layers.Dense( + self.vision_proj = keras.layers.Dense( config.image_text_hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="vision_proj", ) # text projection layer - self.text_proj = tf.keras.layers.Dense( + self.text_proj = keras.layers.Dense( config.image_text_hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="text_proj", ) # image text matching head - self.itm_head = tf.keras.layers.Dense( + self.itm_head = keras.layers.Dense( 2, kernel_initializer=get_initializer(config.initializer_range), name="itm_head" ) @@ -1587,7 +1588,7 @@ class TFBlipForImageTextRetrieval(TFBlipPreTrainedModel): ) self.config = config - def get_input_embeddings(self) -> tf.keras.layers.Layer: + def get_input_embeddings(self) -> keras.layers.Layer: return self.vision_model.embeddings.patch_embedding @unpack_inputs diff --git a/src/transformers/models/blip/modeling_tf_blip_text.py b/src/transformers/models/blip/modeling_tf_blip_text.py index 1245536f63..19d8bc9b6e 100644 --- a/src/transformers/models/blip/modeling_tf_blip_text.py +++ b/src/transformers/models/blip/modeling_tf_blip_text.py @@ -31,6 +31,7 @@ from ...modeling_tf_utils import ( TFPreTrainedModel, get_initializer, get_tf_activation, + keras, keras_serializable, shape_list, unpack_inputs, @@ -75,18 +76,18 @@ BLIP_TEXT_INPUTS_DOCSTRING = r""" # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L52 -class TFBlipTextEmbeddings(tf.keras.layers.Layer): +class TFBlipTextEmbeddings(keras.layers.Layer): """Construct the embeddings from word and position embeddings.""" def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.word_embeddings = tf.keras.layers.Embedding( + self.word_embeddings = keras.layers.Embedding( config.vocab_size, config.hidden_size, embeddings_initializer=get_initializer(config.initializer_range), name="word_embeddings", ) - self.position_embeddings = tf.keras.layers.Embedding( + self.position_embeddings = keras.layers.Embedding( config.max_position_embeddings, config.hidden_size, embeddings_initializer=get_initializer(config.initializer_range), @@ -95,8 +96,8 @@ class TFBlipTextEmbeddings(tf.keras.layers.Layer): # self.LayerNorm is not snake-cased to stick with PyTorch model variable name and be able to load # any TensorFlow checkpoint file - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, name="dropout") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, name="dropout") self.position_ids = tf.expand_dims(tf.range(config.max_position_embeddings), 0) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") @@ -146,7 +147,7 @@ class TFBlipTextEmbeddings(tf.keras.layers.Layer): # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L97 -class TFBlipTextSelfAttention(tf.keras.layers.Layer): +class TFBlipTextSelfAttention(keras.layers.Layer): def __init__(self, config, is_cross_attention, **kwargs): super().__init__(**kwargs) self.config = config @@ -160,21 +161,21 @@ class TFBlipTextSelfAttention(tf.keras.layers.Layer): self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size - self.query = tf.keras.layers.Dense( + self.query = keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" ) - self.key = tf.keras.layers.Dense( + self.key = keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" ) - self.value = tf.keras.layers.Dense( + self.value = keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) - self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) + self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = tf.keras.layers.Embedding( + self.distance_embedding = keras.layers.Embedding( 2 * config.max_position_embeddings - 1, self.attention_head_size ) self.is_cross_attention = is_cross_attention @@ -291,15 +292,15 @@ class TFBlipTextSelfAttention(tf.keras.layers.Layer): self.value.build([None, None, self.config.hidden_size]) -class TFBlipTextSelfOutput(tf.keras.layers.Layer): +class TFBlipTextSelfOutput(keras.layers.Layer): def __init__(self, config: BlipTextConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: Optional[bool] = None) -> tf.Tensor: @@ -322,7 +323,7 @@ class TFBlipTextSelfOutput(tf.keras.layers.Layer): # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#242 -class TFBlipTextAttention(tf.keras.layers.Layer): +class TFBlipTextAttention(keras.layers.Layer): def __init__(self, config, is_cross_attention=False, **kwargs): super().__init__(**kwargs) self.self = TFBlipTextSelfAttention(config, is_cross_attention, name="self") @@ -367,11 +368,11 @@ class TFBlipTextAttention(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->BlipText -class TFBlipTextIntermediate(tf.keras.layers.Layer): +class TFBlipTextIntermediate(keras.layers.Layer): def __init__(self, config: BlipTextConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -396,15 +397,15 @@ class TFBlipTextIntermediate(tf.keras.layers.Layer): self.dense.build([None, None, self.config.hidden_size]) -class TFBlipTextOutput(tf.keras.layers.Layer): +class TFBlipTextOutput(keras.layers.Layer): def __init__(self, config: BlipTextConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -426,7 +427,7 @@ class TFBlipTextOutput(tf.keras.layers.Layer): self.LayerNorm.build([None, None, self.config.hidden_size]) -class TFBlipTextLayer(tf.keras.layers.Layer): +class TFBlipTextLayer(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.config = config @@ -504,7 +505,7 @@ class TFBlipTextLayer(tf.keras.layers.Layer): # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L386 @keras_serializable -class TFBlipTextEncoder(tf.keras.layers.Layer): +class TFBlipTextEncoder(keras.layers.Layer): config_class = BlipTextConfig def __init__(self, config, name=None, **kwargs): @@ -593,11 +594,11 @@ class TFBlipTextEncoder(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->BlipText -class TFBlipTextPooler(tf.keras.layers.Layer): +class TFBlipTextPooler(keras.layers.Layer): def __init__(self, config: BlipTextConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", @@ -623,11 +624,11 @@ class TFBlipTextPooler(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->BlipText -class TFBlipTextPredictionHeadTransform(tf.keras.layers.Layer): +class TFBlipTextPredictionHeadTransform(keras.layers.Layer): def __init__(self, config: BlipTextConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense", @@ -638,7 +639,7 @@ class TFBlipTextPredictionHeadTransform(tf.keras.layers.Layer): else: self.transform_act_fn = config.hidden_act - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: @@ -660,14 +661,14 @@ class TFBlipTextPredictionHeadTransform(tf.keras.layers.Layer): self.LayerNorm.build([None, None, self.config.hidden_size]) -class TFBlipTextLMPredictionHead(tf.keras.layers.Layer): +class TFBlipTextLMPredictionHead(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.transform = TFBlipTextPredictionHeadTransform(config, name="transform") # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. - self.decoder = tf.keras.layers.Dense( + self.decoder = keras.layers.Dense( config.vocab_size, kernel_initializer=get_initializer(config.initializer_range), name="decoder", @@ -694,7 +695,7 @@ class TFBlipTextLMPredictionHead(tf.keras.layers.Layer): return hidden_states -class TFBlipTextOnlyMLMHead(tf.keras.layers.Layer): +class TFBlipTextOnlyMLMHead(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.predictions = TFBlipTextLMPredictionHead(config, name="predictions") @@ -1062,7 +1063,7 @@ class TFBlipTextLMHeadModel(TFBlipTextPreTrainedModel): # Keras won't give us label smoothing for sparse CE, so we de-sparsify things here # Use relu to clamp masked labels at 0 to avoid NaN (we will be zeroing those out later anyway) one_hot_labels = tf.one_hot(tf.nn.relu(labels), depth=self.config.vocab_size, dtype=tf.float32) - loss_fct = tf.keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=0.1, reduction="none") + loss_fct = keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=0.1, reduction="none") masked_positions = tf.cast(tf.not_equal(labels, -100), dtype=tf.float32) lm_loss = loss_fct(one_hot_labels, shifted_prediction_scores) lm_loss *= masked_positions diff --git a/src/transformers/models/camembert/modeling_tf_camembert.py b/src/transformers/models/camembert/modeling_tf_camembert.py index 850d8bccef..c4bb10891d 100644 --- a/src/transformers/models/camembert/modeling_tf_camembert.py +++ b/src/transformers/models/camembert/modeling_tf_camembert.py @@ -46,6 +46,7 @@ from ...modeling_tf_utils import ( TFSequenceClassificationLoss, TFTokenClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -75,7 +76,7 @@ CAMEMBERT_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -168,7 +169,7 @@ CAMEMBERT_INPUTS_DOCSTRING = r""" # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaEmbeddings -class TFCamembertEmbeddings(tf.keras.layers.Layer): +class TFCamembertEmbeddings(keras.layers.Layer): """ Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. """ @@ -181,8 +182,8 @@ class TFCamembertEmbeddings(tf.keras.layers.Layer): self.hidden_size = config.hidden_size self.max_position_embeddings = config.max_position_embeddings self.initializer_range = config.initializer_range - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) def build(self, input_shape=None): with tf.name_scope("word_embeddings"): @@ -274,11 +275,11 @@ class TFCamembertEmbeddings(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Camembert -class TFCamembertPooler(tf.keras.layers.Layer): +class TFCamembertPooler(keras.layers.Layer): def __init__(self, config: CamembertConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", @@ -304,7 +305,7 @@ class TFCamembertPooler(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Camembert -class TFCamembertSelfAttention(tf.keras.layers.Layer): +class TFCamembertSelfAttention(keras.layers.Layer): def __init__(self, config: CamembertConfig, **kwargs): super().__init__(**kwargs) @@ -319,16 +320,16 @@ class TFCamembertSelfAttention(tf.keras.layers.Layer): self.all_head_size = self.num_attention_heads * self.attention_head_size self.sqrt_att_head_size = math.sqrt(self.attention_head_size) - self.query = tf.keras.layers.Dense( + self.query = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" ) - self.key = tf.keras.layers.Dense( + self.key = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" ) - self.value = tf.keras.layers.Dense( + self.value = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) - self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.is_decoder = config.is_decoder self.config = config @@ -437,15 +438,15 @@ class TFCamembertSelfAttention(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Camembert -class TFCamembertSelfOutput(tf.keras.layers.Layer): +class TFCamembertSelfOutput(keras.layers.Layer): def __init__(self, config: CamembertConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -468,7 +469,7 @@ class TFCamembertSelfOutput(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Camembert -class TFCamembertAttention(tf.keras.layers.Layer): +class TFCamembertAttention(keras.layers.Layer): def __init__(self, config: CamembertConfig, **kwargs): super().__init__(**kwargs) @@ -520,11 +521,11 @@ class TFCamembertAttention(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Camembert -class TFCamembertIntermediate(tf.keras.layers.Layer): +class TFCamembertIntermediate(keras.layers.Layer): def __init__(self, config: CamembertConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -550,15 +551,15 @@ class TFCamembertIntermediate(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Camembert -class TFCamembertOutput(tf.keras.layers.Layer): +class TFCamembertOutput(keras.layers.Layer): def __init__(self, config: CamembertConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -581,7 +582,7 @@ class TFCamembertOutput(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Camembert -class TFCamembertLayer(tf.keras.layers.Layer): +class TFCamembertLayer(keras.layers.Layer): def __init__(self, config: CamembertConfig, **kwargs): super().__init__(**kwargs) @@ -685,7 +686,7 @@ class TFCamembertLayer(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Camembert -class TFCamembertEncoder(tf.keras.layers.Layer): +class TFCamembertEncoder(keras.layers.Layer): def __init__(self, config: CamembertConfig, **kwargs): super().__init__(**kwargs) self.config = config @@ -765,7 +766,7 @@ class TFCamembertEncoder(tf.keras.layers.Layer): @keras_serializable # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaMainLayer with Roberta->Camembert -class TFCamembertMainLayer(tf.keras.layers.Layer): +class TFCamembertMainLayer(keras.layers.Layer): config_class = CamembertConfig def __init__(self, config, add_pooling_layer=True, **kwargs): @@ -785,7 +786,7 @@ class TFCamembertMainLayer(tf.keras.layers.Layer): self.embeddings = TFCamembertEmbeddings(config, name="embeddings") # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings - def get_input_embeddings(self) -> tf.keras.layers.Layer: + def get_input_embeddings(self) -> keras.layers.Layer: return self.embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings @@ -1068,7 +1069,7 @@ class TFCamembertModel(TFCamembertPreTrainedModel): # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->Camembert -class TFCamembertLMHead(tf.keras.layers.Layer): +class TFCamembertLMHead(keras.layers.Layer): """Camembert Head for masked language modeling.""" def __init__(self, config, input_embeddings, **kwargs): @@ -1076,10 +1077,10 @@ class TFCamembertLMHead(tf.keras.layers.Layer): self.config = config self.hidden_size = config.hidden_size - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") + self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.act = get_tf_activation("gelu") # The output weights are the same as the input embeddings, but there is @@ -1222,12 +1223,12 @@ class TFCamembertForMaskedLM(TFCamembertPreTrainedModel, TFMaskedLanguageModelin # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead -class TFCamembertClassificationHead(tf.keras.layers.Layer): +class TFCamembertClassificationHead(keras.layers.Layer): """Head for sentence-level classification tasks.""" def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", @@ -1236,8 +1237,8 @@ class TFCamembertClassificationHead(tf.keras.layers.Layer): classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = tf.keras.layers.Dropout(classifier_dropout) - self.out_proj = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(classifier_dropout) + self.out_proj = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" ) self.config = config @@ -1371,8 +1372,8 @@ class TFCamembertForTokenClassification(TFCamembertPreTrainedModel, TFTokenClass classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = tf.keras.layers.Dropout(classifier_dropout) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(classifier_dropout) + self.classifier = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config @@ -1463,8 +1464,8 @@ class TFCamembertForMultipleChoice(TFCamembertPreTrainedModel, TFMultipleChoiceL super().__init__(config, *inputs, **kwargs) self.roberta = TFCamembertMainLayer(config, name="roberta") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) + self.classifier = keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config @@ -1568,7 +1569,7 @@ class TFCamembertForQuestionAnswering(TFCamembertPreTrainedModel, TFQuestionAnsw self.num_labels = config.num_labels self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta") - self.qa_outputs = tf.keras.layers.Dense( + self.qa_outputs = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) self.config = config diff --git a/src/transformers/models/clip/modeling_tf_clip.py b/src/transformers/models/clip/modeling_tf_clip.py index d510f59276..d8dd7f0bd8 100644 --- a/src/transformers/models/clip/modeling_tf_clip.py +++ b/src/transformers/models/clip/modeling_tf_clip.py @@ -32,6 +32,7 @@ from ...modeling_tf_utils import ( TFModelInputType, TFPreTrainedModel, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -77,7 +78,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None): # https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html def contrastive_loss(logits: tf.Tensor) -> tf.Tensor: return tf.math.reduce_mean( - tf.keras.metrics.sparse_categorical_crossentropy( + keras.metrics.sparse_categorical_crossentropy( y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True ) ) @@ -127,7 +128,7 @@ class TFCLIPOutput(ModelOutput): ) -class TFCLIPVisionEmbeddings(tf.keras.layers.Layer): +class TFCLIPVisionEmbeddings(keras.layers.Layer): def __init__(self, config: CLIPVisionConfig, **kwargs): super().__init__(**kwargs) @@ -140,7 +141,7 @@ class TFCLIPVisionEmbeddings(tf.keras.layers.Layer): self.config = config - self.patch_embedding = tf.keras.layers.Conv2D( + self.patch_embedding = keras.layers.Conv2D( filters=self.embed_dim, kernel_size=self.patch_size, strides=self.patch_size, @@ -201,7 +202,7 @@ class TFCLIPVisionEmbeddings(tf.keras.layers.Layer): return embeddings -class TFCLIPTextEmbeddings(tf.keras.layers.Layer): +class TFCLIPTextEmbeddings(keras.layers.Layer): def __init__(self, config: CLIPTextConfig, **kwargs): super().__init__(**kwargs) @@ -259,7 +260,7 @@ class TFCLIPTextEmbeddings(tf.keras.layers.Layer): return final_embeddings -class TFCLIPAttention(tf.keras.layers.Layer): +class TFCLIPAttention(keras.layers.Layer): """Multi-headed attention from 'Attention Is All You Need' paper""" def __init__(self, config: CLIPConfig, **kwargs): @@ -280,19 +281,19 @@ class TFCLIPAttention(tf.keras.layers.Layer): self.sqrt_att_head_size = math.sqrt(self.attention_head_size) - self.q_proj = tf.keras.layers.Dense( + self.q_proj = keras.layers.Dense( units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="q_proj" ) - self.k_proj = tf.keras.layers.Dense( + self.k_proj = keras.layers.Dense( units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="k_proj" ) - self.v_proj = tf.keras.layers.Dense( + self.v_proj = keras.layers.Dense( units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="v_proj" ) - self.dropout = tf.keras.layers.Dropout(rate=config.attention_dropout) + self.dropout = keras.layers.Dropout(rate=config.attention_dropout) - self.out_proj = tf.keras.layers.Dense( + self.out_proj = keras.layers.Dense( units=self.embed_dim, kernel_initializer=get_initializer(out_proj_std), name="out_proj" ) @@ -375,7 +376,7 @@ class TFCLIPAttention(tf.keras.layers.Layer): self.out_proj.build([None, None, self.embed_dim]) -class TFCLIPMLP(tf.keras.layers.Layer): +class TFCLIPMLP(keras.layers.Layer): def __init__(self, config: CLIPConfig, **kwargs): super().__init__(**kwargs) @@ -385,10 +386,10 @@ class TFCLIPMLP(tf.keras.layers.Layer): in_proj_std = (config.hidden_size**-0.5) * ((2 * config.num_hidden_layers) ** -0.5) * factor fc_std = (2 * config.hidden_size) ** -0.5 * factor - self.fc1 = tf.keras.layers.Dense( + self.fc1 = keras.layers.Dense( units=config.intermediate_size, kernel_initializer=get_initializer(fc_std), name="fc1" ) - self.fc2 = tf.keras.layers.Dense( + self.fc2 = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2" ) self.config = config @@ -411,15 +412,15 @@ class TFCLIPMLP(tf.keras.layers.Layer): self.fc2.build([None, None, self.config.intermediate_size]) -class TFCLIPEncoderLayer(tf.keras.layers.Layer): +class TFCLIPEncoderLayer(keras.layers.Layer): def __init__(self, config: CLIPConfig, **kwargs): super().__init__(**kwargs) self.embed_dim = config.hidden_size self.self_attn = TFCLIPAttention(config, name="self_attn") - self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1") + self.layer_norm1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1") self.mlp = TFCLIPMLP(config, name="mlp") - self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2") + self.layer_norm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2") def call( self, @@ -480,7 +481,7 @@ class TFCLIPEncoderLayer(tf.keras.layers.Layer): self.layer_norm2.build([None, None, self.embed_dim]) -class TFCLIPEncoder(tf.keras.layers.Layer): +class TFCLIPEncoder(keras.layers.Layer): """ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a [`TFCLIPEncoderLayer`]. @@ -544,15 +545,13 @@ class TFCLIPEncoder(tf.keras.layers.Layer): layer.build(None) -class TFCLIPTextTransformer(tf.keras.layers.Layer): +class TFCLIPTextTransformer(keras.layers.Layer): def __init__(self, config: CLIPTextConfig, **kwargs): super().__init__(**kwargs) self.embeddings = TFCLIPTextEmbeddings(config, name="embeddings") self.encoder = TFCLIPEncoder(config, name="encoder") - self.final_layer_norm = tf.keras.layers.LayerNormalization( - epsilon=config.layer_norm_eps, name="final_layer_norm" - ) + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="final_layer_norm") # For `pooled_output` computation self.eos_token_id = config.eos_token_id @@ -663,7 +662,7 @@ class TFCLIPTextTransformer(tf.keras.layers.Layer): @keras_serializable -class TFCLIPTextMainLayer(tf.keras.layers.Layer): +class TFCLIPTextMainLayer(keras.layers.Layer): config_class = CLIPTextConfig def __init__(self, config: CLIPTextConfig, **kwargs): @@ -671,7 +670,7 @@ class TFCLIPTextMainLayer(tf.keras.layers.Layer): self.config = config self.text_model = TFCLIPTextTransformer(config, name="text_model") - def get_input_embeddings(self) -> tf.keras.layers.Layer: + def get_input_embeddings(self) -> keras.layers.Layer: return self.text_model.embeddings def set_input_embeddings(self, value: tf.Variable): @@ -718,14 +717,14 @@ class TFCLIPTextMainLayer(tf.keras.layers.Layer): self.text_model.build(None) -class TFCLIPVisionTransformer(tf.keras.layers.Layer): +class TFCLIPVisionTransformer(keras.layers.Layer): def __init__(self, config: CLIPVisionConfig, **kwargs): super().__init__(**kwargs) self.embeddings = TFCLIPVisionEmbeddings(config, name="embeddings") - self.pre_layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="pre_layrnorm") + self.pre_layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="pre_layrnorm") self.encoder = TFCLIPEncoder(config, name="encoder") - self.post_layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm") + self.post_layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm") self.embed_dim = config.hidden_size def call( @@ -782,7 +781,7 @@ class TFCLIPVisionTransformer(tf.keras.layers.Layer): @keras_serializable -class TFCLIPVisionMainLayer(tf.keras.layers.Layer): +class TFCLIPVisionMainLayer(keras.layers.Layer): config_class = CLIPVisionConfig def __init__(self, config: CLIPVisionConfig, **kwargs): @@ -790,7 +789,7 @@ class TFCLIPVisionMainLayer(tf.keras.layers.Layer): self.config = config self.vision_model = TFCLIPVisionTransformer(config, name="vision_model") - def get_input_embeddings(self) -> tf.keras.layers.Layer: + def get_input_embeddings(self) -> keras.layers.Layer: return self.vision_model.embeddings @unpack_inputs @@ -825,7 +824,7 @@ class TFCLIPVisionMainLayer(tf.keras.layers.Layer): @keras_serializable -class TFCLIPMainLayer(tf.keras.layers.Layer): +class TFCLIPMainLayer(keras.layers.Layer): config_class = CLIPConfig def __init__(self, config: CLIPConfig, **kwargs): @@ -853,14 +852,14 @@ class TFCLIPMainLayer(tf.keras.layers.Layer): self.text_model = TFCLIPTextTransformer(text_config, name="text_model") self.vision_model = TFCLIPVisionTransformer(vision_config, name="vision_model") - self.visual_projection = tf.keras.layers.Dense( + self.visual_projection = keras.layers.Dense( units=self.projection_dim, kernel_initializer=get_initializer(vision_config.hidden_size**-0.5 * self.config.initializer_factor), use_bias=False, name="visual_projection", ) - self.text_projection = tf.keras.layers.Dense( + self.text_projection = keras.layers.Dense( units=self.projection_dim, kernel_initializer=get_initializer(text_config.hidden_size**-0.5 * self.config.initializer_factor), use_bias=False, @@ -872,7 +871,7 @@ class TFCLIPMainLayer(tf.keras.layers.Layer): def build(self, input_shape: tf.TensorShape = None): self.logit_scale = self.add_weight( shape=(1,), - initializer=tf.keras.initializers.Constant(self.config.logit_scale_init_value), + initializer=keras.initializers.Constant(self.config.logit_scale_init_value), trainable=True, name="logit_scale", ) @@ -1046,7 +1045,7 @@ CLIP_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. diff --git a/src/transformers/models/convbert/modeling_tf_convbert.py b/src/transformers/models/convbert/modeling_tf_convbert.py index d329c1af59..e6855c68e2 100644 --- a/src/transformers/models/convbert/modeling_tf_convbert.py +++ b/src/transformers/models/convbert/modeling_tf_convbert.py @@ -41,6 +41,7 @@ from ...modeling_tf_utils import ( TFSequenceSummary, TFTokenClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -68,7 +69,7 @@ TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ # Copied from transformers.models.albert.modeling_tf_albert.TFAlbertEmbeddings with Albert->ConvBert -class TFConvBertEmbeddings(tf.keras.layers.Layer): +class TFConvBertEmbeddings(keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, config: ConvBertConfig, **kwargs): @@ -78,8 +79,8 @@ class TFConvBertEmbeddings(tf.keras.layers.Layer): self.embedding_size = config.embedding_size self.max_position_embeddings = config.max_position_embeddings self.initializer_range = config.initializer_range - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) def build(self, input_shape=None): with tf.name_scope("word_embeddings"): @@ -152,7 +153,7 @@ class TFConvBertEmbeddings(tf.keras.layers.Layer): return final_embeddings -class TFConvBertSelfAttention(tf.keras.layers.Layer): +class TFConvBertSelfAttention(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) @@ -178,17 +179,17 @@ class TFConvBertSelfAttention(tf.keras.layers.Layer): self.attention_head_size = config.hidden_size // config.num_attention_heads self.all_head_size = self.num_attention_heads * self.attention_head_size - self.query = tf.keras.layers.Dense( + self.query = keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" ) - self.key = tf.keras.layers.Dense( + self.key = keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" ) - self.value = tf.keras.layers.Dense( + self.value = keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) - self.key_conv_attn_layer = tf.keras.layers.SeparableConv1D( + self.key_conv_attn_layer = keras.layers.SeparableConv1D( self.all_head_size, self.conv_kernel_size, padding="same", @@ -198,21 +199,21 @@ class TFConvBertSelfAttention(tf.keras.layers.Layer): name="key_conv_attn_layer", ) - self.conv_kernel_layer = tf.keras.layers.Dense( + self.conv_kernel_layer = keras.layers.Dense( self.num_attention_heads * self.conv_kernel_size, activation=None, name="conv_kernel_layer", kernel_initializer=get_initializer(config.initializer_range), ) - self.conv_out_layer = tf.keras.layers.Dense( + self.conv_out_layer = keras.layers.Dense( self.all_head_size, activation=None, name="conv_out_layer", kernel_initializer=get_initializer(config.initializer_range), ) - self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) + self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob) self.config = config def transpose_for_scores(self, x, batch_size): @@ -327,15 +328,15 @@ class TFConvBertSelfAttention(tf.keras.layers.Layer): self.conv_out_layer.build([None, None, self.config.hidden_size]) -class TFConvBertSelfOutput(tf.keras.layers.Layer): +class TFConvBertSelfOutput(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) self.config = config def call(self, hidden_states, input_tensor, training=False): @@ -357,7 +358,7 @@ class TFConvBertSelfOutput(tf.keras.layers.Layer): self.LayerNorm.build([None, None, self.config.hidden_size]) -class TFConvBertAttention(tf.keras.layers.Layer): +class TFConvBertAttention(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) @@ -388,7 +389,7 @@ class TFConvBertAttention(tf.keras.layers.Layer): self.dense_output.build(None) -class GroupedLinearLayer(tf.keras.layers.Layer): +class GroupedLinearLayer(keras.layers.Layer): def __init__(self, input_size, output_size, num_groups, kernel_initializer, **kwargs): super().__init__(**kwargs) self.input_size = input_size @@ -421,11 +422,11 @@ class GroupedLinearLayer(tf.keras.layers.Layer): return x -class TFConvBertIntermediate(tf.keras.layers.Layer): +class TFConvBertIntermediate(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) if config.num_groups == 1: - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) else: @@ -458,12 +459,12 @@ class TFConvBertIntermediate(tf.keras.layers.Layer): self.dense.build([None, None, self.config.hidden_size]) -class TFConvBertOutput(tf.keras.layers.Layer): +class TFConvBertOutput(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) if config.num_groups == 1: - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) else: @@ -474,8 +475,8 @@ class TFConvBertOutput(tf.keras.layers.Layer): kernel_initializer=get_initializer(config.initializer_range), name="dense", ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) self.config = config def call(self, hidden_states, input_tensor, training=False): @@ -497,7 +498,7 @@ class TFConvBertOutput(tf.keras.layers.Layer): self.dense.build([None, None, self.config.intermediate_size]) -class TFConvBertLayer(tf.keras.layers.Layer): +class TFConvBertLayer(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) @@ -531,7 +532,7 @@ class TFConvBertLayer(tf.keras.layers.Layer): self.bert_output.build(None) -class TFConvBertEncoder(tf.keras.layers.Layer): +class TFConvBertEncoder(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) @@ -583,11 +584,11 @@ class TFConvBertEncoder(tf.keras.layers.Layer): layer.build(None) -class TFConvBertPredictionHeadTransform(tf.keras.layers.Layer): +class TFConvBertPredictionHeadTransform(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -596,7 +597,7 @@ class TFConvBertPredictionHeadTransform(tf.keras.layers.Layer): else: self.transform_act_fn = config.hidden_act - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.config = config def call(self, hidden_states): @@ -619,7 +620,7 @@ class TFConvBertPredictionHeadTransform(tf.keras.layers.Layer): @keras_serializable -class TFConvBertMainLayer(tf.keras.layers.Layer): +class TFConvBertMainLayer(keras.layers.Layer): config_class = ConvBertConfig def __init__(self, config, **kwargs): @@ -628,7 +629,7 @@ class TFConvBertMainLayer(tf.keras.layers.Layer): self.embeddings = TFConvBertEmbeddings(config, name="embeddings") if config.embedding_size != config.hidden_size: - self.embeddings_project = tf.keras.layers.Dense(config.hidden_size, name="embeddings_project") + self.embeddings_project = keras.layers.Dense(config.hidden_size, name="embeddings_project") self.encoder = TFConvBertEncoder(config, name="encoder") self.config = config @@ -755,7 +756,7 @@ CONVBERT_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -901,7 +902,7 @@ class TFConvBertModel(TFConvBertPreTrainedModel): self.convbert.build(None) -class TFConvBertMaskedLMHead(tf.keras.layers.Layer): +class TFConvBertMaskedLMHead(keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) @@ -938,12 +939,12 @@ class TFConvBertMaskedLMHead(tf.keras.layers.Layer): return hidden_states -class TFConvBertGeneratorPredictions(tf.keras.layers.Layer): +class TFConvBertGeneratorPredictions(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dense = tf.keras.layers.Dense(config.embedding_size, name="dense") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dense = keras.layers.Dense(config.embedding_size, name="dense") self.config = config def call(self, generator_hidden_states, training=False): @@ -1058,20 +1059,20 @@ class TFConvBertForMaskedLM(TFConvBertPreTrainedModel, TFMaskedLanguageModelingL self.generator_lm_head.build(None) -class TFConvBertClassificationHead(tf.keras.layers.Layer): +class TFConvBertClassificationHead(keras.layers.Layer): """Head for sentence-level classification tasks.""" def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = tf.keras.layers.Dropout(classifier_dropout) - self.out_proj = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(classifier_dropout) + self.out_proj = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" ) @@ -1193,7 +1194,7 @@ class TFConvBertForMultipleChoice(TFConvBertPreTrainedModel, TFMultipleChoiceLos self.sequence_summary = TFSequenceSummary( config, initializer_range=config.initializer_range, name="sequence_summary" ) - self.classifier = tf.keras.layers.Dense( + self.classifier = keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config @@ -1302,8 +1303,8 @@ class TFConvBertForTokenClassification(TFConvBertPreTrainedModel, TFTokenClassif classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = tf.keras.layers.Dropout(classifier_dropout) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(classifier_dropout) + self.classifier = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config @@ -1386,7 +1387,7 @@ class TFConvBertForQuestionAnswering(TFConvBertPreTrainedModel, TFQuestionAnswer self.num_labels = config.num_labels self.convbert = TFConvBertMainLayer(config, name="convbert") - self.qa_outputs = tf.keras.layers.Dense( + self.qa_outputs = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) self.config = config diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index 78f635456b..b92ac446d9 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -29,6 +29,7 @@ from ...modeling_tf_utils import ( TFPreTrainedModel, TFSequenceClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -44,7 +45,7 @@ _CONFIG_FOR_DOC = "ConvNextConfig" _CHECKPOINT_FOR_DOC = "facebook/convnext-tiny-224" -class TFConvNextDropPath(tf.keras.layers.Layer): +class TFConvNextDropPath(keras.layers.Layer): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). References: (1) github.com:rwightman/pytorch-image-models @@ -64,22 +65,22 @@ class TFConvNextDropPath(tf.keras.layers.Layer): return x -class TFConvNextEmbeddings(tf.keras.layers.Layer): +class TFConvNextEmbeddings(keras.layers.Layer): """This class is comparable to (and inspired by) the SwinEmbeddings class found in src/transformers/models/swin/modeling_swin.py. """ def __init__(self, config: ConvNextConfig, **kwargs): super().__init__(**kwargs) - self.patch_embeddings = tf.keras.layers.Conv2D( + self.patch_embeddings = keras.layers.Conv2D( filters=config.hidden_sizes[0], kernel_size=config.patch_size, strides=config.patch_size, name="patch_embeddings", kernel_initializer=get_initializer(config.initializer_range), - bias_initializer=tf.keras.initializers.Zeros(), + bias_initializer=keras.initializers.Zeros(), ) - self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm") + self.layernorm = keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm") self.num_channels = config.num_channels self.config = config @@ -93,7 +94,7 @@ class TFConvNextEmbeddings(tf.keras.layers.Layer): message="Make sure that the channel dimension of the pixel values match with the one set in the configuration.", ) - # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format. + # When running on CPU, `keras.layers.Conv2D` doesn't support `NCHW` format. # So change the input format from `NCHW` to `NHWC`. # shape = (batch_size, in_height, in_width, in_channels) pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) @@ -114,7 +115,7 @@ class TFConvNextEmbeddings(tf.keras.layers.Layer): self.layernorm.build([None, None, None, self.config.hidden_sizes[0]]) -class TFConvNextLayer(tf.keras.layers.Layer): +class TFConvNextLayer(keras.layers.Layer): """This corresponds to the `Block` class in the original implementation. There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C, @@ -133,7 +134,7 @@ class TFConvNextLayer(tf.keras.layers.Layer): super().__init__(**kwargs) self.dim = dim self.config = config - self.dwconv = tf.keras.layers.Conv2D( + self.dwconv = keras.layers.Conv2D( filters=dim, kernel_size=7, padding="same", @@ -142,18 +143,18 @@ class TFConvNextLayer(tf.keras.layers.Layer): bias_initializer="zeros", name="dwconv", ) # depthwise conv - self.layernorm = tf.keras.layers.LayerNormalization( + self.layernorm = keras.layers.LayerNormalization( epsilon=1e-6, name="layernorm", ) - self.pwconv1 = tf.keras.layers.Dense( + self.pwconv1 = keras.layers.Dense( units=4 * dim, kernel_initializer=get_initializer(config.initializer_range), bias_initializer="zeros", name="pwconv1", ) # pointwise/1x1 convs, implemented with linear layers self.act = get_tf_activation(config.hidden_act) - self.pwconv2 = tf.keras.layers.Dense( + self.pwconv2 = keras.layers.Dense( units=dim, kernel_initializer=get_initializer(config.initializer_range), bias_initializer="zeros", @@ -164,7 +165,7 @@ class TFConvNextLayer(tf.keras.layers.Layer): self.drop_path = ( TFConvNextDropPath(drop_path, name="drop_path") if drop_path > 0.0 - else tf.keras.layers.Activation("linear", name="drop_path") + else keras.layers.Activation("linear", name="drop_path") ) def build(self, input_shape: tf.TensorShape = None): @@ -172,7 +173,7 @@ class TFConvNextLayer(tf.keras.layers.Layer): self.layer_scale_parameter = ( self.add_weight( shape=(self.dim,), - initializer=tf.keras.initializers.Constant(value=self.config.layer_scale_init_value), + initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value), trainable=True, name="layer_scale_parameter", ) @@ -214,7 +215,7 @@ class TFConvNextLayer(tf.keras.layers.Layer): return x -class TFConvNextStage(tf.keras.layers.Layer): +class TFConvNextStage(keras.layers.Layer): """ConvNext stage, consisting of an optional downsampling layer + multiple residual blocks. Args: @@ -244,7 +245,7 @@ class TFConvNextStage(tf.keras.layers.Layer): super().__init__(**kwargs) if in_channels != out_channels or stride > 1: self.downsampling_layer = [ - tf.keras.layers.LayerNormalization( + keras.layers.LayerNormalization( epsilon=1e-6, name="downsampling_layer.0", ), @@ -253,12 +254,12 @@ class TFConvNextStage(tf.keras.layers.Layer): # layer. All the outputs throughout the model will be in NHWC # from this point on until the output where we again change to # NCHW. - tf.keras.layers.Conv2D( + keras.layers.Conv2D( filters=out_channels, kernel_size=kernel_size, strides=stride, kernel_initializer=get_initializer(config.initializer_range), - bias_initializer=tf.keras.initializers.Zeros(), + bias_initializer=keras.initializers.Zeros(), name="downsampling_layer.1", ), ] @@ -301,7 +302,7 @@ class TFConvNextStage(tf.keras.layers.Layer): self.downsampling_layer[1].build([None, None, None, self.in_channels]) -class TFConvNextEncoder(tf.keras.layers.Layer): +class TFConvNextEncoder(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.stages = [] @@ -347,7 +348,7 @@ class TFConvNextEncoder(tf.keras.layers.Layer): @keras_serializable -class TFConvNextMainLayer(tf.keras.layers.Layer): +class TFConvNextMainLayer(keras.layers.Layer): config_class = ConvNextConfig def __init__(self, config: ConvNextConfig, add_pooling_layer: bool = True, **kwargs): @@ -356,10 +357,10 @@ class TFConvNextMainLayer(tf.keras.layers.Layer): self.config = config self.embeddings = TFConvNextEmbeddings(config, name="embeddings") self.encoder = TFConvNextEncoder(config, name="encoder") - self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") + self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") # We are setting the `data_format` like so because from here on we will revert to the # NCHW output format - self.pooler = tf.keras.layers.GlobalAvgPool2D(data_format="channels_first") if add_pooling_layer else None + self.pooler = keras.layers.GlobalAvgPool2D(data_format="channels_first") if add_pooling_layer else None @unpack_inputs def call( @@ -436,7 +437,7 @@ CONVNEXT_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -575,7 +576,7 @@ class TFConvNextForImageClassification(TFConvNextPreTrainedModel, TFSequenceClas self.convnext = TFConvNextMainLayer(config, name="convnext") # Classifier head - self.classifier = tf.keras.layers.Dense( + self.classifier = keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), bias_initializer="zeros", diff --git a/src/transformers/models/convnextv2/modeling_tf_convnextv2.py b/src/transformers/models/convnextv2/modeling_tf_convnextv2.py index 048cf78b76..d4bef6f161 100644 --- a/src/transformers/models/convnextv2/modeling_tf_convnextv2.py +++ b/src/transformers/models/convnextv2/modeling_tf_convnextv2.py @@ -34,6 +34,7 @@ from ...modeling_tf_utils import ( TFPreTrainedModel, TFSequenceClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -67,7 +68,7 @@ CONVNEXTV2_PRETRAINED_MODEL_ARCHIVE_LIST = [ # Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextDropPath with ConvNext->ConvNextV2 -class TFConvNextV2DropPath(tf.keras.layers.Layer): +class TFConvNextV2DropPath(keras.layers.Layer): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). References: (1) github.com:rwightman/pytorch-image-models @@ -87,7 +88,7 @@ class TFConvNextV2DropPath(tf.keras.layers.Layer): return x -class TFConvNextV2GRN(tf.keras.layers.Layer): +class TFConvNextV2GRN(keras.layers.Layer): """GRN (Global Response Normalization) layer""" def __init__(self, config: ConvNextV2Config, dim: int, **kwargs): @@ -99,12 +100,12 @@ class TFConvNextV2GRN(tf.keras.layers.Layer): self.weight = self.add_weight( name="weight", shape=(1, 1, 1, self.dim), - initializer=tf.keras.initializers.Zeros(), + initializer=keras.initializers.Zeros(), ) self.bias = self.add_weight( name="bias", shape=(1, 1, 1, self.dim), - initializer=tf.keras.initializers.Zeros(), + initializer=keras.initializers.Zeros(), ) return super().build(input_shape) @@ -116,22 +117,22 @@ class TFConvNextV2GRN(tf.keras.layers.Layer): # Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextEmbeddings with ConvNext->ConvNextV2 -class TFConvNextV2Embeddings(tf.keras.layers.Layer): +class TFConvNextV2Embeddings(keras.layers.Layer): """This class is comparable to (and inspired by) the SwinEmbeddings class found in src/transformers/models/swin/modeling_swin.py. """ def __init__(self, config: ConvNextV2Config, **kwargs): super().__init__(**kwargs) - self.patch_embeddings = tf.keras.layers.Conv2D( + self.patch_embeddings = keras.layers.Conv2D( filters=config.hidden_sizes[0], kernel_size=config.patch_size, strides=config.patch_size, name="patch_embeddings", kernel_initializer=get_initializer(config.initializer_range), - bias_initializer=tf.keras.initializers.Zeros(), + bias_initializer=keras.initializers.Zeros(), ) - self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm") + self.layernorm = keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm") self.num_channels = config.num_channels self.config = config @@ -145,7 +146,7 @@ class TFConvNextV2Embeddings(tf.keras.layers.Layer): message="Make sure that the channel dimension of the pixel values match with the one set in the configuration.", ) - # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format. + # When running on CPU, `keras.layers.Conv2D` doesn't support `NCHW` format. # So change the input format from `NCHW` to `NHWC`. # shape = (batch_size, in_height, in_width, in_channels) pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) @@ -166,7 +167,7 @@ class TFConvNextV2Embeddings(tf.keras.layers.Layer): self.layernorm.build([None, None, None, self.config.hidden_sizes[0]]) -class TFConvNextV2Layer(tf.keras.layers.Layer): +class TFConvNextV2Layer(keras.layers.Layer): """This corresponds to the `Block` class in the original implementation. There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C, @@ -188,31 +189,31 @@ class TFConvNextV2Layer(tf.keras.layers.Layer): super().__init__(**kwargs) self.dim = dim self.config = config - self.dwconv = tf.keras.layers.Conv2D( + self.dwconv = keras.layers.Conv2D( filters=dim, kernel_size=7, padding="same", groups=dim, kernel_initializer=get_initializer(config.initializer_range), - bias_initializer=tf.keras.initializers.Zeros(), + bias_initializer=keras.initializers.Zeros(), name="dwconv", ) # depthwise conv - self.layernorm = tf.keras.layers.LayerNormalization( + self.layernorm = keras.layers.LayerNormalization( epsilon=1e-6, name="layernorm", ) - self.pwconv1 = tf.keras.layers.Dense( + self.pwconv1 = keras.layers.Dense( units=4 * dim, kernel_initializer=get_initializer(config.initializer_range), - bias_initializer=tf.keras.initializers.Zeros(), + bias_initializer=keras.initializers.Zeros(), name="pwconv1", ) # pointwise/1x1 convs, implemented with linear layers self.act = get_tf_activation(config.hidden_act) self.grn = TFConvNextV2GRN(config, 4 * dim, dtype=tf.float32, name="grn") - self.pwconv2 = tf.keras.layers.Dense( + self.pwconv2 = keras.layers.Dense( units=dim, kernel_initializer=get_initializer(config.initializer_range), - bias_initializer=tf.keras.initializers.Zeros(), + bias_initializer=keras.initializers.Zeros(), name="pwconv2", ) # Using `layers.Activation` instead of `tf.identity` to better control `training` @@ -220,7 +221,7 @@ class TFConvNextV2Layer(tf.keras.layers.Layer): self.drop_path = ( TFConvNextV2DropPath(drop_path, name="drop_path") if drop_path > 0.0 - else tf.keras.layers.Activation("linear", name="drop_path") + else keras.layers.Activation("linear", name="drop_path") ) def call(self, hidden_states, training=False): @@ -260,7 +261,7 @@ class TFConvNextV2Layer(tf.keras.layers.Layer): # Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextStage with ConvNext->ConvNextV2 -class TFConvNextV2Stage(tf.keras.layers.Layer): +class TFConvNextV2Stage(keras.layers.Layer): """ConvNextV2 stage, consisting of an optional downsampling layer + multiple residual blocks. Args: @@ -290,7 +291,7 @@ class TFConvNextV2Stage(tf.keras.layers.Layer): super().__init__(**kwargs) if in_channels != out_channels or stride > 1: self.downsampling_layer = [ - tf.keras.layers.LayerNormalization( + keras.layers.LayerNormalization( epsilon=1e-6, name="downsampling_layer.0", ), @@ -299,12 +300,12 @@ class TFConvNextV2Stage(tf.keras.layers.Layer): # layer. All the outputs throughout the model will be in NHWC # from this point on until the output where we again change to # NCHW. - tf.keras.layers.Conv2D( + keras.layers.Conv2D( filters=out_channels, kernel_size=kernel_size, strides=stride, kernel_initializer=get_initializer(config.initializer_range), - bias_initializer=tf.keras.initializers.Zeros(), + bias_initializer=keras.initializers.Zeros(), name="downsampling_layer.1", ), ] @@ -347,7 +348,7 @@ class TFConvNextV2Stage(tf.keras.layers.Layer): self.downsampling_layer[1].build([None, None, None, self.in_channels]) -class TFConvNextV2Encoder(tf.keras.layers.Layer): +class TFConvNextV2Encoder(keras.layers.Layer): def __init__(self, config: ConvNextV2Config, **kwargs): super().__init__(**kwargs) self.stages = [] @@ -398,7 +399,7 @@ class TFConvNextV2Encoder(tf.keras.layers.Layer): @keras_serializable -class TFConvNextV2MainLayer(tf.keras.layers.Layer): +class TFConvNextV2MainLayer(keras.layers.Layer): config_class = ConvNextV2Config def __init__(self, config: ConvNextV2Config, **kwargs): @@ -407,10 +408,10 @@ class TFConvNextV2MainLayer(tf.keras.layers.Layer): self.config = config self.embeddings = TFConvNextV2Embeddings(config, name="embeddings") self.encoder = TFConvNextV2Encoder(config, name="encoder") - self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") + self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") # We are setting the `data_format` like so because from here on we will revert to the # NCHW output format - self.pooler = tf.keras.layers.GlobalAvgPool2D(data_format="channels_last") + self.pooler = keras.layers.GlobalAvgPool2D(data_format="channels_last") @unpack_inputs def call( @@ -489,7 +490,7 @@ CONVNEXTV2_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -614,10 +615,10 @@ class TFConvNextV2ForImageClassification(TFConvNextV2PreTrainedModel, TFSequence self.convnextv2 = TFConvNextV2MainLayer(config, name="convnextv2") # Classifier head - self.classifier = tf.keras.layers.Dense( + self.classifier = keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), - bias_initializer=tf.keras.initializers.Zeros(), + bias_initializer=keras.initializers.Zeros(), name="classifier", ) diff --git a/src/transformers/models/ctrl/modeling_tf_ctrl.py b/src/transformers/models/ctrl/modeling_tf_ctrl.py index 7619bbfd89..b0dc90424b 100644 --- a/src/transformers/models/ctrl/modeling_tf_ctrl.py +++ b/src/transformers/models/ctrl/modeling_tf_ctrl.py @@ -29,6 +29,7 @@ from ...modeling_tf_utils import ( TFPreTrainedModel, TFSequenceClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -90,7 +91,7 @@ def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=N return output, attention_weights -class TFMultiHeadAttention(tf.keras.layers.Layer): +class TFMultiHeadAttention(keras.layers.Layer): def __init__(self, d_model_size, num_heads, output_attentions=False, **kwargs): super().__init__(**kwargs) self.num_heads = num_heads @@ -99,11 +100,11 @@ class TFMultiHeadAttention(tf.keras.layers.Layer): self.depth = int(d_model_size / self.num_heads) - self.Wq = tf.keras.layers.Dense(d_model_size, name="Wq") - self.Wk = tf.keras.layers.Dense(d_model_size, name="Wk") - self.Wv = tf.keras.layers.Dense(d_model_size, name="Wv") + self.Wq = keras.layers.Dense(d_model_size, name="Wq") + self.Wk = keras.layers.Dense(d_model_size, name="Wk") + self.Wv = keras.layers.Dense(d_model_size, name="Wv") - self.dense = tf.keras.layers.Dense(d_model_size, name="dense") + self.dense = keras.layers.Dense(d_model_size, name="dense") def split_into_heads(self, x, batch_size): x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth)) @@ -160,12 +161,12 @@ class TFMultiHeadAttention(tf.keras.layers.Layer): self.dense.build([None, None, self.d_model_size]) -class TFPointWiseFeedForwardLayer(tf.keras.layers.Layer): +class TFPointWiseFeedForwardLayer(keras.layers.Layer): def __init__(self, d_model_size, dff, **kwargs): super().__init__(**kwargs) - self.dense_0 = tf.keras.layers.Dense(dff, activation="relu", name="0") - self.dense_2 = tf.keras.layers.Dense(d_model_size, name="2") + self.dense_0 = keras.layers.Dense(dff, activation="relu", name="0") + self.dense_2 = keras.layers.Dense(d_model_size, name="2") self.d_model_size = d_model_size self.dff = dff @@ -187,7 +188,7 @@ class TFPointWiseFeedForwardLayer(tf.keras.layers.Layer): self.dense_2.build([None, None, self.dff]) -class TFEncoderLayer(tf.keras.layers.Layer): +class TFEncoderLayer(keras.layers.Layer): def __init__( self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, output_attentions=False, **kwargs ): @@ -200,11 +201,11 @@ class TFEncoderLayer(tf.keras.layers.Layer): ) self.ffn = TFPointWiseFeedForwardLayer(d_model_size, dff, name="ffn") - self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm1") - self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm2") + self.layernorm1 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm1") + self.layernorm2 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm2") - self.dropout1 = tf.keras.layers.Dropout(rate) - self.dropout2 = tf.keras.layers.Dropout(rate) + self.dropout1 = keras.layers.Dropout(rate) + self.dropout2 = keras.layers.Dropout(rate) self.d_model_size = d_model_size def call(self, x, mask, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=False): @@ -252,7 +253,7 @@ class TFEncoderLayer(tf.keras.layers.Layer): @keras_serializable -class TFCTRLMainLayer(tf.keras.layers.Layer): +class TFCTRLMainLayer(keras.layers.Layer): config_class = CTRLConfig def __init__(self, config, **kwargs): @@ -269,14 +270,14 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size) - self.w = tf.keras.layers.Embedding( + self.w = keras.layers.Embedding( input_dim=config.vocab_size, output_dim=config.n_embd, embeddings_initializer=get_initializer(config.initializer_range), name="w", ) - self.dropout = tf.keras.layers.Dropout(config.embd_pdrop) + self.dropout = keras.layers.Dropout(config.embd_pdrop) self.h = [ TFEncoderLayer( config.n_embd, @@ -289,7 +290,7 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): ) for i in range(config.n_layer) ] - self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="layernorm") + self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="layernorm") def get_input_embeddings(self): return self.w @@ -476,7 +477,7 @@ CTRL_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -635,9 +636,9 @@ class TFCTRLModel(TFCTRLPreTrainedModel): self.transformer.build(None) -class TFCTRLBiasLayer(tf.keras.layers.Layer): +class TFCTRLBiasLayer(keras.layers.Layer): """ - Bias as a layer. It is used for serialization purposes: `tf.keras.Model.save_weights` stores on a per-layer basis, + Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis, so all weights have to be registered in a layer. """ @@ -812,7 +813,7 @@ class TFCTRLForSequenceClassification(TFCTRLPreTrainedModel, TFSequenceClassific def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels - self.classifier = tf.keras.layers.Dense( + self.classifier = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier", diff --git a/src/transformers/models/cvt/modeling_tf_cvt.py b/src/transformers/models/cvt/modeling_tf_cvt.py index 061a80eb45..c69973bdc8 100644 --- a/src/transformers/models/cvt/modeling_tf_cvt.py +++ b/src/transformers/models/cvt/modeling_tf_cvt.py @@ -29,6 +29,7 @@ from ...modeling_tf_utils import ( TFPreTrainedModel, TFSequenceClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -80,7 +81,7 @@ class TFBaseModelOutputWithCLSToken(ModelOutput): hidden_states: Tuple[tf.Tensor, ...] | None = None -class TFCvtDropPath(tf.keras.layers.Layer): +class TFCvtDropPath(keras.layers.Layer): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). References: (1) github.com:rwightman/pytorch-image-models @@ -100,7 +101,7 @@ class TFCvtDropPath(tf.keras.layers.Layer): return (x / keep_prob) * random_tensor -class TFCvtEmbeddings(tf.keras.layers.Layer): +class TFCvtEmbeddings(keras.layers.Layer): """Construct the Convolutional Token Embeddings.""" def __init__( @@ -124,7 +125,7 @@ class TFCvtEmbeddings(tf.keras.layers.Layer): padding=padding, name="convolution_embeddings", ) - self.dropout = tf.keras.layers.Dropout(dropout_rate) + self.dropout = keras.layers.Dropout(dropout_rate) def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_state = self.convolution_embeddings(pixel_values) @@ -140,7 +141,7 @@ class TFCvtEmbeddings(tf.keras.layers.Layer): self.convolution_embeddings.build(None) -class TFCvtConvEmbeddings(tf.keras.layers.Layer): +class TFCvtConvEmbeddings(keras.layers.Layer): """Image to Convolution Embeddings. This convolutional operation aims to model local spatial contexts.""" def __init__( @@ -154,9 +155,9 @@ class TFCvtConvEmbeddings(tf.keras.layers.Layer): **kwargs, ): super().__init__(**kwargs) - self.padding = tf.keras.layers.ZeroPadding2D(padding=padding) + self.padding = keras.layers.ZeroPadding2D(padding=padding) self.patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) - self.projection = tf.keras.layers.Conv2D( + self.projection = keras.layers.Conv2D( filters=embed_dim, kernel_size=patch_size, strides=stride, @@ -166,7 +167,7 @@ class TFCvtConvEmbeddings(tf.keras.layers.Layer): name="projection", ) # Using the same default epsilon as PyTorch - self.normalization = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="normalization") + self.normalization = keras.layers.LayerNormalization(epsilon=1e-5, name="normalization") self.num_channels = num_channels self.embed_dim = embed_dim @@ -198,13 +199,13 @@ class TFCvtConvEmbeddings(tf.keras.layers.Layer): self.normalization.build([None, None, self.embed_dim]) -class TFCvtSelfAttentionConvProjection(tf.keras.layers.Layer): +class TFCvtSelfAttentionConvProjection(keras.layers.Layer): """Convolutional projection layer.""" def __init__(self, config: CvtConfig, embed_dim: int, kernel_size: int, stride: int, padding: int, **kwargs): super().__init__(**kwargs) - self.padding = tf.keras.layers.ZeroPadding2D(padding=padding) - self.convolution = tf.keras.layers.Conv2D( + self.padding = keras.layers.ZeroPadding2D(padding=padding) + self.convolution = keras.layers.Conv2D( filters=embed_dim, kernel_size=kernel_size, kernel_initializer=get_initializer(config.initializer_range), @@ -215,7 +216,7 @@ class TFCvtSelfAttentionConvProjection(tf.keras.layers.Layer): groups=embed_dim, ) # Using the same default epsilon as PyTorch, TF uses (1 - pytorch momentum) - self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization") + self.normalization = keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization") self.embed_dim = embed_dim def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -235,7 +236,7 @@ class TFCvtSelfAttentionConvProjection(tf.keras.layers.Layer): self.normalization.build([None, None, None, self.embed_dim]) -class TFCvtSelfAttentionLinearProjection(tf.keras.layers.Layer): +class TFCvtSelfAttentionLinearProjection(keras.layers.Layer): """Linear projection layer used to flatten tokens into 1D.""" def call(self, hidden_state: tf.Tensor) -> tf.Tensor: @@ -246,7 +247,7 @@ class TFCvtSelfAttentionLinearProjection(tf.keras.layers.Layer): return hidden_state -class TFCvtSelfAttentionProjection(tf.keras.layers.Layer): +class TFCvtSelfAttentionProjection(keras.layers.Layer): """Convolutional Projection for Attention.""" def __init__( @@ -280,7 +281,7 @@ class TFCvtSelfAttentionProjection(tf.keras.layers.Layer): self.convolution_projection.build(None) -class TFCvtSelfAttention(tf.keras.layers.Layer): +class TFCvtSelfAttention(keras.layers.Layer): """ Self-attention layer. A depth-wise separable convolution operation (Convolutional Projection), is applied for query, key, and value embeddings. @@ -336,28 +337,28 @@ class TFCvtSelfAttention(tf.keras.layers.Layer): name="convolution_projection_value", ) - self.projection_query = tf.keras.layers.Dense( + self.projection_query = keras.layers.Dense( units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), use_bias=qkv_bias, bias_initializer="zeros", name="projection_query", ) - self.projection_key = tf.keras.layers.Dense( + self.projection_key = keras.layers.Dense( units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), use_bias=qkv_bias, bias_initializer="zeros", name="projection_key", ) - self.projection_value = tf.keras.layers.Dense( + self.projection_value = keras.layers.Dense( units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), use_bias=qkv_bias, bias_initializer="zeros", name="projection_value", ) - self.dropout = tf.keras.layers.Dropout(attention_drop_rate) + self.dropout = keras.layers.Dropout(attention_drop_rate) def rearrange_for_multi_head_attention(self, hidden_state: tf.Tensor) -> tf.Tensor: batch_size, hidden_size, _ = shape_list(hidden_state) @@ -424,15 +425,15 @@ class TFCvtSelfAttention(tf.keras.layers.Layer): self.projection_value.build([None, None, self.embed_dim]) -class TFCvtSelfOutput(tf.keras.layers.Layer): +class TFCvtSelfOutput(keras.layers.Layer): """Output of the Attention layer .""" def __init__(self, config: CvtConfig, embed_dim: int, drop_rate: float, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.dropout = tf.keras.layers.Dropout(drop_rate) + self.dropout = keras.layers.Dropout(drop_rate) self.embed_dim = embed_dim def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -449,7 +450,7 @@ class TFCvtSelfOutput(tf.keras.layers.Layer): self.dense.build([None, None, self.embed_dim]) -class TFCvtAttention(tf.keras.layers.Layer): +class TFCvtAttention(keras.layers.Layer): """Attention layer. First chunk of the convolutional transformer block.""" def __init__( @@ -507,12 +508,12 @@ class TFCvtAttention(tf.keras.layers.Layer): self.dense_output.build(None) -class TFCvtIntermediate(tf.keras.layers.Layer): +class TFCvtIntermediate(keras.layers.Layer): """Intermediate dense layer. Second chunk of the convolutional transformer block.""" def __init__(self, config: CvtConfig, embed_dim: int, mlp_ratio: int, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=int(embed_dim * mlp_ratio), kernel_initializer=get_initializer(config.initializer_range), activation="gelu", @@ -533,17 +534,17 @@ class TFCvtIntermediate(tf.keras.layers.Layer): self.dense.build([None, None, self.embed_dim]) -class TFCvtOutput(tf.keras.layers.Layer): +class TFCvtOutput(keras.layers.Layer): """ Output of the Convolutional Transformer Block (last chunk). It consists of a MLP and a residual connection. """ def __init__(self, config: CvtConfig, embed_dim: int, mlp_ratio: int, drop_rate: int, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.dropout = tf.keras.layers.Dropout(drop_rate) + self.dropout = keras.layers.Dropout(drop_rate) self.embed_dim = embed_dim self.mlp_ratio = mlp_ratio @@ -562,7 +563,7 @@ class TFCvtOutput(tf.keras.layers.Layer): self.dense.build([None, None, int(self.embed_dim * self.mlp_ratio)]) -class TFCvtLayer(tf.keras.layers.Layer): +class TFCvtLayer(keras.layers.Layer): """ Convolutional Transformer Block composed by attention layers, normalization and multi-layer perceptrons (mlps). It consists of 3 chunks : an attention layer, an intermediate dense layer and an output layer. This corresponds to the @@ -611,11 +612,11 @@ class TFCvtLayer(tf.keras.layers.Layer): self.drop_path = ( TFCvtDropPath(drop_path_rate, name="drop_path") if drop_path_rate > 0.0 - else tf.keras.layers.Activation("linear", name="drop_path") + else keras.layers.Activation("linear", name="drop_path") ) # Using the same default epsilon as PyTorch - self.layernorm_before = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_before") - self.layernorm_after = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_after") + self.layernorm_before = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_before") + self.layernorm_after = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_after") self.embed_dim = embed_dim def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor: @@ -659,7 +660,7 @@ class TFCvtLayer(tf.keras.layers.Layer): self.layernorm_after.build([None, None, self.embed_dim]) -class TFCvtStage(tf.keras.layers.Layer): +class TFCvtStage(keras.layers.Layer): """ Cvt stage (encoder block). Each stage has 2 parts : - (1) A Convolutional Token Embedding layer @@ -755,7 +756,7 @@ class TFCvtStage(tf.keras.layers.Layer): layer.build(None) -class TFCvtEncoder(tf.keras.layers.Layer): +class TFCvtEncoder(keras.layers.Layer): """ Convolutional Vision Transformer encoder. CVT has 3 stages of encoder blocks with their respective number of layers (depth) being 1, 2 and 10. @@ -782,7 +783,7 @@ class TFCvtEncoder(tf.keras.layers.Layer): ) -> Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]: all_hidden_states = () if output_hidden_states else None hidden_state = pixel_values - # When running on CPU, `tf.keras.layers.Conv2D` doesn't support (batch_size, num_channels, height, width) + # When running on CPU, `keras.layers.Conv2D` doesn't support (batch_size, num_channels, height, width) # as input format. So change the input format to (batch_size, height, width, num_channels). hidden_state = tf.transpose(hidden_state, perm=(0, 2, 3, 1)) @@ -817,7 +818,7 @@ class TFCvtEncoder(tf.keras.layers.Layer): @keras_serializable -class TFCvtMainLayer(tf.keras.layers.Layer): +class TFCvtMainLayer(keras.layers.Layer): """Construct the Cvt model.""" config_class = CvtConfig @@ -882,7 +883,7 @@ TFCVT_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -893,7 +894,7 @@ TFCVT_START_DOCSTRING = r""" - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. - This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all the + This second option is useful when using [`keras.Model.fit`] method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`. @@ -1006,10 +1007,10 @@ class TFCvtForImageClassification(TFCvtPreTrainedModel, TFSequenceClassification self.num_labels = config.num_labels self.cvt = TFCvtMainLayer(config, name="cvt") # Using same default epsilon as in the original implementation. - self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm") + self.layernorm = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm") # Classifier head - self.classifier = tf.keras.layers.Dense( + self.classifier = keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), use_bias=True, diff --git a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py index a8fc372db6..bc8ff9cfc9 100644 --- a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py +++ b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py @@ -37,6 +37,7 @@ from ...modeling_tf_utils import ( TFPreTrainedModel, TFSequenceClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -101,7 +102,7 @@ class TFData2VecVisionModelOutputWithPooling(TFBaseModelOutputWithPooling): attentions: Tuple[tf.Tensor] | None = None -class TFData2VecVisionDropPath(tf.keras.layers.Layer): +class TFData2VecVisionDropPath(keras.layers.Layer): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). References: (1) github.com:rwightman/pytorch-image-models @@ -121,7 +122,7 @@ class TFData2VecVisionDropPath(tf.keras.layers.Layer): return x -class TFData2VecVisionEmbeddings(tf.keras.layers.Layer): +class TFData2VecVisionEmbeddings(keras.layers.Layer): """ Construct the CLS token, position and patch embeddings. Optionally, also the mask token. @@ -135,7 +136,7 @@ class TFData2VecVisionEmbeddings(tf.keras.layers.Layer): self.num_patches = self.patch_embeddings.num_patches self.config = config - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) def build(self, input_shape=None): self.cls_token = self.add_weight( @@ -193,7 +194,7 @@ class TFData2VecVisionEmbeddings(tf.keras.layers.Layer): return embeddings -class TFData2VecVisionPatchEmbeddings(tf.keras.layers.Layer): +class TFData2VecVisionPatchEmbeddings(keras.layers.Layer): """ Image to Patch Embedding. """ @@ -215,7 +216,7 @@ class TFData2VecVisionPatchEmbeddings(tf.keras.layers.Layer): self.patch_shape = patch_shape self.num_channels = num_channels - self.projection = tf.keras.layers.Conv2D( + self.projection = keras.layers.Conv2D( filters=hidden_size, kernel_size=patch_size, strides=patch_size, @@ -240,7 +241,7 @@ class TFData2VecVisionPatchEmbeddings(tf.keras.layers.Layer): f" ({self.image_size[0]}*{self.image_size[1]})." ) - # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format. + # When running on CPU, `keras.layers.Conv2D` doesn't support `NCHW` format. # So change the input format from `NCHW` to `NHWC`. # shape = (batch_size, in_height, in_width, in_channels=num_channels) pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) @@ -262,7 +263,7 @@ class TFData2VecVisionPatchEmbeddings(tf.keras.layers.Layer): self.projection.build([None, None, None, self.num_channels]) -class TFData2VecVisionSelfAttention(tf.keras.layers.Layer): +class TFData2VecVisionSelfAttention(keras.layers.Layer): def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None, **kwargs): super().__init__(**kwargs) @@ -277,19 +278,19 @@ class TFData2VecVisionSelfAttention(tf.keras.layers.Layer): self.all_head_size = self.num_attention_heads * self.attention_head_size self.sqrt_att_head_size = math.sqrt(self.attention_head_size) - self.query = tf.keras.layers.Dense( + self.query = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" ) - self.key = tf.keras.layers.Dense( + self.key = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key", use_bias=False, ) - self.value = tf.keras.layers.Dense( + self.value = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) - self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob) if window_size: self.relative_position_bias = TFData2VecVisionRelativePositionBias( @@ -376,7 +377,7 @@ class TFData2VecVisionSelfAttention(tf.keras.layers.Layer): self.relative_position_bias.build(None) -class TFData2VecVisionSelfOutput(tf.keras.layers.Layer): +class TFData2VecVisionSelfOutput(keras.layers.Layer): """ The residual connection is defined in TFData2VecVisionLayer instead of here (as is the case with other models), due to the layernorm applied before each block. @@ -385,10 +386,10 @@ class TFData2VecVisionSelfOutput(tf.keras.layers.Layer): def __init__(self, config: Data2VecVisionConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, gamma=None, training: bool = False) -> tf.Tensor: @@ -406,7 +407,7 @@ class TFData2VecVisionSelfOutput(tf.keras.layers.Layer): self.dense.build([None, None, self.config.hidden_size]) -class TFData2VecVisionAttention(tf.keras.layers.Layer): +class TFData2VecVisionAttention(keras.layers.Layer): def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None, **kwargs): super().__init__(**kwargs) @@ -451,11 +452,11 @@ class TFData2VecVisionAttention(tf.keras.layers.Layer): # Copied from transformers.models.vit.modeling_tf_vit.TFViTIntermediate with ViT->Data2VecVision -class TFData2VecVisionIntermediate(tf.keras.layers.Layer): +class TFData2VecVisionIntermediate(keras.layers.Layer): def __init__(self, config: Data2VecVisionConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -480,14 +481,14 @@ class TFData2VecVisionIntermediate(tf.keras.layers.Layer): self.dense.build([None, None, self.config.hidden_size]) -class TFData2VecVisionOutput(tf.keras.layers.Layer): +class TFData2VecVisionOutput(keras.layers.Layer): def __init__(self, config: Data2VecVisionConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -505,7 +506,7 @@ class TFData2VecVisionOutput(tf.keras.layers.Layer): self.dense.build([None, None, self.config.intermediate_size]) -class TFData2VecVisionLayer(tf.keras.layers.Layer): +class TFData2VecVisionLayer(keras.layers.Layer): """This corresponds to the Block class in the timm implementation.""" def __init__( @@ -518,18 +519,14 @@ class TFData2VecVisionLayer(tf.keras.layers.Layer): self.intermediate = TFData2VecVisionIntermediate(config, name="intermediate") self.data2vec_output = TFData2VecVisionOutput(config, name="output") - self.layernorm_before = tf.keras.layers.LayerNormalization( - epsilon=config.layer_norm_eps, name="layernorm_before" - ) - self.layernorm_after = tf.keras.layers.LayerNormalization( - epsilon=config.layer_norm_eps, name="layernorm_after" - ) + self.layernorm_before = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_before") + self.layernorm_after = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_after") # Using `layers.Activation` instead of `tf.identity` to better control `training` # behaviour. self.drop_path = ( TFData2VecVisionDropPath(drop_path_rate, name="drop_path") if drop_path_rate > 0.0 - else tf.keras.layers.Activation("linear", name="drop_path") + else keras.layers.Activation("linear", name="drop_path") ) self.init_values = config.layer_scale_init_value @@ -619,7 +616,7 @@ class TFData2VecVisionLayer(tf.keras.layers.Layer): # Taken and modified from here: # https://github.com/leondgarse/keras_cv_attention_models/blob/main/keras_cv_attention_models/beit/beit.py#L28 -class TFData2VecVisionRelativePositionBias(tf.keras.layers.Layer): +class TFData2VecVisionRelativePositionBias(keras.layers.Layer): def __init__(self, config: Data2VecVisionConfig, window_size: tuple, **kwargs) -> None: super().__init__(**kwargs) self.config = config @@ -675,7 +672,7 @@ class TFData2VecVisionRelativePositionBias(tf.keras.layers.Layer): return tf.transpose(relative_position_bias, [2, 0, 1]) -class TFData2VecVisionEncoder(tf.keras.layers.Layer): +class TFData2VecVisionEncoder(keras.layers.Layer): def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None, **kwargs): super().__init__(**kwargs) self.config = config @@ -753,7 +750,7 @@ class TFData2VecVisionEncoder(tf.keras.layers.Layer): @keras_serializable -class TFData2VecVisionMainLayer(tf.keras.layers.Layer): +class TFData2VecVisionMainLayer(keras.layers.Layer): config_class = Data2VecVisionConfig def __init__(self, config: Data2VecVisionConfig, add_pooling_layer: bool = True, **kwargs): @@ -769,14 +766,14 @@ class TFData2VecVisionMainLayer(tf.keras.layers.Layer): self.layernorm = ( tf.identity if config.use_mean_pooling - else tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") + else keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") ) # We are setting the `data_format` like so because from here on we will revert to the # NCHW output format self.pooler = TFData2VecVisionPooler(config, name="pooler") if add_pooling_layer else None - def get_input_embeddings(self) -> tf.keras.layers.Layer: + def get_input_embeddings(self) -> keras.layers.Layer: return self.embeddings.patch_embeddings def _prune_heads(self, heads_to_prune): @@ -861,11 +858,11 @@ class TFData2VecVisionMainLayer(tf.keras.layers.Layer): self.pooler.build(None) -class TFData2VecVisionPooler(tf.keras.layers.Layer): +class TFData2VecVisionPooler(keras.layers.Layer): def __init__(self, config: Data2VecVisionConfig, **kwargs): super().__init__(**kwargs) self.layernorm = ( - tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") + keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") if config.use_mean_pooling else None ) @@ -909,7 +906,7 @@ DATA2VEC_VISION_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.). - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -1049,7 +1046,7 @@ class TFData2VecVisionForImageClassification(TFData2VecVisionPreTrainedModel, TF self.data2vec_vision = TFData2VecVisionMainLayer(config, add_pooling_layer=True, name="data2vec_vision") # Classifier head - self.classifier = tf.keras.layers.Dense( + self.classifier = keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier", @@ -1118,7 +1115,7 @@ class TFData2VecVisionForImageClassification(TFData2VecVisionPreTrainedModel, TF self.classifier.build([None, None, self.config.hidden_size]) -class TFData2VecVisionConvModule(tf.keras.layers.Layer): +class TFData2VecVisionConvModule(keras.layers.Layer): """ A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU). @@ -1137,7 +1134,7 @@ class TFData2VecVisionConvModule(tf.keras.layers.Layer): **kwargs, ) -> None: super().__init__(**kwargs) - self.conv = tf.keras.layers.Conv2D( + self.conv = keras.layers.Conv2D( filters=out_channels, kernel_size=kernel_size, padding=padding, @@ -1145,7 +1142,7 @@ class TFData2VecVisionConvModule(tf.keras.layers.Layer): dilation_rate=dilation, name="conv", ) - self.bn = tf.keras.layers.BatchNormalization(name="bn", momentum=0.9, epsilon=1e-5) + self.bn = keras.layers.BatchNormalization(name="bn", momentum=0.9, epsilon=1e-5) self.activation = tf.nn.relu self.in_channels = in_channels self.out_channels = out_channels @@ -1168,7 +1165,7 @@ class TFData2VecVisionConvModule(tf.keras.layers.Layer): self.bn.build((None, None, None, self.out_channels)) -class TFAdaptiveAvgPool2D(tf.keras.layers.Layer): +class TFAdaptiveAvgPool2D(keras.layers.Layer): def __init__(self, output_dims: Tuple[int, int], input_ordering: str = "NHWC", **kwargs): super().__init__(**kwargs) self.output_dims = output_dims @@ -1292,7 +1289,7 @@ class TFAdaptiveAvgPool2D(tf.keras.layers.Layer): return self.pseudo_1d_pool(h_pooled, h_pooling=False) -class TFData2VecVisionPyramidPoolingModule(tf.keras.layers.Layer): +class TFData2VecVisionPyramidPoolingModule(keras.layers.Layer): """ Pyramid Pooling Module (PPM) used in PSPNet. @@ -1342,7 +1339,7 @@ class TFData2VecVisionPyramidPoolingModule(tf.keras.layers.Layer): layer_module.build(None) -class TFData2VecVisionUperHead(tf.keras.layers.Layer): +class TFData2VecVisionUperHead(keras.layers.Layer): """ Unified Perceptual Parsing for Scene Understanding. This head is the implementation of [UPerNet](https://arxiv.org/abs/1807.10221). @@ -1356,7 +1353,7 @@ class TFData2VecVisionUperHead(tf.keras.layers.Layer): self.pool_scales = config.pool_scales # e.g. (1, 2, 3, 6) self.in_channels = [config.hidden_size] * 4 # e.g. [768, 768, 768, 768] self.channels = config.hidden_size - self.classifier = tf.keras.layers.Conv2D(config.num_labels, kernel_size=1, name="classifier") + self.classifier = keras.layers.Conv2D(config.num_labels, kernel_size=1, name="classifier") # PSP Module self.psp_modules = TFData2VecVisionPyramidPoolingModule( @@ -1452,7 +1449,7 @@ class TFData2VecVisionUperHead(tf.keras.layers.Layer): layer.build(None) -class TFData2VecVisionFCNHead(tf.keras.layers.Layer): +class TFData2VecVisionFCNHead(keras.layers.Layer): """ Fully Convolution Networks for Semantic Segmentation. This head is implemented from [FCNNet](https://arxiv.org/abs/1411.4038). @@ -1516,7 +1513,7 @@ class TFData2VecVisionFCNHead(tf.keras.layers.Layer): name="conv_cat", ) - self.classifier = tf.keras.layers.Conv2D(config.num_labels, kernel_size=1, name="classifier") + self.classifier = keras.layers.Conv2D(config.num_labels, kernel_size=1, name="classifier") def call(self, encoder_hidden_states: tf.Tensor) -> tf.Tensor: # just take the relevant feature maps @@ -1555,15 +1552,15 @@ class TFData2VecVisionForSemanticSegmentation(TFData2VecVisionPreTrainedModel): # FPNs self.fpn1 = [ - tf.keras.layers.Conv2DTranspose(config.hidden_size, kernel_size=2, strides=2, name="fpn1.0"), - tf.keras.layers.BatchNormalization(name="fpn1.1", momentum=0.9, epsilon=1e-5), - tf.keras.layers.Activation("gelu"), - tf.keras.layers.Conv2DTranspose(config.hidden_size, kernel_size=2, strides=2, name="fpn1.3"), + keras.layers.Conv2DTranspose(config.hidden_size, kernel_size=2, strides=2, name="fpn1.0"), + keras.layers.BatchNormalization(name="fpn1.1", momentum=0.9, epsilon=1e-5), + keras.layers.Activation("gelu"), + keras.layers.Conv2DTranspose(config.hidden_size, kernel_size=2, strides=2, name="fpn1.3"), ] - self.fpn2 = [tf.keras.layers.Conv2DTranspose(config.hidden_size, kernel_size=2, strides=2, name="fpn2.0")] + self.fpn2 = [keras.layers.Conv2DTranspose(config.hidden_size, kernel_size=2, strides=2, name="fpn2.0")] self.fpn3 = tf.identity - self.fpn4 = tf.keras.layers.MaxPool2D(pool_size=2, strides=2) + self.fpn4 = keras.layers.MaxPool2D(pool_size=2, strides=2) # Semantic segmentation head(s) self.decode_head = TFData2VecVisionUperHead(config, name="decode_head") @@ -1582,7 +1579,7 @@ class TFData2VecVisionForSemanticSegmentation(TFData2VecVisionPreTrainedModel): if auxiliary_logits is not None: upsampled_auxiliary_logits = tf.image.resize(auxiliary_logits, size=label_interp_shape, method="bilinear") # compute weighted loss - loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none") + loss_fct = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none") # Copied from https://www.tensorflow.org/text/tutorials/transformer#loss_and_metrics. # Utility to mask the index to ignore during computing the loss. diff --git a/src/transformers/models/deberta/modeling_tf_deberta.py b/src/transformers/models/deberta/modeling_tf_deberta.py index 0509403bb0..2a2a586c35 100644 --- a/src/transformers/models/deberta/modeling_tf_deberta.py +++ b/src/transformers/models/deberta/modeling_tf_deberta.py @@ -39,6 +39,7 @@ from ...modeling_tf_utils import ( TFSequenceClassificationLoss, TFTokenClassificationLoss, get_initializer, + keras, unpack_inputs, ) from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax @@ -58,10 +59,10 @@ TF_DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [ ] -class TFDebertaContextPooler(tf.keras.layers.Layer): +class TFDebertaContextPooler(keras.layers.Layer): def __init__(self, config: DebertaConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense(config.pooler_hidden_size, name="dense") + self.dense = keras.layers.Dense(config.pooler_hidden_size, name="dense") self.dropout = TFDebertaStableDropout(config.pooler_dropout, name="dropout") self.config = config @@ -90,7 +91,7 @@ class TFDebertaContextPooler(tf.keras.layers.Layer): self.dropout.build(None) -class TFDebertaXSoftmax(tf.keras.layers.Layer): +class TFDebertaXSoftmax(keras.layers.Layer): """ Masked Softmax which is optimized for saving memory @@ -112,7 +113,7 @@ class TFDebertaXSoftmax(tf.keras.layers.Layer): return output -class TFDebertaStableDropout(tf.keras.layers.Layer): +class TFDebertaStableDropout(keras.layers.Layer): """ Optimized dropout module for stabilizing the training @@ -152,7 +153,7 @@ class TFDebertaStableDropout(tf.keras.layers.Layer): return inputs -class TFDebertaLayerNorm(tf.keras.layers.Layer): +class TFDebertaLayerNorm(keras.layers.Layer): """LayerNorm module in the TF style (epsilon inside the square root).""" def __init__(self, size, eps=1e-12, **kwargs): @@ -172,11 +173,11 @@ class TFDebertaLayerNorm(tf.keras.layers.Layer): return self.gamma * (x - mean) / std + self.beta -class TFDebertaSelfOutput(tf.keras.layers.Layer): +class TFDebertaSelfOutput(keras.layers.Layer): def __init__(self, config: DebertaConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense(config.hidden_size, name="dense") - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dense = keras.layers.Dense(config.hidden_size, name="dense") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout") self.config = config @@ -201,7 +202,7 @@ class TFDebertaSelfOutput(tf.keras.layers.Layer): self.dropout.build(None) -class TFDebertaAttention(tf.keras.layers.Layer): +class TFDebertaAttention(keras.layers.Layer): def __init__(self, config: DebertaConfig, **kwargs): super().__init__(**kwargs) self.self = TFDebertaDisentangledSelfAttention(config, name="self") @@ -249,11 +250,11 @@ class TFDebertaAttention(tf.keras.layers.Layer): self.dense_output.build(None) -class TFDebertaIntermediate(tf.keras.layers.Layer): +class TFDebertaIntermediate(keras.layers.Layer): def __init__(self, config: DebertaConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -278,14 +279,14 @@ class TFDebertaIntermediate(tf.keras.layers.Layer): self.dense.build([None, None, self.config.hidden_size]) -class TFDebertaOutput(tf.keras.layers.Layer): +class TFDebertaOutput(keras.layers.Layer): def __init__(self, config: DebertaConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout") self.config = config @@ -311,7 +312,7 @@ class TFDebertaOutput(tf.keras.layers.Layer): self.dropout.build(None) -class TFDebertaLayer(tf.keras.layers.Layer): +class TFDebertaLayer(keras.layers.Layer): def __init__(self, config: DebertaConfig, **kwargs): super().__init__(**kwargs) @@ -362,7 +363,7 @@ class TFDebertaLayer(tf.keras.layers.Layer): self.bert_output.build(None) -class TFDebertaEncoder(tf.keras.layers.Layer): +class TFDebertaEncoder(keras.layers.Layer): def __init__(self, config: DebertaConfig, **kwargs): super().__init__(**kwargs) @@ -543,7 +544,7 @@ def torch_gather(x, indices, gather_axis): return gathered -class TFDebertaDisentangledSelfAttention(tf.keras.layers.Layer): +class TFDebertaDisentangledSelfAttention(keras.layers.Layer): """ Disentangled self-attention module @@ -564,7 +565,7 @@ class TFDebertaDisentangledSelfAttention(tf.keras.layers.Layer): self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size - self.in_proj = tf.keras.layers.Dense( + self.in_proj = keras.layers.Dense( self.all_head_size * 3, kernel_initializer=get_initializer(config.initializer_range), name="in_proj", @@ -576,13 +577,13 @@ class TFDebertaDisentangledSelfAttention(tf.keras.layers.Layer): self.talking_head = getattr(config, "talking_head", False) if self.talking_head: - self.head_logits_proj = tf.keras.layers.Dense( + self.head_logits_proj = keras.layers.Dense( self.num_attention_heads, kernel_initializer=get_initializer(config.initializer_range), name="head_logits_proj", use_bias=False, ) - self.head_weights_proj = tf.keras.layers.Dense( + self.head_weights_proj = keras.layers.Dense( self.num_attention_heads, kernel_initializer=get_initializer(config.initializer_range), name="head_weights_proj", @@ -597,14 +598,14 @@ class TFDebertaDisentangledSelfAttention(tf.keras.layers.Layer): self.max_relative_positions = config.max_position_embeddings self.pos_dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="pos_dropout") if "c2p" in self.pos_att_type: - self.pos_proj = tf.keras.layers.Dense( + self.pos_proj = keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="pos_proj", use_bias=False, ) if "p2c" in self.pos_att_type: - self.pos_q_proj = tf.keras.layers.Dense( + self.pos_q_proj = keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="pos_q_proj" ) @@ -616,10 +617,10 @@ class TFDebertaDisentangledSelfAttention(tf.keras.layers.Layer): return self.built = True self.q_bias = self.add_weight( - name="q_bias", shape=(self.all_head_size), initializer=tf.keras.initializers.Zeros() + name="q_bias", shape=(self.all_head_size), initializer=keras.initializers.Zeros() ) self.v_bias = self.add_weight( - name="v_bias", shape=(self.all_head_size), initializer=tf.keras.initializers.Zeros() + name="v_bias", shape=(self.all_head_size), initializer=keras.initializers.Zeros() ) if getattr(self, "in_proj", None) is not None: with tf.name_scope(self.in_proj.name): @@ -818,7 +819,7 @@ class TFDebertaDisentangledSelfAttention(tf.keras.layers.Layer): return score -class TFDebertaEmbeddings(tf.keras.layers.Layer): +class TFDebertaEmbeddings(keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, config, **kwargs): @@ -831,13 +832,13 @@ class TFDebertaEmbeddings(tf.keras.layers.Layer): self.position_biased_input = getattr(config, "position_biased_input", True) self.initializer_range = config.initializer_range if self.embedding_size != config.hidden_size: - self.embed_proj = tf.keras.layers.Dense( + self.embed_proj = keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="embed_proj", use_bias=False, ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout") def build(self, input_shape=None): @@ -937,13 +938,13 @@ class TFDebertaEmbeddings(tf.keras.layers.Layer): return final_embeddings -class TFDebertaPredictionHeadTransform(tf.keras.layers.Layer): +class TFDebertaPredictionHeadTransform(keras.layers.Layer): def __init__(self, config: DebertaConfig, **kwargs): super().__init__(**kwargs) self.embedding_size = getattr(config, "embedding_size", config.hidden_size) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=self.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense", @@ -953,7 +954,7 @@ class TFDebertaPredictionHeadTransform(tf.keras.layers.Layer): self.transform_act_fn = get_tf_activation(config.hidden_act) else: self.transform_act_fn = config.hidden_act - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: @@ -975,8 +976,8 @@ class TFDebertaPredictionHeadTransform(tf.keras.layers.Layer): self.LayerNorm.build([None, None, self.embedding_size]) -class TFDebertaLMPredictionHead(tf.keras.layers.Layer): - def __init__(self, config: DebertaConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): +class TFDebertaLMPredictionHead(keras.layers.Layer): + def __init__(self, config: DebertaConfig, input_embeddings: keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.config = config @@ -998,7 +999,7 @@ class TFDebertaLMPredictionHead(tf.keras.layers.Layer): with tf.name_scope(self.transform.name): self.transform.build(None) - def get_output_embeddings(self) -> tf.keras.layers.Layer: + def get_output_embeddings(self) -> keras.layers.Layer: return self.input_embeddings def set_output_embeddings(self, value: tf.Variable): @@ -1023,8 +1024,8 @@ class TFDebertaLMPredictionHead(tf.keras.layers.Layer): return hidden_states -class TFDebertaOnlyMLMHead(tf.keras.layers.Layer): - def __init__(self, config: DebertaConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): +class TFDebertaOnlyMLMHead(keras.layers.Layer): + def __init__(self, config: DebertaConfig, input_embeddings: keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.predictions = TFDebertaLMPredictionHead(config, input_embeddings, name="predictions") @@ -1043,7 +1044,7 @@ class TFDebertaOnlyMLMHead(tf.keras.layers.Layer): # @keras_serializable -class TFDebertaMainLayer(tf.keras.layers.Layer): +class TFDebertaMainLayer(keras.layers.Layer): config_class = DebertaConfig def __init__(self, config: DebertaConfig, **kwargs): @@ -1054,7 +1055,7 @@ class TFDebertaMainLayer(tf.keras.layers.Layer): self.embeddings = TFDebertaEmbeddings(config, name="embeddings") self.encoder = TFDebertaEncoder(config, name="encoder") - def get_input_embeddings(self) -> tf.keras.layers.Layer: + def get_input_embeddings(self) -> keras.layers.Layer: return self.embeddings def set_input_embeddings(self, value: tf.Variable): @@ -1153,7 +1154,7 @@ DEBERTA_START_DOCSTRING = r""" on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data. - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -1299,7 +1300,7 @@ class TFDebertaForMaskedLM(TFDebertaPreTrainedModel, TFMaskedLanguageModelingLos self.deberta = TFDebertaMainLayer(config, name="deberta") self.mlm = TFDebertaOnlyMLMHead(config, input_embeddings=self.deberta.embeddings, name="cls") - def get_lm_head(self) -> tf.keras.layers.Layer: + def get_lm_head(self) -> keras.layers.Layer: return self.mlm.predictions @unpack_inputs @@ -1385,7 +1386,7 @@ class TFDebertaForSequenceClassification(TFDebertaPreTrainedModel, TFSequenceCla drop_out = getattr(config, "cls_dropout", None) drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out self.dropout = TFDebertaStableDropout(drop_out, name="cls_dropout") - self.classifier = tf.keras.layers.Dense( + self.classifier = keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier", @@ -1479,8 +1480,8 @@ class TFDebertaForTokenClassification(TFDebertaPreTrainedModel, TFTokenClassific self.num_labels = config.num_labels self.deberta = TFDebertaMainLayer(config, name="deberta") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.classifier = keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config @@ -1562,7 +1563,7 @@ class TFDebertaForQuestionAnswering(TFDebertaPreTrainedModel, TFQuestionAnswerin self.num_labels = config.num_labels self.deberta = TFDebertaMainLayer(config, name="deberta") - self.qa_outputs = tf.keras.layers.Dense( + self.qa_outputs = keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) self.config = config diff --git a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py index 60ef671e1e..05b222ec8a 100644 --- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py @@ -39,6 +39,7 @@ from ...modeling_tf_utils import ( TFSequenceClassificationLoss, TFTokenClassificationLoss, get_initializer, + keras, unpack_inputs, ) from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax @@ -58,10 +59,10 @@ TF_DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST = [ # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaContextPooler with Deberta->DebertaV2 -class TFDebertaV2ContextPooler(tf.keras.layers.Layer): +class TFDebertaV2ContextPooler(keras.layers.Layer): def __init__(self, config: DebertaV2Config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense(config.pooler_hidden_size, name="dense") + self.dense = keras.layers.Dense(config.pooler_hidden_size, name="dense") self.dropout = TFDebertaV2StableDropout(config.pooler_dropout, name="dropout") self.config = config @@ -91,7 +92,7 @@ class TFDebertaV2ContextPooler(tf.keras.layers.Layer): # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaXSoftmax with Deberta->DebertaV2 -class TFDebertaV2XSoftmax(tf.keras.layers.Layer): +class TFDebertaV2XSoftmax(keras.layers.Layer): """ Masked Softmax which is optimized for saving memory @@ -114,7 +115,7 @@ class TFDebertaV2XSoftmax(tf.keras.layers.Layer): # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaStableDropout with Deberta->DebertaV2 -class TFDebertaV2StableDropout(tf.keras.layers.Layer): +class TFDebertaV2StableDropout(keras.layers.Layer): """ Optimized dropout module for stabilizing the training @@ -155,11 +156,11 @@ class TFDebertaV2StableDropout(tf.keras.layers.Layer): # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaSelfOutput with Deberta->DebertaV2 -class TFDebertaV2SelfOutput(tf.keras.layers.Layer): +class TFDebertaV2SelfOutput(keras.layers.Layer): def __init__(self, config: DebertaV2Config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense(config.hidden_size, name="dense") - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dense = keras.layers.Dense(config.hidden_size, name="dense") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout") self.config = config @@ -185,7 +186,7 @@ class TFDebertaV2SelfOutput(tf.keras.layers.Layer): # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaAttention with Deberta->DebertaV2 -class TFDebertaV2Attention(tf.keras.layers.Layer): +class TFDebertaV2Attention(keras.layers.Layer): def __init__(self, config: DebertaV2Config, **kwargs): super().__init__(**kwargs) self.self = TFDebertaV2DisentangledSelfAttention(config, name="self") @@ -234,11 +235,11 @@ class TFDebertaV2Attention(tf.keras.layers.Layer): # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaIntermediate with Deberta->DebertaV2 -class TFDebertaV2Intermediate(tf.keras.layers.Layer): +class TFDebertaV2Intermediate(keras.layers.Layer): def __init__(self, config: DebertaV2Config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -264,14 +265,14 @@ class TFDebertaV2Intermediate(tf.keras.layers.Layer): # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaOutput with Deberta->DebertaV2 -class TFDebertaV2Output(tf.keras.layers.Layer): +class TFDebertaV2Output(keras.layers.Layer): def __init__(self, config: DebertaV2Config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout") self.config = config @@ -298,7 +299,7 @@ class TFDebertaV2Output(tf.keras.layers.Layer): # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaLayer with Deberta->DebertaV2 -class TFDebertaV2Layer(tf.keras.layers.Layer): +class TFDebertaV2Layer(keras.layers.Layer): def __init__(self, config: DebertaV2Config, **kwargs): super().__init__(**kwargs) @@ -349,7 +350,7 @@ class TFDebertaV2Layer(tf.keras.layers.Layer): self.bert_output.build(None) -class TFDebertaV2ConvLayer(tf.keras.layers.Layer): +class TFDebertaV2ConvLayer(keras.layers.Layer): def __init__(self, config: DebertaV2Config, **kwargs): super().__init__(**kwargs) @@ -357,7 +358,7 @@ class TFDebertaV2ConvLayer(tf.keras.layers.Layer): # groups = getattr(config, "conv_groups", 1) self.conv_act = get_tf_activation(getattr(config, "conv_act", "tanh")) self.padding = (self.kernel_size - 1) // 2 - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout") self.config = config @@ -412,7 +413,7 @@ class TFDebertaV2ConvLayer(tf.keras.layers.Layer): return output_states -class TFDebertaV2Encoder(tf.keras.layers.Layer): +class TFDebertaV2Encoder(keras.layers.Layer): def __init__(self, config: DebertaV2Config, **kwargs): super().__init__(**kwargs) @@ -433,7 +434,7 @@ class TFDebertaV2Encoder(tf.keras.layers.Layer): self.norm_rel_ebd = [x.strip() for x in getattr(config, "norm_rel_ebd", "none").lower().split("|")] if "layer_norm" in self.norm_rel_ebd: - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.conv = TFDebertaV2ConvLayer(config, name="conv") if getattr(config, "conv_kernel_size", 0) > 0 else None @@ -634,7 +635,7 @@ def take_along_axis(x, indices): return gathered -class TFDebertaV2DisentangledSelfAttention(tf.keras.layers.Layer): +class TFDebertaV2DisentangledSelfAttention(keras.layers.Layer): """ Disentangled self-attention module @@ -656,19 +657,19 @@ class TFDebertaV2DisentangledSelfAttention(tf.keras.layers.Layer): _attention_head_size = config.hidden_size // config.num_attention_heads self.attention_head_size = getattr(config, "attention_head_size", _attention_head_size) self.all_head_size = self.num_attention_heads * self.attention_head_size - self.query_proj = tf.keras.layers.Dense( + self.query_proj = keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query_proj", use_bias=True, ) - self.key_proj = tf.keras.layers.Dense( + self.key_proj = keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key_proj", use_bias=True, ) - self.value_proj = tf.keras.layers.Dense( + self.value_proj = keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value_proj", @@ -692,14 +693,14 @@ class TFDebertaV2DisentangledSelfAttention(tf.keras.layers.Layer): if not self.share_att_key: if "c2p" in self.pos_att_type: - self.pos_key_proj = tf.keras.layers.Dense( + self.pos_key_proj = keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="pos_proj", use_bias=True, ) if "p2c" in self.pos_att_type: - self.pos_query_proj = tf.keras.layers.Dense( + self.pos_query_proj = keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="pos_q_proj", @@ -925,7 +926,7 @@ class TFDebertaV2DisentangledSelfAttention(tf.keras.layers.Layer): # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaEmbeddings Deberta->DebertaV2 -class TFDebertaV2Embeddings(tf.keras.layers.Layer): +class TFDebertaV2Embeddings(keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, config, **kwargs): @@ -938,13 +939,13 @@ class TFDebertaV2Embeddings(tf.keras.layers.Layer): self.position_biased_input = getattr(config, "position_biased_input", True) self.initializer_range = config.initializer_range if self.embedding_size != config.hidden_size: - self.embed_proj = tf.keras.layers.Dense( + self.embed_proj = keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="embed_proj", use_bias=False, ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout") def build(self, input_shape=None): @@ -1045,13 +1046,13 @@ class TFDebertaV2Embeddings(tf.keras.layers.Layer): # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaPredictionHeadTransform with Deberta->DebertaV2 -class TFDebertaV2PredictionHeadTransform(tf.keras.layers.Layer): +class TFDebertaV2PredictionHeadTransform(keras.layers.Layer): def __init__(self, config: DebertaV2Config, **kwargs): super().__init__(**kwargs) self.embedding_size = getattr(config, "embedding_size", config.hidden_size) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=self.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense", @@ -1061,7 +1062,7 @@ class TFDebertaV2PredictionHeadTransform(tf.keras.layers.Layer): self.transform_act_fn = get_tf_activation(config.hidden_act) else: self.transform_act_fn = config.hidden_act - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: @@ -1084,8 +1085,8 @@ class TFDebertaV2PredictionHeadTransform(tf.keras.layers.Layer): # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaLMPredictionHead with Deberta->DebertaV2 -class TFDebertaV2LMPredictionHead(tf.keras.layers.Layer): - def __init__(self, config: DebertaV2Config, input_embeddings: tf.keras.layers.Layer, **kwargs): +class TFDebertaV2LMPredictionHead(keras.layers.Layer): + def __init__(self, config: DebertaV2Config, input_embeddings: keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.config = config @@ -1107,7 +1108,7 @@ class TFDebertaV2LMPredictionHead(tf.keras.layers.Layer): with tf.name_scope(self.transform.name): self.transform.build(None) - def get_output_embeddings(self) -> tf.keras.layers.Layer: + def get_output_embeddings(self) -> keras.layers.Layer: return self.input_embeddings def set_output_embeddings(self, value: tf.Variable): @@ -1133,8 +1134,8 @@ class TFDebertaV2LMPredictionHead(tf.keras.layers.Layer): # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaOnlyMLMHead with Deberta->DebertaV2 -class TFDebertaV2OnlyMLMHead(tf.keras.layers.Layer): - def __init__(self, config: DebertaV2Config, input_embeddings: tf.keras.layers.Layer, **kwargs): +class TFDebertaV2OnlyMLMHead(keras.layers.Layer): + def __init__(self, config: DebertaV2Config, input_embeddings: keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.predictions = TFDebertaV2LMPredictionHead(config, input_embeddings, name="predictions") @@ -1153,7 +1154,7 @@ class TFDebertaV2OnlyMLMHead(tf.keras.layers.Layer): # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaMainLayer with Deberta->DebertaV2 -class TFDebertaV2MainLayer(tf.keras.layers.Layer): +class TFDebertaV2MainLayer(keras.layers.Layer): config_class = DebertaV2Config def __init__(self, config: DebertaV2Config, **kwargs): @@ -1164,7 +1165,7 @@ class TFDebertaV2MainLayer(tf.keras.layers.Layer): self.embeddings = TFDebertaV2Embeddings(config, name="embeddings") self.encoder = TFDebertaV2Encoder(config, name="encoder") - def get_input_embeddings(self) -> tf.keras.layers.Layer: + def get_input_embeddings(self) -> keras.layers.Layer: return self.embeddings def set_input_embeddings(self, value: tf.Variable): @@ -1264,7 +1265,7 @@ DEBERTA_START_DOCSTRING = r""" on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data. - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -1412,7 +1413,7 @@ class TFDebertaV2ForMaskedLM(TFDebertaV2PreTrainedModel, TFMaskedLanguageModelin self.deberta = TFDebertaV2MainLayer(config, name="deberta") self.mlm = TFDebertaV2OnlyMLMHead(config, input_embeddings=self.deberta.embeddings, name="cls") - def get_lm_head(self) -> tf.keras.layers.Layer: + def get_lm_head(self) -> keras.layers.Layer: return self.mlm.predictions @unpack_inputs @@ -1499,7 +1500,7 @@ class TFDebertaV2ForSequenceClassification(TFDebertaV2PreTrainedModel, TFSequenc drop_out = getattr(config, "cls_dropout", None) drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out self.dropout = TFDebertaV2StableDropout(drop_out, name="cls_dropout") - self.classifier = tf.keras.layers.Dense( + self.classifier = keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier", @@ -1594,8 +1595,8 @@ class TFDebertaV2ForTokenClassification(TFDebertaV2PreTrainedModel, TFTokenClass self.num_labels = config.num_labels self.deberta = TFDebertaV2MainLayer(config, name="deberta") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.classifier = keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config @@ -1678,7 +1679,7 @@ class TFDebertaV2ForQuestionAnswering(TFDebertaV2PreTrainedModel, TFQuestionAnsw self.num_labels = config.num_labels self.deberta = TFDebertaV2MainLayer(config, name="deberta") - self.qa_outputs = tf.keras.layers.Dense( + self.qa_outputs = keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) self.config = config @@ -1777,9 +1778,9 @@ class TFDebertaV2ForMultipleChoice(TFDebertaV2PreTrainedModel, TFMultipleChoiceL super().__init__(config, *inputs, **kwargs) self.deberta = TFDebertaV2MainLayer(config, name="deberta") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.pooler = TFDebertaV2ContextPooler(config, name="pooler") - self.classifier = tf.keras.layers.Dense( + self.classifier = keras.layers.Dense( units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.output_dim = self.pooler.output_dim diff --git a/src/transformers/models/deit/modeling_tf_deit.py b/src/transformers/models/deit/modeling_tf_deit.py index 24d4a60aa3..c6215c63b8 100644 --- a/src/transformers/models/deit/modeling_tf_deit.py +++ b/src/transformers/models/deit/modeling_tf_deit.py @@ -35,6 +35,7 @@ from ...modeling_tf_utils import ( TFPreTrainedModel, TFSequenceClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -101,7 +102,7 @@ class TFDeiTForImageClassificationWithTeacherOutput(ModelOutput): attentions: Tuple[tf.Tensor] | None = None -class TFDeiTEmbeddings(tf.keras.layers.Layer): +class TFDeiTEmbeddings(keras.layers.Layer): """ Construct the CLS token, distillation token, position and patch embeddings. Optionally, also the mask token. """ @@ -111,18 +112,18 @@ class TFDeiTEmbeddings(tf.keras.layers.Layer): self.config = config self.use_mask_token = use_mask_token self.patch_embeddings = TFDeiTPatchEmbeddings(config=config, name="patch_embeddings") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, name="dropout") + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, name="dropout") def build(self, input_shape=None): self.cls_token = self.add_weight( shape=(1, 1, self.config.hidden_size), - initializer=tf.keras.initializers.zeros(), + initializer=keras.initializers.zeros(), trainable=True, name="cls_token", ) self.distillation_token = self.add_weight( shape=(1, 1, self.config.hidden_size), - initializer=tf.keras.initializers.zeros(), + initializer=keras.initializers.zeros(), trainable=True, name="distillation_token", ) @@ -130,14 +131,14 @@ class TFDeiTEmbeddings(tf.keras.layers.Layer): if self.use_mask_token: self.mask_token = self.add_weight( shape=(1, 1, self.config.hidden_size), - initializer=tf.keras.initializers.zeros(), + initializer=keras.initializers.zeros(), trainable=True, name="mask_token", ) num_patches = self.patch_embeddings.num_patches self.position_embeddings = self.add_weight( shape=(1, num_patches + 2, self.config.hidden_size), - initializer=tf.keras.initializers.zeros(), + initializer=keras.initializers.zeros(), trainable=True, name="position_embeddings", ) @@ -173,7 +174,7 @@ class TFDeiTEmbeddings(tf.keras.layers.Layer): return embeddings -class TFDeiTPatchEmbeddings(tf.keras.layers.Layer): +class TFDeiTPatchEmbeddings(keras.layers.Layer): """ This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a @@ -193,7 +194,7 @@ class TFDeiTPatchEmbeddings(tf.keras.layers.Layer): self.num_channels = num_channels self.num_patches = num_patches - self.projection = tf.keras.layers.Conv2D( + self.projection = keras.layers.Conv2D( hidden_size, kernel_size=patch_size, strides=patch_size, name="projection" ) @@ -222,7 +223,7 @@ class TFDeiTPatchEmbeddings(tf.keras.layers.Layer): # Copied from transformers.models.vit.modeling_tf_vit.TFViTSelfAttention with ViT->DeiT -class TFDeiTSelfAttention(tf.keras.layers.Layer): +class TFDeiTSelfAttention(keras.layers.Layer): def __init__(self, config: DeiTConfig, **kwargs): super().__init__(**kwargs) @@ -237,16 +238,16 @@ class TFDeiTSelfAttention(tf.keras.layers.Layer): self.all_head_size = self.num_attention_heads * self.attention_head_size self.sqrt_att_head_size = math.sqrt(self.attention_head_size) - self.query = tf.keras.layers.Dense( + self.query = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" ) - self.key = tf.keras.layers.Dense( + self.key = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" ) - self.value = tf.keras.layers.Dense( + self.value = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) - self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: @@ -313,7 +314,7 @@ class TFDeiTSelfAttention(tf.keras.layers.Layer): # Copied from transformers.models.vit.modeling_tf_vit.TFViTSelfOutput with ViT->DeiT -class TFDeiTSelfOutput(tf.keras.layers.Layer): +class TFDeiTSelfOutput(keras.layers.Layer): """ The residual connection is defined in TFDeiTLayer instead of here (as is the case with other models), due to the layernorm applied before each block. @@ -322,10 +323,10 @@ class TFDeiTSelfOutput(tf.keras.layers.Layer): def __init__(self, config: DeiTConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -344,7 +345,7 @@ class TFDeiTSelfOutput(tf.keras.layers.Layer): # Copied from transformers.models.vit.modeling_tf_vit.TFViTAttention with ViT->DeiT -class TFDeiTAttention(tf.keras.layers.Layer): +class TFDeiTAttention(keras.layers.Layer): def __init__(self, config: DeiTConfig, **kwargs): super().__init__(**kwargs) @@ -384,11 +385,11 @@ class TFDeiTAttention(tf.keras.layers.Layer): # Copied from transformers.models.vit.modeling_tf_vit.TFViTIntermediate with ViT->DeiT -class TFDeiTIntermediate(tf.keras.layers.Layer): +class TFDeiTIntermediate(keras.layers.Layer): def __init__(self, config: DeiTConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -414,14 +415,14 @@ class TFDeiTIntermediate(tf.keras.layers.Layer): # Copied from transformers.models.vit.modeling_tf_vit.TFViTOutput with ViT->DeiT -class TFDeiTOutput(tf.keras.layers.Layer): +class TFDeiTOutput(keras.layers.Layer): def __init__(self, config: DeiTConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -440,7 +441,7 @@ class TFDeiTOutput(tf.keras.layers.Layer): self.dense.build([None, None, self.config.intermediate_size]) -class TFDeiTLayer(tf.keras.layers.Layer): +class TFDeiTLayer(keras.layers.Layer): """This corresponds to the Block class in the timm implementation.""" def __init__(self, config: DeiTConfig, **kwargs): @@ -450,12 +451,8 @@ class TFDeiTLayer(tf.keras.layers.Layer): self.intermediate = TFDeiTIntermediate(config, name="intermediate") self.deit_output = TFDeiTOutput(config, name="output") - self.layernorm_before = tf.keras.layers.LayerNormalization( - epsilon=config.layer_norm_eps, name="layernorm_before" - ) - self.layernorm_after = tf.keras.layers.LayerNormalization( - epsilon=config.layer_norm_eps, name="layernorm_after" - ) + self.layernorm_before = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_before") + self.layernorm_after = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_after") self.config = config def call( @@ -512,7 +509,7 @@ class TFDeiTLayer(tf.keras.layers.Layer): # Copied from transformers.models.vit.modeling_tf_vit.TFViTEncoder with ViT->DeiT -class TFDeiTEncoder(tf.keras.layers.Layer): +class TFDeiTEncoder(keras.layers.Layer): def __init__(self, config: DeiTConfig, **kwargs): super().__init__(**kwargs) @@ -567,7 +564,7 @@ class TFDeiTEncoder(tf.keras.layers.Layer): @keras_serializable -class TFDeiTMainLayer(tf.keras.layers.Layer): +class TFDeiTMainLayer(keras.layers.Layer): config_class = DeiTConfig def __init__( @@ -579,7 +576,7 @@ class TFDeiTMainLayer(tf.keras.layers.Layer): self.embeddings = TFDeiTEmbeddings(config, use_mask_token=use_mask_token, name="embeddings") self.encoder = TFDeiTEncoder(config, name="encoder") - self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") + self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") self.pooler = TFDeiTPooler(config, name="pooler") if add_pooling_layer else None def get_input_embeddings(self) -> TFDeiTPatchEmbeddings: @@ -688,7 +685,7 @@ class TFDeiTPreTrainedModel(TFPreTrainedModel): DEIT_START_DOCSTRING = r""" This model is a TensorFlow - [tf.keras.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer). Use it as a regular + [keras.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer). Use it as a regular TensorFlow Module and refer to the TensorFlow documentation for all matter related to general usage and behavior. Parameters: @@ -774,11 +771,11 @@ class TFDeiTModel(TFDeiTPreTrainedModel): # Copied from transformers.models.vit.modeling_tf_vit.TFViTPooler with ViT->DeiT -class TFDeiTPooler(tf.keras.layers.Layer): +class TFDeiTPooler(keras.layers.Layer): def __init__(self, config: DeiTConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", @@ -803,7 +800,7 @@ class TFDeiTPooler(tf.keras.layers.Layer): self.dense.build([None, None, self.config.hidden_size]) -class TFDeitPixelShuffle(tf.keras.layers.Layer): +class TFDeitPixelShuffle(keras.layers.Layer): """TF layer implementation of torch.nn.PixelShuffle""" def __init__(self, upscale_factor: int, **kwargs) -> None: @@ -829,10 +826,10 @@ class TFDeitPixelShuffle(tf.keras.layers.Layer): return hidden_states -class TFDeitDecoder(tf.keras.layers.Layer): +class TFDeitDecoder(keras.layers.Layer): def __init__(self, config: DeiTConfig, **kwargs) -> None: super().__init__(**kwargs) - self.conv2d = tf.keras.layers.Conv2D( + self.conv2d = keras.layers.Conv2D( filters=config.encoder_stride**2 * config.num_channels, kernel_size=1, name="0" ) self.pixel_shuffle = TFDeitPixelShuffle(config.encoder_stride, name="1") @@ -946,7 +943,7 @@ class TFDeiTForMaskedImageModeling(TFDeiTPreTrainedModel): mask = tf.expand_dims(mask, 1) mask = tf.cast(mask, tf.float32) - reconstruction_loss = tf.keras.losses.mean_absolute_error( + reconstruction_loss = keras.losses.mean_absolute_error( # Swap axes as metric calculation reduces over the final dimension tf.transpose(pixel_values, (1, 2, 3, 0)), tf.transpose(reconstructed_pixel_values, (1, 2, 3, 0)), @@ -996,9 +993,9 @@ class TFDeiTForImageClassification(TFDeiTPreTrainedModel, TFSequenceClassificati # Classifier head self.classifier = ( - tf.keras.layers.Dense(config.num_labels, name="classifier") + keras.layers.Dense(config.num_labels, name="classifier") if config.num_labels > 0 - else tf.keras.layers.Activation("linear", name="classifier") + else keras.layers.Activation("linear", name="classifier") ) self.config = config @@ -1031,7 +1028,7 @@ class TFDeiTForImageClassification(TFDeiTPreTrainedModel, TFSequenceClassificati >>> from PIL import Image >>> import requests - >>> tf.keras.utils.set_random_seed(3) # doctest: +IGNORE_RESULT + >>> keras.utils.set_random_seed(3) # doctest: +IGNORE_RESULT >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) @@ -1110,14 +1107,14 @@ class TFDeiTForImageClassificationWithTeacher(TFDeiTPreTrainedModel): # Classifier heads self.cls_classifier = ( - tf.keras.layers.Dense(config.num_labels, name="cls_classifier") + keras.layers.Dense(config.num_labels, name="cls_classifier") if config.num_labels > 0 - else tf.keras.layers.Activation("linear", name="cls_classifier") + else keras.layers.Activation("linear", name="cls_classifier") ) self.distillation_classifier = ( - tf.keras.layers.Dense(config.num_labels, name="distillation_classifier") + keras.layers.Dense(config.num_labels, name="distillation_classifier") if config.num_labels > 0 - else tf.keras.layers.Activation("linear", name="distillation_classifier") + else keras.layers.Activation("linear", name="distillation_classifier") ) self.config = config diff --git a/src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py b/src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py index 9ae32f8ceb..c99d834670 100644 --- a/src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py +++ b/src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py @@ -30,6 +30,7 @@ from ....modeling_tf_utils import ( TFPreTrainedModel, TFSequenceClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -56,7 +57,7 @@ TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [ ] -class TFPositionalEmbedding(tf.keras.layers.Layer): +class TFPositionalEmbedding(keras.layers.Layer): def __init__(self, demb, **kwargs): super().__init__(**kwargs) @@ -73,7 +74,7 @@ class TFPositionalEmbedding(tf.keras.layers.Layer): return pos_emb[:, None, :] -class TFPositionwiseFF(tf.keras.layers.Layer): +class TFPositionwiseFF(keras.layers.Layer): def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5, init_std=0.02, **kwargs): super().__init__(**kwargs) @@ -81,14 +82,14 @@ class TFPositionwiseFF(tf.keras.layers.Layer): self.d_inner = d_inner self.dropout = dropout - self.layer_1 = tf.keras.layers.Dense( + self.layer_1 = keras.layers.Dense( d_inner, kernel_initializer=get_initializer(init_std), activation=tf.nn.relu, name="CoreNet_._0" ) - self.drop_1 = tf.keras.layers.Dropout(dropout) - self.layer_2 = tf.keras.layers.Dense(d_model, kernel_initializer=get_initializer(init_std), name="CoreNet_._3") - self.drop_2 = tf.keras.layers.Dropout(dropout) + self.drop_1 = keras.layers.Dropout(dropout) + self.layer_2 = keras.layers.Dense(d_model, kernel_initializer=get_initializer(init_std), name="CoreNet_._3") + self.drop_2 = keras.layers.Dropout(dropout) - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm") + self.layer_norm = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm") self.pre_lnorm = pre_lnorm @@ -116,7 +117,7 @@ class TFPositionwiseFF(tf.keras.layers.Layer): return output -class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer): +class TFRelPartialLearnableMultiHeadAttn(keras.layers.Layer): def __init__( self, n_head, @@ -140,17 +141,17 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer): self.dropout = dropout self.output_attentions = output_attentions - self.qkv_net = tf.keras.layers.Dense( + self.qkv_net = keras.layers.Dense( 3 * n_head * d_head, kernel_initializer=get_initializer(init_std), use_bias=False, name="qkv_net" ) - self.drop = tf.keras.layers.Dropout(dropout) - self.dropatt = tf.keras.layers.Dropout(dropatt) - self.o_net = tf.keras.layers.Dense( + self.drop = keras.layers.Dropout(dropout) + self.dropatt = keras.layers.Dropout(dropatt) + self.o_net = keras.layers.Dense( d_model, kernel_initializer=get_initializer(init_std), use_bias=False, name="o_net" ) - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm") + self.layer_norm = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm") self.scale = 1 / (d_head**0.5) @@ -163,7 +164,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer): self.r_r_bias = None self.r_w_bias = None - self.r_net = tf.keras.layers.Dense( + self.r_net = keras.layers.Dense( self.n_head * self.d_head, kernel_initializer=get_initializer(init_std), use_bias=False, name="r_net" ) @@ -268,7 +269,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer): return outputs -class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer): +class TFRelPartialLearnableDecoderLayer(keras.layers.Layer): def __init__( self, n_head, @@ -320,7 +321,7 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer): return outputs -class TFTransfoEmbeddings(tf.keras.layers.Layer): +class TFTransfoEmbeddings(keras.layers.Layer): def __init__(self, vocab_size, emb_size, init_std, **kwargs): super().__init__(**kwargs) @@ -341,7 +342,7 @@ class TFTransfoEmbeddings(tf.keras.layers.Layer): return tf.gather(self.weight, inputs) -class TFAdaptiveEmbedding(tf.keras.layers.Layer): +class TFAdaptiveEmbedding(keras.layers.Layer): def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, init_std=0.02, sample_softmax=False, **kwargs): super().__init__(**kwargs) @@ -418,7 +419,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer): @keras_serializable -class TFTransfoXLMainLayer(tf.keras.layers.Layer): +class TFTransfoXLMainLayer(keras.layers.Layer): config_class = TransfoXLConfig def __init__(self, config, **kwargs): @@ -447,7 +448,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): name="word_emb", ) - self.drop = tf.keras.layers.Dropout(config.dropout) + self.drop = keras.layers.Dropout(config.dropout) self.n_layer = config.n_layer self.mem_len = config.mem_len @@ -773,7 +774,7 @@ TRANSFO_XL_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -1022,7 +1023,7 @@ class TFTransfoXLForSequenceClassification(TFTransfoXLPreTrainedModel, TFSequenc def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels - self.score = tf.keras.layers.Dense( + self.score = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.init_range), name="score", diff --git a/src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl_utilities.py b/src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl_utilities.py index c6a380842e..ed1488d559 100644 --- a/src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl_utilities.py +++ b/src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl_utilities.py @@ -20,10 +20,11 @@ import tensorflow as tf +from ....modeling_tf_utils import keras from ....tf_utils import shape_list -class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer): +class TFAdaptiveSoftmaxMask(keras.layers.Layer): def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1, keep_order=False, **kwargs): super().__init__(**kwargs) diff --git a/src/transformers/models/distilbert/modeling_tf_distilbert.py b/src/transformers/models/distilbert/modeling_tf_distilbert.py index 192e256981..39fd470597 100644 --- a/src/transformers/models/distilbert/modeling_tf_distilbert.py +++ b/src/transformers/models/distilbert/modeling_tf_distilbert.py @@ -43,6 +43,7 @@ from ...modeling_tf_utils import ( TFSequenceClassificationLoss, TFTokenClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -72,7 +73,7 @@ TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ ] -class TFEmbeddings(tf.keras.layers.Layer): +class TFEmbeddings(keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, config, **kwargs): @@ -81,8 +82,8 @@ class TFEmbeddings(tf.keras.layers.Layer): self.dim = config.dim self.initializer_range = config.initializer_range self.max_position_embeddings = config.max_position_embeddings - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.dropout) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.dropout) def build(self, input_shape=None): with tf.name_scope("word_embeddings"): @@ -132,27 +133,27 @@ class TFEmbeddings(tf.keras.layers.Layer): return final_embeddings -class TFMultiHeadSelfAttention(tf.keras.layers.Layer): +class TFMultiHeadSelfAttention(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.n_heads = config.n_heads self.dim = config.dim - self.dropout = tf.keras.layers.Dropout(config.attention_dropout) + self.dropout = keras.layers.Dropout(config.attention_dropout) self.output_attentions = config.output_attentions assert self.dim % self.n_heads == 0, f"Hidden size {self.dim} not dividable by number of heads {self.n_heads}" - self.q_lin = tf.keras.layers.Dense( + self.q_lin = keras.layers.Dense( config.dim, kernel_initializer=get_initializer(config.initializer_range), name="q_lin" ) - self.k_lin = tf.keras.layers.Dense( + self.k_lin = keras.layers.Dense( config.dim, kernel_initializer=get_initializer(config.initializer_range), name="k_lin" ) - self.v_lin = tf.keras.layers.Dense( + self.v_lin = keras.layers.Dense( config.dim, kernel_initializer=get_initializer(config.initializer_range), name="v_lin" ) - self.out_lin = tf.keras.layers.Dense( + self.out_lin = keras.layers.Dense( config.dim, kernel_initializer=get_initializer(config.initializer_range), name="out_lin" ) @@ -236,14 +237,14 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer): self.out_lin.build([None, None, self.config.dim]) -class TFFFN(tf.keras.layers.Layer): +class TFFFN(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dropout = tf.keras.layers.Dropout(config.dropout) - self.lin1 = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(config.dropout) + self.lin1 = keras.layers.Dense( config.hidden_dim, kernel_initializer=get_initializer(config.initializer_range), name="lin1" ) - self.lin2 = tf.keras.layers.Dense( + self.lin2 = keras.layers.Dense( config.dim, kernel_initializer=get_initializer(config.initializer_range), name="lin2" ) self.activation = get_tf_activation(config.activation) @@ -268,14 +269,14 @@ class TFFFN(tf.keras.layers.Layer): self.lin2.build([None, None, self.config.hidden_dim]) -class TFTransformerBlock(tf.keras.layers.Layer): +class TFTransformerBlock(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.n_heads = config.n_heads self.dim = config.dim self.hidden_dim = config.hidden_dim - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) self.activation = config.activation self.output_attentions = config.output_attentions @@ -284,10 +285,10 @@ class TFTransformerBlock(tf.keras.layers.Layer): ), f"Hidden size {config.dim} not dividable by number of heads {config.n_heads}" self.attention = TFMultiHeadSelfAttention(config, name="attention") - self.sa_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="sa_layer_norm") + self.sa_layer_norm = keras.layers.LayerNormalization(epsilon=1e-12, name="sa_layer_norm") self.ffn = TFFFN(config, name="ffn") - self.output_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="output_layer_norm") + self.output_layer_norm = keras.layers.LayerNormalization(epsilon=1e-12, name="output_layer_norm") self.config = config def call(self, x, attn_mask, head_mask, output_attentions, training=False): # removed: src_enc=None, src_len=None @@ -335,7 +336,7 @@ class TFTransformerBlock(tf.keras.layers.Layer): self.output_layer_norm.build([None, None, self.config.dim]) -class TFTransformer(tf.keras.layers.Layer): +class TFTransformer(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.n_layers = config.n_layers @@ -400,7 +401,7 @@ class TFTransformer(tf.keras.layers.Layer): @keras_serializable -class TFDistilBertMainLayer(tf.keras.layers.Layer): +class TFDistilBertMainLayer(keras.layers.Layer): config_class = DistilBertConfig def __init__(self, config, **kwargs): @@ -503,7 +504,7 @@ DISTILBERT_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -630,7 +631,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel): self.distilbert.build(None) -class TFDistilBertLMHead(tf.keras.layers.Layer): +class TFDistilBertLMHead(keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) @@ -680,11 +681,11 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModel self.config = config self.distilbert = TFDistilBertMainLayer(config, name="distilbert") - self.vocab_transform = tf.keras.layers.Dense( + self.vocab_transform = keras.layers.Dense( config.dim, kernel_initializer=get_initializer(config.initializer_range), name="vocab_transform" ) self.act = get_tf_activation(config.activation) - self.vocab_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm") + self.vocab_layer_norm = keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm") self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector") def get_lm_head(self): @@ -779,16 +780,16 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque self.num_labels = config.num_labels self.distilbert = TFDistilBertMainLayer(config, name="distilbert") - self.pre_classifier = tf.keras.layers.Dense( + self.pre_classifier = keras.layers.Dense( config.dim, kernel_initializer=get_initializer(config.initializer_range), activation="relu", name="pre_classifier", ) - self.classifier = tf.keras.layers.Dense( + self.classifier = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout) + self.dropout = keras.layers.Dropout(config.seq_classif_dropout) self.config = config @unpack_inputs @@ -873,8 +874,8 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla self.num_labels = config.num_labels self.distilbert = TFDistilBertMainLayer(config, name="distilbert") - self.dropout = tf.keras.layers.Dropout(config.dropout) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(config.dropout) + self.classifier = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config @@ -952,14 +953,14 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic super().__init__(config, *inputs, **kwargs) self.distilbert = TFDistilBertMainLayer(config, name="distilbert") - self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout) - self.pre_classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(config.seq_classif_dropout) + self.pre_classifier = keras.layers.Dense( config.dim, kernel_initializer=get_initializer(config.initializer_range), activation="relu", name="pre_classifier", ) - self.classifier = tf.keras.layers.Dense( + self.classifier = keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config @@ -1061,11 +1062,11 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn super().__init__(config, *inputs, **kwargs) self.distilbert = TFDistilBertMainLayer(config, name="distilbert") - self.qa_outputs = tf.keras.layers.Dense( + self.qa_outputs = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) assert config.num_labels == 2, f"Incorrect number of labels {config.num_labels} instead of 2" - self.dropout = tf.keras.layers.Dropout(config.qa_dropout) + self.dropout = keras.layers.Dropout(config.qa_dropout) self.config = config @unpack_inputs diff --git a/src/transformers/models/dpr/modeling_tf_dpr.py b/src/transformers/models/dpr/modeling_tf_dpr.py index db9aa6d227..0a6aa47640 100644 --- a/src/transformers/models/dpr/modeling_tf_dpr.py +++ b/src/transformers/models/dpr/modeling_tf_dpr.py @@ -23,7 +23,7 @@ from typing import Tuple, Union import tensorflow as tf from ...modeling_tf_outputs import TFBaseModelOutputWithPooling -from ...modeling_tf_utils import TFModelInputType, TFPreTrainedModel, get_initializer, shape_list, unpack_inputs +from ...modeling_tf_utils import TFModelInputType, TFPreTrainedModel, get_initializer, keras, shape_list, unpack_inputs from ...utils import ( ModelOutput, add_start_docstrings, @@ -147,7 +147,7 @@ class TFDPRReaderOutput(ModelOutput): attentions: Tuple[tf.Tensor, ...] | None = None -class TFDPREncoderLayer(tf.keras.layers.Layer): +class TFDPREncoderLayer(keras.layers.Layer): base_model_prefix = "bert_model" def __init__(self, config: DPRConfig, **kwargs): @@ -161,7 +161,7 @@ class TFDPREncoderLayer(tf.keras.layers.Layer): raise ValueError("Encoder hidden_size can't be zero") self.projection_dim = config.projection_dim if self.projection_dim > 0: - self.encode_proj = tf.keras.layers.Dense( + self.encode_proj = keras.layers.Dense( config.projection_dim, kernel_initializer=get_initializer(config.initializer_range), name="encode_proj" ) @@ -221,7 +221,7 @@ class TFDPREncoderLayer(tf.keras.layers.Layer): self.encode_proj.build(None) -class TFDPRSpanPredictorLayer(tf.keras.layers.Layer): +class TFDPRSpanPredictorLayer(keras.layers.Layer): base_model_prefix = "encoder" def __init__(self, config: DPRConfig, **kwargs): @@ -229,10 +229,10 @@ class TFDPRSpanPredictorLayer(tf.keras.layers.Layer): self.config = config self.encoder = TFDPREncoderLayer(config, name="encoder") - self.qa_outputs = tf.keras.layers.Dense( + self.qa_outputs = keras.layers.Dense( 2, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) - self.qa_classifier = tf.keras.layers.Dense( + self.qa_classifier = keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="qa_classifier" ) @@ -409,7 +409,7 @@ TF_DPR_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a Tensorflow [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) + This model is also a Tensorflow [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. diff --git a/src/transformers/models/efficientformer/modeling_tf_efficientformer.py b/src/transformers/models/efficientformer/modeling_tf_efficientformer.py index 5730cd98fa..113eafb88d 100644 --- a/src/transformers/models/efficientformer/modeling_tf_efficientformer.py +++ b/src/transformers/models/efficientformer/modeling_tf_efficientformer.py @@ -30,6 +30,7 @@ from ...modeling_tf_utils import ( TFPreTrainedModel, TFSequenceClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -64,7 +65,7 @@ TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [ ] -class TFEfficientFormerPatchEmbeddings(tf.keras.layers.Layer): +class TFEfficientFormerPatchEmbeddings(keras.layers.Layer): """ This class performs downsampling between two stages. For the input tensor with the shape [batch_size, num_channels, height, width] it produces output tensor with the shape [batch_size, num_channels, height/stride, width/stride] @@ -76,8 +77,8 @@ class TFEfficientFormerPatchEmbeddings(tf.keras.layers.Layer): super().__init__(**kwargs) self.num_channels = num_channels - self.padding = tf.keras.layers.ZeroPadding2D(padding=config.downsample_pad) - self.projection = tf.keras.layers.Conv2D( + self.padding = keras.layers.ZeroPadding2D(padding=config.downsample_pad) + self.projection = keras.layers.Conv2D( filters=embed_dim, kernel_size=config.downsample_patch_size, strides=config.downsample_stride, @@ -86,7 +87,7 @@ class TFEfficientFormerPatchEmbeddings(tf.keras.layers.Layer): ) # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization self.norm = ( - tf.keras.layers.BatchNormalization(axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="norm") + keras.layers.BatchNormalization(axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="norm") if apply_norm else tf.identity ) @@ -114,7 +115,7 @@ class TFEfficientFormerPatchEmbeddings(tf.keras.layers.Layer): self.norm.build([None, None, None, self.embed_dim]) -class TFEfficientFormerSelfAttention(tf.keras.layers.Layer): +class TFEfficientFormerSelfAttention(keras.layers.Layer): def __init__( self, dim: int, @@ -136,10 +137,10 @@ class TFEfficientFormerSelfAttention(tf.keras.layers.Layer): self.total_expanded_key_dim = int(self.expanded_key_dim * num_heads) hidden_size = self.total_expanded_key_dim + self.total_key_dim * 2 - self.qkv = tf.keras.layers.Dense( + self.qkv = keras.layers.Dense( units=hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="qkv" ) - self.projection = tf.keras.layers.Dense( + self.projection = keras.layers.Dense( units=dim, kernel_initializer=get_initializer(config.initializer_range), name="projection" ) self.resolution = resolution @@ -161,7 +162,7 @@ class TFEfficientFormerSelfAttention(tf.keras.layers.Layer): self.attention_biases = self.add_weight( shape=(self.num_heads, len(attention_offsets)), - initializer=tf.keras.initializers.zeros(), + initializer=keras.initializers.zeros(), trainable=True, name="attention_biases", ) @@ -221,20 +222,20 @@ class TFEfficientFormerSelfAttention(tf.keras.layers.Layer): return outputs -class TFEfficientFormerConvStem(tf.keras.layers.Layer): +class TFEfficientFormerConvStem(keras.layers.Layer): def __init__(self, config: EfficientFormerConfig, out_channels: int, **kwargs): super().__init__(**kwargs) - self.padding = tf.keras.layers.ZeroPadding2D(padding=1) - self.convolution1 = tf.keras.layers.Conv2D( + self.padding = keras.layers.ZeroPadding2D(padding=1) + self.convolution1 = keras.layers.Conv2D( filters=out_channels // 2, kernel_size=3, strides=2, padding="valid", name="convolution1" ) # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization - self.batchnorm_before = tf.keras.layers.BatchNormalization( + self.batchnorm_before = keras.layers.BatchNormalization( axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_before" ) - self.convolution2 = tf.keras.layers.Conv2D( + self.convolution2 = keras.layers.Conv2D( filters=out_channels, kernel_size=3, strides=2, @@ -242,11 +243,11 @@ class TFEfficientFormerConvStem(tf.keras.layers.Layer): name="convolution2", ) # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization - self.batchnorm_after = tf.keras.layers.BatchNormalization( + self.batchnorm_after = keras.layers.BatchNormalization( axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_after" ) - self.activation = tf.keras.layers.Activation(activation=tf.keras.activations.relu, name="activation") + self.activation = keras.layers.Activation(activation=keras.activations.relu, name="activation") self.out_channels = out_channels self.config = config @@ -278,10 +279,10 @@ class TFEfficientFormerConvStem(tf.keras.layers.Layer): self.activation.build(None) -class TFEfficientFormerPooling(tf.keras.layers.Layer): +class TFEfficientFormerPooling(keras.layers.Layer): def __init__(self, pool_size: int, **kwargs): super().__init__(**kwargs) - self.pool = tf.keras.layers.AveragePooling2D(pool_size=pool_size, strides=1, padding="same") + self.pool = keras.layers.AveragePooling2D(pool_size=pool_size, strides=1, padding="same") def call(self, hidden_states: tf.Tensor) -> tf.Tensor: output = self.pool(hidden_states) @@ -289,7 +290,7 @@ class TFEfficientFormerPooling(tf.keras.layers.Layer): return output -class TFEfficientFormerDenseMlp(tf.keras.layers.Layer): +class TFEfficientFormerDenseMlp(keras.layers.Layer): def __init__( self, config: EfficientFormerConfig, @@ -302,13 +303,13 @@ class TFEfficientFormerDenseMlp(tf.keras.layers.Layer): out_features = out_features or in_features hidden_features = hidden_features or in_features - self.linear_in = tf.keras.layers.Dense( + self.linear_in = keras.layers.Dense( units=hidden_features, kernel_initializer=get_initializer(config.initializer_range), name="linear_in" ) self.activation = ACT2FN[config.hidden_act] - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.linear_out = tf.keras.layers.Dense( + self.linear_out = keras.layers.Dense( units=out_features, kernel_initializer=get_initializer(config.initializer_range), name="linear_out" ) self.hidden_features = hidden_features @@ -335,7 +336,7 @@ class TFEfficientFormerDenseMlp(tf.keras.layers.Layer): self.linear_out.build([None, None, self.hidden_features]) -class TFEfficientFormerConvMlp(tf.keras.layers.Layer): +class TFEfficientFormerConvMlp(keras.layers.Layer): def __init__( self, config: EfficientFormerConfig, @@ -349,7 +350,7 @@ class TFEfficientFormerConvMlp(tf.keras.layers.Layer): out_features = out_features or in_features hidden_features = hidden_features or in_features - self.convolution1 = tf.keras.layers.Conv2D( + self.convolution1 = keras.layers.Conv2D( filters=hidden_features, kernel_size=1, name="convolution1", @@ -358,21 +359,21 @@ class TFEfficientFormerConvMlp(tf.keras.layers.Layer): self.activation = ACT2FN[config.hidden_act] - self.convolution2 = tf.keras.layers.Conv2D( + self.convolution2 = keras.layers.Conv2D( filters=out_features, kernel_size=1, name="convolution2", padding="valid", ) - self.dropout = tf.keras.layers.Dropout(rate=drop) + self.dropout = keras.layers.Dropout(rate=drop) # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization - self.batchnorm_before = tf.keras.layers.BatchNormalization( + self.batchnorm_before = keras.layers.BatchNormalization( axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_before" ) # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization - self.batchnorm_after = tf.keras.layers.BatchNormalization( + self.batchnorm_after = keras.layers.BatchNormalization( axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_after" ) self.hidden_features = hidden_features @@ -408,7 +409,7 @@ class TFEfficientFormerConvMlp(tf.keras.layers.Layer): # Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextDropPath with ConvNext->EfficientFormer -class TFEfficientFormerDropPath(tf.keras.layers.Layer): +class TFEfficientFormerDropPath(keras.layers.Layer): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). References: (1) github.com:rwightman/pytorch-image-models @@ -428,7 +429,7 @@ class TFEfficientFormerDropPath(tf.keras.layers.Layer): return x -class TFEfficientFormerFlat(tf.keras.layers.Layer): +class TFEfficientFormerFlat(keras.layers.Layer): def __init__(self, **kwargs): super().__init__(**kwargs) @@ -438,7 +439,7 @@ class TFEfficientFormerFlat(tf.keras.layers.Layer): return hidden_states -class TFEfficientFormerMeta3D(tf.keras.layers.Layer): +class TFEfficientFormerMeta3D(keras.layers.Layer): def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0, **kwargs): super().__init__(**kwargs) @@ -454,8 +455,8 @@ class TFEfficientFormerMeta3D(tf.keras.layers.Layer): self.dim = dim self.config = config - self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm1") - self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm2") + self.layernorm1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm1") + self.layernorm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm2") mlp_hidden_dim = int(dim * config.mlp_expansion_ratio) self.mlp = TFEfficientFormerDenseMlp(config, in_features=dim, hidden_features=mlp_hidden_dim, name="mlp") @@ -463,7 +464,7 @@ class TFEfficientFormerMeta3D(tf.keras.layers.Layer): self.drop_path = ( TFEfficientFormerDropPath(drop_path) if drop_path > 0.0 - else tf.keras.layers.Activation("linear", name="drop_path") + else keras.layers.Activation("linear", name="drop_path") ) self.config = config @@ -474,13 +475,13 @@ class TFEfficientFormerMeta3D(tf.keras.layers.Layer): if self.config.use_layer_scale: self.layer_scale_1 = self.add_weight( shape=(self.dim,), - initializer=tf.keras.initializers.Constant(value=self.config.layer_scale_init_value), + initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value), trainable=True, name="layer_scale_1", ) self.layer_scale_2 = self.add_weight( shape=(self.dim,), - initializer=tf.keras.initializers.Constant(value=self.config.layer_scale_init_value), + initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value), trainable=True, name="layer_scale_2", ) @@ -538,7 +539,7 @@ class TFEfficientFormerMeta3D(tf.keras.layers.Layer): return outputs -class TFEfficientFormerMeta3DLayers(tf.keras.layers.Layer): +class TFEfficientFormerMeta3DLayers(keras.layers.Layer): def __init__(self, config: EfficientFormerConfig, **kwargs): super().__init__(**kwargs) drop_paths = [ @@ -581,7 +582,7 @@ class TFEfficientFormerMeta3DLayers(tf.keras.layers.Layer): layer.build(None) -class TFEfficientFormerMeta4D(tf.keras.layers.Layer): +class TFEfficientFormerMeta4D(keras.layers.Layer): def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0, **kwargs): super().__init__(**kwargs) pool_size = config.pool_size if config.pool_size is not None else 3 @@ -595,7 +596,7 @@ class TFEfficientFormerMeta4D(tf.keras.layers.Layer): self.drop_path = ( TFEfficientFormerDropPath(drop_path, name="drop_path") if drop_path > 0.0 - else tf.keras.layers.Activation("linear", name="drop_path") + else keras.layers.Activation("linear", name="drop_path") ) self.config = config @@ -606,13 +607,13 @@ class TFEfficientFormerMeta4D(tf.keras.layers.Layer): if self.config.use_layer_scale: self.layer_scale_1 = self.add_weight( shape=(self.dim), - initializer=tf.keras.initializers.Constant(value=self.config.layer_scale_init_value), + initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value), trainable=True, name="layer_scale_1", ) self.layer_scale_2 = self.add_weight( shape=(self.dim), - initializer=tf.keras.initializers.Constant(value=self.config.layer_scale_init_value), + initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value), trainable=True, name="layer_scale_2", ) @@ -654,7 +655,7 @@ class TFEfficientFormerMeta4D(tf.keras.layers.Layer): return layer_output -class TFEfficientFormerMeta4DLayers(tf.keras.layers.Layer): +class TFEfficientFormerMeta4DLayers(keras.layers.Layer): def __init__(self, config: EfficientFormerConfig, stage_idx: int, **kwargs): super().__init__(**kwargs) num_layers = ( @@ -686,7 +687,7 @@ class TFEfficientFormerMeta4DLayers(tf.keras.layers.Layer): layer.build(None) -class TFEfficientFormerIntermediateStage(tf.keras.layers.Layer): +class TFEfficientFormerIntermediateStage(keras.layers.Layer): def __init__(self, config: EfficientFormerConfig, index: int, **kwargs): super().__init__(**kwargs) self.meta4D_layers = TFEfficientFormerMeta4DLayers(config=config, stage_idx=index, name="meta4D_layers") @@ -704,7 +705,7 @@ class TFEfficientFormerIntermediateStage(tf.keras.layers.Layer): self.meta4D_layers.build(None) -class TFEfficientFormerLastStage(tf.keras.layers.Layer): +class TFEfficientFormerLastStage(keras.layers.Layer): def __init__(self, config: EfficientFormerConfig, **kwargs): super().__init__(**kwargs) self.meta4D_layers = TFEfficientFormerMeta4DLayers(config=config, stage_idx=-1, name="meta4D_layers") @@ -737,7 +738,7 @@ class TFEfficientFormerLastStage(tf.keras.layers.Layer): self.meta3D_layers.build(None) -class TFEfficientFormerEncoder(tf.keras.layers.Layer): +class TFEfficientFormerEncoder(keras.layers.Layer): def __init__(self, config: EfficientFormerConfig, **kwargs): super().__init__(**kwargs) @@ -818,7 +819,7 @@ class TFEfficientFormerEncoder(tf.keras.layers.Layer): @keras_serializable -class TFEfficientFormerMainLayer(tf.keras.layers.Layer): +class TFEfficientFormerMainLayer(keras.layers.Layer): config_class = EfficientFormerConfig def __init__(self, config: EfficientFormerConfig, **kwargs) -> None: @@ -827,7 +828,7 @@ class TFEfficientFormerMainLayer(tf.keras.layers.Layer): self.patch_embed = TFEfficientFormerConvStem(config, config.hidden_sizes[0], name="patch_embed") self.encoder = TFEfficientFormerEncoder(config, name="encoder") - self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") + self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") @unpack_inputs def call( @@ -848,7 +849,7 @@ class TFEfficientFormerMainLayer(tf.keras.layers.Layer): if pixel_values is None: raise ValueError("You have to specify pixel_values") - # When running on CPU, tf.keras.layers.Conv2D and tf.keras.layers.AveragePool2D do not + # When running on CPU, keras.layers.Conv2D and keras.layers.AveragePool2D do not # support channels first NCHW format. A number of blocks contain both. # So change the input format from (batch_size, num_channels, height, width) to # (batch_size, height, width, num_channels) here. @@ -914,7 +915,7 @@ class TFEfficientFormerPreTrainedModel(TFPreTrainedModel): EFFICIENTFORMER_START_DOCSTRING = r""" This model is a TensorFlow - [tf.keras.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer). Use it as a regular + [keras.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer). Use it as a regular TensorFlow Module and refer to the TensorFlow documentation for all matter related to general usage and behavior. @@ -1001,9 +1002,9 @@ class TFEfficientFormerForImageClassification(TFEfficientFormerPreTrainedModel, # Classifier head self.classifier = ( - tf.keras.layers.Dense(config.num_labels, name="classifier") + keras.layers.Dense(config.num_labels, name="classifier") if config.num_labels > 0 - else tf.keras.layers.Activation("linear", name="classifier") + else keras.layers.Activation("linear", name="classifier") ) self.config = config @@ -1119,14 +1120,14 @@ class TFEfficientFormerForImageClassificationWithTeacher(TFEfficientFormerPreTra # Classifier heads self.classifier = ( - tf.keras.layers.Dense(config.num_labels, name="classifier") + keras.layers.Dense(config.num_labels, name="classifier") if config.num_labels > 0 - else tf.keras.layers.Activation("linear", name="classifier") + else keras.layers.Activation("linear", name="classifier") ) self.distillation_classifier = ( - tf.keras.layers.Dense(config.num_labels, name="distillation_classifier") + keras.layers.Dense(config.num_labels, name="distillation_classifier") if config.num_labels > 0 - else tf.keras.layers.Activation("linear", name="distillation_classifier") + else keras.layers.Activation("linear", name="distillation_classifier") ) @unpack_inputs diff --git a/src/transformers/models/electra/modeling_tf_electra.py b/src/transformers/models/electra/modeling_tf_electra.py index ecbbd5ad8f..b0c8b4fa28 100644 --- a/src/transformers/models/electra/modeling_tf_electra.py +++ b/src/transformers/models/electra/modeling_tf_electra.py @@ -44,6 +44,7 @@ from ...modeling_tf_utils import ( TFSequenceSummary, TFTokenClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -76,7 +77,7 @@ TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [ # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Electra -class TFElectraSelfAttention(tf.keras.layers.Layer): +class TFElectraSelfAttention(keras.layers.Layer): def __init__(self, config: ElectraConfig, **kwargs): super().__init__(**kwargs) @@ -91,16 +92,16 @@ class TFElectraSelfAttention(tf.keras.layers.Layer): self.all_head_size = self.num_attention_heads * self.attention_head_size self.sqrt_att_head_size = math.sqrt(self.attention_head_size) - self.query = tf.keras.layers.Dense( + self.query = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" ) - self.key = tf.keras.layers.Dense( + self.key = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" ) - self.value = tf.keras.layers.Dense( + self.value = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) - self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.is_decoder = config.is_decoder self.config = config @@ -209,15 +210,15 @@ class TFElectraSelfAttention(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Electra -class TFElectraSelfOutput(tf.keras.layers.Layer): +class TFElectraSelfOutput(keras.layers.Layer): def __init__(self, config: ElectraConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -240,7 +241,7 @@ class TFElectraSelfOutput(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Electra -class TFElectraAttention(tf.keras.layers.Layer): +class TFElectraAttention(keras.layers.Layer): def __init__(self, config: ElectraConfig, **kwargs): super().__init__(**kwargs) @@ -292,11 +293,11 @@ class TFElectraAttention(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Electra -class TFElectraIntermediate(tf.keras.layers.Layer): +class TFElectraIntermediate(keras.layers.Layer): def __init__(self, config: ElectraConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -322,15 +323,15 @@ class TFElectraIntermediate(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Electra -class TFElectraOutput(tf.keras.layers.Layer): +class TFElectraOutput(keras.layers.Layer): def __init__(self, config: ElectraConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -353,7 +354,7 @@ class TFElectraOutput(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Electra -class TFElectraLayer(tf.keras.layers.Layer): +class TFElectraLayer(keras.layers.Layer): def __init__(self, config: ElectraConfig, **kwargs): super().__init__(**kwargs) @@ -457,7 +458,7 @@ class TFElectraLayer(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Electra -class TFElectraEncoder(tf.keras.layers.Layer): +class TFElectraEncoder(keras.layers.Layer): def __init__(self, config: ElectraConfig, **kwargs): super().__init__(**kwargs) self.config = config @@ -536,11 +537,11 @@ class TFElectraEncoder(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Electra -class TFElectraPooler(tf.keras.layers.Layer): +class TFElectraPooler(keras.layers.Layer): def __init__(self, config: ElectraConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", @@ -566,7 +567,7 @@ class TFElectraPooler(tf.keras.layers.Layer): # Copied from transformers.models.albert.modeling_tf_albert.TFAlbertEmbeddings with Albert->Electra -class TFElectraEmbeddings(tf.keras.layers.Layer): +class TFElectraEmbeddings(keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, config: ElectraConfig, **kwargs): @@ -576,8 +577,8 @@ class TFElectraEmbeddings(tf.keras.layers.Layer): self.embedding_size = config.embedding_size self.max_position_embeddings = config.max_position_embeddings self.initializer_range = config.initializer_range - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) def build(self, input_shape=None): with tf.name_scope("word_embeddings"): @@ -650,12 +651,12 @@ class TFElectraEmbeddings(tf.keras.layers.Layer): return final_embeddings -class TFElectraDiscriminatorPredictions(tf.keras.layers.Layer): +class TFElectraDiscriminatorPredictions(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense(config.hidden_size, name="dense") - self.dense_prediction = tf.keras.layers.Dense(1, name="dense_prediction") + self.dense = keras.layers.Dense(config.hidden_size, name="dense") + self.dense_prediction = keras.layers.Dense(1, name="dense_prediction") self.config = config def call(self, discriminator_hidden_states, training=False): @@ -677,12 +678,12 @@ class TFElectraDiscriminatorPredictions(tf.keras.layers.Layer): self.dense_prediction.build([None, None, self.config.hidden_size]) -class TFElectraGeneratorPredictions(tf.keras.layers.Layer): +class TFElectraGeneratorPredictions(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dense = tf.keras.layers.Dense(config.embedding_size, name="dense") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dense = keras.layers.Dense(config.embedding_size, name="dense") self.config = config def call(self, generator_hidden_states, training=False): @@ -718,7 +719,7 @@ class TFElectraPreTrainedModel(TFPreTrainedModel): @keras_serializable -class TFElectraMainLayer(tf.keras.layers.Layer): +class TFElectraMainLayer(keras.layers.Layer): config_class = ElectraConfig def __init__(self, config, **kwargs): @@ -730,7 +731,7 @@ class TFElectraMainLayer(tf.keras.layers.Layer): self.embeddings = TFElectraEmbeddings(config, name="embeddings") if config.embedding_size != config.hidden_size: - self.embeddings_project = tf.keras.layers.Dense(config.hidden_size, name="embeddings_project") + self.embeddings_project = keras.layers.Dense(config.hidden_size, name="embeddings_project") self.encoder = TFElectraEncoder(config, name="encoder") @@ -952,7 +953,7 @@ ELECTRA_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -1205,7 +1206,7 @@ class TFElectraForPreTraining(TFElectraPreTrainedModel): self.discriminator_predictions.build(None) -class TFElectraMaskedLMHead(tf.keras.layers.Layer): +class TFElectraMaskedLMHead(keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) @@ -1347,13 +1348,13 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLos self.generator_lm_head.build(None) -class TFElectraClassificationHead(tf.keras.layers.Layer): +class TFElectraClassificationHead(keras.layers.Layer): """Head for sentence-level classification tasks.""" def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) classifier_dropout = ( @@ -1361,8 +1362,8 @@ class TFElectraClassificationHead(tf.keras.layers.Layer): if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = tf.keras.layers.Dropout(classifier_dropout) - self.out_proj = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(classifier_dropout) + self.out_proj = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" ) self.config = config @@ -1486,7 +1487,7 @@ class TFElectraForMultipleChoice(TFElectraPreTrainedModel, TFMultipleChoiceLoss) self.sequence_summary = TFSequenceSummary( config, initializer_range=config.initializer_range, name="sequence_summary" ) - self.classifier = tf.keras.layers.Dense( + self.classifier = keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config @@ -1594,8 +1595,8 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = tf.keras.layers.Dropout(classifier_dropout) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(classifier_dropout) + self.classifier = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config @@ -1681,7 +1682,7 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin self.num_labels = config.num_labels self.electra = TFElectraMainLayer(config, name="electra") - self.qa_outputs = tf.keras.layers.Dense( + self.qa_outputs = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) self.config = config diff --git a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py index 86c9c28b03..b4b2503bd0 100644 --- a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py +++ b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py @@ -32,6 +32,7 @@ from ...modeling_tf_utils import ( TFModelInputType, TFPreTrainedModel, get_initializer, + keras, unpack_inputs, ) from ...tf_utils import shape_list @@ -77,7 +78,7 @@ ENCODER_DECODER_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -258,7 +259,7 @@ class TFEncoderDecoderModel(TFPreTrainedModel, TFCausalLanguageModelingLoss): self.encoder.config.hidden_size != self.decoder.config.hidden_size and self.decoder.config.cross_attention_hidden_size is None ): - self.enc_to_dec_proj = tf.keras.layers.Dense( + self.enc_to_dec_proj = keras.layers.Dense( units=self.decoder.config.hidden_size, kernel_initializer=get_initializer(config.encoder.initializer_range), name="enc_to_dec_proj", @@ -445,7 +446,7 @@ class TFEncoderDecoderModel(TFPreTrainedModel, TFCausalLanguageModelingLoss): kwargs_decoder["load_weight_prefix"] = cls.load_weight_prefix decoder = TFAutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder) - # Make sure these 2 `tf.keras.Model` have fixed names so `from_pretrained` could load model weights correctly. + # Make sure these 2 `keras.Model` have fixed names so `from_pretrained` could load model weights correctly. if encoder.name != "encoder": raise ValueError("encoder model must be created with the name `encoder`.") if decoder.name != "decoder": diff --git a/src/transformers/models/esm/modeling_tf_esm.py b/src/transformers/models/esm/modeling_tf_esm.py index 38229167b3..2c780b4bdd 100644 --- a/src/transformers/models/esm/modeling_tf_esm.py +++ b/src/transformers/models/esm/modeling_tf_esm.py @@ -22,8 +22,6 @@ from typing import Optional, Tuple, Union import numpy as np import tensorflow as tf -from tensorflow.keras.activations import gelu -from tensorflow.keras.layers import Dense, Dropout, Embedding, Layer, LayerNormalization from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward from ...modeling_tf_outputs import ( @@ -40,6 +38,7 @@ from ...modeling_tf_utils import ( TFSequenceClassificationLoss, TFTokenClassificationLoss, get_initializer, + keras, shape_list, unpack_inputs, ) @@ -90,7 +89,7 @@ def average_product_correct(x): return normalized -class TFRotaryEmbedding(Layer): +class TFRotaryEmbedding(keras.layers.Layer): """ Rotary position embeddings based on those in [RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer). Query and keys are transformed by rotation @@ -134,7 +133,7 @@ class TFRotaryEmbedding(Layer): ) -class TFEsmContactPredictionHead(Layer): +class TFEsmContactPredictionHead(keras.layers.Layer): """Performs symmetrization, apc, and computes a logistic regression on the output features""" def __init__( @@ -147,7 +146,7 @@ class TFEsmContactPredictionHead(Layer): super().__init__(name=name) self.eos_idx = eos_idx self.in_features = in_features - self.regression = Dense(1, use_bias=bias, activation="sigmoid", name="regression") + self.regression = keras.layers.Dense(1, use_bias=bias, activation="sigmoid", name="regression") def build(self, input_shape=None): if self.built: @@ -174,20 +173,20 @@ class TFEsmContactPredictionHead(Layer): return tf.squeeze(self.regression(attentions), 3) -class TFEsmEmbeddings(Layer): +class TFEsmEmbeddings(keras.layers.Layer): """ Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. """ def __init__(self, config, name=None): super().__init__(name=name) - self.word_embeddings = Embedding( + self.word_embeddings = keras.layers.Embedding( config.vocab_size, config.hidden_size, embeddings_initializer=get_initializer(config.initializer_range), name="word_embeddings", ) - self.position_embeddings = Embedding( + self.position_embeddings = keras.layers.Embedding( config.max_position_embeddings, config.hidden_size, embeddings_initializer=get_initializer(config.initializer_range), @@ -195,7 +194,7 @@ class TFEsmEmbeddings(Layer): ) if config.emb_layer_norm_before: - self.layer_norm = LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") + self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") else: self.layer_norm = None # Matt: I think this line was copied incorrectly from BERT, disabling for now @@ -286,7 +285,7 @@ class TFEsmEmbeddings(Layer): self.layer_norm.build([None, None, self.config.hidden_size]) -class TFEsmSelfAttention(Layer): +class TFEsmSelfAttention(keras.layers.Layer): def __init__(self, config, position_embedding_type=None, name=None): super().__init__(name=name) if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): @@ -299,22 +298,24 @@ class TFEsmSelfAttention(Layer): self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size - self.query = Dense( + self.query = keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" ) - self.key = Dense(self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key") - self.value = Dense( + self.key = keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" + ) + self.value = keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) - self.dropout = Dropout(config.attention_probs_dropout_prob) + self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob) self.position_embedding_type = position_embedding_type or getattr( config, "position_embedding_type", "absolute" ) self.rotary_embeddings = None if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = Embedding( + self.distance_embedding = keras.layers.Embedding( 2 * config.max_position_embeddings - 1, self.attention_head_size, embeddings_initializer=get_initializer(config.initializer_range), @@ -451,13 +452,13 @@ class TFEsmSelfAttention(Layer): self.rotary_embeddings.build(None) -class TFEsmSelfOutput(Layer): +class TFEsmSelfOutput(keras.layers.Layer): def __init__(self, config, name=None): super().__init__(name=name) - self.dense = Dense( + self.dense = keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.dropout = Dropout(config.hidden_dropout_prob) + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) self.config = config def call(self, hidden_states, input_tensor, training=False): @@ -475,13 +476,13 @@ class TFEsmSelfOutput(Layer): self.dense.build([None, None, self.config.hidden_size]) -class TFEsmAttention(Layer): +class TFEsmAttention(keras.layers.Layer): def __init__(self, config, name=None): super().__init__(name=name) self.self = TFEsmSelfAttention(config, name="self") self.output_layer = TFEsmSelfOutput(config, name="output") self.pruned_heads = set() - self.LayerNorm = LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.config = config def prune_heads(self, heads): @@ -528,11 +529,11 @@ class TFEsmAttention(Layer): self.LayerNorm.build([None, None, self.config.hidden_size]) -class TFEsmIntermediate(tf.keras.layers.Layer): +class TFEsmIntermediate(keras.layers.Layer): def __init__(self, config: EsmConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense", @@ -553,13 +554,13 @@ class TFEsmIntermediate(tf.keras.layers.Layer): self.dense.build([None, None, self.config.hidden_size]) -class TFEsmOutput(Layer): +class TFEsmOutput(keras.layers.Layer): def __init__(self, config, name=None): super().__init__(name=name) - self.dense = Dense( + self.dense = keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.dropout = Dropout(config.hidden_dropout_prob) + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) self.config = config def call(self, hidden_states, input_tensor, training=False): @@ -577,7 +578,7 @@ class TFEsmOutput(Layer): self.dense.build([None, None, self.config.intermediate_size]) -class TFEsmLayer(Layer): +class TFEsmLayer(keras.layers.Layer): def __init__(self, config, name=None): super().__init__(name=name) self.chunk_size_feed_forward = config.chunk_size_feed_forward @@ -591,7 +592,7 @@ class TFEsmLayer(Layer): self.crossattention = TFEsmAttention(config) self.intermediate = TFEsmIntermediate(config, name="intermediate") self.output_layer = TFEsmOutput(config, name="output") - self.LayerNorm = LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.config = config def call( @@ -682,12 +683,14 @@ class TFEsmLayer(Layer): self.LayerNorm.build([None, None, self.config.hidden_size]) -class TFEsmEncoder(Layer): +class TFEsmEncoder(keras.layers.Layer): def __init__(self, config, name=None): super().__init__(name=name) self.config = config self.layer = [TFEsmLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)] - self.emb_layer_norm_after = LayerNormalization(epsilon=config.layer_norm_eps, name="emb_layer_norm_after") + self.emb_layer_norm_after = keras.layers.LayerNormalization( + epsilon=config.layer_norm_eps, name="emb_layer_norm_after" + ) def call( self, @@ -774,11 +777,11 @@ class TFEsmEncoder(Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Esm -class TFEsmPooler(tf.keras.layers.Layer): +class TFEsmPooler(keras.layers.Layer): def __init__(self, config: EsmConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", @@ -874,7 +877,7 @@ ESM_INPUTS_DOCSTRING = r""" "The bare ESM Model transformer outputting raw hidden-states without any specific head on top.", ESM_START_DOCSTRING, ) -class TFEsmMainLayer(Layer): +class TFEsmMainLayer(keras.layers.Layer): """ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of @@ -1288,20 +1291,20 @@ class TFEsmForMaskedLM(TFEsmPreTrainedModel, TFMaskedLanguageModelingLoss): self.lm_head.build(None) -class TFEsmLMHead(Layer): +class TFEsmLMHead(keras.layers.Layer): """ESM Head for masked language modeling.""" def __init__(self, config, name=None): super().__init__(name=name) - self.dense = Dense( + self.dense = keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.layer_norm = LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") + self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") if config.tie_word_embeddings: self.decoder = None else: - self.decoder = Dense( + self.decoder = keras.layers.Dense( config.vocab_size, kernel_initializer=get_initializer(config.initializer_range), name="decoder", @@ -1331,7 +1334,7 @@ class TFEsmLMHead(Layer): def call(self, features): x = self.dense(features) - x = gelu(x) + x = tf.nn.gelu(x) x = self.layer_norm(x) # project back to size of vocabulary with bias @@ -1443,8 +1446,8 @@ class TFEsmForTokenClassification(TFEsmPreTrainedModel, TFTokenClassificationLos self.num_labels = config.num_labels self.esm = TFEsmMainLayer(config, add_pooling_layer=False, name="esm") - self.dropout = Dropout(config.hidden_dropout_prob) - self.classifier = Dense(config.num_labels, name="classifier") + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) + self.classifier = keras.layers.Dense(config.num_labels, name="classifier") self.config = config @unpack_inputs @@ -1515,19 +1518,19 @@ class TFEsmForTokenClassification(TFEsmPreTrainedModel, TFTokenClassificationLos self.classifier.build([None, None, self.config.hidden_size]) -class TFEsmClassificationHead(Layer): +class TFEsmClassificationHead(keras.layers.Layer): """Head for sentence-level classification tasks.""" def __init__(self, config, name=None): super().__init__(name=name) - self.dense = Dense( + self.dense = keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", name="dense", ) - self.dropout = Dropout(config.hidden_dropout_prob) - self.out_proj = Dense( + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) + self.out_proj = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), activation="linear", diff --git a/src/transformers/models/flaubert/modeling_tf_flaubert.py b/src/transformers/models/flaubert/modeling_tf_flaubert.py index 1a4d307701..23f66e56a9 100644 --- a/src/transformers/models/flaubert/modeling_tf_flaubert.py +++ b/src/transformers/models/flaubert/modeling_tf_flaubert.py @@ -46,6 +46,7 @@ from ...modeling_tf_utils import ( TFSharedEmbeddings, TFTokenClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -76,7 +77,7 @@ FLAUBERT_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -300,7 +301,7 @@ class TFFlaubertModel(TFFlaubertPreTrainedModel): # Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMMultiHeadAttention with XLM->Flaubert -class TFFlaubertMultiHeadAttention(tf.keras.layers.Layer): +class TFFlaubertMultiHeadAttention(keras.layers.Layer): NEW_ID = itertools.count() def __init__(self, n_heads, dim, config, **kwargs): @@ -311,11 +312,11 @@ class TFFlaubertMultiHeadAttention(tf.keras.layers.Layer): self.output_attentions = config.output_attentions assert self.dim % self.n_heads == 0 - self.q_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="q_lin") - self.k_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="k_lin") - self.v_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="v_lin") - self.out_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="out_lin") - self.dropout = tf.keras.layers.Dropout(config.attention_dropout) + self.q_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="q_lin") + self.k_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="k_lin") + self.v_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="v_lin") + self.out_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="out_lin") + self.dropout = keras.layers.Dropout(config.attention_dropout) self.pruned_heads = set() self.dim = dim @@ -411,14 +412,14 @@ class TFFlaubertMultiHeadAttention(tf.keras.layers.Layer): # Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMTransformerFFN -class TFFlaubertTransformerFFN(tf.keras.layers.Layer): +class TFFlaubertTransformerFFN(keras.layers.Layer): def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs): super().__init__(**kwargs) - self.lin1 = tf.keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name="lin1") - self.lin2 = tf.keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name="lin2") + self.lin1 = keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name="lin1") + self.lin2 = keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name="lin2") self.act = get_tf_activation("gelu") if config.gelu_activation else get_tf_activation("relu") - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) self.in_dim = in_dim self.dim_hidden = dim_hidden @@ -443,7 +444,7 @@ class TFFlaubertTransformerFFN(tf.keras.layers.Layer): @keras_serializable -class TFFlaubertMainLayer(tf.keras.layers.Layer): +class TFFlaubertMainLayer(keras.layers.Layer): config_class = FlaubertConfig def __init__(self, config, **kwargs): @@ -466,11 +467,11 @@ class TFFlaubertMainLayer(tf.keras.layers.Layer): self.return_dict = config.use_return_dict self.max_position_embeddings = config.max_position_embeddings self.embed_init_std = config.embed_init_std - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) self.embeddings = TFSharedEmbeddings( self.n_words, self.dim, initializer_range=config.embed_init_std, name="embeddings" ) - self.layer_norm_emb = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm_emb") + self.layer_norm_emb = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm_emb") self.attentions = [] self.layer_norm1 = [] self.ffns = [] @@ -481,7 +482,7 @@ class TFFlaubertMainLayer(tf.keras.layers.Layer): TFFlaubertMultiHeadAttention(self.n_heads, self.dim, config=config, name=f"attentions_._{i}") ) self.layer_norm1.append( - tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name=f"layer_norm1_._{i}") + keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name=f"layer_norm1_._{i}") ) # if self.is_decoder: # self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps)) @@ -490,7 +491,7 @@ class TFFlaubertMainLayer(tf.keras.layers.Layer): TFFlaubertTransformerFFN(self.dim, self.hidden_dim, self.dim, config=config, name=f"ffns_._{i}") ) self.layer_norm2.append( - tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name=f"layer_norm2_._{i}") + keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name=f"layer_norm2_._{i}") ) def build(self, input_shape=None): @@ -739,7 +740,7 @@ class TFFlaubertMainLayer(tf.keras.layers.Layer): # Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMPredLayer -class TFFlaubertPredLayer(tf.keras.layers.Layer): +class TFFlaubertPredLayer(keras.layers.Layer): """ Prediction layer (cross_entropy or adaptive_softmax). """ @@ -1014,7 +1015,7 @@ class TFFlaubertForQuestionAnsweringSimple(TFFlaubertPreTrainedModel, TFQuestion def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFFlaubertMainLayer(config, name="transformer") - self.qa_outputs = tf.keras.layers.Dense( + self.qa_outputs = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.init_std), name="qa_outputs" ) self.config = config @@ -1120,8 +1121,8 @@ class TFFlaubertForTokenClassification(TFFlaubertPreTrainedModel, TFTokenClassif self.num_labels = config.num_labels self.transformer = TFFlaubertMainLayer(config, name="transformer") - self.dropout = tf.keras.layers.Dropout(config.dropout) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(config.dropout) + self.classifier = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.init_std), name="classifier" ) self.config = config @@ -1213,7 +1214,7 @@ class TFFlaubertForMultipleChoice(TFFlaubertPreTrainedModel, TFMultipleChoiceLos self.transformer = TFFlaubertMainLayer(config, name="transformer") self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary") - self.logits_proj = tf.keras.layers.Dense( + self.logits_proj = keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj" ) self.config = config diff --git a/src/transformers/models/funnel/modeling_tf_funnel.py b/src/transformers/models/funnel/modeling_tf_funnel.py index 18f3043afb..4e4a544523 100644 --- a/src/transformers/models/funnel/modeling_tf_funnel.py +++ b/src/transformers/models/funnel/modeling_tf_funnel.py @@ -42,6 +42,7 @@ from ...modeling_tf_utils import ( TFSequenceClassificationLoss, TFTokenClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -77,7 +78,7 @@ TF_FUNNEL_PRETRAINED_MODEL_ARCHIVE_LIST = [ INF = 1e6 -class TFFunnelEmbeddings(tf.keras.layers.Layer): +class TFFunnelEmbeddings(keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, config, **kwargs): @@ -87,8 +88,8 @@ class TFFunnelEmbeddings(tf.keras.layers.Layer): self.hidden_size = config.hidden_size self.initializer_std = 1.0 if config.initializer_std is None else config.initializer_std - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout) def build(self, input_shape=None): with tf.name_scope("word_embeddings"): @@ -141,8 +142,8 @@ class TFFunnelAttentionStructure: self.pool_q_only = config.pool_q_only self.pooling_type = config.pooling_type - self.sin_dropout = tf.keras.layers.Dropout(config.hidden_dropout) - self.cos_dropout = tf.keras.layers.Dropout(config.hidden_dropout) + self.sin_dropout = keras.layers.Dropout(config.hidden_dropout) + self.cos_dropout = keras.layers.Dropout(config.hidden_dropout) # Track where we are at in terms of pooling from the original input, e.g., by how much the sequence length was # divided. self.pooling_mult = None @@ -387,7 +388,7 @@ def _relative_shift_gather(positional_attn, context_len, shift): return positional_attn -class TFFunnelRelMultiheadAttention(tf.keras.layers.Layer): +class TFFunnelRelMultiheadAttention(keras.layers.Layer): def __init__(self, config, block_index, **kwargs): super().__init__(**kwargs) self.attention_type = config.attention_type @@ -397,19 +398,19 @@ class TFFunnelRelMultiheadAttention(tf.keras.layers.Layer): self.initializer_range = config.initializer_range self.block_index = block_index - self.hidden_dropout = tf.keras.layers.Dropout(config.hidden_dropout) - self.attention_dropout = tf.keras.layers.Dropout(config.attention_dropout) + self.hidden_dropout = keras.layers.Dropout(config.hidden_dropout) + self.attention_dropout = keras.layers.Dropout(config.attention_dropout) initializer = get_initializer(config.initializer_range) - self.q_head = tf.keras.layers.Dense( + self.q_head = keras.layers.Dense( n_head * d_head, use_bias=False, kernel_initializer=initializer, name="q_head" ) - self.k_head = tf.keras.layers.Dense(n_head * d_head, kernel_initializer=initializer, name="k_head") - self.v_head = tf.keras.layers.Dense(n_head * d_head, kernel_initializer=initializer, name="v_head") + self.k_head = keras.layers.Dense(n_head * d_head, kernel_initializer=initializer, name="k_head") + self.v_head = keras.layers.Dense(n_head * d_head, kernel_initializer=initializer, name="v_head") - self.post_proj = tf.keras.layers.Dense(d_model, kernel_initializer=initializer, name="post_proj") - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") + self.post_proj = keras.layers.Dense(d_model, kernel_initializer=initializer, name="post_proj") + self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.scale = 1.0 / (d_head**0.5) def build(self, input_shape=None): @@ -570,16 +571,16 @@ class TFFunnelRelMultiheadAttention(tf.keras.layers.Layer): return (output, attn_prob) if output_attentions else (output,) -class TFFunnelPositionwiseFFN(tf.keras.layers.Layer): +class TFFunnelPositionwiseFFN(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) initializer = get_initializer(config.initializer_range) - self.linear_1 = tf.keras.layers.Dense(config.d_inner, kernel_initializer=initializer, name="linear_1") + self.linear_1 = keras.layers.Dense(config.d_inner, kernel_initializer=initializer, name="linear_1") self.activation_function = get_tf_activation(config.hidden_act) - self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) - self.linear_2 = tf.keras.layers.Dense(config.d_model, kernel_initializer=initializer, name="linear_2") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout) - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) + self.linear_2 = keras.layers.Dense(config.d_model, kernel_initializer=initializer, name="linear_2") + self.dropout = keras.layers.Dropout(config.hidden_dropout) + self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.config = config def call(self, hidden, training=False): @@ -605,7 +606,7 @@ class TFFunnelPositionwiseFFN(tf.keras.layers.Layer): self.layer_norm.build([None, None, self.config.d_model]) -class TFFunnelLayer(tf.keras.layers.Layer): +class TFFunnelLayer(keras.layers.Layer): def __init__(self, config, block_index, **kwargs): super().__init__(**kwargs) self.attention = TFFunnelRelMultiheadAttention(config, block_index, name="attention") @@ -630,7 +631,7 @@ class TFFunnelLayer(tf.keras.layers.Layer): self.ffn.build(None) -class TFFunnelEncoder(tf.keras.layers.Layer): +class TFFunnelEncoder(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.separate_cls = config.separate_cls @@ -729,7 +730,7 @@ def upsample(x, stride, target_len, separate_cls=True, truncate_seq=False): return output -class TFFunnelDecoder(tf.keras.layers.Layer): +class TFFunnelDecoder(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.separate_cls = config.separate_cls @@ -794,7 +795,7 @@ class TFFunnelDecoder(tf.keras.layers.Layer): @keras_serializable -class TFFunnelBaseLayer(tf.keras.layers.Layer): +class TFFunnelBaseLayer(keras.layers.Layer): """Base model without decoder""" config_class = FunnelConfig @@ -875,7 +876,7 @@ class TFFunnelBaseLayer(tf.keras.layers.Layer): @keras_serializable -class TFFunnelMainLayer(tf.keras.layers.Layer): +class TFFunnelMainLayer(keras.layers.Layer): """Base model with decoder""" config_class = FunnelConfig @@ -988,15 +989,15 @@ class TFFunnelMainLayer(tf.keras.layers.Layer): self.decoder.build(None) -class TFFunnelDiscriminatorPredictions(tf.keras.layers.Layer): +class TFFunnelDiscriminatorPredictions(keras.layers.Layer): """Prediction module for the discriminator, made up of two dense layers.""" def __init__(self, config, **kwargs): super().__init__(**kwargs) initializer = get_initializer(config.initializer_range) - self.dense = tf.keras.layers.Dense(config.d_model, kernel_initializer=initializer, name="dense") + self.dense = keras.layers.Dense(config.d_model, kernel_initializer=initializer, name="dense") self.activation_function = get_tf_activation(config.hidden_act) - self.dense_prediction = tf.keras.layers.Dense(1, kernel_initializer=initializer, name="dense_prediction") + self.dense_prediction = keras.layers.Dense(1, kernel_initializer=initializer, name="dense_prediction") self.config = config def call(self, discriminator_hidden_states): @@ -1017,7 +1018,7 @@ class TFFunnelDiscriminatorPredictions(tf.keras.layers.Layer): self.dense_prediction.build([None, None, self.config.d_model]) -class TFFunnelMaskedLMHead(tf.keras.layers.Layer): +class TFFunnelMaskedLMHead(keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) self.config = config @@ -1053,20 +1054,18 @@ class TFFunnelMaskedLMHead(tf.keras.layers.Layer): return hidden_states -class TFFunnelClassificationHead(tf.keras.layers.Layer): +class TFFunnelClassificationHead(keras.layers.Layer): def __init__(self, config, n_labels, **kwargs): super().__init__(**kwargs) initializer = get_initializer(config.initializer_range) - self.linear_hidden = tf.keras.layers.Dense( - config.d_model, kernel_initializer=initializer, name="linear_hidden" - ) - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout) - self.linear_out = tf.keras.layers.Dense(n_labels, kernel_initializer=initializer, name="linear_out") + self.linear_hidden = keras.layers.Dense(config.d_model, kernel_initializer=initializer, name="linear_hidden") + self.dropout = keras.layers.Dropout(config.hidden_dropout) + self.linear_out = keras.layers.Dense(n_labels, kernel_initializer=initializer, name="linear_out") self.config = config def call(self, hidden, training=False): hidden = self.linear_hidden(hidden) - hidden = tf.keras.activations.tanh(hidden) + hidden = keras.activations.tanh(hidden) hidden = self.dropout(hidden, training=training) return self.linear_out(hidden) @@ -1132,7 +1131,7 @@ FUNNEL_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -1700,8 +1699,8 @@ class TFFunnelForTokenClassification(TFFunnelPreTrainedModel, TFTokenClassificat self.num_labels = config.num_labels self.funnel = TFFunnelMainLayer(config, name="funnel") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(config.hidden_dropout) + self.classifier = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config @@ -1789,7 +1788,7 @@ class TFFunnelForQuestionAnswering(TFFunnelPreTrainedModel, TFQuestionAnsweringL self.num_labels = config.num_labels self.funnel = TFFunnelMainLayer(config, name="funnel") - self.qa_outputs = tf.keras.layers.Dense( + self.qa_outputs = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) self.config = config diff --git a/src/transformers/models/gpt2/modeling_tf_gpt2.py b/src/transformers/models/gpt2/modeling_tf_gpt2.py index 50c2dd54f4..fd40df97dd 100644 --- a/src/transformers/models/gpt2/modeling_tf_gpt2.py +++ b/src/transformers/models/gpt2/modeling_tf_gpt2.py @@ -37,6 +37,7 @@ from ...modeling_tf_utils import ( TFSequenceClassificationLoss, TFSequenceSummary, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -67,7 +68,7 @@ TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [ ] -class TFAttention(tf.keras.layers.Layer): +class TFAttention(keras.layers.Layer): def __init__(self, nx, config, scale=False, is_cross_attention=False, **kwargs): super().__init__(**kwargs) @@ -88,8 +89,8 @@ class TFAttention(tf.keras.layers.Layer): self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name="c_attn") self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_proj") - self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop) - self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop) + self.attn_dropout = keras.layers.Dropout(config.attn_pdrop) + self.resid_dropout = keras.layers.Dropout(config.resid_pdrop) self.pruned_heads = set() self.embed_dim = n_state @@ -222,14 +223,14 @@ class TFAttention(tf.keras.layers.Layer): self.q_attn.build([None, None, self.embed_dim]) -class TFMLP(tf.keras.layers.Layer): +class TFMLP(keras.layers.Layer): def __init__(self, n_state, config, **kwargs): super().__init__(**kwargs) nx = config.n_embd self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc") self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj") self.act = get_tf_activation(config.activation_function) - self.dropout = tf.keras.layers.Dropout(config.resid_pdrop) + self.dropout = keras.layers.Dropout(config.resid_pdrop) self.intermediate_size = n_state self.embed_dim = nx @@ -251,18 +252,18 @@ class TFMLP(tf.keras.layers.Layer): self.c_proj.build([None, None, self.embed_dim]) -class TFBlock(tf.keras.layers.Layer): +class TFBlock(keras.layers.Layer): def __init__(self, config, scale=False, **kwargs): super().__init__(**kwargs) nx = config.n_embd inner_dim = config.n_inner if config.n_inner is not None else 4 * nx - self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1") + self.ln_1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1") self.attn = TFAttention(nx, config, scale, name="attn") - self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2") + self.ln_2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2") if config.add_cross_attention: self.crossattention = TFAttention(nx, config, scale, name="crossattention", is_cross_attention=True) - self.ln_cross_attn = tf.keras.layers.LayerNormalization( + self.ln_cross_attn = keras.layers.LayerNormalization( epsilon=config.layer_norm_epsilon, name="ln_cross_attn" ) @@ -354,7 +355,7 @@ class TFBlock(tf.keras.layers.Layer): @keras_serializable -class TFGPT2MainLayer(tf.keras.layers.Layer): +class TFGPT2MainLayer(keras.layers.Layer): config_class = GPT2Config def __init__(self, config, *inputs, **kwargs): @@ -371,21 +372,21 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): self.n_positions = config.n_positions self.initializer_range = config.initializer_range - self.wte = tf.keras.layers.Embedding( + self.wte = keras.layers.Embedding( input_dim=config.vocab_size, output_dim=config.hidden_size, embeddings_initializer=get_initializer(config.initializer_range), name="wte", ) - self.wpe = tf.keras.layers.Embedding( + self.wpe = keras.layers.Embedding( input_dim=config.n_positions, output_dim=config.n_embd, embeddings_initializer=get_initializer(config.initializer_range), name="wpe", ) - self.drop = tf.keras.layers.Dropout(config.embd_pdrop) + self.drop = keras.layers.Dropout(config.embd_pdrop) self.h = [TFBlock(config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)] - self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_f") + self.ln_f = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_f") self.embed_dim = config.hidden_size def get_input_embeddings(self): @@ -649,7 +650,7 @@ GPT2_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -1134,7 +1135,7 @@ class TFGPT2ForSequenceClassification(TFGPT2PreTrainedModel, TFSequenceClassific def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels - self.score = tf.keras.layers.Dense( + self.score = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="score", diff --git a/src/transformers/models/gpt2/tokenization_gpt2_tf.py b/src/transformers/models/gpt2/tokenization_gpt2_tf.py index 4ab4af5b9d..41f0874919 100644 --- a/src/transformers/models/gpt2/tokenization_gpt2_tf.py +++ b/src/transformers/models/gpt2/tokenization_gpt2_tf.py @@ -5,10 +5,11 @@ import tensorflow as tf from keras_nlp.tokenizers import BytePairTokenizer from tensorflow_text import pad_model_inputs +from ...modeling_tf_utils import keras from .tokenization_gpt2 import GPT2Tokenizer -class TFGPT2Tokenizer(tf.keras.layers.Layer): +class TFGPT2Tokenizer(keras.layers.Layer): """ This is an in-graph tokenizer for GPT2. It should be initialized similarly to other tokenizers, using the `from_pretrained()` method. It can also be initialized with the `from_tokenizer()` method, which imports settings diff --git a/src/transformers/models/gptj/modeling_tf_gptj.py b/src/transformers/models/gptj/modeling_tf_gptj.py index af05f9119d..d948fc63c0 100644 --- a/src/transformers/models/gptj/modeling_tf_gptj.py +++ b/src/transformers/models/gptj/modeling_tf_gptj.py @@ -41,6 +41,7 @@ from ...modeling_tf_utils import ( TFSequenceClassificationLoss, TFSharedEmbeddings, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -82,7 +83,7 @@ def apply_rotary_pos_emb(tensor: tf.Tensor, sincos: tf.Tensor) -> tf.Tensor: return (tensor * cos_pos) + (rotate_every_two(tensor) * sin_pos) -class TFGPTJAttention(tf.keras.layers.Layer): +class TFGPTJAttention(keras.layers.Layer): def __init__(self, config: GPTJConfig, **kwargs): super().__init__(**kwargs) @@ -97,28 +98,28 @@ class TFGPTJAttention(tf.keras.layers.Layer): self.scale_attn = self.head_dim**0.5 self.rotary_dim = config.rotary_dim - self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop) - self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop) + self.attn_dropout = keras.layers.Dropout(config.attn_pdrop) + self.resid_dropout = keras.layers.Dropout(config.resid_pdrop) - self.q_proj = tf.keras.layers.Dense( + self.q_proj = keras.layers.Dense( self.embed_dim, use_bias=False, kernel_initializer=get_initializer(config.initializer_range), name="q_proj", ) - self.k_proj = tf.keras.layers.Dense( + self.k_proj = keras.layers.Dense( self.embed_dim, use_bias=False, kernel_initializer=get_initializer(config.initializer_range), name="k_proj", ) - self.v_proj = tf.keras.layers.Dense( + self.v_proj = keras.layers.Dense( self.embed_dim, use_bias=False, kernel_initializer=get_initializer(config.initializer_range), name="v_proj", ) - self.out_proj = tf.keras.layers.Dense( + self.out_proj = keras.layers.Dense( self.embed_dim, use_bias=False, kernel_initializer=get_initializer(config.initializer_range), @@ -285,20 +286,20 @@ class TFGPTJAttention(tf.keras.layers.Layer): self.out_proj.build([None, None, self.embed_dim]) -class TFGPTJMLP(tf.keras.layers.Layer): +class TFGPTJMLP(keras.layers.Layer): def __init__(self, intermediate_size: int, config: GPTJConfig, **kwargs): super().__init__(**kwargs) embed_dim = config.n_embd - self.fc_in = tf.keras.layers.Dense( + self.fc_in = keras.layers.Dense( intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="fc_in" ) - self.fc_out = tf.keras.layers.Dense( + self.fc_out = keras.layers.Dense( embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="fc_out" ) self.act = get_tf_activation(config.activation_function) - self.dropout = tf.keras.layers.Dropout(config.embd_pdrop) + self.dropout = keras.layers.Dropout(config.embd_pdrop) self.embed_dim = config.n_embd self.intermediate_size = intermediate_size @@ -321,11 +322,11 @@ class TFGPTJMLP(tf.keras.layers.Layer): self.fc_out.build([None, None, self.intermediate_size]) -class TFGPTJBlock(tf.keras.layers.Layer): +class TFGPTJBlock(keras.layers.Layer): def __init__(self, config: GPTJConfig, **kwargs): super().__init__(**kwargs) inner_dim = config.n_inner if config.n_inner is not None else 4 * config.n_embd - self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1") + self.ln_1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1") self.attn = TFGPTJAttention(config, name="attn") self.mlp = TFGPTJMLP(inner_dim, config, name="mlp") self.config = config @@ -379,7 +380,7 @@ class TFGPTJBlock(tf.keras.layers.Layer): @keras_serializable -class TFGPTJMainLayer(tf.keras.layers.Layer): +class TFGPTJMainLayer(keras.layers.Layer): config_class = GPTJConfig def __init__(self, config: GPTJConfig, *inputs, **kwargs): @@ -399,9 +400,9 @@ class TFGPTJMainLayer(tf.keras.layers.Layer): self.wte = TFSharedEmbeddings( config.vocab_size, config.hidden_size, initializer_range=config.initializer_range, name="wte" ) - self.drop = tf.keras.layers.Dropout(config.embd_pdrop) + self.drop = keras.layers.Dropout(config.embd_pdrop) self.h = [TFGPTJBlock(config, name=f"h_._{i}") for i in range(config.n_layer)] - self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_f") + self.ln_f = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_f") self.embed_dim = config.n_embd def get_input_embeddings(self): @@ -580,7 +581,7 @@ GPTJ_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -752,7 +753,7 @@ class TFGPTJForCausalLM(TFGPTJPreTrainedModel, TFCausalLanguageModelingLoss): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFGPTJMainLayer(config, name="transformer") - self.lm_head = tf.keras.layers.Dense( + self.lm_head = keras.layers.Dense( config.vocab_size, kernel_initializer=get_initializer(config.initializer_range), name="lm_head" ) self.config = config @@ -888,7 +889,7 @@ class TFGPTJForSequenceClassification(TFGPTJPreTrainedModel, TFSequenceClassific super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.transformer = TFGPTJMainLayer(config, name="transformer") - self.score = tf.keras.layers.Dense( + self.score = keras.layers.Dense( self.num_labels, use_bias=False, kernel_initializer=get_initializer(config.initializer_range), @@ -1014,7 +1015,7 @@ class TFGPTJForQuestionAnswering(TFGPTJPreTrainedModel, TFQuestionAnsweringLoss) super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.transformer = TFGPTJMainLayer(config, name="transformer") - self.qa_outputs = tf.keras.layers.Dense( + self.qa_outputs = keras.layers.Dense( self.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) self.config = config diff --git a/src/transformers/models/groupvit/modeling_tf_groupvit.py b/src/transformers/models/groupvit/modeling_tf_groupvit.py index 7620c08cab..d04f9afb7d 100644 --- a/src/transformers/models/groupvit/modeling_tf_groupvit.py +++ b/src/transformers/models/groupvit/modeling_tf_groupvit.py @@ -31,6 +31,7 @@ from ...modeling_tf_utils import ( TFModelInputType, TFPreTrainedModel, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -92,7 +93,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None): # https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html def contrastive_loss(logits: tf.Tensor) -> tf.Tensor: return tf.math.reduce_mean( - tf.keras.metrics.sparse_categorical_crossentropy( + keras.metrics.sparse_categorical_crossentropy( y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True ) ) @@ -264,13 +265,13 @@ class TFGroupViTModelOutput(ModelOutput): ) -class TFGroupViTCrossAttentionLayer(tf.keras.layers.Layer): +class TFGroupViTCrossAttentionLayer(keras.layers.Layer): def __init__(self, config: GroupViTVisionConfig, **kwargs): super().__init__(**kwargs) self.attn = TFGroupViTAttention(config, name="attn") - self.norm2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm2") + self.norm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm2") self.mlp = TFGroupViTMLP(config, name="mlp") - self.norm_post = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm_post") + self.norm_post = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm_post") self.config = config def call(self, query: tf.Tensor, key: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -298,15 +299,15 @@ class TFGroupViTCrossAttentionLayer(tf.keras.layers.Layer): self.norm_post.build([None, None, self.config.hidden_size]) -class TFGroupViTAssignAttention(tf.keras.layers.Layer): +class TFGroupViTAssignAttention(keras.layers.Layer): def __init__(self, config: GroupViTVisionConfig, **kwargs): super().__init__(**kwargs) self.scale = config.hidden_size**-0.5 - self.q_proj = tf.keras.layers.Dense(config.hidden_size, name="q_proj") - self.k_proj = tf.keras.layers.Dense(config.hidden_size, name="k_proj") - self.v_proj = tf.keras.layers.Dense(config.hidden_size, name="v_proj") - self.proj = tf.keras.layers.Dense(config.hidden_size, name="proj") + self.q_proj = keras.layers.Dense(config.hidden_size, name="q_proj") + self.k_proj = keras.layers.Dense(config.hidden_size, name="k_proj") + self.v_proj = keras.layers.Dense(config.hidden_size, name="v_proj") + self.proj = keras.layers.Dense(config.hidden_size, name="proj") self.assign_eps = config.assign_eps self.config = config @@ -364,12 +365,12 @@ class TFGroupViTAssignAttention(tf.keras.layers.Layer): self.proj.build([None, None, self.config.hidden_size]) -class TFGroupViTTokenAssign(tf.keras.layers.Layer): +class TFGroupViTTokenAssign(keras.layers.Layer): def __init__(self, config: GroupViTVisionConfig, num_group_token: int, num_output_group: int, **kwargs): super().__init__(**kwargs) self.num_output_group = num_output_group # norm on group_tokens - self.norm_tokens = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm_tokens") + self.norm_tokens = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm_tokens") assign_mlp_ratio = ( config.assign_mlp_ratio if isinstance(config.assign_mlp_ratio, collections.abc.Iterable) @@ -377,15 +378,13 @@ class TFGroupViTTokenAssign(tf.keras.layers.Layer): ) tokens_dim, channels_dim = [int(x * config.hidden_size) for x in assign_mlp_ratio] self.mlp_inter = TFGroupViTMixerMLP(config, num_group_token, tokens_dim, num_output_group, name="mlp_inter") - self.norm_post_tokens = tf.keras.layers.LayerNormalization( - epsilon=config.layer_norm_eps, name="norm_post_tokens" - ) + self.norm_post_tokens = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm_post_tokens") # norm on x - self.norm_x = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm_x") + self.norm_x = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm_x") self.pre_assign_attn = TFGroupViTCrossAttentionLayer(config, name="pre_assign_attn") self.assign = TFGroupViTAssignAttention(config, name="assign") - self.norm_new_x = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm_new_x") + self.norm_new_x = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm_new_x") self.mlp_channels = TFGroupViTMLP( config, config.hidden_size, channels_dim, config.hidden_size, name="mlp_channels" ) @@ -454,7 +453,7 @@ class TFGroupViTTokenAssign(tf.keras.layers.Layer): # Adapted from transformers.models.vit.modeling_tf_vit.TFViTPatchEmbeddings with ViT->GroupViT -class TFGroupViTPatchEmbeddings(tf.keras.layers.Layer): +class TFGroupViTPatchEmbeddings(keras.layers.Layer): """ This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a @@ -477,7 +476,7 @@ class TFGroupViTPatchEmbeddings(tf.keras.layers.Layer): self.num_channels = num_channels self.config = config - self.projection = tf.keras.layers.Conv2D( + self.projection = keras.layers.Conv2D( filters=self.hidden_size, kernel_size=patch_size, strides=patch_size, @@ -506,7 +505,7 @@ class TFGroupViTPatchEmbeddings(tf.keras.layers.Layer): f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})." ) - # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format. + # When running on CPU, `keras.layers.Conv2D` doesn't support `NCHW` format. # So change the input format from `NCHW` to `NHWC`. # shape = (batch_size, in_height, in_width, in_channels=num_channels) pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) @@ -533,7 +532,7 @@ class TFGroupViTPatchEmbeddings(tf.keras.layers.Layer): # Adapted from transformers.vit.modeling_tf_vit.TFViTEmbeddings -class TFGroupViTVisionEmbeddings(tf.keras.layers.Layer): +class TFGroupViTVisionEmbeddings(keras.layers.Layer): """ Construct the position and patch embeddings. @@ -543,8 +542,8 @@ class TFGroupViTVisionEmbeddings(tf.keras.layers.Layer): super().__init__(**kwargs) self.patch_embeddings = TFGroupViTPatchEmbeddings(config, name="patch_embeddings") - self.dropout = tf.keras.layers.Dropout(rate=config.dropout, name="dropout") - self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") + self.dropout = keras.layers.Dropout(rate=config.dropout, name="dropout") + self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") self.config = config def build(self, input_shape=None): @@ -615,7 +614,7 @@ class TFGroupViTVisionEmbeddings(tf.keras.layers.Layer): # Copied from transformers.models.clip.modeling_tf_clip.TFCLIPTextEmbeddings with CLIP->GroupViT -class TFGroupViTTextEmbeddings(tf.keras.layers.Layer): +class TFGroupViTTextEmbeddings(keras.layers.Layer): def __init__(self, config: GroupViTTextConfig, **kwargs): super().__init__(**kwargs) @@ -673,7 +672,7 @@ class TFGroupViTTextEmbeddings(tf.keras.layers.Layer): return final_embeddings -class TFGroupViTStage(tf.keras.layers.Layer): +class TFGroupViTStage(keras.layers.Layer): """This corresponds to the `GroupingLayer` class in the GroupViT implementation.""" def __init__( @@ -703,7 +702,7 @@ class TFGroupViTStage(tf.keras.layers.Layer): if num_prev_group_token > 0 and num_group_token > 0: self.group_projector = [ - tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="group_projector.0"), + keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="group_projector.0"), TFGroupViTMixerMLP( config, num_prev_group_token, config.hidden_size // 2, num_group_token, name="group_projector.1" ), @@ -803,7 +802,7 @@ class TFGroupViTStage(tf.keras.layers.Layer): return outputs -class TFGroupViTMLP(tf.keras.layers.Layer): +class TFGroupViTMLP(keras.layers.Layer): def __init__( self, config: GroupViTVisionConfig, @@ -818,8 +817,8 @@ class TFGroupViTMLP(tf.keras.layers.Layer): hidden_size = hidden_size if hidden_size is not None else config.hidden_size intermediate_size = intermediate_size if intermediate_size is not None else config.intermediate_size output_size = output_size if output_size is not None else hidden_size - self.fc1 = tf.keras.layers.Dense(intermediate_size, name="fc1") - self.fc2 = tf.keras.layers.Dense(output_size, name="fc2") + self.fc1 = keras.layers.Dense(intermediate_size, name="fc1") + self.fc2 = keras.layers.Dense(output_size, name="fc2") self.intermediate_size = intermediate_size self.hidden_size = hidden_size @@ -848,7 +847,7 @@ class TFGroupViTMixerMLP(TFGroupViTMLP): # Adapted from transformers.models.clip.modeling_tf_clip.TFCLIPAttention -class TFGroupViTAttention(tf.keras.layers.Layer): +class TFGroupViTAttention(keras.layers.Layer): """Multi-headed attention from 'Attention Is All You Need' paper""" def __init__(self, config: GroupViTConfig, **kwargs): @@ -869,19 +868,19 @@ class TFGroupViTAttention(tf.keras.layers.Layer): self.sqrt_att_head_size = math.sqrt(self.attention_head_size) - self.q_proj = tf.keras.layers.Dense( + self.q_proj = keras.layers.Dense( units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="q_proj" ) - self.k_proj = tf.keras.layers.Dense( + self.k_proj = keras.layers.Dense( units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="k_proj" ) - self.v_proj = tf.keras.layers.Dense( + self.v_proj = keras.layers.Dense( units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="v_proj" ) - self.dropout = tf.keras.layers.Dropout(rate=config.attention_dropout) + self.dropout = keras.layers.Dropout(rate=config.attention_dropout) - self.out_proj = tf.keras.layers.Dense( + self.out_proj = keras.layers.Dense( units=self.embed_dim, kernel_initializer=get_initializer(out_proj_std), name="out_proj" ) @@ -973,15 +972,15 @@ class TFGroupViTAttention(tf.keras.layers.Layer): # Copied from transformers.models.clip.modeling_tf_clip.TFCLIPEncoderLayer with CLIP->GroupViT -class TFGroupViTEncoderLayer(tf.keras.layers.Layer): +class TFGroupViTEncoderLayer(keras.layers.Layer): def __init__(self, config: GroupViTConfig, **kwargs): super().__init__(**kwargs) self.embed_dim = config.hidden_size self.self_attn = TFGroupViTAttention(config, name="self_attn") - self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1") + self.layer_norm1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1") self.mlp = TFGroupViTMLP(config, name="mlp") - self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2") + self.layer_norm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2") def call( self, @@ -1043,7 +1042,7 @@ class TFGroupViTEncoderLayer(tf.keras.layers.Layer): # Adapted from transformers.models.clip.modeling_tf_clip.TFGroupViTTextEncoder -class TFGroupViTTextEncoder(tf.keras.layers.Layer): +class TFGroupViTTextEncoder(keras.layers.Layer): def __init__(self, config: GroupViTTextConfig, **kwargs): super().__init__(**kwargs) @@ -1096,7 +1095,7 @@ class TFGroupViTTextEncoder(tf.keras.layers.Layer): layer.build(None) -class TFGroupViTVisionEncoder(tf.keras.layers.Layer): +class TFGroupViTVisionEncoder(keras.layers.Layer): def __init__(self, config: GroupViTVisionConfig, **kwargs) -> None: super().__init__(**kwargs) @@ -1157,15 +1156,13 @@ class TFGroupViTVisionEncoder(tf.keras.layers.Layer): # Copied from transformers.models.clip.modeling_tf_clip.TFCLIPTextTransformer with CLIPText->GroupViTText, CLIPEncoder->GroupViTTextEncoder -class TFGroupViTTextTransformer(tf.keras.layers.Layer): +class TFGroupViTTextTransformer(keras.layers.Layer): def __init__(self, config: GroupViTTextConfig, **kwargs): super().__init__(**kwargs) self.embeddings = TFGroupViTTextEmbeddings(config, name="embeddings") self.encoder = TFGroupViTTextEncoder(config, name="encoder") - self.final_layer_norm = tf.keras.layers.LayerNormalization( - epsilon=config.layer_norm_eps, name="final_layer_norm" - ) + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="final_layer_norm") # For `pooled_output` computation self.eos_token_id = config.eos_token_id @@ -1276,13 +1273,13 @@ class TFGroupViTTextTransformer(tf.keras.layers.Layer): # Adapted from transformers.models.clip.modeling_tf_clip.TFCLIPVisionTransformer -class TFGroupViTVisionTransformer(tf.keras.layers.Layer): +class TFGroupViTVisionTransformer(keras.layers.Layer): def __init__(self, config: GroupViTVisionConfig, **kwargs): super().__init__(**kwargs) self.embeddings = TFGroupViTVisionEmbeddings(config, name="embeddings") self.encoder = TFGroupViTVisionEncoder(config, name="encoder") - self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") + self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") self.embed_dim = config.hidden_size def call( @@ -1335,7 +1332,7 @@ class TFGroupViTVisionTransformer(tf.keras.layers.Layer): @keras_serializable # Copied from transformers.models.clip.modeling_tf_clip.TFCLIPTextMainLayer with CLIP->GroupViT -class TFGroupViTTextMainLayer(tf.keras.layers.Layer): +class TFGroupViTTextMainLayer(keras.layers.Layer): config_class = GroupViTTextConfig def __init__(self, config: GroupViTTextConfig, **kwargs): @@ -1343,7 +1340,7 @@ class TFGroupViTTextMainLayer(tf.keras.layers.Layer): self.config = config self.text_model = TFGroupViTTextTransformer(config, name="text_model") - def get_input_embeddings(self) -> tf.keras.layers.Layer: + def get_input_embeddings(self) -> keras.layers.Layer: return self.text_model.embeddings def set_input_embeddings(self, value: tf.Variable): @@ -1392,7 +1389,7 @@ class TFGroupViTTextMainLayer(tf.keras.layers.Layer): @keras_serializable # Copied from transformers.models.clip.modeling_tf_clip.TFCLIPVisionMainLayer with CLIP->GroupViT -class TFGroupViTVisionMainLayer(tf.keras.layers.Layer): +class TFGroupViTVisionMainLayer(keras.layers.Layer): config_class = GroupViTVisionConfig def __init__(self, config: GroupViTVisionConfig, **kwargs): @@ -1400,7 +1397,7 @@ class TFGroupViTVisionMainLayer(tf.keras.layers.Layer): self.config = config self.vision_model = TFGroupViTVisionTransformer(config, name="vision_model") - def get_input_embeddings(self) -> tf.keras.layers.Layer: + def get_input_embeddings(self) -> keras.layers.Layer: return self.vision_model.embeddings @unpack_inputs @@ -1436,7 +1433,7 @@ class TFGroupViTVisionMainLayer(tf.keras.layers.Layer): @keras_serializable # Adapted from transformers.models.clip.modeling_tf_clip.TFCLIPMainLayer -class TFGroupViTMainLayer(tf.keras.layers.Layer): +class TFGroupViTMainLayer(keras.layers.Layer): config_class = GroupViTConfig def __init__(self, config: GroupViTConfig, **kwargs): @@ -1468,22 +1465,22 @@ class TFGroupViTMainLayer(tf.keras.layers.Layer): self.vision_model = TFGroupViTVisionTransformer(vision_config, name="vision_model") self.visual_projection = [ - tf.keras.layers.Dense(self.projection_intermediate_dim, name="visual_projection.0"), - tf.keras.layers.BatchNormalization(name="visual_projection.1", momentum=0.9, epsilon=1e-5), - tf.keras.layers.ReLU(name="visual_projection.2"), - tf.keras.layers.Dense(self.projection_dim, name="visual_projection.3"), + keras.layers.Dense(self.projection_intermediate_dim, name="visual_projection.0"), + keras.layers.BatchNormalization(name="visual_projection.1", momentum=0.9, epsilon=1e-5), + keras.layers.ReLU(name="visual_projection.2"), + keras.layers.Dense(self.projection_dim, name="visual_projection.3"), ] self.text_projection = [ - tf.keras.layers.Dense(self.projection_intermediate_dim, name="text_projection.0"), - tf.keras.layers.BatchNormalization(name="text_projection.1", momentum=0.9, epsilon=1e-5), - tf.keras.layers.ReLU(name="text_projection.2"), - tf.keras.layers.Dense(self.projection_dim, name="text_projection.3"), + keras.layers.Dense(self.projection_intermediate_dim, name="text_projection.0"), + keras.layers.BatchNormalization(name="text_projection.1", momentum=0.9, epsilon=1e-5), + keras.layers.ReLU(name="text_projection.2"), + keras.layers.Dense(self.projection_dim, name="text_projection.3"), ] def build(self, input_shape=None): self.logit_scale = self.add_weight( shape=(1,), - initializer=tf.keras.initializers.Constant(self.config.logit_scale_init_value), + initializer=keras.initializers.Constant(self.config.logit_scale_init_value), trainable=True, name="logit_scale", ) @@ -1718,7 +1715,7 @@ GROUPVIT_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -1729,7 +1726,7 @@ GROUPVIT_START_DOCSTRING = r""" - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. - This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all the + This second option is useful when using [`keras.Model.fit`] method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`. If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py index fc8e99e057..258763beb1 100644 --- a/src/transformers/models/hubert/modeling_tf_hubert.py +++ b/src/transformers/models/hubert/modeling_tf_hubert.py @@ -27,6 +27,7 @@ from ...modeling_tf_outputs import TFBaseModelOutput, TFCausalLMOutput from ...modeling_tf_utils import ( TFPreTrainedModel, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -169,7 +170,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None): # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2GroupNorm with Wav2Vec2->Hubert -class TFHubertGroupNorm(tf.keras.layers.Layer): +class TFHubertGroupNorm(keras.layers.Layer): """ From tensorflow-addons https://www.tensorflow.org/addons/api_docs/python/tfa/layers/GroupNormalization """ @@ -181,12 +182,12 @@ class TFHubertGroupNorm(tf.keras.layers.Layer): epsilon: float = 1e-3, center: bool = True, scale: bool = True, - beta_initializer: tf.keras.initializers.Initializer = "zeros", - gamma_initializer: tf.keras.initializers.Initializer = "ones", - beta_regularizer: tf.keras.regularizers.Regularizer = None, - gamma_regularizer: tf.keras.regularizers.Regularizer = None, - beta_constraint: tf.keras.constraints.Constraint = None, - gamma_constraint: tf.keras.constraints.Constraint = None, + beta_initializer: keras.initializers.Initializer = "zeros", + gamma_initializer: keras.initializers.Initializer = "ones", + beta_regularizer: keras.regularizers.Regularizer = None, + gamma_regularizer: keras.regularizers.Regularizer = None, + beta_constraint: keras.constraints.Constraint = None, + gamma_constraint: keras.constraints.Constraint = None, **kwargs, ): super().__init__(**kwargs) @@ -196,12 +197,12 @@ class TFHubertGroupNorm(tf.keras.layers.Layer): self.epsilon = epsilon self.center = center self.scale = scale - self.beta_initializer = tf.keras.initializers.get(beta_initializer) - self.gamma_initializer = tf.keras.initializers.get(gamma_initializer) - self.beta_regularizer = tf.keras.regularizers.get(beta_regularizer) - self.gamma_regularizer = tf.keras.regularizers.get(gamma_regularizer) - self.beta_constraint = tf.keras.constraints.get(beta_constraint) - self.gamma_constraint = tf.keras.constraints.get(gamma_constraint) + self.beta_initializer = keras.initializers.get(beta_initializer) + self.gamma_initializer = keras.initializers.get(gamma_initializer) + self.beta_regularizer = keras.regularizers.get(beta_regularizer) + self.gamma_regularizer = keras.regularizers.get(gamma_regularizer) + self.beta_constraint = keras.constraints.get(beta_constraint) + self.gamma_constraint = keras.constraints.get(gamma_constraint) self._check_axis() def build(self, input_shape): @@ -216,7 +217,7 @@ class TFHubertGroupNorm(tf.keras.layers.Layer): super().build(input_shape) def call(self, inputs): - input_shape = tf.keras.backend.int_shape(inputs) + input_shape = keras.backend.int_shape(inputs) tensor_input_shape = tf.shape(inputs) reshaped_inputs, group_shape = self._reshape_into_groups(inputs, input_shape, tensor_input_shape) @@ -238,12 +239,12 @@ class TFHubertGroupNorm(tf.keras.layers.Layer): "epsilon": self.epsilon, "center": self.center, "scale": self.scale, - "beta_initializer": tf.keras.initializers.serialize(self.beta_initializer), - "gamma_initializer": tf.keras.initializers.serialize(self.gamma_initializer), - "beta_regularizer": tf.keras.regularizers.serialize(self.beta_regularizer), - "gamma_regularizer": tf.keras.regularizers.serialize(self.gamma_regularizer), - "beta_constraint": tf.keras.constraints.serialize(self.beta_constraint), - "gamma_constraint": tf.keras.constraints.serialize(self.gamma_constraint), + "beta_initializer": keras.initializers.serialize(self.beta_initializer), + "gamma_initializer": keras.initializers.serialize(self.gamma_initializer), + "beta_regularizer": keras.regularizers.serialize(self.beta_regularizer), + "gamma_regularizer": keras.regularizers.serialize(self.gamma_regularizer), + "beta_constraint": keras.constraints.serialize(self.beta_constraint), + "gamma_constraint": keras.constraints.serialize(self.gamma_constraint), } base_config = super().get_config() return {**base_config, **config} @@ -264,7 +265,7 @@ class TFHubertGroupNorm(tf.keras.layers.Layer): return inputs, group_shape def _apply_normalization(self, reshaped_inputs, input_shape): - group_shape = tf.keras.backend.int_shape(reshaped_inputs) + group_shape = keras.backend.int_shape(reshaped_inputs) group_reduction_axes = list(range(1, len(group_shape))) is_instance_norm = (input_shape[self.axis] // self.groups) == 1 if not is_instance_norm: @@ -342,7 +343,7 @@ class TFHubertGroupNorm(tf.keras.layers.Layer): def _create_input_spec(self, input_shape): dim = input_shape[self.axis] - self.input_spec = tf.keras.layers.InputSpec(ndim=len(input_shape), axes={self.axis: dim}) + self.input_spec = keras.layers.InputSpec(ndim=len(input_shape), axes={self.axis: dim}) def _add_gamma_weight(self, input_shape): dim = input_shape[self.axis] @@ -386,7 +387,7 @@ class TFHubertGroupNorm(tf.keras.layers.Layer): # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2WeightNormConv1D with Wav2Vec2->Hubert -class TFHubertWeightNormConv1D(tf.keras.layers.Conv1D): +class TFHubertWeightNormConv1D(keras.layers.Conv1D): """Adapted from https://www.tensorflow.org/probability/api_docs/python/tfp/layers/weight_norm/WeightNorm""" def __init__(self, filters, kernel_size, groups, explicit_padding, **kwargs): @@ -443,13 +444,13 @@ class TFHubertWeightNormConv1D(tf.keras.layers.Conv1D): # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2NoLayerNormConvLayer with Wav2Vec2->Hubert -class TFHubertNoLayerNormConvLayer(tf.keras.layers.Layer): +class TFHubertNoLayerNormConvLayer(keras.layers.Layer): def __init__(self, config: HubertConfig, layer_id: int = 0, **kwargs: Any) -> None: super().__init__(**kwargs) self.in_conv_dim = config.conv_dim[layer_id] if layer_id > 0 else 1 self.out_conv_dim = config.conv_dim[layer_id] - self.conv = tf.keras.layers.Conv1D( + self.conv = keras.layers.Conv1D( filters=self.out_conv_dim, kernel_size=config.conv_kernel[layer_id], strides=config.conv_stride[layer_id], @@ -473,20 +474,20 @@ class TFHubertNoLayerNormConvLayer(tf.keras.layers.Layer): # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2LayerNormConvLayer with Wav2Vec2->Hubert -class TFHubertLayerNormConvLayer(tf.keras.layers.Layer): +class TFHubertLayerNormConvLayer(keras.layers.Layer): def __init__(self, config: HubertConfig, layer_id: int = 0, **kwargs: Any) -> None: super().__init__(**kwargs) self.in_conv_dim = config.conv_dim[layer_id] if layer_id > 0 else 1 self.out_conv_dim = config.conv_dim[layer_id] - self.conv = tf.keras.layers.Conv1D( + self.conv = keras.layers.Conv1D( filters=self.out_conv_dim, kernel_size=config.conv_kernel[layer_id], strides=config.conv_stride[layer_id], use_bias=config.conv_bias, name="conv", ) - self.layer_norm = tf.keras.layers.LayerNormalization(name="layer_norm", epsilon=config.layer_norm_eps) + self.layer_norm = keras.layers.LayerNormalization(name="layer_norm", epsilon=config.layer_norm_eps) self.activation = get_tf_activation(config.feat_extract_activation) def call(self, hidden_states: tf.Tensor) -> tf.Tensor: @@ -508,13 +509,13 @@ class TFHubertLayerNormConvLayer(tf.keras.layers.Layer): # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2GroupNormConvLayer with Wav2Vec2->Hubert -class TFHubertGroupNormConvLayer(tf.keras.layers.Layer): +class TFHubertGroupNormConvLayer(keras.layers.Layer): def __init__(self, config: HubertConfig, layer_id: int = 0, **kwargs: Any) -> None: super().__init__(**kwargs) self.in_conv_dim = config.conv_dim[layer_id] if layer_id > 0 else 1 self.out_conv_dim = config.conv_dim[layer_id] - self.conv = tf.keras.layers.Conv1D( + self.conv = keras.layers.Conv1D( filters=self.out_conv_dim, kernel_size=config.conv_kernel[layer_id], strides=config.conv_stride[layer_id], @@ -543,7 +544,7 @@ class TFHubertGroupNormConvLayer(tf.keras.layers.Layer): # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2PositionalConvEmbedding with Wav2Vec2->Hubert -class TFHubertPositionalConvEmbedding(tf.keras.layers.Layer): +class TFHubertPositionalConvEmbedding(keras.layers.Layer): def __init__(self, config: HubertConfig, **kwargs: Any) -> None: super().__init__(**kwargs) self.conv = TFHubertWeightNormConv1D( @@ -573,7 +574,7 @@ class TFHubertPositionalConvEmbedding(tf.keras.layers.Layer): # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2SamePadLayer with Wav2Vec2->Hubert -class TFHubertSamePadLayer(tf.keras.layers.Layer): +class TFHubertSamePadLayer(keras.layers.Layer): def __init__(self, num_conv_pos_embeddings, **kwargs): super().__init__(**kwargs) self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0 @@ -584,7 +585,7 @@ class TFHubertSamePadLayer(tf.keras.layers.Layer): return hidden_states -class TFHubertFeatureEncoder(tf.keras.layers.Layer): +class TFHubertFeatureEncoder(keras.layers.Layer): def __init__(self, config: HubertConfig, **kwargs: Any) -> None: super().__init__(**kwargs) @@ -630,18 +631,18 @@ class TFHubertFeatureExtractor(TFHubertFeatureEncoder): ) -class TFHubertFeatureProjection(tf.keras.layers.Layer): +class TFHubertFeatureProjection(keras.layers.Layer): def __init__(self, config: HubertConfig, **kwargs): super().__init__(**kwargs) - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") - self.projection = tf.keras.layers.Dense( + self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") + self.projection = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), bias_initializer="zeros", name="projection", ) - self.dropout = tf.keras.layers.Dropout(rate=config.feat_proj_dropout) + self.dropout = keras.layers.Dropout(rate=config.feat_proj_dropout) self.config = config def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -663,7 +664,7 @@ class TFHubertFeatureProjection(tf.keras.layers.Layer): # Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with TFBart->TFHubert -class TFHubertAttention(tf.keras.layers.Layer): +class TFHubertAttention(keras.layers.Layer): """Multi-headed attention from "Attention Is All You Need""" def __init__( @@ -679,7 +680,7 @@ class TFHubertAttention(tf.keras.layers.Layer): self.embed_dim = embed_dim self.num_heads = num_heads - self.dropout = tf.keras.layers.Dropout(dropout) + self.dropout = keras.layers.Dropout(dropout) self.head_dim = embed_dim // num_heads if (self.head_dim * num_heads) != self.embed_dim: raise ValueError( @@ -689,10 +690,10 @@ class TFHubertAttention(tf.keras.layers.Layer): self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder - self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") - self.q_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") - self.v_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") - self.out_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") + self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") + self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") + self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") + self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int): return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3)) @@ -834,13 +835,13 @@ class TFHubertAttention(tf.keras.layers.Layer): # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2FeedForward with Wav2Vec2->Hubert -class TFHubertFeedForward(tf.keras.layers.Layer): +class TFHubertFeedForward(keras.layers.Layer): def __init__(self, config: HubertConfig, **kwargs): super().__init__(**kwargs) - self.intermediate_dropout = tf.keras.layers.Dropout(config.activation_dropout) + self.intermediate_dropout = keras.layers.Dropout(config.activation_dropout) - self.intermediate_dense = tf.keras.layers.Dense( + self.intermediate_dense = keras.layers.Dense( units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), bias_initializer="zeros", @@ -848,13 +849,13 @@ class TFHubertFeedForward(tf.keras.layers.Layer): ) self.intermediate_act_fn = get_tf_activation(config.hidden_act) - self.output_dense = tf.keras.layers.Dense( + self.output_dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), bias_initializer="zeros", name="output_dense", ) - self.output_dropout = tf.keras.layers.Dropout(config.hidden_dropout) + self.output_dropout = keras.layers.Dropout(config.hidden_dropout) self.config = config def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -879,7 +880,7 @@ class TFHubertFeedForward(tf.keras.layers.Layer): # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2EncoderLayer with Wav2Vec2->Hubert -class TFHubertEncoderLayer(tf.keras.layers.Layer): +class TFHubertEncoderLayer(keras.layers.Layer): def __init__(self, config: HubertConfig, **kwargs): super().__init__(**kwargs) self.attention = TFHubertAttention( @@ -889,12 +890,10 @@ class TFHubertEncoderLayer(tf.keras.layers.Layer): is_decoder=False, name="attention", ) - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout) - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") + self.dropout = keras.layers.Dropout(config.hidden_dropout) + self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.feed_forward = TFHubertFeedForward(config, name="feed_forward") - self.final_layer_norm = tf.keras.layers.LayerNormalization( - epsilon=config.layer_norm_eps, name="final_layer_norm" - ) + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="final_layer_norm") self.config = config def call( @@ -941,7 +940,7 @@ class TFHubertEncoderLayer(tf.keras.layers.Layer): # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2EncoderLayerStableLayerNorm with Wav2Vec2->Hubert -class TFHubertEncoderLayerStableLayerNorm(tf.keras.layers.Layer): +class TFHubertEncoderLayerStableLayerNorm(keras.layers.Layer): def __init__(self, config: HubertConfig, **kwargs): super().__init__(**kwargs) self.attention = TFHubertAttention( @@ -951,12 +950,10 @@ class TFHubertEncoderLayerStableLayerNorm(tf.keras.layers.Layer): is_decoder=False, name="attention", ) - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout) - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") + self.dropout = keras.layers.Dropout(config.hidden_dropout) + self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.feed_forward = TFHubertFeedForward(config, name="feed_forward") - self.final_layer_norm = tf.keras.layers.LayerNormalization( - epsilon=config.layer_norm_eps, name="final_layer_norm" - ) + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="final_layer_norm") self.config = config def call( @@ -1001,13 +998,13 @@ class TFHubertEncoderLayerStableLayerNorm(tf.keras.layers.Layer): # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2Encoder with Wav2Vec2->Hubert -class TFHubertEncoder(tf.keras.layers.Layer): +class TFHubertEncoder(keras.layers.Layer): def __init__(self, config: HubertConfig, **kwargs): super().__init__(**kwargs) self.config = config self.pos_conv_embed = TFHubertPositionalConvEmbedding(config, name="pos_conv_embed") - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout) + self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") + self.dropout = keras.layers.Dropout(config.hidden_dropout) self.layer = [TFHubertEncoderLayer(config, name=f"layers.{i}") for i in range(config.num_hidden_layers)] def call( @@ -1082,13 +1079,13 @@ class TFHubertEncoder(tf.keras.layers.Layer): # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2EncoderStableLayerNorm with Wav2Vec2->Hubert -class TFHubertEncoderStableLayerNorm(tf.keras.layers.Layer): +class TFHubertEncoderStableLayerNorm(keras.layers.Layer): def __init__(self, config: HubertConfig, **kwargs): super().__init__(**kwargs) self.config = config self.pos_conv_embed = TFHubertPositionalConvEmbedding(config, name="pos_conv_embed") - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout) + self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") + self.dropout = keras.layers.Dropout(config.hidden_dropout) self.layer = [ TFHubertEncoderLayerStableLayerNorm(config, name=f"layers.{i}") for i in range(config.num_hidden_layers) ] @@ -1165,7 +1162,7 @@ class TFHubertEncoderStableLayerNorm(tf.keras.layers.Layer): @keras_serializable -class TFHubertMainLayer(tf.keras.layers.Layer): +class TFHubertMainLayer(keras.layers.Layer): config_class = HubertConfig def __init__(self, config: HubertConfig, **kwargs): @@ -1339,7 +1336,7 @@ HUBERT_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -1522,8 +1519,8 @@ class TFHubertForCTC(TFHubertPreTrainedModel): super().__init__(config, *inputs, **kwargs) self.hubert = TFHubertMainLayer(config, name="hubert") - self.dropout = tf.keras.layers.Dropout(config.final_dropout) - self.lm_head = tf.keras.layers.Dense(config.vocab_size, name="lm_head") + self.dropout = keras.layers.Dropout(config.final_dropout) + self.lm_head = keras.layers.Dense(config.vocab_size, name="lm_head") self.output_hidden_size = ( config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size ) diff --git a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py index f5edb52520..21e7c64069 100644 --- a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py +++ b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py @@ -41,6 +41,7 @@ from ...modeling_tf_utils import ( TFSequenceClassificationLoss, TFTokenClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -59,7 +60,7 @@ TF_LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST = [ ] -class TFLayoutLMEmbeddings(tf.keras.layers.Layer): +class TFLayoutLMEmbeddings(keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, config: LayoutLMConfig, **kwargs): @@ -70,8 +71,8 @@ class TFLayoutLMEmbeddings(tf.keras.layers.Layer): self.max_position_embeddings = config.max_position_embeddings self.max_2d_position_embeddings = config.max_2d_position_embeddings self.initializer_range = config.initializer_range - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) def build(self, input_shape=None): with tf.name_scope("word_embeddings"): @@ -194,7 +195,7 @@ class TFLayoutLMEmbeddings(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->LayoutLM -class TFLayoutLMSelfAttention(tf.keras.layers.Layer): +class TFLayoutLMSelfAttention(keras.layers.Layer): def __init__(self, config: LayoutLMConfig, **kwargs): super().__init__(**kwargs) @@ -209,16 +210,16 @@ class TFLayoutLMSelfAttention(tf.keras.layers.Layer): self.all_head_size = self.num_attention_heads * self.attention_head_size self.sqrt_att_head_size = math.sqrt(self.attention_head_size) - self.query = tf.keras.layers.Dense( + self.query = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" ) - self.key = tf.keras.layers.Dense( + self.key = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" ) - self.value = tf.keras.layers.Dense( + self.value = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) - self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.is_decoder = config.is_decoder self.config = config @@ -327,15 +328,15 @@ class TFLayoutLMSelfAttention(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->LayoutLM -class TFLayoutLMSelfOutput(tf.keras.layers.Layer): +class TFLayoutLMSelfOutput(keras.layers.Layer): def __init__(self, config: LayoutLMConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -358,7 +359,7 @@ class TFLayoutLMSelfOutput(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->LayoutLM -class TFLayoutLMAttention(tf.keras.layers.Layer): +class TFLayoutLMAttention(keras.layers.Layer): def __init__(self, config: LayoutLMConfig, **kwargs): super().__init__(**kwargs) @@ -410,11 +411,11 @@ class TFLayoutLMAttention(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->LayoutLM -class TFLayoutLMIntermediate(tf.keras.layers.Layer): +class TFLayoutLMIntermediate(keras.layers.Layer): def __init__(self, config: LayoutLMConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -440,15 +441,15 @@ class TFLayoutLMIntermediate(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->LayoutLM -class TFLayoutLMOutput(tf.keras.layers.Layer): +class TFLayoutLMOutput(keras.layers.Layer): def __init__(self, config: LayoutLMConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -471,7 +472,7 @@ class TFLayoutLMOutput(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->LayoutLM -class TFLayoutLMLayer(tf.keras.layers.Layer): +class TFLayoutLMLayer(keras.layers.Layer): def __init__(self, config: LayoutLMConfig, **kwargs): super().__init__(**kwargs) @@ -575,7 +576,7 @@ class TFLayoutLMLayer(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->LayoutLM -class TFLayoutLMEncoder(tf.keras.layers.Layer): +class TFLayoutLMEncoder(keras.layers.Layer): def __init__(self, config: LayoutLMConfig, **kwargs): super().__init__(**kwargs) self.config = config @@ -654,11 +655,11 @@ class TFLayoutLMEncoder(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->LayoutLM -class TFLayoutLMPooler(tf.keras.layers.Layer): +class TFLayoutLMPooler(keras.layers.Layer): def __init__(self, config: LayoutLMConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", @@ -684,11 +685,11 @@ class TFLayoutLMPooler(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->LayoutLM -class TFLayoutLMPredictionHeadTransform(tf.keras.layers.Layer): +class TFLayoutLMPredictionHeadTransform(keras.layers.Layer): def __init__(self, config: LayoutLMConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense", @@ -699,7 +700,7 @@ class TFLayoutLMPredictionHeadTransform(tf.keras.layers.Layer): else: self.transform_act_fn = config.hidden_act - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: @@ -722,8 +723,8 @@ class TFLayoutLMPredictionHeadTransform(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMPredictionHead with Bert->LayoutLM -class TFLayoutLMLMPredictionHead(tf.keras.layers.Layer): - def __init__(self, config: LayoutLMConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): +class TFLayoutLMLMPredictionHead(keras.layers.Layer): + def __init__(self, config: LayoutLMConfig, input_embeddings: keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.config = config @@ -745,7 +746,7 @@ class TFLayoutLMLMPredictionHead(tf.keras.layers.Layer): with tf.name_scope(self.transform.name): self.transform.build(None) - def get_output_embeddings(self) -> tf.keras.layers.Layer: + def get_output_embeddings(self) -> keras.layers.Layer: return self.input_embeddings def set_output_embeddings(self, value: tf.Variable): @@ -771,8 +772,8 @@ class TFLayoutLMLMPredictionHead(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertMLMHead with Bert->LayoutLM -class TFLayoutLMMLMHead(tf.keras.layers.Layer): - def __init__(self, config: LayoutLMConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): +class TFLayoutLMMLMHead(keras.layers.Layer): + def __init__(self, config: LayoutLMConfig, input_embeddings: keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.predictions = TFLayoutLMLMPredictionHead(config, input_embeddings, name="predictions") @@ -792,7 +793,7 @@ class TFLayoutLMMLMHead(tf.keras.layers.Layer): @keras_serializable -class TFLayoutLMMainLayer(tf.keras.layers.Layer): +class TFLayoutLMMainLayer(keras.layers.Layer): config_class = LayoutLMConfig def __init__(self, config: LayoutLMConfig, add_pooling_layer: bool = True, **kwargs): @@ -804,7 +805,7 @@ class TFLayoutLMMainLayer(tf.keras.layers.Layer): self.encoder = TFLayoutLMEncoder(config, name="encoder") self.pooler = TFLayoutLMPooler(config, name="pooler") if add_pooling_layer else None - def get_input_embeddings(self) -> tf.keras.layers.Layer: + def get_input_embeddings(self) -> keras.layers.Layer: return self.embeddings def set_input_embeddings(self, value: tf.Variable): @@ -957,7 +958,7 @@ LAYOUTLM_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -1161,7 +1162,7 @@ class TFLayoutLMForMaskedLM(TFLayoutLMPreTrainedModel, TFMaskedLanguageModelingL self.layoutlm = TFLayoutLMMainLayer(config, add_pooling_layer=True, name="layoutlm") self.mlm = TFLayoutLMMLMHead(config, input_embeddings=self.layoutlm.embeddings, name="mlm___cls") - def get_lm_head(self) -> tf.keras.layers.Layer: + def get_lm_head(self) -> keras.layers.Layer: return self.mlm.predictions def get_prefix_bias_name(self) -> str: @@ -1289,8 +1290,8 @@ class TFLayoutLMForSequenceClassification(TFLayoutLMPreTrainedModel, TFSequenceC self.num_labels = config.num_labels self.layoutlm = TFLayoutLMMainLayer(config, name="layoutlm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.classifier = keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier", @@ -1425,8 +1426,8 @@ class TFLayoutLMForTokenClassification(TFLayoutLMPreTrainedModel, TFTokenClassif self.num_labels = config.num_labels self.layoutlm = TFLayoutLMMainLayer(config, add_pooling_layer=True, name="layoutlm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.classifier = keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier", @@ -1558,7 +1559,7 @@ class TFLayoutLMForQuestionAnswering(TFLayoutLMPreTrainedModel, TFQuestionAnswer self.num_labels = config.num_labels self.layoutlm = TFLayoutLMMainLayer(config, add_pooling_layer=True, name="layoutlm") - self.qa_outputs = tf.keras.layers.Dense( + self.qa_outputs = keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs", diff --git a/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py b/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py index 2ad140a78e..b52cfba54c 100644 --- a/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py +++ b/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py @@ -36,6 +36,7 @@ from ...modeling_tf_utils import ( TFSequenceClassificationLoss, TFTokenClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -65,7 +66,7 @@ TF_LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST = [ LARGE_NEGATIVE = -1e8 -class TFLayoutLMv3PatchEmbeddings(tf.keras.layers.Layer): +class TFLayoutLMv3PatchEmbeddings(keras.layers.Layer): """LayoutLMv3 image (patch) embeddings.""" def __init__(self, config: LayoutLMv3Config, **kwargs): @@ -75,7 +76,7 @@ class TFLayoutLMv3PatchEmbeddings(tf.keras.layers.Layer): if isinstance(config.patch_size, collections.abc.Iterable) else (config.patch_size, config.patch_size) ) - self.proj = tf.keras.layers.Conv2D( + self.proj = keras.layers.Conv2D( filters=config.hidden_size, kernel_size=patch_sizes, strides=patch_sizes, @@ -90,7 +91,7 @@ class TFLayoutLMv3PatchEmbeddings(tf.keras.layers.Layer): self.config = config def call(self, pixel_values: tf.Tensor) -> tf.Tensor: - # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format. + # When running on CPU, `keras.layers.Conv2D` doesn't support `NCHW` format. # So change the input format from `NCHW` to `NHWC`. pixel_values = tf.transpose(pixel_values, perm=[0, 2, 3, 1]) @@ -107,53 +108,53 @@ class TFLayoutLMv3PatchEmbeddings(tf.keras.layers.Layer): self.proj.build([None, None, None, self.config.num_channels]) -class TFLayoutLMv3TextEmbeddings(tf.keras.layers.Layer): +class TFLayoutLMv3TextEmbeddings(keras.layers.Layer): """ LayoutLMv3 text embeddings. Same as `RobertaEmbeddings` but with added spatial (layout) embeddings. """ def __init__(self, config: LayoutLMv3Config, **kwargs): super().__init__(**kwargs) - self.word_embeddings = tf.keras.layers.Embedding( + self.word_embeddings = keras.layers.Embedding( config.vocab_size, config.hidden_size, embeddings_initializer=get_initializer(config.initializer_range), name="word_embeddings", ) - self.token_type_embeddings = tf.keras.layers.Embedding( + self.token_type_embeddings = keras.layers.Embedding( config.type_vocab_size, config.hidden_size, embeddings_initializer=get_initializer(config.initializer_range), name="token_type_embeddings", ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) self.padding_token_index = config.pad_token_id - self.position_embeddings = tf.keras.layers.Embedding( + self.position_embeddings = keras.layers.Embedding( config.max_position_embeddings, config.hidden_size, embeddings_initializer=get_initializer(config.initializer_range), name="position_embeddings", ) - self.x_position_embeddings = tf.keras.layers.Embedding( + self.x_position_embeddings = keras.layers.Embedding( config.max_2d_position_embeddings, config.coordinate_size, embeddings_initializer=get_initializer(config.initializer_range), name="x_position_embeddings", ) - self.y_position_embeddings = tf.keras.layers.Embedding( + self.y_position_embeddings = keras.layers.Embedding( config.max_2d_position_embeddings, config.coordinate_size, embeddings_initializer=get_initializer(config.initializer_range), name="y_position_embeddings", ) - self.h_position_embeddings = tf.keras.layers.Embedding( + self.h_position_embeddings = keras.layers.Embedding( config.max_2d_position_embeddings, config.shape_size, embeddings_initializer=get_initializer(config.initializer_range), name="h_position_embeddings", ) - self.w_position_embeddings = tf.keras.layers.Embedding( + self.w_position_embeddings = keras.layers.Embedding( config.max_2d_position_embeddings, config.shape_size, embeddings_initializer=get_initializer(config.initializer_range), @@ -300,7 +301,7 @@ class TFLayoutLMv3TextEmbeddings(tf.keras.layers.Layer): self.w_position_embeddings.build(None) -class TFLayoutLMv3SelfAttention(tf.keras.layers.Layer): +class TFLayoutLMv3SelfAttention(keras.layers.Layer): def __init__(self, config: LayoutLMv3Config, **kwargs): super().__init__(**kwargs) if config.hidden_size % config.num_attention_heads != 0: @@ -314,23 +315,23 @@ class TFLayoutLMv3SelfAttention(tf.keras.layers.Layer): self.all_head_size = self.num_attention_heads * self.attention_head_size self.attention_score_normaliser = math.sqrt(self.attention_head_size) - self.query = tf.keras.layers.Dense( + self.query = keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query", ) - self.key = tf.keras.layers.Dense( + self.key = keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key", ) - self.value = tf.keras.layers.Dense( + self.value = keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value", ) - self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) + self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob) self.has_relative_attention_bias = config.has_relative_attention_bias self.has_spatial_attention_bias = config.has_spatial_attention_bias self.config = config @@ -349,7 +350,7 @@ class TFLayoutLMv3SelfAttention(tf.keras.layers.Layer): def cogview_attention(self, attention_scores: tf.Tensor, alpha: Union[float, int] = 32): """ https://arxiv.org/abs/2105.13290 Section 2.4 Stabilization of training: Precision Bottleneck Relaxation - (PB-Relax). A replacement of the original tf.keras.layers.Softmax(axis=-1)(attention_scores). Seems the new + (PB-Relax). A replacement of the original keras.layers.Softmax(axis=-1)(attention_scores). Seems the new attention_probs will result in a slower speed and a little bias. Can use tf.debugging.assert_near(standard_attention_probs, cogview_attention_probs, atol=1e-08) for comparison. The smaller atol (e.g., 1e-08), the better. @@ -428,15 +429,15 @@ class TFLayoutLMv3SelfAttention(tf.keras.layers.Layer): # Copied from models.roberta.modeling_tf_roberta.TFRobertaSelfOutput -class TFLayoutLMv3SelfOutput(tf.keras.layers.Layer): +class TFLayoutLMv3SelfOutput(keras.layers.Layer): def __init__(self, config: LayoutLMv3Config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -458,7 +459,7 @@ class TFLayoutLMv3SelfOutput(tf.keras.layers.Layer): self.LayerNorm.build([None, None, self.config.hidden_size]) -class TFLayoutLMv3Attention(tf.keras.layers.Layer): +class TFLayoutLMv3Attention(keras.layers.Layer): def __init__(self, config: LayoutLMv3Config, **kwargs): super().__init__(**kwargs) self.self_attention = TFLayoutLMv3SelfAttention(config, name="self") @@ -500,11 +501,11 @@ class TFLayoutLMv3Attention(tf.keras.layers.Layer): # Copied from models.roberta.modeling_tf_bert.TFRobertaIntermediate -class TFLayoutLMv3Intermediate(tf.keras.layers.Layer): +class TFLayoutLMv3Intermediate(keras.layers.Layer): def __init__(self, config: LayoutLMv3Config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -530,15 +531,15 @@ class TFLayoutLMv3Intermediate(tf.keras.layers.Layer): # Copied from models.roberta.modeling_tf_bert.TFRobertaOutput -class TFLayoutLMv3Output(tf.keras.layers.Layer): +class TFLayoutLMv3Output(keras.layers.Layer): def __init__(self, config: LayoutLMv3Config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -560,7 +561,7 @@ class TFLayoutLMv3Output(tf.keras.layers.Layer): self.LayerNorm.build([None, None, self.config.hidden_size]) -class TFLayoutLMv3Layer(tf.keras.layers.Layer): +class TFLayoutLMv3Layer(keras.layers.Layer): def __init__(self, config: LayoutLMv3Config, **kwargs): super().__init__(**kwargs) self.attention = TFLayoutLMv3Attention(config, name="attention") @@ -608,7 +609,7 @@ class TFLayoutLMv3Layer(tf.keras.layers.Layer): self.bert_output.build(None) -class TFLayoutLMv3Encoder(tf.keras.layers.Layer): +class TFLayoutLMv3Encoder(keras.layers.Layer): def __init__(self, config: LayoutLMv3Config, **kwargs): super().__init__(**kwargs) self.config = config @@ -620,7 +621,7 @@ class TFLayoutLMv3Encoder(tf.keras.layers.Layer): if self.has_relative_attention_bias: self.rel_pos_bins = config.rel_pos_bins self.max_rel_pos = config.max_rel_pos - self.rel_pos_bias = tf.keras.layers.Dense( + self.rel_pos_bias = keras.layers.Dense( units=config.num_attention_heads, kernel_initializer=get_initializer(config.initializer_range), use_bias=False, @@ -630,13 +631,13 @@ class TFLayoutLMv3Encoder(tf.keras.layers.Layer): if self.has_spatial_attention_bias: self.max_rel_2d_pos = config.max_rel_2d_pos self.rel_2d_pos_bins = config.rel_2d_pos_bins - self.rel_pos_x_bias = tf.keras.layers.Dense( + self.rel_pos_x_bias = keras.layers.Dense( units=config.num_attention_heads, kernel_initializer=get_initializer(config.initializer_range), use_bias=False, name="rel_pos_x_bias", ) - self.rel_pos_y_bias = tf.keras.layers.Dense( + self.rel_pos_y_bias = keras.layers.Dense( units=config.num_attention_heads, kernel_initializer=get_initializer(config.initializer_range), use_bias=False, @@ -670,7 +671,7 @@ class TFLayoutLMv3Encoder(tf.keras.layers.Layer): def _cal_pos_emb( self, - dense_layer: tf.keras.layers.Dense, + dense_layer: keras.layers.Dense, position_ids: tf.Tensor, num_buckets: int, max_distance: int, @@ -782,7 +783,7 @@ class TFLayoutLMv3Encoder(tf.keras.layers.Layer): @keras_serializable -class TFLayoutLMv3MainLayer(tf.keras.layers.Layer): +class TFLayoutLMv3MainLayer(keras.layers.Layer): config_class = LayoutLMv3Config def __init__(self, config: LayoutLMv3Config, **kwargs): @@ -795,14 +796,14 @@ class TFLayoutLMv3MainLayer(tf.keras.layers.Layer): if config.visual_embed: self.patch_embed = TFLayoutLMv3PatchEmbeddings(config, name="patch_embed") - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, name="dropout") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, name="dropout") if config.has_relative_attention_bias or config.has_spatial_attention_bias: image_size = config.input_size // config.patch_size self.init_visual_bbox(image_size=(image_size, image_size)) - self.norm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="norm") + self.norm = keras.layers.LayerNormalization(epsilon=1e-6, name="norm") self.encoder = TFLayoutLMv3Encoder(config, name="encoder") @@ -846,7 +847,7 @@ class TFLayoutLMv3MainLayer(tf.keras.layers.Layer): with tf.name_scope(self.norm.name): self.norm.build([None, None, self.config.hidden_size]) - def get_input_embeddings(self) -> tf.keras.layers.Layer: + def get_input_embeddings(self) -> keras.layers.Layer: return self.embeddings.word_embeddings def set_input_embeddings(self, value: tf.Variable): @@ -1141,7 +1142,7 @@ LAYOUTLMV3_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -1339,14 +1340,14 @@ class TFLayoutLMv3Model(TFLayoutLMv3PreTrainedModel): self.layoutlmv3.build(None) -class TFLayoutLMv3ClassificationHead(tf.keras.layers.Layer): +class TFLayoutLMv3ClassificationHead(keras.layers.Layer): """ Head for sentence-level classification tasks. Reference: RobertaClassificationHead """ def __init__(self, config: LayoutLMv3Config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( config.hidden_size, activation="tanh", kernel_initializer=get_initializer(config.initializer_range), @@ -1355,11 +1356,11 @@ class TFLayoutLMv3ClassificationHead(tf.keras.layers.Layer): classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = tf.keras.layers.Dropout( + self.dropout = keras.layers.Dropout( classifier_dropout, name="dropout", ) - self.out_proj = tf.keras.layers.Dense( + self.out_proj = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj", @@ -1520,9 +1521,9 @@ class TFLayoutLMv3ForTokenClassification(TFLayoutLMv3PreTrainedModel, TFTokenCla self.num_labels = config.num_labels self.layoutlmv3 = TFLayoutLMv3MainLayer(config, name="layoutlmv3") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, name="dropout") + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, name="dropout") if config.num_labels < 10: - self.classifier = tf.keras.layers.Dense( + self.classifier = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier", diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py index 95397cd18e..f64ed7758d 100644 --- a/src/transformers/models/led/modeling_tf_led.py +++ b/src/transformers/models/led/modeling_tf_led.py @@ -32,6 +32,7 @@ from ...modeling_tf_utils import ( TFModelInputType, TFPreTrainedModel, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -113,7 +114,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None): return (one_cst - expanded_mask) * LARGE_NEGATIVE -class TFLEDLearnedPositionalEmbedding(tf.keras.layers.Embedding): +class TFLEDLearnedPositionalEmbedding(keras.layers.Embedding): """ This module learns positional embeddings up to a fixed maximum size. """ @@ -131,7 +132,7 @@ class TFLEDLearnedPositionalEmbedding(tf.keras.layers.Embedding): # Copied from transformers.models.longformer.modeling_tf_longformer.TFLongformerSelfAttention with TFLongformer->TFLEDEncoder -class TFLEDEncoderSelfAttention(tf.keras.layers.Layer): +class TFLEDEncoderSelfAttention(keras.layers.Layer): def __init__(self, config, layer_id, **kwargs): super().__init__(**kwargs) self.config = config @@ -145,40 +146,40 @@ class TFLEDEncoderSelfAttention(tf.keras.layers.Layer): self.num_heads = config.num_attention_heads self.head_dim = int(config.hidden_size / config.num_attention_heads) self.embed_dim = config.hidden_size - self.query = tf.keras.layers.Dense( + self.query = keras.layers.Dense( self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="query", ) - self.key = tf.keras.layers.Dense( + self.key = keras.layers.Dense( self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="key", ) - self.value = tf.keras.layers.Dense( + self.value = keras.layers.Dense( self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="value", ) # separate projection layers for tokens with global attention - self.query_global = tf.keras.layers.Dense( + self.query_global = keras.layers.Dense( self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="query_global", ) - self.key_global = tf.keras.layers.Dense( + self.key_global = keras.layers.Dense( self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="key_global", ) - self.value_global = tf.keras.layers.Dense( + self.value_global = keras.layers.Dense( self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="value_global", ) - self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) - self.global_dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) + self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob) + self.global_dropout = keras.layers.Dropout(config.attention_probs_dropout_prob) self.layer_id = layer_id attention_window = config.attention_window[self.layer_id] @@ -998,11 +999,11 @@ class TFLEDEncoderSelfAttention(tf.keras.layers.Layer): ) -class TFLEDEncoderAttention(tf.keras.layers.Layer): +class TFLEDEncoderAttention(keras.layers.Layer): def __init__(self, config, layer_id, **kwargs): super().__init__(**kwargs) self.longformer_self_attn = TFLEDEncoderSelfAttention(config, layer_id=layer_id, name="longformer_self_attn") - self.output_dense = tf.keras.layers.Dense(config.d_model, use_bias=True, name="output") + self.output_dense = keras.layers.Dense(config.d_model, use_bias=True, name="output") self.config = config def call(self, inputs, training=False): @@ -1037,7 +1038,7 @@ class TFLEDEncoderAttention(tf.keras.layers.Layer): self.output_dense.build([None, None, self.config.d_model]) -class TFLEDDecoderAttention(tf.keras.layers.Layer): +class TFLEDDecoderAttention(keras.layers.Layer): """Multi-headed attention from "Attention Is All You Need""" def __init__( @@ -1053,16 +1054,16 @@ class TFLEDDecoderAttention(tf.keras.layers.Layer): self.embed_dim = embed_dim self.num_heads = num_heads - self.dropout = tf.keras.layers.Dropout(dropout) + self.dropout = keras.layers.Dropout(dropout) self.head_dim = embed_dim // num_heads assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder - self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") - self.q_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") - self.v_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") - self.out_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") + self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") + self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") + self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") + self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int): return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3)) @@ -1205,18 +1206,18 @@ class TFLEDDecoderAttention(tf.keras.layers.Layer): self.out_proj.build([None, None, self.embed_dim]) -class TFLEDEncoderLayer(tf.keras.layers.Layer): +class TFLEDEncoderLayer(keras.layers.Layer): def __init__(self, config: LEDConfig, layer_id: int, **kwargs): super().__init__(**kwargs) self.embed_dim = config.d_model self.self_attn = TFLEDEncoderAttention(config, layer_id, name="self_attn") - self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.dropout = keras.layers.Dropout(config.dropout) self.activation_fn = get_tf_activation(config.activation_function) - self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) - self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") - self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") - self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) + self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.config = config def call( @@ -1285,7 +1286,7 @@ class TFLEDEncoderLayer(tf.keras.layers.Layer): self.final_layer_norm.build([None, None, self.embed_dim]) -class TFLEDDecoderLayer(tf.keras.layers.Layer): +class TFLEDDecoderLayer(keras.layers.Layer): def __init__(self, config: LEDConfig, **kwargs): super().__init__(**kwargs) self.embed_dim = config.d_model @@ -1296,11 +1297,11 @@ class TFLEDDecoderLayer(tf.keras.layers.Layer): name="self_attn", is_decoder=True, ) - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) self.activation_fn = get_tf_activation(config.activation_function) - self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) - self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") self.encoder_attn = TFLEDDecoderAttention( self.embed_dim, config.decoder_attention_heads, @@ -1308,10 +1309,10 @@ class TFLEDDecoderLayer(tf.keras.layers.Layer): name="encoder_attn", is_decoder=True, ) - self.encoder_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") - self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") - self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") - self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") + self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.config = config def call( @@ -1616,7 +1617,7 @@ LED_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -1721,7 +1722,7 @@ LED_INPUTS_DOCSTRING = r""" @keras_serializable -class TFLEDEncoder(tf.keras.layers.Layer): +class TFLEDEncoder(keras.layers.Layer): config_class = LEDConfig """ Transformer encoder consisting of *config.encoder_layers* self-attention layers. Each layer is a @@ -1731,10 +1732,10 @@ class TFLEDEncoder(tf.keras.layers.Layer): config: LEDConfig """ - def __init__(self, config: LEDConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs): + def __init__(self, config: LEDConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs): super().__init__(**kwargs) self.config = config - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) if config.encoder_layerdrop > 0: logger.warning("Layerdrop is currently disabled in TFLED models.") self.layerdrop = 0.0 @@ -1758,7 +1759,7 @@ class TFLEDEncoder(tf.keras.layers.Layer): name="embed_positions", ) self.layers = [TFLEDEncoderLayer(config, i, name=f"layers.{i}") for i in range(config.encoder_layers)] - self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") + self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") self.embed_dim = config.d_model def get_embed_tokens(self): @@ -1991,7 +1992,7 @@ class TFLEDEncoder(tf.keras.layers.Layer): @keras_serializable -class TFLEDDecoder(tf.keras.layers.Layer): +class TFLEDDecoder(keras.layers.Layer): config_class = LEDConfig """ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFLEDDecoderLayer`] @@ -2001,7 +2002,7 @@ class TFLEDDecoder(tf.keras.layers.Layer): embed_tokens: output embedding """ - def __init__(self, config: LEDConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs): + def __init__(self, config: LEDConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs): super().__init__(**kwargs) self.config = config self.padding_idx = config.pad_token_id @@ -2015,9 +2016,9 @@ class TFLEDDecoder(tf.keras.layers.Layer): name="embed_positions", ) self.layers = [TFLEDDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)] - self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") + self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) def set_embed_tokens(self, embed_tokens): self.embed_tokens = embed_tokens @@ -2218,16 +2219,16 @@ class TFLEDDecoder(tf.keras.layers.Layer): @keras_serializable -class TFLEDMainLayer(tf.keras.layers.Layer): +class TFLEDMainLayer(keras.layers.Layer): config_class = LEDConfig def __init__(self, config: LEDConfig, **kwargs): super().__init__(**kwargs) self.config = config - self.shared = tf.keras.layers.Embedding( + self.shared = keras.layers.Embedding( input_dim=config.vocab_size, output_dim=config.d_model, - embeddings_initializer=tf.keras.initializers.TruncatedNormal(stddev=self.config.init_std), + embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std), name="led.shared", ) # Additional attribute to specify the expected name scope of the layer (for loading/storing weights) @@ -2434,9 +2435,9 @@ class TFLEDModel(TFLEDPreTrainedModel): # Copied from transformers.models.bart.modeling_tf_bart.BiasLayer -class BiasLayer(tf.keras.layers.Layer): +class BiasLayer(keras.layers.Layer): """ - Bias as a layer. It is used for serialization purposes: `tf.keras.Model.save_weights` stores on a per-layer basis, + Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis, so all weights have to be registered in a layer. """ @@ -2635,9 +2636,7 @@ class TFLEDForConditionalGeneration(TFLEDPreTrainedModel): def hf_compute_loss(self, labels, logits): """CrossEntropyLoss that ignores pad tokens""" - loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( - from_logits=True, reduction=tf.keras.losses.Reduction.NONE - ) + loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE) if self.config.tf_legacy_loss: melted_labels = tf.reshape(labels, (-1,)) active_loss = tf.not_equal(melted_labels, self.config.pad_token_id) diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py index c586157f9d..1cbfb28695 100644 --- a/src/transformers/models/longformer/modeling_tf_longformer.py +++ b/src/transformers/models/longformer/modeling_tf_longformer.py @@ -34,6 +34,7 @@ from ...modeling_tf_utils import ( TFSequenceClassificationLoss, TFTokenClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -416,7 +417,7 @@ def _compute_global_attention_mask(input_ids_shape, sep_token_indices, before_se # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->Longformer -class TFLongformerLMHead(tf.keras.layers.Layer): +class TFLongformerLMHead(keras.layers.Layer): """Longformer Head for masked language modeling.""" def __init__(self, config, input_embeddings, **kwargs): @@ -424,10 +425,10 @@ class TFLongformerLMHead(tf.keras.layers.Layer): self.config = config self.hidden_size = config.hidden_size - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") + self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.act = get_tf_activation("gelu") # The output weights are the same as the input embeddings, but there is @@ -476,7 +477,7 @@ class TFLongformerLMHead(tf.keras.layers.Layer): return hidden_states -class TFLongformerEmbeddings(tf.keras.layers.Layer): +class TFLongformerEmbeddings(keras.layers.Layer): """ Same as BertEmbeddings with a tiny tweak for positional embeddings indexing and some extra casting. """ @@ -489,8 +490,8 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer): self.hidden_size = config.hidden_size self.max_position_embeddings = config.max_position_embeddings self.initializer_range = config.initializer_range - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) def build(self, input_shape=None): with tf.name_scope("word_embeddings"): @@ -583,11 +584,11 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Longformer -class TFLongformerIntermediate(tf.keras.layers.Layer): +class TFLongformerIntermediate(keras.layers.Layer): def __init__(self, config: LongformerConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -613,15 +614,15 @@ class TFLongformerIntermediate(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Longformer -class TFLongformerOutput(tf.keras.layers.Layer): +class TFLongformerOutput(keras.layers.Layer): def __init__(self, config: LongformerConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -644,11 +645,11 @@ class TFLongformerOutput(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Longformer -class TFLongformerPooler(tf.keras.layers.Layer): +class TFLongformerPooler(keras.layers.Layer): def __init__(self, config: LongformerConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", @@ -674,15 +675,15 @@ class TFLongformerPooler(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Longformer -class TFLongformerSelfOutput(tf.keras.layers.Layer): +class TFLongformerSelfOutput(keras.layers.Layer): def __init__(self, config: LongformerConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -704,7 +705,7 @@ class TFLongformerSelfOutput(tf.keras.layers.Layer): self.LayerNorm.build([None, None, self.config.hidden_size]) -class TFLongformerSelfAttention(tf.keras.layers.Layer): +class TFLongformerSelfAttention(keras.layers.Layer): def __init__(self, config, layer_id, **kwargs): super().__init__(**kwargs) self.config = config @@ -718,40 +719,40 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer): self.num_heads = config.num_attention_heads self.head_dim = int(config.hidden_size / config.num_attention_heads) self.embed_dim = config.hidden_size - self.query = tf.keras.layers.Dense( + self.query = keras.layers.Dense( self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="query", ) - self.key = tf.keras.layers.Dense( + self.key = keras.layers.Dense( self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="key", ) - self.value = tf.keras.layers.Dense( + self.value = keras.layers.Dense( self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="value", ) # separate projection layers for tokens with global attention - self.query_global = tf.keras.layers.Dense( + self.query_global = keras.layers.Dense( self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="query_global", ) - self.key_global = tf.keras.layers.Dense( + self.key_global = keras.layers.Dense( self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="key_global", ) - self.value_global = tf.keras.layers.Dense( + self.value_global = keras.layers.Dense( self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="value_global", ) - self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) - self.global_dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) + self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob) + self.global_dropout = keras.layers.Dropout(config.attention_probs_dropout_prob) self.layer_id = layer_id attention_window = config.attention_window[self.layer_id] @@ -1571,7 +1572,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer): ) -class TFLongformerAttention(tf.keras.layers.Layer): +class TFLongformerAttention(keras.layers.Layer): def __init__(self, config, layer_id=0, **kwargs): super().__init__(**kwargs) @@ -1612,7 +1613,7 @@ class TFLongformerAttention(tf.keras.layers.Layer): self.dense_output.build(None) -class TFLongformerLayer(tf.keras.layers.Layer): +class TFLongformerLayer(keras.layers.Layer): def __init__(self, config, layer_id=0, **kwargs): super().__init__(**kwargs) @@ -1656,7 +1657,7 @@ class TFLongformerLayer(tf.keras.layers.Layer): self.longformer_output.build(None) -class TFLongformerEncoder(tf.keras.layers.Layer): +class TFLongformerEncoder(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) @@ -1744,7 +1745,7 @@ class TFLongformerEncoder(tf.keras.layers.Layer): @keras_serializable -class TFLongformerMainLayer(tf.keras.layers.Layer): +class TFLongformerMainLayer(keras.layers.Layer): config_class = LongformerConfig def __init__(self, config, add_pooling_layer=True, **kwargs): @@ -2006,7 +2007,7 @@ LONGFORMER_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -2288,7 +2289,7 @@ class TFLongformerForQuestionAnswering(TFLongformerPreTrainedModel, TFQuestionAn self.num_labels = config.num_labels self.longformer = TFLongformerMainLayer(config, add_pooling_layer=False, name="longformer") - self.qa_outputs = tf.keras.layers.Dense( + self.qa_outputs = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs", @@ -2414,19 +2415,19 @@ class TFLongformerForQuestionAnswering(TFLongformerPreTrainedModel, TFQuestionAn self.qa_outputs.build([None, None, self.config.hidden_size]) -class TFLongformerClassificationHead(tf.keras.layers.Layer): +class TFLongformerClassificationHead(keras.layers.Layer): """Head for sentence-level classification tasks.""" def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", name="dense", ) - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.out_proj = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) + self.out_proj = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" ) self.config = config @@ -2580,8 +2581,8 @@ class TFLongformerForMultipleChoice(TFLongformerPreTrainedModel, TFMultipleChoic super().__init__(config, *inputs, **kwargs) self.longformer = TFLongformerMainLayer(config, name="longformer") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) + self.classifier = keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config @@ -2708,8 +2709,8 @@ class TFLongformerForTokenClassification(TFLongformerPreTrainedModel, TFTokenCla self.num_labels = config.num_labels self.longformer = TFLongformerMainLayer(config=config, add_pooling_layer=False, name="longformer") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) + self.classifier = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config diff --git a/src/transformers/models/lxmert/modeling_tf_lxmert.py b/src/transformers/models/lxmert/modeling_tf_lxmert.py index af7b98fe60..22ce04a001 100644 --- a/src/transformers/models/lxmert/modeling_tf_lxmert.py +++ b/src/transformers/models/lxmert/modeling_tf_lxmert.py @@ -31,6 +31,7 @@ from ...modeling_tf_utils import ( TFModelInputType, TFPreTrainedModel, get_initializer, + keras, keras_serializable, shape_list, unpack_inputs, @@ -151,29 +152,27 @@ class TFLxmertForPreTrainingOutput(ModelOutput): cross_encoder_attentions: Tuple[tf.Tensor] | None = None -class TFLxmertVisualFeatureEncoder(tf.keras.layers.Layer): +class TFLxmertVisualFeatureEncoder(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) # Object feature encoding - self.visn_fc = tf.keras.layers.Dense( + self.visn_fc = keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="visn_fc", ) - self.visn_layer_norm = tf.keras.layers.LayerNormalization( - epsilon=config.layer_norm_eps, name="visn_layer_norm" - ) + self.visn_layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="visn_layer_norm") # Box position encoding - self.box_fc = tf.keras.layers.Dense( + self.box_fc = keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="box_fc", ) - self.box_layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="box_layer_norm") + self.box_layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="box_layer_norm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) self.feat_dim = config.visual_feat_dim self.pos_dim = config.visual_pos_dim self.config = config @@ -208,7 +207,7 @@ class TFLxmertVisualFeatureEncoder(tf.keras.layers.Layer): self.box_layer_norm.build([None, None, self.config.hidden_size]) -class TFLxmertEmbeddings(tf.keras.layers.Layer): +class TFLxmertEmbeddings(keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, config, **kwargs): @@ -218,8 +217,8 @@ class TFLxmertEmbeddings(tf.keras.layers.Layer): self.hidden_size = config.hidden_size self.max_position_embeddings = config.max_position_embeddings self.initializer_range = config.initializer_range - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) def build(self, input_shape=None): with tf.name_scope("word_embeddings"): @@ -278,7 +277,7 @@ class TFLxmertEmbeddings(tf.keras.layers.Layer): return final_embeddings -class TFLxmertAttention(tf.keras.layers.Layer): +class TFLxmertAttention(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) if config.hidden_size % config.num_attention_heads != 0: @@ -292,23 +291,23 @@ class TFLxmertAttention(tf.keras.layers.Layer): self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size - self.query = tf.keras.layers.Dense( + self.query = keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query", ) - self.key = tf.keras.layers.Dense( + self.key = keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key", ) - self.value = tf.keras.layers.Dense( + self.value = keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value", ) - self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) + self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob) self.ctx_dim = config.hidden_size self.config = config @@ -370,10 +369,10 @@ class TFLxmertAttention(tf.keras.layers.Layer): self.value.build([None, None, self.ctx_dim]) -class TFLxmertIntermediate(tf.keras.layers.Layer): +class TFLxmertIntermediate(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense", @@ -398,17 +397,17 @@ class TFLxmertIntermediate(tf.keras.layers.Layer): self.dense.build([None, None, self.config.hidden_size]) -class TFLxmertOutput(tf.keras.layers.Layer): +class TFLxmertOutput(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense", ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) self.config = config def call(self, hidden_states, input_tensor, training=False): @@ -429,16 +428,16 @@ class TFLxmertOutput(tf.keras.layers.Layer): self.LayerNorm.build([None, None, self.config.hidden_size]) -class TFLxmertAttentionOutput(tf.keras.layers.Layer): +class TFLxmertAttentionOutput(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense", ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) self.config = config def call(self, hidden_states, input_tensor, training=False): @@ -459,7 +458,7 @@ class TFLxmertAttentionOutput(tf.keras.layers.Layer): self.LayerNorm.build([None, None, self.config.hidden_size]) -class TFLxmertSelfAttentionLayer(tf.keras.layers.Layer): +class TFLxmertSelfAttentionLayer(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.self = TFLxmertAttention(config, name="self") @@ -485,7 +484,7 @@ class TFLxmertSelfAttentionLayer(tf.keras.layers.Layer): self.attention_output.build(None) -class TFLxmertCrossAttentionLayer(tf.keras.layers.Layer): +class TFLxmertCrossAttentionLayer(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.att = TFLxmertAttention(config, name="att") @@ -518,7 +517,7 @@ class TFLxmertCrossAttentionLayer(tf.keras.layers.Layer): self.attention_output.build(None) -class TFLxmertLayer(tf.keras.layers.Layer): +class TFLxmertLayer(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.attention = TFLxmertSelfAttentionLayer(config, name="attention") @@ -548,7 +547,7 @@ class TFLxmertLayer(tf.keras.layers.Layer): self.transformer_output.build(None) -class TFLxmertXLayer(tf.keras.layers.Layer): +class TFLxmertXLayer(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.visual_attention = TFLxmertCrossAttentionLayer(config, name="visual_attention") @@ -679,7 +678,7 @@ class TFLxmertXLayer(tf.keras.layers.Layer): self.visn_output.build(None) -class TFLxmertEncoder(tf.keras.layers.Layer): +class TFLxmertEncoder(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) @@ -789,7 +788,7 @@ class TFLxmertEncoder(tf.keras.layers.Layer): @keras_serializable -class TFLxmertMainLayer(tf.keras.layers.Layer): +class TFLxmertMainLayer(keras.layers.Layer): config_class = LxmertConfig def __init__(self, config, **kwargs): @@ -991,7 +990,7 @@ LXMERT_START_DOCSTRING = r""" genome, using a combination of masked language modeling, region of interest feature regression, cross entropy loss for question answering attribute prediction, and object tag prediction. - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -1145,10 +1144,10 @@ class TFLxmertModel(TFLxmertPreTrainedModel): self.lxmert.build(None) -class TFLxmertPooler(tf.keras.layers.Layer): +class TFLxmertPooler(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", @@ -1173,11 +1172,11 @@ class TFLxmertPooler(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->Lxmert -class TFLxmertPredictionHeadTransform(tf.keras.layers.Layer): +class TFLxmertPredictionHeadTransform(keras.layers.Layer): def __init__(self, config: LxmertConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense", @@ -1188,7 +1187,7 @@ class TFLxmertPredictionHeadTransform(tf.keras.layers.Layer): else: self.transform_act_fn = config.hidden_act - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: @@ -1211,8 +1210,8 @@ class TFLxmertPredictionHeadTransform(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMPredictionHead with Bert->Lxmert -class TFLxmertLMPredictionHead(tf.keras.layers.Layer): - def __init__(self, config: LxmertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): +class TFLxmertLMPredictionHead(keras.layers.Layer): + def __init__(self, config: LxmertConfig, input_embeddings: keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.config = config @@ -1234,7 +1233,7 @@ class TFLxmertLMPredictionHead(tf.keras.layers.Layer): with tf.name_scope(self.transform.name): self.transform.build(None) - def get_output_embeddings(self) -> tf.keras.layers.Layer: + def get_output_embeddings(self) -> keras.layers.Layer: return self.input_embeddings def set_output_embeddings(self, value: tf.Variable): @@ -1260,8 +1259,8 @@ class TFLxmertLMPredictionHead(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertMLMHead with Bert->Lxmert -class TFLxmertMLMHead(tf.keras.layers.Layer): - def __init__(self, config: LxmertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): +class TFLxmertMLMHead(keras.layers.Layer): + def __init__(self, config: LxmertConfig, input_embeddings: keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.predictions = TFLxmertLMPredictionHead(config, input_embeddings, name="predictions") @@ -1280,12 +1279,12 @@ class TFLxmertMLMHead(tf.keras.layers.Layer): self.predictions.build(None) -class TFLxmertPreTrainingHeads(tf.keras.layers.Layer): +class TFLxmertPreTrainingHeads(keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) self.predictions = TFLxmertLMPredictionHead(config, input_embeddings, name="predictions") - self.seq_relationship = tf.keras.layers.Dense( + self.seq_relationship = keras.layers.Dense( 2, kernel_initializer=get_initializer(config.initializer_range), name="seq_relationship", @@ -1309,18 +1308,18 @@ class TFLxmertPreTrainingHeads(tf.keras.layers.Layer): self.seq_relationship.build([None, None, self.config.hidden_size]) -class TFLxmertVisualAnswerHead(tf.keras.layers.Layer): +class TFLxmertVisualAnswerHead(keras.layers.Layer): def __init__(self, config, num_labels, **kwargs): super().__init__(**kwargs) hid_dim = config.hidden_size - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( hid_dim * 2, kernel_initializer=get_initializer(config.initializer_range), name="logit_fc_._0", ) self.activation = get_tf_activation("gelu") - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="logit_fc_._2") - self.dense_1 = tf.keras.layers.Dense( + self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="logit_fc_._2") + self.dense_1 = keras.layers.Dense( num_labels, kernel_initializer=get_initializer(config.initializer_range), name="logit_fc_._3", @@ -1350,7 +1349,7 @@ class TFLxmertVisualAnswerHead(tf.keras.layers.Layer): self.dense_1.build([None, None, self.hid_dim * 2]) -class TFLxmertVisualObjHead(tf.keras.layers.Layer): +class TFLxmertVisualObjHead(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.transform = TFLxmertPredictionHeadTransform(config, name="transform") @@ -1368,7 +1367,7 @@ class TFLxmertVisualObjHead(tf.keras.layers.Layer): # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.decoder_dict = { - key: tf.keras.layers.Dense( + key: keras.layers.Dense( self.visual_losses[key]["num"], kernel_initializer=get_initializer(config.initializer_range), name=f"decoder_dict.{key}", @@ -1424,9 +1423,9 @@ class TFLxmertForPreTraining(TFLxmertPreTrainedModel): # Loss functions self.loss_fcts = { - "l2": tf.keras.losses.Huber(delta=1.0, name="huber_loss"), - "visn_ce": tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), - "ce": tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), + "l2": keras.losses.Huber(delta=1.0, name="huber_loss"), + "visn_ce": keras.losses.SparseCategoricalCrossentropy(from_logits=True), + "ce": keras.losses.SparseCategoricalCrossentropy(from_logits=True), } visual_losses = {} diff --git a/src/transformers/models/marian/modeling_tf_marian.py b/src/transformers/models/marian/modeling_tf_marian.py index 3dec9f537e..c6d5355f70 100644 --- a/src/transformers/models/marian/modeling_tf_marian.py +++ b/src/transformers/models/marian/modeling_tf_marian.py @@ -35,6 +35,7 @@ from ...modeling_tf_outputs import ( from ...modeling_tf_utils import ( TFCausalLanguageModelingLoss, TFPreTrainedModel, + keras, keras_serializable, unpack_inputs, ) @@ -116,7 +117,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None): return (one_cst - expanded_mask) * LARGE_NEGATIVE -class TFMarianSinusoidalPositionalEmbedding(tf.keras.layers.Layer): +class TFMarianSinusoidalPositionalEmbedding(keras.layers.Layer): """This module produces sinusoidal positional embeddings of any length.""" def __init__(self, num_positions: int, embedding_dim: int, **kwargs): @@ -175,7 +176,7 @@ class TFMarianSinusoidalPositionalEmbedding(tf.keras.layers.Layer): # Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with Bart->Marian -class TFMarianAttention(tf.keras.layers.Layer): +class TFMarianAttention(keras.layers.Layer): """Multi-headed attention from "Attention Is All You Need""" def __init__( @@ -191,7 +192,7 @@ class TFMarianAttention(tf.keras.layers.Layer): self.embed_dim = embed_dim self.num_heads = num_heads - self.dropout = tf.keras.layers.Dropout(dropout) + self.dropout = keras.layers.Dropout(dropout) self.head_dim = embed_dim // num_heads if (self.head_dim * num_heads) != self.embed_dim: raise ValueError( @@ -201,10 +202,10 @@ class TFMarianAttention(tf.keras.layers.Layer): self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder - self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") - self.q_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") - self.v_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") - self.out_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") + self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") + self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") + self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") + self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int): return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3)) @@ -346,20 +347,20 @@ class TFMarianAttention(tf.keras.layers.Layer): # Copied from transformers.models.bart.modeling_tf_bart.TFBartEncoderLayer with Bart->Marian -class TFMarianEncoderLayer(tf.keras.layers.Layer): +class TFMarianEncoderLayer(keras.layers.Layer): def __init__(self, config: MarianConfig, **kwargs): super().__init__(**kwargs) self.embed_dim = config.d_model self.self_attn = TFMarianAttention( self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn" ) - self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.dropout = keras.layers.Dropout(config.dropout) self.activation_fn = get_tf_activation(config.activation_function) - self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) - self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") - self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") - self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) + self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.config = config def call( @@ -424,7 +425,7 @@ class TFMarianEncoderLayer(tf.keras.layers.Layer): # Copied from transformers.models.bart.modeling_tf_bart.TFBartDecoderLayer with Bart->Marian -class TFMarianDecoderLayer(tf.keras.layers.Layer): +class TFMarianDecoderLayer(keras.layers.Layer): def __init__(self, config: MarianConfig, **kwargs): super().__init__(**kwargs) self.embed_dim = config.d_model @@ -435,11 +436,11 @@ class TFMarianDecoderLayer(tf.keras.layers.Layer): name="self_attn", is_decoder=True, ) - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) self.activation_fn = get_tf_activation(config.activation_function) - self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) - self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") self.encoder_attn = TFMarianAttention( self.embed_dim, config.decoder_attention_heads, @@ -447,10 +448,10 @@ class TFMarianDecoderLayer(tf.keras.layers.Layer): name="encoder_attn", is_decoder=True, ) - self.encoder_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") - self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") - self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") - self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") + self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.config = config def call( @@ -570,7 +571,7 @@ MARIAN_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -713,7 +714,7 @@ MARIAN_INPUTS_DOCSTRING = r""" @keras_serializable -class TFMarianEncoder(tf.keras.layers.Layer): +class TFMarianEncoder(keras.layers.Layer): config_class = MarianConfig """ Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a @@ -723,10 +724,10 @@ class TFMarianEncoder(tf.keras.layers.Layer): config: MarianConfig """ - def __init__(self, config: MarianConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs): + def __init__(self, config: MarianConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs): super().__init__(**kwargs) self.config = config - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) self.layerdrop = config.encoder_layerdrop self.padding_idx = config.pad_token_id self.max_source_positions = config.max_position_embeddings @@ -880,7 +881,7 @@ class TFMarianEncoder(tf.keras.layers.Layer): @keras_serializable -class TFMarianDecoder(tf.keras.layers.Layer): +class TFMarianDecoder(keras.layers.Layer): config_class = MarianConfig """ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFMarianDecoderLayer`] @@ -890,7 +891,7 @@ class TFMarianDecoder(tf.keras.layers.Layer): embed_tokens: output embedding """ - def __init__(self, config: MarianConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs): + def __init__(self, config: MarianConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs): super().__init__(**kwargs) self.config = config self.padding_idx = config.pad_token_id @@ -904,7 +905,7 @@ class TFMarianDecoder(tf.keras.layers.Layer): self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0 self.layers = [TFMarianDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)] - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) def get_embed_tokens(self): return self.embed_tokens @@ -1116,17 +1117,17 @@ class TFMarianDecoder(tf.keras.layers.Layer): @keras_serializable -class TFMarianMainLayer(tf.keras.layers.Layer): +class TFMarianMainLayer(keras.layers.Layer): config_class = MarianConfig def __init__(self, config: MarianConfig, **kwargs): super().__init__(**kwargs) self.config = config - self.shared = tf.keras.layers.Embedding( + self.shared = keras.layers.Embedding( input_dim=config.vocab_size, output_dim=config.d_model, - embeddings_initializer=tf.keras.initializers.TruncatedNormal(stddev=self.config.init_std), + embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std), name="model.shared", ) # Additional attribute to specify the expected name scope of the layer (for loading/storing weights) @@ -1338,9 +1339,9 @@ class TFMarianModel(TFMarianPreTrainedModel): # Copied from transformers.models.bart.modeling_tf_bart.BiasLayer -class BiasLayer(tf.keras.layers.Layer): +class BiasLayer(keras.layers.Layer): """ - Bias as a layer. It is used for serialization purposes: `tf.keras.Model.save_weights` stores on a per-layer basis, + Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis, so all weights have to be registered in a layer. """ diff --git a/src/transformers/models/mbart/modeling_tf_mbart.py b/src/transformers/models/mbart/modeling_tf_mbart.py index 8ba2e602bf..2c134b520d 100644 --- a/src/transformers/models/mbart/modeling_tf_mbart.py +++ b/src/transformers/models/mbart/modeling_tf_mbart.py @@ -35,6 +35,7 @@ from ...modeling_tf_utils import ( TFCausalLanguageModelingLoss, TFModelInputType, TFPreTrainedModel, + keras, keras_serializable, unpack_inputs, ) @@ -116,7 +117,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None): # Copied from transformers.models.bart.modeling_tf_bart.TFBartLearnedPositionalEmbedding with Bart->MBart -class TFMBartLearnedPositionalEmbedding(tf.keras.layers.Embedding): +class TFMBartLearnedPositionalEmbedding(keras.layers.Embedding): """ This module learns positional embeddings up to a fixed maximum size. """ @@ -144,7 +145,7 @@ class TFMBartLearnedPositionalEmbedding(tf.keras.layers.Embedding): # Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with Bart->MBart -class TFMBartAttention(tf.keras.layers.Layer): +class TFMBartAttention(keras.layers.Layer): """Multi-headed attention from "Attention Is All You Need""" def __init__( @@ -160,7 +161,7 @@ class TFMBartAttention(tf.keras.layers.Layer): self.embed_dim = embed_dim self.num_heads = num_heads - self.dropout = tf.keras.layers.Dropout(dropout) + self.dropout = keras.layers.Dropout(dropout) self.head_dim = embed_dim // num_heads if (self.head_dim * num_heads) != self.embed_dim: raise ValueError( @@ -170,10 +171,10 @@ class TFMBartAttention(tf.keras.layers.Layer): self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder - self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") - self.q_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") - self.v_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") - self.out_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") + self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") + self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") + self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") + self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int): return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3)) @@ -314,20 +315,20 @@ class TFMBartAttention(tf.keras.layers.Layer): self.out_proj.build([None, None, self.embed_dim]) -class TFMBartEncoderLayer(tf.keras.layers.Layer): +class TFMBartEncoderLayer(keras.layers.Layer): def __init__(self, config: MBartConfig, **kwargs): super().__init__(**kwargs) self.embed_dim = config.d_model self.self_attn = TFMBartAttention( self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn" ) - self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.dropout = keras.layers.Dropout(config.dropout) self.activation_fn = get_tf_activation(config.activation_function) - self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) - self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") - self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") - self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) + self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.config = config def call( @@ -391,7 +392,7 @@ class TFMBartEncoderLayer(tf.keras.layers.Layer): self.final_layer_norm.build([None, None, self.embed_dim]) -class TFMBartDecoderLayer(tf.keras.layers.Layer): +class TFMBartDecoderLayer(keras.layers.Layer): def __init__(self, config: MBartConfig, **kwargs): super().__init__(**kwargs) self.embed_dim = config.d_model @@ -402,11 +403,11 @@ class TFMBartDecoderLayer(tf.keras.layers.Layer): name="self_attn", is_decoder=True, ) - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) self.activation_fn = get_tf_activation(config.activation_function) - self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) - self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") self.encoder_attn = TFMBartAttention( self.embed_dim, config.decoder_attention_heads, @@ -414,10 +415,10 @@ class TFMBartDecoderLayer(tf.keras.layers.Layer): name="encoder_attn", is_decoder=True, ) - self.encoder_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") - self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") - self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") - self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") + self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.config = config def call( @@ -537,7 +538,7 @@ MBART_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -703,7 +704,7 @@ MBART_GENERATION_EXAMPLE = r""" @keras_serializable -class TFMBartEncoder(tf.keras.layers.Layer): +class TFMBartEncoder(keras.layers.Layer): config_class = MBartConfig """ Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a @@ -713,10 +714,10 @@ class TFMBartEncoder(tf.keras.layers.Layer): config: MBartConfig """ - def __init__(self, config: MBartConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs): + def __init__(self, config: MBartConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs): super().__init__(**kwargs) self.config = config - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) self.layerdrop = config.encoder_layerdrop self.padding_idx = config.pad_token_id self.max_source_positions = config.max_position_embeddings @@ -729,8 +730,8 @@ class TFMBartEncoder(tf.keras.layers.Layer): name="embed_positions", ) self.layers = [TFMBartEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)] - self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") + self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") + self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") self.embed_dim = config.d_model def get_embed_tokens(self): @@ -882,7 +883,7 @@ class TFMBartEncoder(tf.keras.layers.Layer): @keras_serializable -class TFMBartDecoder(tf.keras.layers.Layer): +class TFMBartDecoder(keras.layers.Layer): config_class = MBartConfig """ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFMBartDecoderLayer`] @@ -892,7 +893,7 @@ class TFMBartDecoder(tf.keras.layers.Layer): embed_tokens: output embedding """ - def __init__(self, config: MBartConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs): + def __init__(self, config: MBartConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs): super().__init__(**kwargs) self.config = config self.padding_idx = config.pad_token_id @@ -905,10 +906,10 @@ class TFMBartDecoder(tf.keras.layers.Layer): ) self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0 self.layers = [TFMBartDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)] - self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") + self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") + self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) def get_embed_tokens(self): return self.embed_tokens @@ -1131,17 +1132,17 @@ class TFMBartDecoder(tf.keras.layers.Layer): @keras_serializable -class TFMBartMainLayer(tf.keras.layers.Layer): +class TFMBartMainLayer(keras.layers.Layer): config_class = MBartConfig def __init__(self, config: MBartConfig, **kwargs): super().__init__(**kwargs) self.config = config - self.shared = tf.keras.layers.Embedding( + self.shared = keras.layers.Embedding( input_dim=config.vocab_size, output_dim=config.d_model, - embeddings_initializer=tf.keras.initializers.TruncatedNormal(stddev=self.config.init_std), + embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std), name="model.shared", ) # Additional attribute to specify the expected name scope of the layer (for loading/storing weights) @@ -1356,9 +1357,9 @@ class TFMBartModel(TFMBartPreTrainedModel): # Copied from transformers.models.bart.modeling_tf_bart.BiasLayer -class BiasLayer(tf.keras.layers.Layer): +class BiasLayer(keras.layers.Layer): """ - Bias as a layer. It is used for serialization purposes: `tf.keras.Model.save_weights` stores on a per-layer basis, + Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis, so all weights have to be registered in a layer. """ diff --git a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py index 7f40a6271e..6ccc996557 100644 --- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py +++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py @@ -46,6 +46,7 @@ from ...modeling_tf_utils import ( TFSequenceClassificationLoss, TFTokenClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -98,9 +99,7 @@ class TFMobileBertPreTrainingLoss: """ def hf_compute_loss(self, labels: tf.Tensor, logits: tf.Tensor) -> tf.Tensor: - loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( - from_logits=True, reduction=tf.keras.losses.Reduction.NONE - ) + loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE) # Clip negative labels to zero here to avoid NaNs and errors - those positions will get masked later anyway unmasked_lm_losses = loss_fn(y_true=tf.nn.relu(labels["labels"]), y_pred=logits[0]) @@ -120,11 +119,11 @@ class TFMobileBertPreTrainingLoss: return tf.reshape(reduced_masked_lm_loss + reduced_masked_ns_loss, (1,)) -class TFMobileBertIntermediate(tf.keras.layers.Layer): +class TFMobileBertIntermediate(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense(config.intermediate_size, name="dense") + self.dense = keras.layers.Dense(config.intermediate_size, name="dense") if isinstance(config.hidden_act, str): self.intermediate_act_fn = get_tf_activation(config.hidden_act) @@ -147,7 +146,7 @@ class TFMobileBertIntermediate(tf.keras.layers.Layer): self.dense.build([None, None, self.config.true_hidden_size]) -class TFLayerNorm(tf.keras.layers.LayerNormalization): +class TFLayerNorm(keras.layers.LayerNormalization): def __init__(self, feat_size, *args, **kwargs): self.feat_size = feat_size super().__init__(*args, **kwargs) @@ -156,7 +155,7 @@ class TFLayerNorm(tf.keras.layers.LayerNormalization): super().build([None, None, self.feat_size]) -class TFNoNorm(tf.keras.layers.Layer): +class TFNoNorm(keras.layers.Layer): def __init__(self, feat_size, epsilon=None, **kwargs): super().__init__(**kwargs) self.feat_size = feat_size @@ -173,7 +172,7 @@ class TFNoNorm(tf.keras.layers.Layer): NORM2FN = {"layer_norm": TFLayerNorm, "no_norm": TFNoNorm} -class TFMobileBertEmbeddings(tf.keras.layers.Layer): +class TFMobileBertEmbeddings(keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, config, **kwargs): @@ -185,14 +184,14 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer): self.hidden_size = config.hidden_size self.max_position_embeddings = config.max_position_embeddings self.initializer_range = config.initializer_range - self.embedding_transformation = tf.keras.layers.Dense(config.hidden_size, name="embedding_transformation") + self.embedding_transformation = keras.layers.Dense(config.hidden_size, name="embedding_transformation") # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = NORM2FN[config.normalization_type]( config.hidden_size, epsilon=config.layer_norm_eps, name="LayerNorm" ) - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.embedded_input_size = self.embedding_size * (3 if self.trigram_input else 1) def build(self, input_shape=None): @@ -277,7 +276,7 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer): return final_embeddings -class TFMobileBertSelfAttention(tf.keras.layers.Layer): +class TFMobileBertSelfAttention(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) if config.hidden_size % config.num_attention_heads != 0: @@ -292,17 +291,17 @@ class TFMobileBertSelfAttention(tf.keras.layers.Layer): self.attention_head_size = int(config.true_hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size - self.query = tf.keras.layers.Dense( + self.query = keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" ) - self.key = tf.keras.layers.Dense( + self.key = keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" ) - self.value = tf.keras.layers.Dense( + self.value = keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) - self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) + self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob) self.config = config def transpose_for_scores(self, x, batch_size): @@ -378,18 +377,18 @@ class TFMobileBertSelfAttention(tf.keras.layers.Layer): ) -class TFMobileBertSelfOutput(tf.keras.layers.Layer): +class TFMobileBertSelfOutput(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.use_bottleneck = config.use_bottleneck - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( config.true_hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.LayerNorm = NORM2FN[config.normalization_type]( config.true_hidden_size, epsilon=config.layer_norm_eps, name="LayerNorm" ) if not self.use_bottleneck: - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) self.config = config def call(self, hidden_states, residual_tensor, training=False): @@ -411,7 +410,7 @@ class TFMobileBertSelfOutput(tf.keras.layers.Layer): self.LayerNorm.build(None) -class TFMobileBertAttention(tf.keras.layers.Layer): +class TFMobileBertAttention(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.self = TFMobileBertSelfAttention(config, name="self") @@ -451,14 +450,14 @@ class TFMobileBertAttention(tf.keras.layers.Layer): self.mobilebert_output.build(None) -class TFOutputBottleneck(tf.keras.layers.Layer): +class TFOutputBottleneck(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense(config.hidden_size, name="dense") + self.dense = keras.layers.Dense(config.hidden_size, name="dense") self.LayerNorm = NORM2FN[config.normalization_type]( config.hidden_size, epsilon=config.layer_norm_eps, name="LayerNorm" ) - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) self.config = config def call(self, hidden_states, residual_tensor, training=False): @@ -479,18 +478,18 @@ class TFOutputBottleneck(tf.keras.layers.Layer): self.LayerNorm.build(None) -class TFMobileBertOutput(tf.keras.layers.Layer): +class TFMobileBertOutput(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.use_bottleneck = config.use_bottleneck - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( config.true_hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.LayerNorm = NORM2FN[config.normalization_type]( config.true_hidden_size, epsilon=config.layer_norm_eps, name="LayerNorm" ) if not self.use_bottleneck: - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) else: self.bottleneck = TFOutputBottleneck(config, name="bottleneck") self.config = config @@ -520,10 +519,10 @@ class TFMobileBertOutput(tf.keras.layers.Layer): self.bottleneck.build(None) -class TFBottleneckLayer(tf.keras.layers.Layer): +class TFBottleneckLayer(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense(config.intra_bottleneck_size, name="dense") + self.dense = keras.layers.Dense(config.intra_bottleneck_size, name="dense") self.LayerNorm = NORM2FN[config.normalization_type]( config.intra_bottleneck_size, epsilon=config.layer_norm_eps, name="LayerNorm" ) @@ -546,7 +545,7 @@ class TFBottleneckLayer(tf.keras.layers.Layer): self.LayerNorm.build(None) -class TFBottleneck(tf.keras.layers.Layer): +class TFBottleneck(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.key_query_shared_bottleneck = config.key_query_shared_bottleneck @@ -593,10 +592,10 @@ class TFBottleneck(tf.keras.layers.Layer): self.attention.build(None) -class TFFFNOutput(tf.keras.layers.Layer): +class TFFFNOutput(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense(config.true_hidden_size, name="dense") + self.dense = keras.layers.Dense(config.true_hidden_size, name="dense") self.LayerNorm = NORM2FN[config.normalization_type]( config.true_hidden_size, epsilon=config.layer_norm_eps, name="LayerNorm" ) @@ -619,7 +618,7 @@ class TFFFNOutput(tf.keras.layers.Layer): self.LayerNorm.build(None) -class TFFFNLayer(tf.keras.layers.Layer): +class TFFFNLayer(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.intermediate = TFMobileBertIntermediate(config, name="intermediate") @@ -642,7 +641,7 @@ class TFFFNLayer(tf.keras.layers.Layer): self.mobilebert_output.build(None) -class TFMobileBertLayer(tf.keras.layers.Layer): +class TFMobileBertLayer(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.use_bottleneck = config.use_bottleneck @@ -723,7 +722,7 @@ class TFMobileBertLayer(tf.keras.layers.Layer): layer.build(None) -class TFMobileBertEncoder(tf.keras.layers.Layer): +class TFMobileBertEncoder(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.output_attentions = config.output_attentions @@ -775,12 +774,12 @@ class TFMobileBertEncoder(tf.keras.layers.Layer): layer.build(None) -class TFMobileBertPooler(tf.keras.layers.Layer): +class TFMobileBertPooler(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.do_activate = config.classifier_activation if self.do_activate: - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", @@ -807,10 +806,10 @@ class TFMobileBertPooler(tf.keras.layers.Layer): self.dense.build([None, None, self.config.hidden_size]) -class TFMobileBertPredictionHeadTransform(tf.keras.layers.Layer): +class TFMobileBertPredictionHeadTransform(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) if isinstance(config.hidden_act, str): @@ -838,7 +837,7 @@ class TFMobileBertPredictionHeadTransform(tf.keras.layers.Layer): self.LayerNorm.build(None) -class TFMobileBertLMPredictionHead(tf.keras.layers.Layer): +class TFMobileBertLMPredictionHead(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.transform = TFMobileBertPredictionHeadTransform(config, name="transform") @@ -887,7 +886,7 @@ class TFMobileBertLMPredictionHead(tf.keras.layers.Layer): return hidden_states -class TFMobileBertMLMHead(tf.keras.layers.Layer): +class TFMobileBertMLMHead(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.predictions = TFMobileBertLMPredictionHead(config, name="predictions") @@ -906,7 +905,7 @@ class TFMobileBertMLMHead(tf.keras.layers.Layer): @keras_serializable -class TFMobileBertMainLayer(tf.keras.layers.Layer): +class TFMobileBertMainLayer(keras.layers.Layer): config_class = MobileBertConfig def __init__(self, config, add_pooling_layer=True, **kwargs): @@ -1082,7 +1081,7 @@ MOBILEBERT_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -1434,10 +1433,10 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModel return (tf_weight,) -class TFMobileBertOnlyNSPHead(tf.keras.layers.Layer): +class TFMobileBertOnlyNSPHead(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.seq_relationship = tf.keras.layers.Dense(2, name="seq_relationship") + self.seq_relationship = keras.layers.Dense(2, name="seq_relationship") self.config = config def call(self, pooled_output): @@ -1571,8 +1570,8 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = tf.keras.layers.Dropout(classifier_dropout) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(classifier_dropout) + self.classifier = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config @@ -1670,7 +1669,7 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn self.num_labels = config.num_labels self.mobilebert = TFMobileBertMainLayer(config, add_pooling_layer=False, name="mobilebert") - self.qa_outputs = tf.keras.layers.Dense( + self.qa_outputs = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) self.config = config @@ -1780,8 +1779,8 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic super().__init__(config, *inputs, **kwargs) self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) + self.classifier = keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config @@ -1898,8 +1897,8 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = tf.keras.layers.Dropout(classifier_dropout) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(classifier_dropout) + self.classifier = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config diff --git a/src/transformers/models/mobilevit/modeling_tf_mobilevit.py b/src/transformers/models/mobilevit/modeling_tf_mobilevit.py index 9493172329..2024979936 100644 --- a/src/transformers/models/mobilevit/modeling_tf_mobilevit.py +++ b/src/transformers/models/mobilevit/modeling_tf_mobilevit.py @@ -35,7 +35,13 @@ from ...modeling_tf_outputs import ( TFImageClassifierOutputWithNoAttention, TFSemanticSegmenterOutputWithNoAttention, ) -from ...modeling_tf_utils import TFPreTrainedModel, TFSequenceClassificationLoss, keras_serializable, unpack_inputs +from ...modeling_tf_utils import ( + TFPreTrainedModel, + TFSequenceClassificationLoss, + keras, + keras_serializable, + unpack_inputs, +) from ...tf_utils import shape_list, stable_softmax from ...utils import logging from .configuration_mobilevit import MobileViTConfig @@ -81,7 +87,7 @@ def make_divisible(value: int, divisor: int = 8, min_value: Optional[int] = None return int(new_value) -class TFMobileViTConvLayer(tf.keras.layers.Layer): +class TFMobileViTConvLayer(keras.layers.Layer): def __init__( self, config: MobileViTConfig, @@ -103,12 +109,12 @@ class TFMobileViTConvLayer(tf.keras.layers.Layer): ) padding = int((kernel_size - 1) / 2) * dilation - self.padding = tf.keras.layers.ZeroPadding2D(padding) + self.padding = keras.layers.ZeroPadding2D(padding) if out_channels % groups != 0: raise ValueError(f"Output channels ({out_channels}) are not divisible by {groups} groups.") - self.convolution = tf.keras.layers.Conv2D( + self.convolution = keras.layers.Conv2D( filters=out_channels, kernel_size=kernel_size, strides=stride, @@ -120,7 +126,7 @@ class TFMobileViTConvLayer(tf.keras.layers.Layer): ) if use_normalization: - self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.1, name="normalization") + self.normalization = keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.1, name="normalization") else: self.normalization = None @@ -158,7 +164,7 @@ class TFMobileViTConvLayer(tf.keras.layers.Layer): self.normalization.build([None, None, None, self.out_channels]) -class TFMobileViTInvertedResidual(tf.keras.layers.Layer): +class TFMobileViTInvertedResidual(keras.layers.Layer): """ Inverted residual block (MobileNetv2): https://arxiv.org/abs/1801.04381 """ @@ -222,7 +228,7 @@ class TFMobileViTInvertedResidual(tf.keras.layers.Layer): self.reduce_1x1.build(None) -class TFMobileViTMobileNetLayer(tf.keras.layers.Layer): +class TFMobileViTMobileNetLayer(keras.layers.Layer): def __init__( self, config: MobileViTConfig, @@ -261,7 +267,7 @@ class TFMobileViTMobileNetLayer(tf.keras.layers.Layer): layer_module.build(None) -class TFMobileViTSelfAttention(tf.keras.layers.Layer): +class TFMobileViTSelfAttention(keras.layers.Layer): def __init__(self, config: MobileViTConfig, hidden_size: int, **kwargs) -> None: super().__init__(**kwargs) @@ -277,11 +283,11 @@ class TFMobileViTSelfAttention(tf.keras.layers.Layer): scale = tf.cast(self.attention_head_size, dtype=tf.float32) self.scale = tf.math.sqrt(scale) - self.query = tf.keras.layers.Dense(self.all_head_size, use_bias=config.qkv_bias, name="query") - self.key = tf.keras.layers.Dense(self.all_head_size, use_bias=config.qkv_bias, name="key") - self.value = tf.keras.layers.Dense(self.all_head_size, use_bias=config.qkv_bias, name="value") + self.query = keras.layers.Dense(self.all_head_size, use_bias=config.qkv_bias, name="query") + self.key = keras.layers.Dense(self.all_head_size, use_bias=config.qkv_bias, name="key") + self.value = keras.layers.Dense(self.all_head_size, use_bias=config.qkv_bias, name="value") - self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) + self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob) self.hidden_size = hidden_size def transpose_for_scores(self, x: tf.Tensor) -> tf.Tensor: @@ -328,11 +334,11 @@ class TFMobileViTSelfAttention(tf.keras.layers.Layer): self.value.build([None, None, self.hidden_size]) -class TFMobileViTSelfOutput(tf.keras.layers.Layer): +class TFMobileViTSelfOutput(keras.layers.Layer): def __init__(self, config: MobileViTConfig, hidden_size: int, **kwargs) -> None: super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense(hidden_size, name="dense") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dense = keras.layers.Dense(hidden_size, name="dense") + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) self.hidden_size = hidden_size def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -349,7 +355,7 @@ class TFMobileViTSelfOutput(tf.keras.layers.Layer): self.dense.build([None, None, self.hidden_size]) -class TFMobileViTAttention(tf.keras.layers.Layer): +class TFMobileViTAttention(keras.layers.Layer): def __init__(self, config: MobileViTConfig, hidden_size: int, **kwargs) -> None: super().__init__(**kwargs) self.attention = TFMobileViTSelfAttention(config, hidden_size, name="attention") @@ -375,10 +381,10 @@ class TFMobileViTAttention(tf.keras.layers.Layer): self.dense_output.build(None) -class TFMobileViTIntermediate(tf.keras.layers.Layer): +class TFMobileViTIntermediate(keras.layers.Layer): def __init__(self, config: MobileViTConfig, hidden_size: int, intermediate_size: int, **kwargs) -> None: super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense(intermediate_size, name="dense") + self.dense = keras.layers.Dense(intermediate_size, name="dense") if isinstance(config.hidden_act, str): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: @@ -399,11 +405,11 @@ class TFMobileViTIntermediate(tf.keras.layers.Layer): self.dense.build([None, None, self.hidden_size]) -class TFMobileViTOutput(tf.keras.layers.Layer): +class TFMobileViTOutput(keras.layers.Layer): def __init__(self, config: MobileViTConfig, hidden_size: int, intermediate_size: int, **kwargs) -> None: super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense(hidden_size, name="dense") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dense = keras.layers.Dense(hidden_size, name="dense") + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) self.intermediate_size = intermediate_size def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -421,18 +427,14 @@ class TFMobileViTOutput(tf.keras.layers.Layer): self.dense.build([None, None, self.intermediate_size]) -class TFMobileViTTransformerLayer(tf.keras.layers.Layer): +class TFMobileViTTransformerLayer(keras.layers.Layer): def __init__(self, config: MobileViTConfig, hidden_size: int, intermediate_size: int, **kwargs) -> None: super().__init__(**kwargs) self.attention = TFMobileViTAttention(config, hidden_size, name="attention") self.intermediate = TFMobileViTIntermediate(config, hidden_size, intermediate_size, name="intermediate") self.mobilevit_output = TFMobileViTOutput(config, hidden_size, intermediate_size, name="output") - self.layernorm_before = tf.keras.layers.LayerNormalization( - epsilon=config.layer_norm_eps, name="layernorm_before" - ) - self.layernorm_after = tf.keras.layers.LayerNormalization( - epsilon=config.layer_norm_eps, name="layernorm_after" - ) + self.layernorm_before = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_before") + self.layernorm_after = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_after") self.hidden_size = hidden_size def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -465,7 +467,7 @@ class TFMobileViTTransformerLayer(tf.keras.layers.Layer): self.layernorm_after.build([None, None, self.hidden_size]) -class TFMobileViTTransformer(tf.keras.layers.Layer): +class TFMobileViTTransformer(keras.layers.Layer): def __init__(self, config: MobileViTConfig, hidden_size: int, num_stages: int, **kwargs) -> None: super().__init__(**kwargs) @@ -494,7 +496,7 @@ class TFMobileViTTransformer(tf.keras.layers.Layer): layer_module.build(None) -class TFMobileViTLayer(tf.keras.layers.Layer): +class TFMobileViTLayer(keras.layers.Layer): """ MobileViT block: https://arxiv.org/abs/2110.02178 """ @@ -549,7 +551,7 @@ class TFMobileViTLayer(tf.keras.layers.Layer): config, hidden_size=hidden_size, num_stages=num_stages, name="transformer" ) - self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") + self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") self.conv_projection = TFMobileViTConvLayer( config, in_channels=hidden_size, out_channels=in_channels, kernel_size=1, name="conv_projection" @@ -688,7 +690,7 @@ class TFMobileViTLayer(tf.keras.layers.Layer): self.downsampling_layer.build(None) -class TFMobileViTEncoder(tf.keras.layers.Layer): +class TFMobileViTEncoder(keras.layers.Layer): def __init__(self, config: MobileViTConfig, **kwargs) -> None: super().__init__(**kwargs) self.config = config @@ -798,7 +800,7 @@ class TFMobileViTEncoder(tf.keras.layers.Layer): @keras_serializable -class TFMobileViTMainLayer(tf.keras.layers.Layer): +class TFMobileViTMainLayer(keras.layers.Layer): config_class = MobileViTConfig def __init__(self, config: MobileViTConfig, expand_output: bool = True, **kwargs): @@ -826,7 +828,7 @@ class TFMobileViTMainLayer(tf.keras.layers.Layer): name="conv_1x1_exp", ) - self.pooler = tf.keras.layers.GlobalAveragePooling2D(data_format="channels_first", name="pooler") + self.pooler = keras.layers.GlobalAveragePooling2D(data_format="channels_first", name="pooler") def _prune_heads(self, heads_to_prune): """ @@ -848,7 +850,7 @@ class TFMobileViTMainLayer(tf.keras.layers.Layer): ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict - # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format. + # When running on CPU, `keras.layers.Conv2D` doesn't support `NCHW` format. # So change the input format from `NCHW` to `NHWC`. # shape = (batch_size, in_height, in_width, in_channels=num_channels) pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) @@ -931,7 +933,7 @@ MOBILEVIT_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -1038,9 +1040,9 @@ class TFMobileViTForImageClassification(TFMobileViTPreTrainedModel, TFSequenceCl self.mobilevit = TFMobileViTMainLayer(config, name="mobilevit") # Classifier head - self.dropout = tf.keras.layers.Dropout(config.classifier_dropout_prob) + self.dropout = keras.layers.Dropout(config.classifier_dropout_prob) self.classifier = ( - tf.keras.layers.Dense(config.num_labels, name="classifier") if config.num_labels > 0 else tf.identity + keras.layers.Dense(config.num_labels, name="classifier") if config.num_labels > 0 else tf.identity ) self.config = config @@ -1096,11 +1098,11 @@ class TFMobileViTForImageClassification(TFMobileViTPreTrainedModel, TFSequenceCl self.classifier.build([None, None, self.config.neck_hidden_sizes[-1]]) -class TFMobileViTASPPPooling(tf.keras.layers.Layer): +class TFMobileViTASPPPooling(keras.layers.Layer): def __init__(self, config: MobileViTConfig, in_channels: int, out_channels: int, **kwargs) -> None: super().__init__(**kwargs) - self.global_pool = tf.keras.layers.GlobalAveragePooling2D(keepdims=True, name="global_pool") + self.global_pool = keras.layers.GlobalAveragePooling2D(keepdims=True, name="global_pool") self.conv_1x1 = TFMobileViTConvLayer( config, @@ -1132,7 +1134,7 @@ class TFMobileViTASPPPooling(tf.keras.layers.Layer): self.conv_1x1.build(None) -class TFMobileViTASPP(tf.keras.layers.Layer): +class TFMobileViTASPP(keras.layers.Layer): """ ASPP module defined in DeepLab papers: https://arxiv.org/abs/1606.00915, https://arxiv.org/abs/1706.05587 """ @@ -1187,7 +1189,7 @@ class TFMobileViTASPP(tf.keras.layers.Layer): name="project", ) - self.dropout = tf.keras.layers.Dropout(config.aspp_dropout_prob) + self.dropout = keras.layers.Dropout(config.aspp_dropout_prob) def call(self, features: tf.Tensor, training: bool = False) -> tf.Tensor: # since the hidden states were transposed to have `(batch_size, channels, height, width)` @@ -1215,7 +1217,7 @@ class TFMobileViTASPP(tf.keras.layers.Layer): conv.build(None) -class TFMobileViTDeepLabV3(tf.keras.layers.Layer): +class TFMobileViTDeepLabV3(keras.layers.Layer): """ DeepLabv3 architecture: https://arxiv.org/abs/1706.05587 """ @@ -1224,7 +1226,7 @@ class TFMobileViTDeepLabV3(tf.keras.layers.Layer): super().__init__(**kwargs) self.aspp = TFMobileViTASPP(config, name="aspp") - self.dropout = tf.keras.layers.Dropout(config.classifier_dropout_prob) + self.dropout = keras.layers.Dropout(config.classifier_dropout_prob) self.classifier = TFMobileViTConvLayer( config, @@ -1276,7 +1278,7 @@ class TFMobileViTForSemanticSegmentation(TFMobileViTPreTrainedModel): upsampled_logits = tf.image.resize(logits, size=label_interp_shape, method="bilinear") # compute weighted loss - loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none") + loss_fct = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none") def masked_loss(real, pred): unmasked_loss = loss_fct(real, pred) diff --git a/src/transformers/models/mpnet/modeling_tf_mpnet.py b/src/transformers/models/mpnet/modeling_tf_mpnet.py index 589c706b7f..fe2825c76c 100644 --- a/src/transformers/models/mpnet/modeling_tf_mpnet.py +++ b/src/transformers/models/mpnet/modeling_tf_mpnet.py @@ -44,6 +44,7 @@ from ...modeling_tf_utils import ( TFSequenceClassificationLoss, TFTokenClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -77,7 +78,7 @@ class TFMPNetPreTrainedModel(TFPreTrainedModel): base_model_prefix = "mpnet" -class TFMPNetEmbeddings(tf.keras.layers.Layer): +class TFMPNetEmbeddings(keras.layers.Layer): """Construct the embeddings from word, position embeddings.""" def __init__(self, config, **kwargs): @@ -88,8 +89,8 @@ class TFMPNetEmbeddings(tf.keras.layers.Layer): self.hidden_size = config.hidden_size self.max_position_embeddings = config.max_position_embeddings self.initializer_range = config.initializer_range - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) def build(self, input_shape=None): with tf.name_scope("word_embeddings"): @@ -160,11 +161,11 @@ class TFMPNetEmbeddings(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->MPNet -class TFMPNetPooler(tf.keras.layers.Layer): +class TFMPNetPooler(keras.layers.Layer): def __init__(self, config: MPNetConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", @@ -189,7 +190,7 @@ class TFMPNetPooler(tf.keras.layers.Layer): self.dense.build([None, None, self.config.hidden_size]) -class TFMPNetSelfAttention(tf.keras.layers.Layer): +class TFMPNetSelfAttention(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) @@ -204,19 +205,19 @@ class TFMPNetSelfAttention(tf.keras.layers.Layer): self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size - self.q = tf.keras.layers.Dense( + self.q = keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="q" ) - self.k = tf.keras.layers.Dense( + self.k = keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="k" ) - self.v = tf.keras.layers.Dense( + self.v = keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="v" ) - self.o = tf.keras.layers.Dense( + self.o = keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="o" ) - self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) + self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob) self.config = config def transpose_for_scores(self, x, batch_size): @@ -280,13 +281,13 @@ class TFMPNetSelfAttention(tf.keras.layers.Layer): self.o.build([None, None, self.config.hidden_size]) -class TFMPNetAttention(tf.keras.layers.Layer): +class TFMPNetAttention(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.attn = TFMPNetSelfAttention(config, name="attn") - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) self.config = config def prune_heads(self, heads): @@ -313,11 +314,11 @@ class TFMPNetAttention(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->MPNet -class TFMPNetIntermediate(tf.keras.layers.Layer): +class TFMPNetIntermediate(keras.layers.Layer): def __init__(self, config: MPNetConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -343,15 +344,15 @@ class TFMPNetIntermediate(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->MPNet -class TFMPNetOutput(tf.keras.layers.Layer): +class TFMPNetOutput(keras.layers.Layer): def __init__(self, config: MPNetConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -373,7 +374,7 @@ class TFMPNetOutput(tf.keras.layers.Layer): self.LayerNorm.build([None, None, self.config.hidden_size]) -class TFMPNetLayer(tf.keras.layers.Layer): +class TFMPNetLayer(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) @@ -409,7 +410,7 @@ class TFMPNetLayer(tf.keras.layers.Layer): self.out.build(None) -class TFMPNetEncoder(tf.keras.layers.Layer): +class TFMPNetEncoder(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) @@ -526,7 +527,7 @@ class TFMPNetEncoder(tf.keras.layers.Layer): @keras_serializable -class TFMPNetMainLayer(tf.keras.layers.Layer): +class TFMPNetMainLayer(keras.layers.Layer): config_class = MPNetConfig def __init__(self, config, **kwargs): @@ -544,7 +545,7 @@ class TFMPNetMainLayer(tf.keras.layers.Layer): self.embeddings = TFMPNetEmbeddings(config, name="embeddings") # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings - def get_input_embeddings(self) -> tf.keras.layers.Layer: + def get_input_embeddings(self) -> keras.layers.Layer: return self.embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings @@ -666,7 +667,7 @@ MPNET_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -800,7 +801,7 @@ class TFMPNetModel(TFMPNetPreTrainedModel): self.mpnet.build(None) -class TFMPNetLMHead(tf.keras.layers.Layer): +class TFMPNetLMHead(keras.layers.Layer): """MPNet head for masked and permuted language modeling""" def __init__(self, config, input_embeddings, **kwargs): @@ -808,10 +809,10 @@ class TFMPNetLMHead(tf.keras.layers.Layer): self.config = config self.hidden_size = config.hidden_size - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") + self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.act = get_tf_activation("gelu") # The output weights are the same as the input embeddings, but there is @@ -942,19 +943,19 @@ class TFMPNetForMaskedLM(TFMPNetPreTrainedModel, TFMaskedLanguageModelingLoss): self.lm_head.build(None) -class TFMPNetClassificationHead(tf.keras.layers.Layer): +class TFMPNetClassificationHead(keras.layers.Layer): """Head for sentence-level classification tasks.""" def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", name="dense", ) - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.out_proj = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) + self.out_proj = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" ) self.config = config @@ -1074,8 +1075,8 @@ class TFMPNetForMultipleChoice(TFMPNetPreTrainedModel, TFMultipleChoiceLoss): super().__init__(config, *inputs, **kwargs) self.mpnet = TFMPNetMainLayer(config, name="mpnet") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) + self.classifier = keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config @@ -1175,8 +1176,8 @@ class TFMPNetForTokenClassification(TFMPNetPreTrainedModel, TFTokenClassificatio self.num_labels = config.num_labels self.mpnet = TFMPNetMainLayer(config, name="mpnet") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) + self.classifier = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config @@ -1261,7 +1262,7 @@ class TFMPNetForQuestionAnswering(TFMPNetPreTrainedModel, TFQuestionAnsweringLos self.num_labels = config.num_labels self.mpnet = TFMPNetMainLayer(config, name="mpnet") - self.qa_outputs = tf.keras.layers.Dense( + self.qa_outputs = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) self.config = config diff --git a/src/transformers/models/openai/modeling_tf_openai.py b/src/transformers/models/openai/modeling_tf_openai.py index ea9651c6a0..8c213bcebd 100644 --- a/src/transformers/models/openai/modeling_tf_openai.py +++ b/src/transformers/models/openai/modeling_tf_openai.py @@ -34,6 +34,7 @@ from ...modeling_tf_utils import ( TFSequenceSummary, TFSharedEmbeddings, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -60,7 +61,7 @@ TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [ ] -class TFAttention(tf.keras.layers.Layer): +class TFAttention(keras.layers.Layer): def __init__(self, nx, config, scale=False, **kwargs): super().__init__(**kwargs) @@ -76,8 +77,8 @@ class TFAttention(tf.keras.layers.Layer): self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name="c_attn") self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_proj") - self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop) - self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop) + self.attn_dropout = keras.layers.Dropout(config.attn_pdrop) + self.resid_dropout = keras.layers.Dropout(config.resid_pdrop) self.n_state = n_state self.pruned_heads = set() @@ -166,14 +167,14 @@ class TFAttention(tf.keras.layers.Layer): self.c_proj.build([None, None, self.n_state]) -class TFMLP(tf.keras.layers.Layer): +class TFMLP(keras.layers.Layer): def __init__(self, n_state, config, **kwargs): super().__init__(**kwargs) nx = config.n_embd self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc") self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj") self.act = get_tf_activation("gelu") - self.dropout = tf.keras.layers.Dropout(config.resid_pdrop) + self.dropout = keras.layers.Dropout(config.resid_pdrop) self.nx = nx self.n_state = n_state @@ -195,14 +196,14 @@ class TFMLP(tf.keras.layers.Layer): self.c_proj.build([None, None, self.nx]) -class TFBlock(tf.keras.layers.Layer): +class TFBlock(keras.layers.Layer): def __init__(self, config, scale=False, **kwargs): super().__init__(**kwargs) nx = config.n_embd self.attn = TFAttention(nx, config, scale, name="attn") - self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1") + self.ln_1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1") self.mlp = TFMLP(4 * nx, config, name="mlp") - self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2") + self.ln_2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2") self.nx = nx def call(self, x, attention_mask, head_mask, output_attentions, training=False): @@ -235,7 +236,7 @@ class TFBlock(tf.keras.layers.Layer): @keras_serializable -class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): +class TFOpenAIGPTMainLayer(keras.layers.Layer): config_class = OpenAIGPTConfig def __init__(self, config, *inputs, **kwargs): @@ -253,7 +254,7 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): self.tokens_embed = TFSharedEmbeddings( config.vocab_size, config.n_embd, initializer_range=config.initializer_range, name="tokens_embed" ) - self.drop = tf.keras.layers.Dropout(config.embd_pdrop) + self.drop = keras.layers.Dropout(config.embd_pdrop) self.h = [TFBlock(config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)] def build(self, input_shape=None): @@ -445,7 +446,7 @@ OPENAI_GPT_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -833,7 +834,7 @@ class TFOpenAIGPTForSequenceClassification(TFOpenAIGPTPreTrainedModel, TFSequenc def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels - self.score = tf.keras.layers.Dense( + self.score = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="score", diff --git a/src/transformers/models/opt/modeling_tf_opt.py b/src/transformers/models/opt/modeling_tf_opt.py index e435808ec1..8dbad97e08 100644 --- a/src/transformers/models/opt/modeling_tf_opt.py +++ b/src/transformers/models/opt/modeling_tf_opt.py @@ -31,6 +31,7 @@ from ...modeling_tf_utils import ( TFModelInputType, TFPreTrainedModel, TFSharedEmbeddings, + keras, keras_serializable, unpack_inputs, ) @@ -91,7 +92,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None): return (one_cst - expanded_mask) * LARGE_NEGATIVE -class TFOPTLearnedPositionalEmbedding(tf.keras.layers.Embedding): +class TFOPTLearnedPositionalEmbedding(keras.layers.Embedding): """ This module learns positional embeddings up to a fixed maximum size. """ @@ -116,7 +117,7 @@ class TFOPTLearnedPositionalEmbedding(tf.keras.layers.Embedding): # Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with Bart->OPT -class TFOPTAttention(tf.keras.layers.Layer): +class TFOPTAttention(keras.layers.Layer): """Multi-headed attention from "Attention Is All You Need""" def __init__( @@ -132,7 +133,7 @@ class TFOPTAttention(tf.keras.layers.Layer): self.embed_dim = embed_dim self.num_heads = num_heads - self.dropout = tf.keras.layers.Dropout(dropout) + self.dropout = keras.layers.Dropout(dropout) self.head_dim = embed_dim // num_heads if (self.head_dim * num_heads) != self.embed_dim: raise ValueError( @@ -142,10 +143,10 @@ class TFOPTAttention(tf.keras.layers.Layer): self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder - self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") - self.q_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") - self.v_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") - self.out_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") + self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") + self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") + self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") + self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int): return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3)) @@ -286,7 +287,7 @@ class TFOPTAttention(tf.keras.layers.Layer): self.out_proj.build([None, None, self.embed_dim]) -class TFOPTDecoderLayer(tf.keras.layers.Layer): +class TFOPTDecoderLayer(keras.layers.Layer): def __init__(self, config: OPTConfig, **kwargs): super().__init__(**kwargs) self.do_layer_norm_before = config.do_layer_norm_before @@ -298,13 +299,13 @@ class TFOPTDecoderLayer(tf.keras.layers.Layer): name="self_attn", is_decoder=True, ) - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) self.activation_fn = get_tf_activation(config.activation_function) - self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") - self.fc1 = tf.keras.layers.Dense(config.ffn_dim, name="fc1") - self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") - self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.fc1 = keras.layers.Dense(config.ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.config = config def call( @@ -398,7 +399,7 @@ OPT_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -499,7 +500,7 @@ OPT_INPUTS_DOCSTRING = r""" @keras_serializable -class TFOPTDecoder(tf.keras.layers.Layer): +class TFOPTDecoder(keras.layers.Layer): config_class = OPTConfig def __init__(self, config: OPTConfig, **kwargs): @@ -521,20 +522,20 @@ class TFOPTDecoder(tf.keras.layers.Layer): # with checkpoints that have been fine-tuned before transformers v4.20.1 # see https://github.com/facebookresearch/metaseq/pull/164 if config.do_layer_norm_before and not config._remove_final_layer_norm: - self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") else: self.final_layer_norm = None if config.word_embed_proj_dim != config.hidden_size: - self.project_out = tf.keras.layers.Dense(config.word_embed_proj_dim, name="project_out", use_bias=False) - self.project_in = tf.keras.layers.Dense(config.hidden_size, name="project_in", use_bias=False) + self.project_out = keras.layers.Dense(config.word_embed_proj_dim, name="project_out", use_bias=False) + self.project_in = keras.layers.Dense(config.hidden_size, name="project_in", use_bias=False) else: self.project_in = None self.project_out = None self.layers = [TFOPTDecoderLayer(config, name=f"layers.{i}") for i in range(config.num_hidden_layers)] - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) def get_embed_tokens(self): return self.embed_tokens @@ -760,7 +761,7 @@ class TFOPTDecoder(tf.keras.layers.Layer): @keras_serializable -class TFOPTMainLayer(tf.keras.layers.Layer): +class TFOPTMainLayer(keras.layers.Layer): config_class = OPTConfig def __init__(self, config: OPTConfig, **kwargs): diff --git a/src/transformers/models/pegasus/modeling_tf_pegasus.py b/src/transformers/models/pegasus/modeling_tf_pegasus.py index 4ec50905d7..a3acdc027f 100644 --- a/src/transformers/models/pegasus/modeling_tf_pegasus.py +++ b/src/transformers/models/pegasus/modeling_tf_pegasus.py @@ -36,6 +36,7 @@ from ...modeling_tf_utils import ( TFCausalLanguageModelingLoss, TFModelInputType, TFPreTrainedModel, + keras, keras_serializable, unpack_inputs, ) @@ -118,7 +119,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None): # Copied from transformers.models.marian.modeling_tf_marian.TFMarianSinusoidalPositionalEmbedding with Marian->Pegasus -class TFPegasusSinusoidalPositionalEmbedding(tf.keras.layers.Layer): +class TFPegasusSinusoidalPositionalEmbedding(keras.layers.Layer): """This module produces sinusoidal positional embeddings of any length.""" def __init__(self, num_positions: int, embedding_dim: int, **kwargs): @@ -177,7 +178,7 @@ class TFPegasusSinusoidalPositionalEmbedding(tf.keras.layers.Layer): # Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with Bart->Pegasus -class TFPegasusAttention(tf.keras.layers.Layer): +class TFPegasusAttention(keras.layers.Layer): """Multi-headed attention from "Attention Is All You Need""" def __init__( @@ -193,7 +194,7 @@ class TFPegasusAttention(tf.keras.layers.Layer): self.embed_dim = embed_dim self.num_heads = num_heads - self.dropout = tf.keras.layers.Dropout(dropout) + self.dropout = keras.layers.Dropout(dropout) self.head_dim = embed_dim // num_heads if (self.head_dim * num_heads) != self.embed_dim: raise ValueError( @@ -203,10 +204,10 @@ class TFPegasusAttention(tf.keras.layers.Layer): self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder - self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") - self.q_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") - self.v_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") - self.out_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") + self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") + self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") + self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") + self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int): return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3)) @@ -348,20 +349,20 @@ class TFPegasusAttention(tf.keras.layers.Layer): # Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartEncoderLayer with MBart->Pegasus -class TFPegasusEncoderLayer(tf.keras.layers.Layer): +class TFPegasusEncoderLayer(keras.layers.Layer): def __init__(self, config: PegasusConfig, **kwargs): super().__init__(**kwargs) self.embed_dim = config.d_model self.self_attn = TFPegasusAttention( self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn" ) - self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.dropout = keras.layers.Dropout(config.dropout) self.activation_fn = get_tf_activation(config.activation_function) - self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) - self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") - self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") - self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) + self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.config = config def call( @@ -426,7 +427,7 @@ class TFPegasusEncoderLayer(tf.keras.layers.Layer): # Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartDecoderLayer with MBart->Pegasus -class TFPegasusDecoderLayer(tf.keras.layers.Layer): +class TFPegasusDecoderLayer(keras.layers.Layer): def __init__(self, config: PegasusConfig, **kwargs): super().__init__(**kwargs) self.embed_dim = config.d_model @@ -437,11 +438,11 @@ class TFPegasusDecoderLayer(tf.keras.layers.Layer): name="self_attn", is_decoder=True, ) - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) self.activation_fn = get_tf_activation(config.activation_function) - self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) - self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") self.encoder_attn = TFPegasusAttention( self.embed_dim, config.decoder_attention_heads, @@ -449,10 +450,10 @@ class TFPegasusDecoderLayer(tf.keras.layers.Layer): name="encoder_attn", is_decoder=True, ) - self.encoder_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") - self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") - self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") - self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") + self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.config = config def call( @@ -572,7 +573,7 @@ PEGASUS_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -716,7 +717,7 @@ PEGASUS_INPUTS_DOCSTRING = r""" @keras_serializable -class TFPegasusEncoder(tf.keras.layers.Layer): +class TFPegasusEncoder(keras.layers.Layer): config_class = PegasusConfig """ Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a @@ -726,10 +727,10 @@ class TFPegasusEncoder(tf.keras.layers.Layer): config: PegasusConfig """ - def __init__(self, config: PegasusConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs): + def __init__(self, config: PegasusConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs): super().__init__(**kwargs) self.config = config - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) self.layerdrop = config.encoder_layerdrop self.padding_idx = config.pad_token_id self.max_source_positions = config.max_position_embeddings @@ -742,7 +743,7 @@ class TFPegasusEncoder(tf.keras.layers.Layer): name="embed_positions", ) self.layers = [TFPegasusEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)] - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") + self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") def get_embed_tokens(self): return self.embed_tokens @@ -889,7 +890,7 @@ class TFPegasusEncoder(tf.keras.layers.Layer): @keras_serializable -class TFPegasusDecoder(tf.keras.layers.Layer): +class TFPegasusDecoder(keras.layers.Layer): config_class = PegasusConfig """ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFPegasusDecoderLayer`] @@ -899,7 +900,7 @@ class TFPegasusDecoder(tf.keras.layers.Layer): embed_tokens: output embedding """ - def __init__(self, config: PegasusConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs): + def __init__(self, config: PegasusConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs): super().__init__(**kwargs) self.config = config self.padding_idx = config.pad_token_id @@ -912,9 +913,9 @@ class TFPegasusDecoder(tf.keras.layers.Layer): ) self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0 self.layers = [TFPegasusDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)] - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") + self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) def get_embed_tokens(self): return self.embed_tokens @@ -1131,17 +1132,17 @@ class TFPegasusDecoder(tf.keras.layers.Layer): @keras_serializable -class TFPegasusMainLayer(tf.keras.layers.Layer): +class TFPegasusMainLayer(keras.layers.Layer): config_class = PegasusConfig def __init__(self, config: PegasusConfig, **kwargs): super().__init__(**kwargs) self.config = config - self.shared = tf.keras.layers.Embedding( + self.shared = keras.layers.Embedding( input_dim=config.vocab_size, output_dim=config.d_model, - embeddings_initializer=tf.keras.initializers.TruncatedNormal(stddev=self.config.init_std), + embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std), name="model.shared", ) # Additional attribute to specify the expected name scope of the layer (for loading/storing weights) @@ -1353,9 +1354,9 @@ class TFPegasusModel(TFPegasusPreTrainedModel): # Copied from transformers.models.bart.modeling_tf_bart.BiasLayer -class BiasLayer(tf.keras.layers.Layer): +class BiasLayer(keras.layers.Layer): """ - Bias as a layer. It is used for serialization purposes: `tf.keras.Model.save_weights` stores on a per-layer basis, + Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis, so all weights have to be registered in a layer. """ diff --git a/src/transformers/models/rag/modeling_tf_rag.py b/src/transformers/models/rag/modeling_tf_rag.py index 7ebf1bb146..e586bed87c 100644 --- a/src/transformers/models/rag/modeling_tf_rag.py +++ b/src/transformers/models/rag/modeling_tf_rag.py @@ -31,6 +31,7 @@ from ...modeling_tf_utils import ( TFCausalLanguageModelingLoss, TFModelInputType, TFPreTrainedModel, + keras, shape_list, unpack_inputs, ) @@ -406,7 +407,7 @@ RAG_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a Tensorflow [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) + This model is also a Tensorflow [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -1275,9 +1276,9 @@ class TFRagTokenForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss """CrossEntropyLoss that ignores pad tokens""" # Matt: As written, this loss is not XLA-compatible, but it's doing some very weird things # and I don't feel comfortable converting it. - loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( + loss_fn = keras.losses.SparseCategoricalCrossentropy( from_logits=True, - reduction=tf.keras.losses.Reduction.SUM, + reduction=keras.losses.Reduction.SUM, ) if from_logits is False: # convert to logits diff --git a/src/transformers/models/regnet/modeling_tf_regnet.py b/src/transformers/models/regnet/modeling_tf_regnet.py index 28115d718b..bca515fbf3 100644 --- a/src/transformers/models/regnet/modeling_tf_regnet.py +++ b/src/transformers/models/regnet/modeling_tf_regnet.py @@ -25,7 +25,13 @@ from ...modeling_tf_outputs import ( TFBaseModelOutputWithPoolingAndNoAttention, TFSequenceClassifierOutput, ) -from ...modeling_tf_utils import TFPreTrainedModel, TFSequenceClassificationLoss, keras_serializable, unpack_inputs +from ...modeling_tf_utils import ( + TFPreTrainedModel, + TFSequenceClassificationLoss, + keras, + keras_serializable, + unpack_inputs, +) from ...tf_utils import shape_list from ...utils import logging from .configuration_regnet import RegNetConfig @@ -50,7 +56,7 @@ TF_REGNET_PRETRAINED_MODEL_ARCHIVE_LIST = [ ] -class TFRegNetConvLayer(tf.keras.layers.Layer): +class TFRegNetConvLayer(keras.layers.Layer): def __init__( self, in_channels: int, @@ -64,8 +70,8 @@ class TFRegNetConvLayer(tf.keras.layers.Layer): super().__init__(**kwargs) # The padding and conv has been verified in # https://colab.research.google.com/gist/sayakpaul/854bc10eeaf21c9ee2119e0b9f3841a7/scratchpad.ipynb - self.padding = tf.keras.layers.ZeroPadding2D(padding=kernel_size // 2) - self.convolution = tf.keras.layers.Conv2D( + self.padding = keras.layers.ZeroPadding2D(padding=kernel_size // 2) + self.convolution = keras.layers.Conv2D( filters=out_channels, kernel_size=kernel_size, strides=stride, @@ -74,7 +80,7 @@ class TFRegNetConvLayer(tf.keras.layers.Layer): use_bias=False, name="convolution", ) - self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization") + self.normalization = keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization") self.activation = ACT2FN[activation] if activation is not None else tf.identity self.in_channels = in_channels self.out_channels = out_channels @@ -97,7 +103,7 @@ class TFRegNetConvLayer(tf.keras.layers.Layer): self.normalization.build([None, None, None, self.out_channels]) -class TFRegNetEmbeddings(tf.keras.layers.Layer): +class TFRegNetEmbeddings(keras.layers.Layer): """ RegNet Embeddings (stem) composed of a single aggressive convolution. """ @@ -121,7 +127,7 @@ class TFRegNetEmbeddings(tf.keras.layers.Layer): "Make sure that the channel dimension of the pixel values match with the one set in the configuration." ) - # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format. + # When running on CPU, `keras.layers.Conv2D` doesn't support `NCHW` format. # So change the input format from `NCHW` to `NHWC`. # shape = (batch_size, in_height, in_width, in_channels=num_channels) pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) @@ -137,7 +143,7 @@ class TFRegNetEmbeddings(tf.keras.layers.Layer): self.embedder.build(None) -class TFRegNetShortCut(tf.keras.layers.Layer): +class TFRegNetShortCut(keras.layers.Layer): """ RegNet shortcut, used to project the residual features to the correct size. If needed, it is also used to downsample the input using `stride=2`. @@ -145,10 +151,10 @@ class TFRegNetShortCut(tf.keras.layers.Layer): def __init__(self, in_channels: int, out_channels: int, stride: int = 2, **kwargs): super().__init__(**kwargs) - self.convolution = tf.keras.layers.Conv2D( + self.convolution = keras.layers.Conv2D( filters=out_channels, kernel_size=1, strides=stride, use_bias=False, name="convolution" ) - self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization") + self.normalization = keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization") self.in_channels = in_channels self.out_channels = out_channels @@ -167,17 +173,17 @@ class TFRegNetShortCut(tf.keras.layers.Layer): self.normalization.build([None, None, None, self.out_channels]) -class TFRegNetSELayer(tf.keras.layers.Layer): +class TFRegNetSELayer(keras.layers.Layer): """ Squeeze and Excitation layer (SE) proposed in [Squeeze-and-Excitation Networks](https://arxiv.org/abs/1709.01507). """ def __init__(self, in_channels: int, reduced_channels: int, **kwargs): super().__init__(**kwargs) - self.pooler = tf.keras.layers.GlobalAveragePooling2D(keepdims=True, name="pooler") + self.pooler = keras.layers.GlobalAveragePooling2D(keepdims=True, name="pooler") self.attention = [ - tf.keras.layers.Conv2D(filters=reduced_channels, kernel_size=1, activation="relu", name="attention.0"), - tf.keras.layers.Conv2D(filters=in_channels, kernel_size=1, activation="sigmoid", name="attention.2"), + keras.layers.Conv2D(filters=reduced_channels, kernel_size=1, activation="relu", name="attention.0"), + keras.layers.Conv2D(filters=in_channels, kernel_size=1, activation="sigmoid", name="attention.2"), ] self.in_channels = in_channels self.reduced_channels = reduced_channels @@ -204,7 +210,7 @@ class TFRegNetSELayer(tf.keras.layers.Layer): self.attention[1].build([None, None, None, self.reduced_channels]) -class TFRegNetXLayer(tf.keras.layers.Layer): +class TFRegNetXLayer(keras.layers.Layer): """ RegNet's layer composed by three `3x3` convolutions, same as a ResNet bottleneck layer with reduction = 1. """ @@ -216,7 +222,7 @@ class TFRegNetXLayer(tf.keras.layers.Layer): self.shortcut = ( TFRegNetShortCut(in_channels, out_channels, stride=stride, name="shortcut") if should_apply_shortcut - else tf.keras.layers.Activation("linear", name="shortcut") + else keras.layers.Activation("linear", name="shortcut") ) # `self.layers` instead of `self.layer` because that is a reserved argument. self.layers = [ @@ -250,7 +256,7 @@ class TFRegNetXLayer(tf.keras.layers.Layer): layer.build(None) -class TFRegNetYLayer(tf.keras.layers.Layer): +class TFRegNetYLayer(keras.layers.Layer): """ RegNet's Y layer: an X layer with Squeeze and Excitation. """ @@ -262,7 +268,7 @@ class TFRegNetYLayer(tf.keras.layers.Layer): self.shortcut = ( TFRegNetShortCut(in_channels, out_channels, stride=stride, name="shortcut") if should_apply_shortcut - else tf.keras.layers.Activation("linear", name="shortcut") + else keras.layers.Activation("linear", name="shortcut") ) self.layers = [ TFRegNetConvLayer(in_channels, out_channels, kernel_size=1, activation=config.hidden_act, name="layer.0"), @@ -296,7 +302,7 @@ class TFRegNetYLayer(tf.keras.layers.Layer): layer.build(None) -class TFRegNetStage(tf.keras.layers.Layer): +class TFRegNetStage(keras.layers.Layer): """ A RegNet stage composed by stacked layers. """ @@ -328,7 +334,7 @@ class TFRegNetStage(tf.keras.layers.Layer): layer.build(None) -class TFRegNetEncoder(tf.keras.layers.Layer): +class TFRegNetEncoder(keras.layers.Layer): def __init__(self, config: RegNetConfig, **kwargs): super().__init__(**kwargs) self.stages = [] @@ -376,7 +382,7 @@ class TFRegNetEncoder(tf.keras.layers.Layer): @keras_serializable -class TFRegNetMainLayer(tf.keras.layers.Layer): +class TFRegNetMainLayer(keras.layers.Layer): config_class = RegNetConfig def __init__(self, config, **kwargs): @@ -384,7 +390,7 @@ class TFRegNetMainLayer(tf.keras.layers.Layer): self.config = config self.embedder = TFRegNetEmbeddings(config, name="embedder") self.encoder = TFRegNetEncoder(config, name="encoder") - self.pooler = tf.keras.layers.GlobalAveragePooling2D(keepdims=True, name="pooler") + self.pooler = keras.layers.GlobalAveragePooling2D(keepdims=True, name="pooler") @unpack_inputs def call( @@ -457,7 +463,7 @@ class TFRegNetPreTrainedModel(TFPreTrainedModel): REGNET_START_DOCSTRING = r""" This model is a Tensorflow - [tf.keras.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer) sub-class. Use it as a + [keras.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer) sub-class. Use it as a regular Tensorflow Module and refer to the Tensorflow documentation for all matter related to general usage and behavior. @@ -548,8 +554,8 @@ class TFRegNetForImageClassification(TFRegNetPreTrainedModel, TFSequenceClassifi self.regnet = TFRegNetMainLayer(config, name="regnet") # classification head self.classifier = [ - tf.keras.layers.Flatten(), - tf.keras.layers.Dense(config.num_labels, name="classifier.1") if config.num_labels > 0 else tf.identity, + keras.layers.Flatten(), + keras.layers.Dense(config.num_labels, name="classifier.1") if config.num_labels > 0 else tf.identity, ] @unpack_inputs diff --git a/src/transformers/models/rembert/modeling_tf_rembert.py b/src/transformers/models/rembert/modeling_tf_rembert.py index 17779d1f62..58b13bc35b 100644 --- a/src/transformers/models/rembert/modeling_tf_rembert.py +++ b/src/transformers/models/rembert/modeling_tf_rembert.py @@ -44,6 +44,7 @@ from ...modeling_tf_utils import ( TFSequenceClassificationLoss, TFTokenClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -67,7 +68,7 @@ TF_REMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ ] -class TFRemBertEmbeddings(tf.keras.layers.Layer): +class TFRemBertEmbeddings(keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, config: RemBertConfig, **kwargs): @@ -77,8 +78,8 @@ class TFRemBertEmbeddings(tf.keras.layers.Layer): self.input_embedding_size = config.input_embedding_size self.max_position_embeddings = config.max_position_embeddings self.initializer_range = config.initializer_range - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) def build(self, input_shape=None): with tf.name_scope("word_embeddings"): @@ -150,7 +151,7 @@ class TFRemBertEmbeddings(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->RemBert -class TFRemBertSelfAttention(tf.keras.layers.Layer): +class TFRemBertSelfAttention(keras.layers.Layer): def __init__(self, config: RemBertConfig, **kwargs): super().__init__(**kwargs) @@ -165,16 +166,16 @@ class TFRemBertSelfAttention(tf.keras.layers.Layer): self.all_head_size = self.num_attention_heads * self.attention_head_size self.sqrt_att_head_size = math.sqrt(self.attention_head_size) - self.query = tf.keras.layers.Dense( + self.query = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" ) - self.key = tf.keras.layers.Dense( + self.key = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" ) - self.value = tf.keras.layers.Dense( + self.value = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) - self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.is_decoder = config.is_decoder self.config = config @@ -283,15 +284,15 @@ class TFRemBertSelfAttention(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->RemBert -class TFRemBertSelfOutput(tf.keras.layers.Layer): +class TFRemBertSelfOutput(keras.layers.Layer): def __init__(self, config: RemBertConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -314,7 +315,7 @@ class TFRemBertSelfOutput(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->RemBert -class TFRemBertAttention(tf.keras.layers.Layer): +class TFRemBertAttention(keras.layers.Layer): def __init__(self, config: RemBertConfig, **kwargs): super().__init__(**kwargs) @@ -366,11 +367,11 @@ class TFRemBertAttention(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->RemBert -class TFRemBertIntermediate(tf.keras.layers.Layer): +class TFRemBertIntermediate(keras.layers.Layer): def __init__(self, config: RemBertConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -396,15 +397,15 @@ class TFRemBertIntermediate(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->RemBert -class TFRemBertOutput(tf.keras.layers.Layer): +class TFRemBertOutput(keras.layers.Layer): def __init__(self, config: RemBertConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -427,7 +428,7 @@ class TFRemBertOutput(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->RemBert -class TFRemBertLayer(tf.keras.layers.Layer): +class TFRemBertLayer(keras.layers.Layer): def __init__(self, config: RemBertConfig, **kwargs): super().__init__(**kwargs) @@ -530,12 +531,12 @@ class TFRemBertLayer(tf.keras.layers.Layer): self.crossattention.build(None) -class TFRemBertEncoder(tf.keras.layers.Layer): +class TFRemBertEncoder(keras.layers.Layer): def __init__(self, config: RemBertConfig, **kwargs): super().__init__(**kwargs) self.config = config - self.embedding_hidden_mapping_in = tf.keras.layers.Dense( + self.embedding_hidden_mapping_in = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="embedding_hidden_mapping_in", @@ -619,11 +620,11 @@ class TFRemBertEncoder(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->RemBert -class TFRemBertPooler(tf.keras.layers.Layer): +class TFRemBertPooler(keras.layers.Layer): def __init__(self, config: RemBertConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", @@ -648,21 +649,21 @@ class TFRemBertPooler(tf.keras.layers.Layer): self.dense.build([None, None, self.config.hidden_size]) -class TFRemBertLMPredictionHead(tf.keras.layers.Layer): - def __init__(self, config: RemBertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): +class TFRemBertLMPredictionHead(keras.layers.Layer): + def __init__(self, config: RemBertConfig, input_embeddings: keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.config = config self.initializer_range = config.initializer_range self.output_embedding_size = config.output_embedding_size - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( config.output_embedding_size, kernel_initializer=get_initializer(self.initializer_range), name="dense" ) if isinstance(config.hidden_act, str): self.activation = get_tf_activation(config.hidden_act) else: self.activation = config.hidden_act - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") def build(self, input_shape=None): self.decoder = self.add_weight( @@ -684,7 +685,7 @@ class TFRemBertLMPredictionHead(tf.keras.layers.Layer): with tf.name_scope(self.LayerNorm.name): self.LayerNorm.build([None, self.config.output_embedding_size]) - def get_output_embeddings(self) -> tf.keras.layers.Layer: + def get_output_embeddings(self) -> keras.layers.Layer: return self def set_output_embeddings(self, value): @@ -711,8 +712,8 @@ class TFRemBertLMPredictionHead(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertMLMHead with Bert->RemBert -class TFRemBertMLMHead(tf.keras.layers.Layer): - def __init__(self, config: RemBertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): +class TFRemBertMLMHead(keras.layers.Layer): + def __init__(self, config: RemBertConfig, input_embeddings: keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.predictions = TFRemBertLMPredictionHead(config, input_embeddings, name="predictions") @@ -732,7 +733,7 @@ class TFRemBertMLMHead(tf.keras.layers.Layer): @keras_serializable -class TFRemBertMainLayer(tf.keras.layers.Layer): +class TFRemBertMainLayer(keras.layers.Layer): config_class = RemBertConfig def __init__(self, config: RemBertConfig, add_pooling_layer: bool = True, **kwargs): @@ -745,7 +746,7 @@ class TFRemBertMainLayer(tf.keras.layers.Layer): self.encoder = TFRemBertEncoder(config, name="encoder") self.pooler = TFRemBertPooler(config, name="pooler") if add_pooling_layer else None - def get_input_embeddings(self) -> tf.keras.layers.Layer: + def get_input_embeddings(self) -> keras.layers.Layer: return self.embeddings def set_input_embeddings(self, value: tf.Variable): @@ -949,7 +950,7 @@ REMBERT_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -1137,7 +1138,7 @@ class TFRemBertForMaskedLM(TFRemBertPreTrainedModel, TFMaskedLanguageModelingLos self.rembert = TFRemBertMainLayer(config, name="rembert", add_pooling_layer=False) self.mlm = TFRemBertMLMHead(config, input_embeddings=self.rembert.embeddings, name="mlm___cls") - def get_lm_head(self) -> tf.keras.layers.Layer: + def get_lm_head(self) -> keras.layers.Layer: return self.mlm.predictions @unpack_inputs @@ -1219,7 +1220,7 @@ class TFRemBertForCausalLM(TFRemBertPreTrainedModel, TFCausalLanguageModelingLos self.rembert = TFRemBertMainLayer(config, name="rembert", add_pooling_layer=False) self.mlm = TFRemBertMLMHead(config, input_embeddings=self.rembert.embeddings, name="mlm___cls") - def get_lm_head(self) -> tf.keras.layers.Layer: + def get_lm_head(self) -> keras.layers.Layer: return self.mlm.predictions # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.prepare_inputs_for_generation @@ -1346,8 +1347,8 @@ class TFRemBertForSequenceClassification(TFRemBertPreTrainedModel, TFSequenceCla self.num_labels = config.num_labels self.rembert = TFRemBertMainLayer(config, name="rembert") - self.dropout = tf.keras.layers.Dropout(rate=config.classifier_dropout_prob) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(rate=config.classifier_dropout_prob) + self.classifier = keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier", @@ -1433,8 +1434,8 @@ class TFRemBertForMultipleChoice(TFRemBertPreTrainedModel, TFMultipleChoiceLoss) super().__init__(config, *inputs, **kwargs) self.rembert = TFRemBertMainLayer(config, name="rembert") - self.dropout = tf.keras.layers.Dropout(rate=config.classifier_dropout_prob) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(rate=config.classifier_dropout_prob) + self.classifier = keras.layers.Dense( units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config @@ -1543,8 +1544,8 @@ class TFRemBertForTokenClassification(TFRemBertPreTrainedModel, TFTokenClassific self.num_labels = config.num_labels self.rembert = TFRemBertMainLayer(config, name="rembert", add_pooling_layer=False) - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.classifier = keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config @@ -1628,7 +1629,7 @@ class TFRemBertForQuestionAnswering(TFRemBertPreTrainedModel, TFQuestionAnswerin self.num_labels = config.num_labels self.rembert = TFRemBertMainLayer(config, add_pooling_layer=False, name="rembert") - self.qa_outputs = tf.keras.layers.Dense( + self.qa_outputs = keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) self.config = config diff --git a/src/transformers/models/resnet/modeling_tf_resnet.py b/src/transformers/models/resnet/modeling_tf_resnet.py index 9a34b5f385..faf5c635ba 100644 --- a/src/transformers/models/resnet/modeling_tf_resnet.py +++ b/src/transformers/models/resnet/modeling_tf_resnet.py @@ -24,7 +24,13 @@ from ...modeling_tf_outputs import ( TFBaseModelOutputWithPoolingAndNoAttention, TFImageClassifierOutputWithNoAttention, ) -from ...modeling_tf_utils import TFPreTrainedModel, TFSequenceClassificationLoss, keras_serializable, unpack_inputs +from ...modeling_tf_utils import ( + TFPreTrainedModel, + TFSequenceClassificationLoss, + keras, + keras_serializable, + unpack_inputs, +) from ...tf_utils import shape_list from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging from .configuration_resnet import ResNetConfig @@ -49,7 +55,7 @@ TF_RESNET_PRETRAINED_MODEL_ARCHIVE_LIST = [ ] -class TFResNetConvLayer(tf.keras.layers.Layer): +class TFResNetConvLayer(keras.layers.Layer): def __init__( self, in_channels: int, @@ -61,12 +67,12 @@ class TFResNetConvLayer(tf.keras.layers.Layer): ) -> None: super().__init__(**kwargs) self.pad_value = kernel_size // 2 - self.conv = tf.keras.layers.Conv2D( + self.conv = keras.layers.Conv2D( out_channels, kernel_size=kernel_size, strides=stride, padding="valid", use_bias=False, name="convolution" ) # Use same default momentum and epsilon as PyTorch equivalent - self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization") - self.activation = ACT2FN[activation] if activation is not None else tf.keras.layers.Activation("linear") + self.normalization = keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization") + self.activation = ACT2FN[activation] if activation is not None else keras.layers.Activation("linear") self.in_channels = in_channels self.out_channels = out_channels @@ -95,7 +101,7 @@ class TFResNetConvLayer(tf.keras.layers.Layer): self.normalization.build([None, None, None, self.out_channels]) -class TFResNetEmbeddings(tf.keras.layers.Layer): +class TFResNetEmbeddings(keras.layers.Layer): """ ResNet Embeddings (stem) composed of a single aggressive convolution. """ @@ -110,7 +116,7 @@ class TFResNetEmbeddings(tf.keras.layers.Layer): activation=config.hidden_act, name="embedder", ) - self.pooler = tf.keras.layers.MaxPool2D(pool_size=3, strides=2, padding="valid", name="pooler") + self.pooler = keras.layers.MaxPool2D(pool_size=3, strides=2, padding="valid", name="pooler") self.num_channels = config.num_channels def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -137,7 +143,7 @@ class TFResNetEmbeddings(tf.keras.layers.Layer): self.pooler.build(None) -class TFResNetShortCut(tf.keras.layers.Layer): +class TFResNetShortCut(keras.layers.Layer): """ ResNet shortcut, used to project the residual features to the correct size. If needed, it is also used to downsample the input using `stride=2`. @@ -145,11 +151,11 @@ class TFResNetShortCut(tf.keras.layers.Layer): def __init__(self, in_channels: int, out_channels: int, stride: int = 2, **kwargs) -> None: super().__init__(**kwargs) - self.convolution = tf.keras.layers.Conv2D( + self.convolution = keras.layers.Conv2D( out_channels, kernel_size=1, strides=stride, use_bias=False, name="convolution" ) # Use same default momentum and epsilon as PyTorch equivalent - self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization") + self.normalization = keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization") self.in_channels = in_channels self.out_channels = out_channels @@ -171,7 +177,7 @@ class TFResNetShortCut(tf.keras.layers.Layer): self.normalization.build([None, None, None, self.out_channels]) -class TFResNetBasicLayer(tf.keras.layers.Layer): +class TFResNetBasicLayer(keras.layers.Layer): """ A classic ResNet's residual layer composed by two `3x3` convolutions. """ @@ -186,7 +192,7 @@ class TFResNetBasicLayer(tf.keras.layers.Layer): self.shortcut = ( TFResNetShortCut(in_channels, out_channels, stride=stride, name="shortcut") if should_apply_shortcut - else tf.keras.layers.Activation("linear", name="shortcut") + else keras.layers.Activation("linear", name="shortcut") ) self.activation = ACT2FN[activation] @@ -214,7 +220,7 @@ class TFResNetBasicLayer(tf.keras.layers.Layer): self.shortcut.build(None) -class TFResNetBottleNeckLayer(tf.keras.layers.Layer): +class TFResNetBottleNeckLayer(keras.layers.Layer): """ A classic ResNet's bottleneck layer composed by three `3x3` convolutions. @@ -240,7 +246,7 @@ class TFResNetBottleNeckLayer(tf.keras.layers.Layer): self.shortcut = ( TFResNetShortCut(in_channels, out_channels, stride=stride, name="shortcut") if should_apply_shortcut - else tf.keras.layers.Activation("linear", name="shortcut") + else keras.layers.Activation("linear", name="shortcut") ) self.activation = ACT2FN[activation] @@ -272,7 +278,7 @@ class TFResNetBottleNeckLayer(tf.keras.layers.Layer): self.shortcut.build(None) -class TFResNetStage(tf.keras.layers.Layer): +class TFResNetStage(keras.layers.Layer): """ A ResNet stage composed of stacked layers. """ @@ -306,7 +312,7 @@ class TFResNetStage(tf.keras.layers.Layer): layer.build(None) -class TFResNetEncoder(tf.keras.layers.Layer): +class TFResNetEncoder(keras.layers.Layer): def __init__(self, config: ResNetConfig, **kwargs) -> None: super().__init__(**kwargs) # based on `downsample_in_first_stage` the first layer of the first stage may or may not downsample the input @@ -375,7 +381,7 @@ class TFResNetPreTrainedModel(TFPreTrainedModel): RESNET_START_DOCSTRING = r""" This model is a TensorFlow - [tf.keras.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer) sub-class. Use it as a + [keras.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer) sub-class. Use it as a regular TensorFlow Module and refer to the TensorFlow documentation for all matter related to general usage and behavior. @@ -401,7 +407,7 @@ RESNET_INPUTS_DOCSTRING = r""" @keras_serializable -class TFResNetMainLayer(tf.keras.layers.Layer): +class TFResNetMainLayer(keras.layers.Layer): config_class = ResNetConfig def __init__(self, config: ResNetConfig, **kwargs) -> None: @@ -409,7 +415,7 @@ class TFResNetMainLayer(tf.keras.layers.Layer): self.config = config self.embedder = TFResNetEmbeddings(config, name="embedder") self.encoder = TFResNetEncoder(config, name="encoder") - self.pooler = tf.keras.layers.GlobalAveragePooling2D(keepdims=True) + self.pooler = keras.layers.GlobalAveragePooling2D(keepdims=True) @unpack_inputs def call( @@ -530,14 +536,14 @@ class TFResNetForImageClassification(TFResNetPreTrainedModel, TFSequenceClassifi self.resnet = TFResNetMainLayer(config, name="resnet") # classification head self.classifier_layer = ( - tf.keras.layers.Dense(config.num_labels, name="classifier.1") + keras.layers.Dense(config.num_labels, name="classifier.1") if config.num_labels > 0 - else tf.keras.layers.Activation("linear", name="classifier.1") + else keras.layers.Activation("linear", name="classifier.1") ) self.config = config def classifier(self, x: tf.Tensor) -> tf.Tensor: - x = tf.keras.layers.Flatten()(x) + x = keras.layers.Flatten()(x) logits = self.classifier_layer(x) return logits diff --git a/src/transformers/models/roberta/modeling_tf_roberta.py b/src/transformers/models/roberta/modeling_tf_roberta.py index 6fb846c775..afe773ec97 100644 --- a/src/transformers/models/roberta/modeling_tf_roberta.py +++ b/src/transformers/models/roberta/modeling_tf_roberta.py @@ -46,6 +46,7 @@ from ...modeling_tf_utils import ( TFSequenceClassificationLoss, TFTokenClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -73,7 +74,7 @@ TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [ ] -class TFRobertaEmbeddings(tf.keras.layers.Layer): +class TFRobertaEmbeddings(keras.layers.Layer): """ Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. """ @@ -86,8 +87,8 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer): self.hidden_size = config.hidden_size self.max_position_embeddings = config.max_position_embeddings self.initializer_range = config.initializer_range - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) def build(self, input_shape=None): with tf.name_scope("word_embeddings"): @@ -179,11 +180,11 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Roberta -class TFRobertaPooler(tf.keras.layers.Layer): +class TFRobertaPooler(keras.layers.Layer): def __init__(self, config: RobertaConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", @@ -209,7 +210,7 @@ class TFRobertaPooler(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Roberta -class TFRobertaSelfAttention(tf.keras.layers.Layer): +class TFRobertaSelfAttention(keras.layers.Layer): def __init__(self, config: RobertaConfig, **kwargs): super().__init__(**kwargs) @@ -224,16 +225,16 @@ class TFRobertaSelfAttention(tf.keras.layers.Layer): self.all_head_size = self.num_attention_heads * self.attention_head_size self.sqrt_att_head_size = math.sqrt(self.attention_head_size) - self.query = tf.keras.layers.Dense( + self.query = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" ) - self.key = tf.keras.layers.Dense( + self.key = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" ) - self.value = tf.keras.layers.Dense( + self.value = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) - self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.is_decoder = config.is_decoder self.config = config @@ -342,15 +343,15 @@ class TFRobertaSelfAttention(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Roberta -class TFRobertaSelfOutput(tf.keras.layers.Layer): +class TFRobertaSelfOutput(keras.layers.Layer): def __init__(self, config: RobertaConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -373,7 +374,7 @@ class TFRobertaSelfOutput(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Roberta -class TFRobertaAttention(tf.keras.layers.Layer): +class TFRobertaAttention(keras.layers.Layer): def __init__(self, config: RobertaConfig, **kwargs): super().__init__(**kwargs) @@ -425,11 +426,11 @@ class TFRobertaAttention(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Roberta -class TFRobertaIntermediate(tf.keras.layers.Layer): +class TFRobertaIntermediate(keras.layers.Layer): def __init__(self, config: RobertaConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -455,15 +456,15 @@ class TFRobertaIntermediate(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Roberta -class TFRobertaOutput(tf.keras.layers.Layer): +class TFRobertaOutput(keras.layers.Layer): def __init__(self, config: RobertaConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -486,7 +487,7 @@ class TFRobertaOutput(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Roberta -class TFRobertaLayer(tf.keras.layers.Layer): +class TFRobertaLayer(keras.layers.Layer): def __init__(self, config: RobertaConfig, **kwargs): super().__init__(**kwargs) @@ -590,7 +591,7 @@ class TFRobertaLayer(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Roberta -class TFRobertaEncoder(tf.keras.layers.Layer): +class TFRobertaEncoder(keras.layers.Layer): def __init__(self, config: RobertaConfig, **kwargs): super().__init__(**kwargs) self.config = config @@ -669,7 +670,7 @@ class TFRobertaEncoder(tf.keras.layers.Layer): @keras_serializable -class TFRobertaMainLayer(tf.keras.layers.Layer): +class TFRobertaMainLayer(keras.layers.Layer): config_class = RobertaConfig def __init__(self, config, add_pooling_layer=True, **kwargs): @@ -689,7 +690,7 @@ class TFRobertaMainLayer(tf.keras.layers.Layer): self.embeddings = TFRobertaEmbeddings(config, name="embeddings") # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings - def get_input_embeddings(self) -> tf.keras.layers.Layer: + def get_input_embeddings(self) -> keras.layers.Layer: return self.embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings @@ -895,7 +896,7 @@ ROBERTA_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -1068,7 +1069,7 @@ class TFRobertaModel(TFRobertaPreTrainedModel): self.roberta.build(None) -class TFRobertaLMHead(tf.keras.layers.Layer): +class TFRobertaLMHead(keras.layers.Layer): """Roberta Head for masked language modeling.""" def __init__(self, config, input_embeddings, **kwargs): @@ -1076,10 +1077,10 @@ class TFRobertaLMHead(tf.keras.layers.Layer): self.config = config self.hidden_size = config.hidden_size - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") + self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.act = get_tf_activation("gelu") # The output weights are the same as the input embeddings, but there is @@ -1350,12 +1351,12 @@ class TFRobertaForCausalLM(TFRobertaPreTrainedModel, TFCausalLanguageModelingLos self.lm_head.build(None) -class TFRobertaClassificationHead(tf.keras.layers.Layer): +class TFRobertaClassificationHead(keras.layers.Layer): """Head for sentence-level classification tasks.""" def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", @@ -1364,8 +1365,8 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer): classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = tf.keras.layers.Dropout(classifier_dropout) - self.out_proj = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(classifier_dropout) + self.out_proj = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" ) self.config = config @@ -1493,8 +1494,8 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss) super().__init__(config, *inputs, **kwargs) self.roberta = TFRobertaMainLayer(config, name="roberta") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) + self.classifier = keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config @@ -1599,8 +1600,8 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = tf.keras.layers.Dropout(classifier_dropout) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(classifier_dropout) + self.classifier = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config @@ -1690,7 +1691,7 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin self.num_labels = config.num_labels self.roberta = TFRobertaMainLayer(config, add_pooling_layer=False, name="roberta") - self.qa_outputs = tf.keras.layers.Dense( + self.qa_outputs = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) self.config = config diff --git a/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py index f82f75c088..6d111deaab 100644 --- a/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py +++ b/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py @@ -46,6 +46,7 @@ from ...modeling_tf_utils import ( TFSequenceClassificationLoss, TFTokenClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -78,7 +79,7 @@ TF_ROBERTA_PRELAYERNORM_PRETRAINED_MODEL_ARCHIVE_LIST = [ # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaEmbeddings with Roberta->RobertaPreLayerNorm -class TFRobertaPreLayerNormEmbeddings(tf.keras.layers.Layer): +class TFRobertaPreLayerNormEmbeddings(keras.layers.Layer): """ Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. """ @@ -91,8 +92,8 @@ class TFRobertaPreLayerNormEmbeddings(tf.keras.layers.Layer): self.hidden_size = config.hidden_size self.max_position_embeddings = config.max_position_embeddings self.initializer_range = config.initializer_range - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) def build(self, input_shape=None): with tf.name_scope("word_embeddings"): @@ -184,11 +185,11 @@ class TFRobertaPreLayerNormEmbeddings(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->RobertaPreLayerNorm -class TFRobertaPreLayerNormPooler(tf.keras.layers.Layer): +class TFRobertaPreLayerNormPooler(keras.layers.Layer): def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", @@ -214,7 +215,7 @@ class TFRobertaPreLayerNormPooler(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->RobertaPreLayerNorm -class TFRobertaPreLayerNormSelfAttention(tf.keras.layers.Layer): +class TFRobertaPreLayerNormSelfAttention(keras.layers.Layer): def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): super().__init__(**kwargs) @@ -229,16 +230,16 @@ class TFRobertaPreLayerNormSelfAttention(tf.keras.layers.Layer): self.all_head_size = self.num_attention_heads * self.attention_head_size self.sqrt_att_head_size = math.sqrt(self.attention_head_size) - self.query = tf.keras.layers.Dense( + self.query = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" ) - self.key = tf.keras.layers.Dense( + self.key = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" ) - self.value = tf.keras.layers.Dense( + self.value = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) - self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.is_decoder = config.is_decoder self.config = config @@ -346,14 +347,14 @@ class TFRobertaPreLayerNormSelfAttention(tf.keras.layers.Layer): self.value.build([None, None, self.config.hidden_size]) -class TFRobertaPreLayerNormSelfOutput(tf.keras.layers.Layer): +class TFRobertaPreLayerNormSelfOutput(keras.layers.Layer): def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -372,13 +373,13 @@ class TFRobertaPreLayerNormSelfOutput(tf.keras.layers.Layer): self.dense.build([None, None, self.config.hidden_size]) -class TFRobertaPreLayerNormAttention(tf.keras.layers.Layer): +class TFRobertaPreLayerNormAttention(keras.layers.Layer): def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): super().__init__(**kwargs) self.self_attention = TFRobertaPreLayerNormSelfAttention(config, name="self") self.dense_output = TFRobertaPreLayerNormSelfOutput(config, name="output") - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.config = config # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention.prune_heads @@ -430,12 +431,12 @@ class TFRobertaPreLayerNormAttention(tf.keras.layers.Layer): self.LayerNorm.build([None, None, self.config.hidden_size]) -class TFRobertaPreLayerNormIntermediate(tf.keras.layers.Layer): +class TFRobertaPreLayerNormIntermediate(keras.layers.Layer): def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): super().__init__(**kwargs) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dense = tf.keras.layers.Dense( + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dense = keras.layers.Dense( units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -464,14 +465,14 @@ class TFRobertaPreLayerNormIntermediate(tf.keras.layers.Layer): self.dense.build([None, None, self.config.hidden_size]) -class TFRobertaPreLayerNormOutput(tf.keras.layers.Layer): +class TFRobertaPreLayerNormOutput(keras.layers.Layer): def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -491,7 +492,7 @@ class TFRobertaPreLayerNormOutput(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->RobertaPreLayerNorm -class TFRobertaPreLayerNormLayer(tf.keras.layers.Layer): +class TFRobertaPreLayerNormLayer(keras.layers.Layer): def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): super().__init__(**kwargs) @@ -595,7 +596,7 @@ class TFRobertaPreLayerNormLayer(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->RobertaPreLayerNorm -class TFRobertaPreLayerNormEncoder(tf.keras.layers.Layer): +class TFRobertaPreLayerNormEncoder(keras.layers.Layer): def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): super().__init__(**kwargs) self.config = config @@ -674,7 +675,7 @@ class TFRobertaPreLayerNormEncoder(tf.keras.layers.Layer): @keras_serializable -class TFRobertaPreLayerNormMainLayer(tf.keras.layers.Layer): +class TFRobertaPreLayerNormMainLayer(keras.layers.Layer): config_class = RobertaPreLayerNormConfig def __init__(self, config, add_pooling_layer=True, **kwargs): @@ -689,12 +690,12 @@ class TFRobertaPreLayerNormMainLayer(tf.keras.layers.Layer): self.output_hidden_states = config.output_hidden_states self.return_dict = config.use_return_dict self.encoder = TFRobertaPreLayerNormEncoder(config, name="encoder") - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.pooler = TFRobertaPreLayerNormPooler(config, name="pooler") if add_pooling_layer else None # The embeddings must be the last declaration in order to follow the weights order self.embeddings = TFRobertaPreLayerNormEmbeddings(config, name="embeddings") - def get_input_embeddings(self) -> tf.keras.layers.Layer: + def get_input_embeddings(self) -> keras.layers.Layer: return self.embeddings def set_input_embeddings(self, value: tf.Variable): @@ -900,7 +901,7 @@ ROBERTA_PRELAYERNORM_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -1075,7 +1076,7 @@ class TFRobertaPreLayerNormModel(TFRobertaPreLayerNormPreTrainedModel): # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->RobertaPreLayerNorm -class TFRobertaPreLayerNormLMHead(tf.keras.layers.Layer): +class TFRobertaPreLayerNormLMHead(keras.layers.Layer): """RobertaPreLayerNorm Head for masked language modeling.""" def __init__(self, config, input_embeddings, **kwargs): @@ -1083,10 +1084,10 @@ class TFRobertaPreLayerNormLMHead(tf.keras.layers.Layer): self.config = config self.hidden_size = config.hidden_size - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") + self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.act = get_tf_activation("gelu") # The output weights are the same as the input embeddings, but there is @@ -1371,12 +1372,12 @@ class TFRobertaPreLayerNormForCausalLM(TFRobertaPreLayerNormPreTrainedModel, TFC # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead with Roberta->RobertaPreLayerNorm -class TFRobertaPreLayerNormClassificationHead(tf.keras.layers.Layer): +class TFRobertaPreLayerNormClassificationHead(keras.layers.Layer): """Head for sentence-level classification tasks.""" def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", @@ -1385,8 +1386,8 @@ class TFRobertaPreLayerNormClassificationHead(tf.keras.layers.Layer): classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = tf.keras.layers.Dropout(classifier_dropout) - self.out_proj = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(classifier_dropout) + self.out_proj = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" ) self.config = config @@ -1518,8 +1519,8 @@ class TFRobertaPreLayerNormForMultipleChoice(TFRobertaPreLayerNormPreTrainedMode super().__init__(config, *inputs, **kwargs) self.roberta_prelayernorm = TFRobertaPreLayerNormMainLayer(config, name="roberta_prelayernorm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) + self.classifier = keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config @@ -1628,8 +1629,8 @@ class TFRobertaPreLayerNormForTokenClassification(TFRobertaPreLayerNormPreTraine classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = tf.keras.layers.Dropout(classifier_dropout) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(classifier_dropout) + self.classifier = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config @@ -1720,7 +1721,7 @@ class TFRobertaPreLayerNormForQuestionAnswering(TFRobertaPreLayerNormPreTrainedM self.roberta_prelayernorm = TFRobertaPreLayerNormMainLayer( config, add_pooling_layer=False, name="roberta_prelayernorm" ) - self.qa_outputs = tf.keras.layers.Dense( + self.qa_outputs = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) self.config = config diff --git a/src/transformers/models/roformer/modeling_tf_roformer.py b/src/transformers/models/roformer/modeling_tf_roformer.py index baf0daca31..eb52a09934 100644 --- a/src/transformers/models/roformer/modeling_tf_roformer.py +++ b/src/transformers/models/roformer/modeling_tf_roformer.py @@ -45,6 +45,7 @@ from ...modeling_tf_utils import ( TFSequenceSummary, TFTokenClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -74,7 +75,7 @@ TF_ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [ ] -class TFRoFormerSinusoidalPositionalEmbedding(tf.keras.layers.Layer): +class TFRoFormerSinusoidalPositionalEmbedding(keras.layers.Layer): """This module produces sinusoidal positional embeddings of any length.""" def __init__(self, num_positions: int, embedding_dim: int, **kwargs): @@ -130,7 +131,7 @@ class TFRoFormerSinusoidalPositionalEmbedding(tf.keras.layers.Layer): return tf.gather(self.weight, positions) -class TFRoFormerEmbeddings(tf.keras.layers.Layer): +class TFRoFormerEmbeddings(keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, config: RoFormerConfig, **kwargs): @@ -139,8 +140,8 @@ class TFRoFormerEmbeddings(tf.keras.layers.Layer): self.config = config self.embedding_size = config.embedding_size self.initializer_range = config.initializer_range - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) def build(self, input_shape=None): with tf.name_scope("word_embeddings"): @@ -197,7 +198,7 @@ class TFRoFormerEmbeddings(tf.keras.layers.Layer): return final_embeddings -class TFRoFormerSelfAttention(tf.keras.layers.Layer): +class TFRoFormerSelfAttention(keras.layers.Layer): def __init__(self, config: RoFormerConfig, **kwargs): super().__init__(**kwargs) @@ -212,16 +213,16 @@ class TFRoFormerSelfAttention(tf.keras.layers.Layer): self.all_head_size = self.num_attention_heads * self.attention_head_size self.sqrt_att_head_size = math.sqrt(self.attention_head_size) - self.query = tf.keras.layers.Dense( + self.query = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" ) - self.key = tf.keras.layers.Dense( + self.key = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" ) - self.value = tf.keras.layers.Dense( + self.value = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) - self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.rotary_value = config.rotary_value self.config = config @@ -329,15 +330,15 @@ class TFRoFormerSelfAttention(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->RoFormer -class TFRoFormerSelfOutput(tf.keras.layers.Layer): +class TFRoFormerSelfOutput(keras.layers.Layer): def __init__(self, config: RoFormerConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -359,7 +360,7 @@ class TFRoFormerSelfOutput(tf.keras.layers.Layer): self.LayerNorm.build([None, None, self.config.hidden_size]) -class TFRoFormerAttention(tf.keras.layers.Layer): +class TFRoFormerAttention(keras.layers.Layer): def __init__(self, config: RoFormerConfig, **kwargs): super().__init__(**kwargs) @@ -406,11 +407,11 @@ class TFRoFormerAttention(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->RoFormer -class TFRoFormerIntermediate(tf.keras.layers.Layer): +class TFRoFormerIntermediate(keras.layers.Layer): def __init__(self, config: RoFormerConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -436,15 +437,15 @@ class TFRoFormerIntermediate(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->RoFormer -class TFRoFormerOutput(tf.keras.layers.Layer): +class TFRoFormerOutput(keras.layers.Layer): def __init__(self, config: RoFormerConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -466,7 +467,7 @@ class TFRoFormerOutput(tf.keras.layers.Layer): self.LayerNorm.build([None, None, self.config.hidden_size]) -class TFRoFormerLayer(tf.keras.layers.Layer): +class TFRoFormerLayer(keras.layers.Layer): def __init__(self, config: RoFormerConfig, **kwargs): super().__init__(**kwargs) @@ -515,7 +516,7 @@ class TFRoFormerLayer(tf.keras.layers.Layer): self.roformer_output.build(None) -class TFRoFormerEncoder(tf.keras.layers.Layer): +class TFRoFormerEncoder(keras.layers.Layer): def __init__(self, config: RoFormerConfig, **kwargs): super().__init__(**kwargs) self.embed_positions = TFRoFormerSinusoidalPositionalEmbedding( @@ -582,11 +583,11 @@ class TFRoFormerEncoder(tf.keras.layers.Layer): layer.build(None) -class TFRoFormerPredictionHeadTransform(tf.keras.layers.Layer): +class TFRoFormerPredictionHeadTransform(keras.layers.Layer): def __init__(self, config: RoFormerConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense", @@ -597,7 +598,7 @@ class TFRoFormerPredictionHeadTransform(tf.keras.layers.Layer): else: self.transform_act_fn = config.hidden_act - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: @@ -619,8 +620,8 @@ class TFRoFormerPredictionHeadTransform(tf.keras.layers.Layer): self.LayerNorm.build([None, None, self.config.embedding_size]) -class TFRoFormerLMPredictionHead(tf.keras.layers.Layer): - def __init__(self, config: RoFormerConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): +class TFRoFormerLMPredictionHead(keras.layers.Layer): + def __init__(self, config: RoFormerConfig, input_embeddings: keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.config = config @@ -642,7 +643,7 @@ class TFRoFormerLMPredictionHead(tf.keras.layers.Layer): with tf.name_scope(self.transform.name): self.transform.build(None) - def get_output_embeddings(self) -> tf.keras.layers.Layer: + def get_output_embeddings(self) -> keras.layers.Layer: return self.input_embeddings def set_output_embeddings(self, value: tf.Variable): @@ -668,8 +669,8 @@ class TFRoFormerLMPredictionHead(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertMLMHead with Bert->RoFormer -class TFRoFormerMLMHead(tf.keras.layers.Layer): - def __init__(self, config: RoFormerConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): +class TFRoFormerMLMHead(keras.layers.Layer): + def __init__(self, config: RoFormerConfig, input_embeddings: keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.predictions = TFRoFormerLMPredictionHead(config, input_embeddings, name="predictions") @@ -689,7 +690,7 @@ class TFRoFormerMLMHead(tf.keras.layers.Layer): @keras_serializable -class TFRoFormerMainLayer(tf.keras.layers.Layer): +class TFRoFormerMainLayer(keras.layers.Layer): config_class = RoFormerConfig def __init__(self, config: RoFormerConfig, add_pooling_layer: bool = True, **kwargs): @@ -699,11 +700,11 @@ class TFRoFormerMainLayer(tf.keras.layers.Layer): self.embeddings = TFRoFormerEmbeddings(config, name="embeddings") if config.embedding_size != config.hidden_size: - self.embeddings_project = tf.keras.layers.Dense(config.hidden_size, name="embeddings_project") + self.embeddings_project = keras.layers.Dense(config.hidden_size, name="embeddings_project") self.encoder = TFRoFormerEncoder(config, name="encoder") - def get_input_embeddings(self) -> tf.keras.layers.Layer: + def get_input_embeddings(self) -> keras.layers.Layer: return self.embeddings def set_input_embeddings(self, value: tf.Variable): @@ -833,7 +834,7 @@ ROFORMER_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -986,7 +987,7 @@ class TFRoFormerForMaskedLM(TFRoFormerPreTrainedModel, TFMaskedLanguageModelingL self.roformer = TFRoFormerMainLayer(config, name="roformer") self.mlm = TFRoFormerMLMHead(config, input_embeddings=self.roformer.embeddings, name="mlm___cls") - def get_lm_head(self) -> tf.keras.layers.Layer: + def get_lm_head(self) -> keras.layers.Layer: return self.mlm.predictions @unpack_inputs @@ -1066,7 +1067,7 @@ class TFRoFormerForCausalLM(TFRoFormerPreTrainedModel, TFCausalLanguageModelingL self.roformer = TFRoFormerMainLayer(config, name="roformer") self.mlm = TFRoFormerMLMHead(config, input_embeddings=self.roformer.embeddings, name="mlm___cls") - def get_lm_head(self) -> tf.keras.layers.Layer: + def get_lm_head(self) -> keras.layers.Layer: return self.mlm.predictions @unpack_inputs @@ -1137,17 +1138,17 @@ class TFRoFormerForCausalLM(TFRoFormerPreTrainedModel, TFCausalLanguageModelingL self.mlm.build(None) -class TFRoFormerClassificationHead(tf.keras.layers.Layer): +class TFRoFormerClassificationHead(keras.layers.Layer): """Head for sentence-level classification tasks.""" def __init__(self, config: RoFormerConfig, *inputs, **kwargs): super().__init__(*inputs, **kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.out_proj = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.out_proj = keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" ) @@ -1271,7 +1272,7 @@ class TFRoFormerForMultipleChoice(TFRoFormerPreTrainedModel, TFMultipleChoiceLos self.roformer = TFRoFormerMainLayer(config, name="roformer") self.sequence_summary = TFSequenceSummary(config, config.initializer_range, name="sequence_summary") - self.classifier = tf.keras.layers.Dense( + self.classifier = keras.layers.Dense( units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config @@ -1379,8 +1380,8 @@ class TFRoFormerForTokenClassification(TFRoFormerPreTrainedModel, TFTokenClassif self.num_labels = config.num_labels self.roformer = TFRoFormerMainLayer(config, name="roformer") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.classifier = keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config @@ -1462,7 +1463,7 @@ class TFRoFormerForQuestionAnswering(TFRoFormerPreTrainedModel, TFQuestionAnswer self.num_labels = config.num_labels self.roformer = TFRoFormerMainLayer(config, name="roformer") - self.qa_outputs = tf.keras.layers.Dense( + self.qa_outputs = keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) self.config = config diff --git a/src/transformers/models/sam/modeling_tf_sam.py b/src/transformers/models/sam/modeling_tf_sam.py index 7e79da1cb9..db7b9d32cd 100644 --- a/src/transformers/models/sam/modeling_tf_sam.py +++ b/src/transformers/models/sam/modeling_tf_sam.py @@ -29,7 +29,7 @@ import tensorflow as tf from ...activations_tf import ACT2FN from ...modeling_tf_outputs import TFBaseModelOutput -from ...modeling_tf_utils import TFModelInputType, TFPreTrainedModel, shape_list, unpack_inputs +from ...modeling_tf_utils import TFModelInputType, TFPreTrainedModel, keras, shape_list, unpack_inputs from ...tf_utils import flatten, functional_layernorm from ...utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging from .configuration_sam import SamConfig, SamMaskDecoderConfig, SamPromptEncoderConfig, SamVisionConfig @@ -114,7 +114,7 @@ class TFSamImageSegmentationOutput(ModelOutput): mask_decoder_attentions: Tuple[tf.Tensor, ...] | None = None -class TFSamPatchEmbeddings(tf.keras.layers.Layer): +class TFSamPatchEmbeddings(keras.layers.Layer): """ This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a @@ -133,7 +133,7 @@ class TFSamPatchEmbeddings(tf.keras.layers.Layer): self.num_channels = num_channels self.num_patches = num_patches - self.projection = tf.keras.layers.Conv2D( + self.projection = keras.layers.Conv2D( hidden_size, kernel_size=patch_size, strides=patch_size, name="projection" ) @@ -159,11 +159,11 @@ class TFSamPatchEmbeddings(tf.keras.layers.Layer): self.projection.build([None, None, None, self.num_channels]) -class TFSamMLPBlock(tf.keras.layers.Layer): +class TFSamMLPBlock(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.lin1 = tf.keras.layers.Dense(config.mlp_dim, name="lin1") - self.lin2 = tf.keras.layers.Dense(config.hidden_size, name="lin2") + self.lin1 = keras.layers.Dense(config.mlp_dim, name="lin1") + self.lin2 = keras.layers.Dense(config.hidden_size, name="lin2") self.act = ACT2FN[config.hidden_act] self.config = config @@ -185,7 +185,7 @@ class TFSamMLPBlock(tf.keras.layers.Layer): self.lin2.build([None, None, self.config.mlp_dim]) -class TFSamLayerNorm(tf.keras.layers.Layer): +class TFSamLayerNorm(keras.layers.Layer): r"""LayerNorm that supports two data formats: channels_last (default) or channels_first. The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height, width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width). @@ -212,7 +212,7 @@ class TFSamLayerNorm(tf.keras.layers.Layer): return x -class TFSamAttention(tf.keras.layers.Layer): +class TFSamAttention(keras.layers.Layer): """ SAM's attention layer that allows for downscaling the size of the embedding after projection to queries, keys, and values. @@ -229,10 +229,10 @@ class TFSamAttention(tf.keras.layers.Layer): if self.internal_dim % config.num_attention_heads != 0: raise ValueError("num_attention_heads must divide hidden_size.") - self.q_proj = tf.keras.layers.Dense(self.internal_dim, name="q_proj") - self.k_proj = tf.keras.layers.Dense(self.internal_dim, name="k_proj") - self.v_proj = tf.keras.layers.Dense(self.internal_dim, name="v_proj") - self.out_proj = tf.keras.layers.Dense(self.hidden_size, name="out_proj") + self.q_proj = keras.layers.Dense(self.internal_dim, name="q_proj") + self.k_proj = keras.layers.Dense(self.internal_dim, name="k_proj") + self.v_proj = keras.layers.Dense(self.internal_dim, name="v_proj") + self.out_proj = keras.layers.Dense(self.hidden_size, name="out_proj") def _separate_heads(self, hidden_states: tf.Tensor, num_attention_heads: int) -> tf.Tensor: batch, point_batch_size, n_tokens, channel = shape_list(hidden_states) @@ -295,7 +295,7 @@ class TFSamAttention(tf.keras.layers.Layer): self.out_proj.build([None, None, self.internal_dim]) -class TFSamTwoWayAttentionBlock(tf.keras.layers.Layer): +class TFSamTwoWayAttentionBlock(keras.layers.Layer): def __init__(self, config, attention_downsample_rate: int = 2, skip_first_layer_pe: bool = False, **kwargs): """ A transformer block with four layers: @@ -316,17 +316,17 @@ class TFSamTwoWayAttentionBlock(tf.keras.layers.Layer): self.layer_norm_eps = config.layer_norm_eps self.self_attn = TFSamAttention(config, downsample_rate=1, name="self_attn") - self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=self.layer_norm_eps, name="layer_norm1") + self.layer_norm1 = keras.layers.LayerNormalization(epsilon=self.layer_norm_eps, name="layer_norm1") self.cross_attn_token_to_image = TFSamAttention( config, downsample_rate=attention_downsample_rate, name="cross_attn_token_to_image" ) - self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=self.layer_norm_eps, name="layer_norm2") + self.layer_norm2 = keras.layers.LayerNormalization(epsilon=self.layer_norm_eps, name="layer_norm2") self.mlp = TFSamMLPBlock(config, name="mlp") - self.layer_norm3 = tf.keras.layers.LayerNormalization(epsilon=self.layer_norm_eps, name="layer_norm3") + self.layer_norm3 = keras.layers.LayerNormalization(epsilon=self.layer_norm_eps, name="layer_norm3") - self.layer_norm4 = tf.keras.layers.LayerNormalization(epsilon=self.layer_norm_eps, name="layer_norm4") + self.layer_norm4 = keras.layers.LayerNormalization(epsilon=self.layer_norm_eps, name="layer_norm4") self.cross_attn_image_to_token = TFSamAttention( config, downsample_rate=attention_downsample_rate, name="cross_attn_image_to_token" ) @@ -412,7 +412,7 @@ class TFSamTwoWayAttentionBlock(tf.keras.layers.Layer): self.cross_attn_image_to_token.build(None) -class TFSamTwoWayTransformer(tf.keras.layers.Layer): +class TFSamTwoWayTransformer(keras.layers.Layer): def __init__(self, config: SamMaskDecoderConfig, **kwargs): super().__init__(**kwargs) self.config = config @@ -424,7 +424,7 @@ class TFSamTwoWayTransformer(tf.keras.layers.Layer): self.layers.append(TFSamTwoWayAttentionBlock(config, skip_first_layer_pe=(i == 0), name=f"layers_._{i}")) self.final_attn_token_to_image = TFSamAttention(config, name="final_attn_token_to_image") - self.layer_norm_final_attn = tf.keras.layers.LayerNormalization( + self.layer_norm_final_attn = keras.layers.LayerNormalization( epsilon=config.layer_norm_eps, name="layer_norm_final_attn" ) @@ -493,17 +493,17 @@ class TFSamTwoWayTransformer(tf.keras.layers.Layer): layer.build(None) -class TFSamFeedForward(tf.keras.layers.Layer): +class TFSamFeedForward(keras.layers.Layer): def __init__( self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int, sigmoid_output: bool = False, **kwargs ): super().__init__(**kwargs) self.num_layers = num_layers - self.activation = tf.keras.layers.ReLU() - self.proj_in = tf.keras.layers.Dense(hidden_dim, input_shape=(input_dim,), name="proj_in") - self.proj_out = tf.keras.layers.Dense(output_dim, input_shape=(hidden_dim,), name="proj_out") + self.activation = keras.layers.ReLU() + self.proj_in = keras.layers.Dense(hidden_dim, input_shape=(input_dim,), name="proj_in") + self.proj_out = keras.layers.Dense(output_dim, input_shape=(hidden_dim,), name="proj_out") self.layers = [ - tf.keras.layers.Dense(hidden_dim, input_shape=(hidden_dim,), name=f"layers_._{i}") + keras.layers.Dense(hidden_dim, input_shape=(hidden_dim,), name=f"layers_._{i}") for i in range(num_layers - 2) ] self.sigmoid_output = sigmoid_output @@ -537,7 +537,7 @@ class TFSamFeedForward(tf.keras.layers.Layer): layer.build([None, None, self.hidden_dim]) -class TFSamMaskDecoder(tf.keras.layers.Layer): +class TFSamMaskDecoder(keras.layers.Layer): def __init__(self, config: SamMaskDecoderConfig, **kwargs): super().__init__(**kwargs) @@ -548,10 +548,10 @@ class TFSamMaskDecoder(tf.keras.layers.Layer): self.transformer = TFSamTwoWayTransformer(config, name="transformer") - self.upscale_conv1 = tf.keras.layers.Conv2DTranspose( + self.upscale_conv1 = keras.layers.Conv2DTranspose( self.hidden_size // 4, kernel_size=2, strides=2, name="upscale_conv1", data_format="channels_first" ) - self.upscale_conv2 = tf.keras.layers.Conv2DTranspose( + self.upscale_conv2 = keras.layers.Conv2DTranspose( self.hidden_size // 8, kernel_size=2, strides=2, name="upscale_conv2", data_format="channels_first" ) self.upscale_layer_norm = TFSamLayerNorm( @@ -685,7 +685,7 @@ class TFSamMaskDecoder(tf.keras.layers.Layer): return outputs -class TFSamPositionalEmbedding(tf.keras.layers.Layer): +class TFSamPositionalEmbedding(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.scale = config.hidden_size // 2 @@ -696,7 +696,7 @@ class TFSamPositionalEmbedding(tf.keras.layers.Layer): self.positional_embedding = self.add_weight( name="positional_embedding", shape=(2, self.config.num_pos_feats), - initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=self.scale), + initializer=keras.initializers.RandomNormal(mean=0.0, stddev=self.scale), trainable=False, ) super().build(input_shape) @@ -723,14 +723,14 @@ class TFSamPositionalEmbedding(tf.keras.layers.Layer): return tf.concat([tf.sin(coordinates), tf.cos(coordinates)], axis=-1) -class TFSamMaskEmbedding(tf.keras.layers.Layer): +class TFSamMaskEmbedding(keras.layers.Layer): def __init__(self, config: SamPromptEncoderConfig, **kwargs): super().__init__(**kwargs) self.mask_input_channels = config.mask_input_channels // 4 self.activation = ACT2FN[config.hidden_act] - self.conv1 = tf.keras.layers.Conv2D(self.mask_input_channels, kernel_size=2, strides=2, name="conv1") - self.conv2 = tf.keras.layers.Conv2D(config.mask_input_channels, kernel_size=2, strides=2, name="conv2") - self.conv3 = tf.keras.layers.Conv2D(config.hidden_size, kernel_size=1, name="conv3") + self.conv1 = keras.layers.Conv2D(self.mask_input_channels, kernel_size=2, strides=2, name="conv1") + self.conv2 = keras.layers.Conv2D(config.mask_input_channels, kernel_size=2, strides=2, name="conv2") + self.conv3 = keras.layers.Conv2D(config.hidden_size, kernel_size=1, name="conv3") self.layer_norm1 = TFSamLayerNorm(self.mask_input_channels, config.layer_norm_eps, name="layer_norm1") self.layer_norm2 = TFSamLayerNorm(self.mask_input_channels * 4, config.layer_norm_eps, name="layer_norm2") self.config = config @@ -765,7 +765,7 @@ class TFSamMaskEmbedding(tf.keras.layers.Layer): self.layer_norm2.build([None, None, None, self.mask_input_channels * 4]) -class TFSamPromptEncoder(tf.keras.layers.Layer): +class TFSamPromptEncoder(keras.layers.Layer): def __init__(self, config: SamPromptEncoderConfig, shared_patch_embedding, **kwargs): super().__init__(**kwargs) self.shared_embedding = shared_patch_embedding @@ -784,14 +784,14 @@ class TFSamPromptEncoder(tf.keras.layers.Layer): self.no_mask_embed = self.add_weight( name="no_mask_embed.weight", shape=(1, self.hidden_size), - initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.02), + initializer=keras.initializers.RandomNormal(mean=0.0, stddev=0.02), trainable=True, ) self.point_embed = [ self.add_weight( name=f"point_embed_._{i}.weight", shape=(1, self.hidden_size), - initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.02), + initializer=keras.initializers.RandomNormal(mean=0.0, stddev=0.02), trainable=True, ) for i in range(self.config.num_point_embeddings) @@ -799,7 +799,7 @@ class TFSamPromptEncoder(tf.keras.layers.Layer): self.not_a_point_embed = self.add_weight( name="not_a_point_embed.weight", shape=(1, self.hidden_size), - initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.02), + initializer=keras.initializers.RandomNormal(mean=0.0, stddev=0.02), trainable=True, ) with tf.name_scope("mask_embed"): @@ -907,7 +907,7 @@ class TFSamPromptEncoder(tf.keras.layers.Layer): return sparse_embeddings, dense_embeddings -class TFSamVisionAttention(tf.keras.layers.Layer): +class TFSamVisionAttention(keras.layers.Layer): """Multi-head Attention block with relative position embeddings.""" def __init__(self, config, window_size, **kwargs): @@ -925,8 +925,8 @@ class TFSamVisionAttention(tf.keras.layers.Layer): self.scale = head_dim**-0.5 self.dropout = config.attention_dropout - self.qkv = tf.keras.layers.Dense(config.hidden_size * 3, use_bias=config.qkv_bias, name="qkv") - self.proj = tf.keras.layers.Dense(config.hidden_size, name="proj") + self.qkv = keras.layers.Dense(config.hidden_size * 3, use_bias=config.qkv_bias, name="qkv") + self.proj = keras.layers.Dense(config.hidden_size, name="proj") self.use_rel_pos = config.use_rel_pos if self.use_rel_pos: @@ -1072,12 +1072,12 @@ class TFSamVisionAttention(tf.keras.layers.Layer): return outputs -class TFSamVisionLayer(tf.keras.layers.Layer): +class TFSamVisionLayer(keras.layers.Layer): def __init__(self, config, window_size, **kwargs): super().__init__(**kwargs) - self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1") + self.layer_norm1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1") self.attn = TFSamVisionAttention(config, window_size, name="attn") - self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2") + self.layer_norm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2") self.mlp = TFSamMLPBlock(config, name="mlp") self.window_size = window_size self.config = config @@ -1166,19 +1166,19 @@ class TFSamVisionLayer(tf.keras.layers.Layer): self.mlp.build(None) -class TFSamVisionNeck(tf.keras.layers.Layer): +class TFSamVisionNeck(keras.layers.Layer): def __init__(self, config: SamVisionConfig, **kwargs): super().__init__(**kwargs) self.config = config - self.conv1 = tf.keras.layers.Conv2D( + self.conv1 = keras.layers.Conv2D( config.output_channels, kernel_size=1, use_bias=False, name="conv1", ) self.layer_norm1 = TFSamLayerNorm(config.output_channels, name="layer_norm1") - self.conv2 = tf.keras.layers.Conv2D( + self.conv2 = keras.layers.Conv2D( config.output_channels, kernel_size=3, padding="same", @@ -1214,7 +1214,7 @@ class TFSamVisionNeck(tf.keras.layers.Layer): self.layer_norm2.build(None) -class TFSamVisionEncoder(tf.keras.layers.Layer): +class TFSamVisionEncoder(keras.layers.Layer): def __init__(self, config: SamVisionConfig, **kwargs): super().__init__(**kwargs) self.config = config @@ -1332,7 +1332,7 @@ SAM_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a TensorFlow [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) + This model is also a TensorFlow [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TensorFlow Model and refer to the TensorFlow documentation for all matter related to general usage and behavior. diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py index 3f0d0bf8ff..75c8ee2b39 100644 --- a/src/transformers/models/segformer/modeling_tf_segformer.py +++ b/src/transformers/models/segformer/modeling_tf_segformer.py @@ -30,7 +30,13 @@ from ...file_utils import ( replace_return_docstrings, ) from ...modeling_tf_outputs import TFBaseModelOutput, TFSemanticSegmenterOutput, TFSequenceClassifierOutput -from ...modeling_tf_utils import TFPreTrainedModel, TFSequenceClassificationLoss, keras_serializable, unpack_inputs +from ...modeling_tf_utils import ( + TFPreTrainedModel, + TFSequenceClassificationLoss, + keras, + keras_serializable, + unpack_inputs, +) from ...tf_utils import shape_list, stable_softmax from ...utils import logging from .configuration_segformer import SegformerConfig @@ -56,7 +62,7 @@ TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [ # Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextDropPath with ConvNext->Segformer -class TFSegformerDropPath(tf.keras.layers.Layer): +class TFSegformerDropPath(keras.layers.Layer): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). References: (1) github.com:rwightman/pytorch-image-models @@ -76,17 +82,17 @@ class TFSegformerDropPath(tf.keras.layers.Layer): return x -class TFSegformerOverlapPatchEmbeddings(tf.keras.layers.Layer): +class TFSegformerOverlapPatchEmbeddings(keras.layers.Layer): """Construct the overlapping patch embeddings.""" def __init__(self, patch_size, stride, num_channels, hidden_size, **kwargs): super().__init__(**kwargs) - self.padding = tf.keras.layers.ZeroPadding2D(padding=patch_size // 2) - self.proj = tf.keras.layers.Conv2D( + self.padding = keras.layers.ZeroPadding2D(padding=patch_size // 2) + self.proj = keras.layers.Conv2D( filters=hidden_size, kernel_size=patch_size, strides=stride, padding="VALID", name="proj" ) - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm") + self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm") self.num_channels = num_channels self.hidden_size = hidden_size @@ -113,7 +119,7 @@ class TFSegformerOverlapPatchEmbeddings(tf.keras.layers.Layer): self.layer_norm.build([None, None, self.hidden_size]) -class TFSegformerEfficientSelfAttention(tf.keras.layers.Layer): +class TFSegformerEfficientSelfAttention(keras.layers.Layer): """SegFormer's efficient self-attention mechanism. Employs the sequence reduction process introduced in the [PvT paper](https://arxiv.org/abs/2102.12122).""" @@ -139,18 +145,18 @@ class TFSegformerEfficientSelfAttention(tf.keras.layers.Layer): self.all_head_size = self.num_attention_heads * self.attention_head_size self.sqrt_att_head_size = math.sqrt(self.attention_head_size) - self.query = tf.keras.layers.Dense(self.all_head_size, name="query") - self.key = tf.keras.layers.Dense(self.all_head_size, name="key") - self.value = tf.keras.layers.Dense(self.all_head_size, name="value") + self.query = keras.layers.Dense(self.all_head_size, name="query") + self.key = keras.layers.Dense(self.all_head_size, name="key") + self.value = keras.layers.Dense(self.all_head_size, name="value") - self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) + self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob) self.sr_ratio = sequence_reduction_ratio if sequence_reduction_ratio > 1: - self.sr = tf.keras.layers.Conv2D( + self.sr = keras.layers.Conv2D( filters=hidden_size, kernel_size=sequence_reduction_ratio, strides=sequence_reduction_ratio, name="sr" ) - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm") + self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm") def transpose_for_scores(self, tensor: tf.Tensor) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] @@ -230,11 +236,11 @@ class TFSegformerEfficientSelfAttention(tf.keras.layers.Layer): self.layer_norm.build([None, None, self.hidden_size]) -class TFSegformerSelfOutput(tf.keras.layers.Layer): +class TFSegformerSelfOutput(keras.layers.Layer): def __init__(self, config: SegformerConfig, hidden_size: int, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense(hidden_size, name="dense") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dense = keras.layers.Dense(hidden_size, name="dense") + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) self.hidden_size = hidden_size def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -251,7 +257,7 @@ class TFSegformerSelfOutput(tf.keras.layers.Layer): self.dense.build([None, None, self.hidden_size]) -class TFSegformerAttention(tf.keras.layers.Layer): +class TFSegformerAttention(keras.layers.Layer): def __init__( self, config: SegformerConfig, @@ -291,10 +297,10 @@ class TFSegformerAttention(tf.keras.layers.Layer): self.dense_output.build(None) -class TFSegformerDWConv(tf.keras.layers.Layer): +class TFSegformerDWConv(keras.layers.Layer): def __init__(self, dim: int = 768, **kwargs): super().__init__(**kwargs) - self.depthwise_convolution = tf.keras.layers.Conv2D( + self.depthwise_convolution = keras.layers.Conv2D( filters=dim, kernel_size=3, strides=1, padding="same", groups=dim, name="dwconv" ) self.dim = dim @@ -320,7 +326,7 @@ class TFSegformerDWConv(tf.keras.layers.Layer): self.depthwise_convolution.build([None, None, None, self.dim]) -class TFSegformerMixFFN(tf.keras.layers.Layer): +class TFSegformerMixFFN(keras.layers.Layer): def __init__( self, config: SegformerConfig, @@ -331,14 +337,14 @@ class TFSegformerMixFFN(tf.keras.layers.Layer): ): super().__init__(**kwargs) out_features = out_features or in_features - self.dense1 = tf.keras.layers.Dense(hidden_features, name="dense1") + self.dense1 = keras.layers.Dense(hidden_features, name="dense1") self.depthwise_convolution = TFSegformerDWConv(hidden_features, name="dwconv") if isinstance(config.hidden_act, str): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - self.dense2 = tf.keras.layers.Dense(out_features, name="dense2") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dense2 = keras.layers.Dense(out_features, name="dense2") + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) self.hidden_features = hidden_features self.in_features = in_features @@ -366,7 +372,7 @@ class TFSegformerMixFFN(tf.keras.layers.Layer): self.dense2.build([None, None, self.hidden_features]) -class TFSegformerLayer(tf.keras.layers.Layer): +class TFSegformerLayer(keras.layers.Layer): """This corresponds to the Block class in the original implementation.""" def __init__( @@ -380,7 +386,7 @@ class TFSegformerLayer(tf.keras.layers.Layer): **kwargs, ): super().__init__(**kwargs) - self.layer_norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm_1") + self.layer_norm_1 = keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm_1") self.attention = TFSegformerAttention( config, hidden_size=hidden_size, @@ -388,8 +394,8 @@ class TFSegformerLayer(tf.keras.layers.Layer): sequence_reduction_ratio=sequence_reduction_ratio, name="attention", ) - self.drop_path = TFSegformerDropPath(drop_path) if drop_path > 0.0 else tf.keras.layers.Activation("linear") - self.layer_norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm_2") + self.drop_path = TFSegformerDropPath(drop_path) if drop_path > 0.0 else keras.layers.Activation("linear") + self.layer_norm_2 = keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm_2") mlp_hidden_size = int(hidden_size * mlp_ratio) self.mlp = TFSegformerMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size, name="mlp") self.hidden_size = hidden_size @@ -444,7 +450,7 @@ class TFSegformerLayer(tf.keras.layers.Layer): self.mlp.build(None) -class TFSegformerEncoder(tf.keras.layers.Layer): +class TFSegformerEncoder(keras.layers.Layer): def __init__(self, config: SegformerConfig, **kwargs): super().__init__(**kwargs) self.config = config @@ -492,7 +498,7 @@ class TFSegformerEncoder(tf.keras.layers.Layer): # Layer norms self.layer_norms = [ - tf.keras.layers.LayerNormalization(epsilon=1e-05, name=f"layer_norm.{i}") + keras.layers.LayerNormalization(epsilon=1e-05, name=f"layer_norm.{i}") for i in range(config.num_encoder_blocks) ] @@ -566,7 +572,7 @@ class TFSegformerEncoder(tf.keras.layers.Layer): @keras_serializable -class TFSegformerMainLayer(tf.keras.layers.Layer): +class TFSegformerMainLayer(keras.layers.Layer): config_class = SegformerConfig def __init__(self, config: SegformerConfig, **kwargs): @@ -591,7 +597,7 @@ class TFSegformerMainLayer(tf.keras.layers.Layer): ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict - # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format. + # When running on CPU, `keras.layers.Conv2D` doesn't support `NCHW` format. # So change the input format from `NCHW` to `NHWC`. # shape = (batch_size, in_height, in_width, in_channels=num_channels) pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) @@ -653,7 +659,7 @@ SEGFORMER_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -752,7 +758,7 @@ class TFSegformerForImageClassification(TFSegformerPreTrainedModel, TFSequenceCl self.segformer = TFSegformerMainLayer(config, name="segformer") # Classifier head - self.classifier = tf.keras.layers.Dense(config.num_labels, name="classifier") + self.classifier = keras.layers.Dense(config.num_labels, name="classifier") self.config = config @unpack_inputs @@ -812,14 +818,14 @@ class TFSegformerForImageClassification(TFSegformerPreTrainedModel, TFSequenceCl self.classifier.build([None, None, self.config.hidden_sizes[-1]]) -class TFSegformerMLP(tf.keras.layers.Layer): +class TFSegformerMLP(keras.layers.Layer): """ Linear Embedding. """ def __init__(self, input_dim: int, config: SegformerConfig, **kwargs): super().__init__(**kwargs) - self.proj = tf.keras.layers.Dense(config.decoder_hidden_size, name="proj") + self.proj = keras.layers.Dense(config.decoder_hidden_size, name="proj") self.input_dim = input_dim def call(self, hidden_states: tf.Tensor) -> tf.Tensor: @@ -850,14 +856,14 @@ class TFSegformerDecodeHead(TFSegformerPreTrainedModel): self.mlps = mlps # the following 3 layers implement the ConvModule of the original implementation - self.linear_fuse = tf.keras.layers.Conv2D( + self.linear_fuse = keras.layers.Conv2D( filters=config.decoder_hidden_size, kernel_size=1, use_bias=False, name="linear_fuse" ) - self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="batch_norm") - self.activation = tf.keras.layers.Activation("relu") + self.batch_norm = keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="batch_norm") + self.activation = keras.layers.Activation("relu") - self.dropout = tf.keras.layers.Dropout(config.classifier_dropout_prob) - self.classifier = tf.keras.layers.Conv2D(filters=config.num_labels, kernel_size=1, name="classifier") + self.dropout = keras.layers.Dropout(config.classifier_dropout_prob) + self.classifier = keras.layers.Conv2D(filters=config.num_labels, kernel_size=1, name="classifier") self.config = config @@ -931,7 +937,7 @@ class TFSegformerForSemanticSegmentation(TFSegformerPreTrainedModel): upsampled_logits = tf.image.resize(logits, size=label_interp_shape, method="bilinear") # compute weighted loss - loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none") + loss_fct = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none") def masked_loss(real, pred): unmasked_loss = loss_fct(real, pred) diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py index d9a86c2dda..927d8e09ba 100755 --- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py @@ -35,6 +35,7 @@ from ...modeling_tf_utils import ( TFModelInputType, TFPreTrainedModel, TFSharedEmbeddings, + keras, keras_serializable, unpack_inputs, ) @@ -121,7 +122,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None): return (one_cst - expanded_mask) * LARGE_NEGATIVE -class TFConv1dSubsampler(tf.keras.layers.Layer): +class TFConv1dSubsampler(keras.layers.Layer): """ Convolutional subsampler: a stack of 1D convolution (along temporal dimension) followed by non-linear activation via gated linear units (https://arxiv.org/abs/1911.08460) @@ -137,7 +138,7 @@ class TFConv1dSubsampler(tf.keras.layers.Layer): self.kernel_sizes = config.conv_kernel_sizes self.conv_layers = [ - tf.keras.layers.Conv1D( + keras.layers.Conv1D( filters=self.mid_channels if i < self.num_layers - 1 else self.out_channels * 2, kernel_size=k, strides=2, @@ -176,7 +177,7 @@ class TFConv1dSubsampler(tf.keras.layers.Layer): layer.build([None, None, self.in_channels] if i == 0 else [None, None, self.mid_channels // 2]) -class TFSpeech2TextSinusoidalPositionalEmbedding(tf.keras.layers.Layer): +class TFSpeech2TextSinusoidalPositionalEmbedding(keras.layers.Layer): """This module produces sinusoidal positional embeddings of any length.""" def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None, **kwargs): @@ -236,7 +237,7 @@ class TFSpeech2TextSinusoidalPositionalEmbedding(tf.keras.layers.Layer): # Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with Bart->Speech2Text -class TFSpeech2TextAttention(tf.keras.layers.Layer): +class TFSpeech2TextAttention(keras.layers.Layer): """Multi-headed attention from "Attention Is All You Need""" def __init__( @@ -252,7 +253,7 @@ class TFSpeech2TextAttention(tf.keras.layers.Layer): self.embed_dim = embed_dim self.num_heads = num_heads - self.dropout = tf.keras.layers.Dropout(dropout) + self.dropout = keras.layers.Dropout(dropout) self.head_dim = embed_dim // num_heads if (self.head_dim * num_heads) != self.embed_dim: raise ValueError( @@ -262,10 +263,10 @@ class TFSpeech2TextAttention(tf.keras.layers.Layer): self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder - self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") - self.q_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") - self.v_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") - self.out_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") + self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") + self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") + self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") + self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int): return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3)) @@ -406,20 +407,20 @@ class TFSpeech2TextAttention(tf.keras.layers.Layer): self.out_proj.build([None, None, self.embed_dim]) -class TFSpeech2TextEncoderLayer(tf.keras.layers.Layer): +class TFSpeech2TextEncoderLayer(keras.layers.Layer): def __init__(self, config: Speech2TextConfig, **kwargs): super().__init__(**kwargs) self.embed_dim = config.d_model self.self_attn = TFSpeech2TextAttention( self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn" ) - self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.dropout = keras.layers.Dropout(config.dropout) self.activation_fn = get_tf_activation(config.activation_function) - self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) - self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") - self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") - self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) + self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.config = config def call( @@ -482,7 +483,7 @@ class TFSpeech2TextEncoderLayer(tf.keras.layers.Layer): self.final_layer_norm.build([None, None, self.embed_dim]) -class TFSpeech2TextDecoderLayer(tf.keras.layers.Layer): +class TFSpeech2TextDecoderLayer(keras.layers.Layer): def __init__(self, config: Speech2TextConfig, **kwargs): super().__init__(**kwargs) self.embed_dim = config.d_model @@ -494,11 +495,11 @@ class TFSpeech2TextDecoderLayer(tf.keras.layers.Layer): name="self_attn", is_decoder=True, ) - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) self.activation_fn = get_tf_activation(config.activation_function) - self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) - self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") self.encoder_attn = TFSpeech2TextAttention( self.embed_dim, config.decoder_attention_heads, @@ -506,10 +507,10 @@ class TFSpeech2TextDecoderLayer(tf.keras.layers.Layer): name="encoder_attn", is_decoder=True, ) - self.encoder_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") - self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") - self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") - self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") + self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.config = config def call( @@ -655,7 +656,7 @@ SPEECH_TO_TEXT_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -777,7 +778,7 @@ SPEECH_TO_TEXT_INPUTS_DOCSTRING = r""" @keras_serializable -class TFSpeech2TextEncoder(tf.keras.layers.Layer): +class TFSpeech2TextEncoder(keras.layers.Layer): config_class = Speech2TextConfig """ Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a @@ -791,7 +792,7 @@ class TFSpeech2TextEncoder(tf.keras.layers.Layer): super().__init__(**kwargs) self.config = config - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) self.layerdrop = config.encoder_layerdrop embed_dim = config.d_model @@ -808,7 +809,7 @@ class TFSpeech2TextEncoder(tf.keras.layers.Layer): name="embed_positions", ) self.layers = [TFSpeech2TextEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)] - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") + self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") def _get_feat_extract_output_lengths(self, input_lengths: tf.Tensor): """ @@ -964,7 +965,7 @@ class TFSpeech2TextEncoder(tf.keras.layers.Layer): @keras_serializable -class TFSpeech2TextDecoder(tf.keras.layers.Layer): +class TFSpeech2TextDecoder(keras.layers.Layer): config_class = Speech2TextConfig """ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFSpeech2TextDecoderLayer`] @@ -991,9 +992,9 @@ class TFSpeech2TextDecoder(tf.keras.layers.Layer): ) self.layers = [TFSpeech2TextDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)] - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") + self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) def get_embed_tokens(self): return self.embed_tokens @@ -1204,7 +1205,7 @@ class TFSpeech2TextDecoder(tf.keras.layers.Layer): @keras_serializable -class TFSpeech2TextMainLayer(tf.keras.layers.Layer): +class TFSpeech2TextMainLayer(keras.layers.Layer): config_class = Speech2TextConfig def __init__(self, config: Speech2TextConfig, **kwargs): @@ -1417,7 +1418,7 @@ class TFSpeech2TextForConditionalGeneration(TFSpeech2TextPreTrainedModel, TFCaus def __init__(self, config: Speech2TextConfig): super().__init__(config) self.model = TFSpeech2TextMainLayer(config, name="model") - self.lm_head = tf.keras.layers.Dense(self.config.vocab_size, use_bias=False, name="lm_head") + self.lm_head = keras.layers.Dense(self.config.vocab_size, use_bias=False, name="lm_head") # TODO (Joao): investigate why Speech2Text has numerical issues in XLA generate self.supports_xla_generation = False self.config = config diff --git a/src/transformers/models/swin/modeling_tf_swin.py b/src/transformers/models/swin/modeling_tf_swin.py index f26da27790..6632759f68 100644 --- a/src/transformers/models/swin/modeling_tf_swin.py +++ b/src/transformers/models/swin/modeling_tf_swin.py @@ -31,6 +31,7 @@ from ...modeling_tf_utils import ( TFPreTrainedModel, TFSequenceClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -267,7 +268,7 @@ def drop_path( return input * random_tensor -class TFSwinEmbeddings(tf.keras.layers.Layer): +class TFSwinEmbeddings(keras.layers.Layer): """ Construct the patch and position embeddings. Optionally, also the mask token. """ @@ -281,8 +282,8 @@ class TFSwinEmbeddings(tf.keras.layers.Layer): self.use_mask_token = use_mask_token self.use_absolute_embeddings = config.use_absolute_embeddings - self.norm = tf.keras.layers.LayerNormalization(name="norm", epsilon=1e-5) - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, name="dropout") + self.norm = keras.layers.LayerNormalization(name="norm", epsilon=1e-5) + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, name="dropout") self.config = config def build(self, input_shape: tf.TensorShape) -> None: @@ -335,7 +336,7 @@ class TFSwinEmbeddings(tf.keras.layers.Layer): return embeddings, output_dimensions -class TFSwinPatchEmbeddings(tf.keras.layers.Layer): +class TFSwinPatchEmbeddings(keras.layers.Layer): """ Image to Patch Embedding. """ @@ -353,7 +354,7 @@ class TFSwinPatchEmbeddings(tf.keras.layers.Layer): self.num_patches = num_patches self.grid_size = (image_size[0] // patch_size[0], image_size[1] // patch_size[1]) - self.projection = tf.keras.layers.Conv2D( + self.projection = keras.layers.Conv2D( filters=hidden_size, kernel_size=self.patch_size, strides=self.patch_size, @@ -403,7 +404,7 @@ class TFSwinPatchEmbeddings(tf.keras.layers.Layer): self.projection.build([None, None, None, self.num_channels]) -class TFSwinPatchMerging(tf.keras.layers.Layer): +class TFSwinPatchMerging(keras.layers.Layer): """ Patch Merging Layer. @@ -412,7 +413,7 @@ class TFSwinPatchMerging(tf.keras.layers.Layer): Resolution of input feature. dim (`int`): Number of input channels. - norm_layer (`tf.keras.layer.Layer`, *optional*, defaults to `tf.keras.layers.LayerNormalization`): + norm_layer (`keras.layer.Layer`, *optional*, defaults to `keras.layers.LayerNormalization`): Normalization layer class. """ @@ -422,10 +423,10 @@ class TFSwinPatchMerging(tf.keras.layers.Layer): super().__init__(**kwargs) self.input_resolution = input_resolution self.dim = dim - self.reduction = tf.keras.layers.Dense(2 * dim, use_bias=False, name="reduction") + self.reduction = keras.layers.Dense(2 * dim, use_bias=False, name="reduction") if norm_layer is None: # Use same default epsilon as PyTorch - self.norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="norm") + self.norm = keras.layers.LayerNormalization(epsilon=1e-5, name="norm") else: self.norm = norm_layer(name="norm") @@ -476,7 +477,7 @@ class TFSwinPatchMerging(tf.keras.layers.Layer): self.norm.build([None, None, 4 * self.dim]) -class TFSwinDropPath(tf.keras.layers.Layer): +class TFSwinDropPath(keras.layers.Layer): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" def __init__(self, drop_prob: float = None, scale_by_keep: bool = True, **kwargs) -> None: @@ -488,7 +489,7 @@ class TFSwinDropPath(tf.keras.layers.Layer): return drop_path(input, self.drop_prob, training, self.scale_by_keep) -class TFSwinSelfAttention(tf.keras.layers.Layer): +class TFSwinSelfAttention(keras.layers.Layer): def __init__(self, config: SwinConfig, dim: int, num_heads: int, **kwargs) -> None: super().__init__(**kwargs) if dim % num_heads != 0: @@ -504,26 +505,26 @@ class TFSwinSelfAttention(tf.keras.layers.Layer): window_size if isinstance(window_size, collections.abc.Iterable) else (window_size, window_size) ) - self.query = tf.keras.layers.Dense( + self.query = keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), use_bias=config.qkv_bias, name="query", ) - self.key = tf.keras.layers.Dense( + self.key = keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), use_bias=config.qkv_bias, name="key", ) - self.value = tf.keras.layers.Dense( + self.value = keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), use_bias=config.qkv_bias, name="value", ) - self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) + self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob) def build(self, input_shape: tf.TensorShape) -> None: self.relative_position_bias_table = self.add_weight( @@ -636,11 +637,11 @@ class TFSwinSelfAttention(tf.keras.layers.Layer): return outputs -class TFSwinSelfOutput(tf.keras.layers.Layer): +class TFSwinSelfOutput(keras.layers.Layer): def __init__(self, config: SwinConfig, dim: int, **kwargs) -> None: super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense(dim, name="dense") - self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob, name="dropout") + self.dense = keras.layers.Dense(dim, name="dense") + self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob, name="dropout") self.dim = dim def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -660,7 +661,7 @@ class TFSwinSelfOutput(tf.keras.layers.Layer): self.dropout.build(None) -class TFSwinAttention(tf.keras.layers.Layer): +class TFSwinAttention(keras.layers.Layer): def __init__(self, config: SwinConfig, dim: int, num_heads: int, **kwargs) -> None: super().__init__(**kwargs) self.self = TFSwinSelfAttention(config, dim, num_heads, name="self") @@ -699,10 +700,10 @@ class TFSwinAttention(tf.keras.layers.Layer): self.self_output.build(None) -class TFSwinIntermediate(tf.keras.layers.Layer): +class TFSwinIntermediate(keras.layers.Layer): def __init__(self, config: SwinConfig, dim: int, **kwargs) -> None: super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense(int(config.mlp_ratio * dim), name="dense") + self.dense = keras.layers.Dense(int(config.mlp_ratio * dim), name="dense") if isinstance(config.hidden_act, str): self.intermediate_act_fn = ACT2FN[config.hidden_act] else: @@ -723,11 +724,11 @@ class TFSwinIntermediate(tf.keras.layers.Layer): self.dense.build([None, None, self.dim]) -class TFSwinOutput(tf.keras.layers.Layer): +class TFSwinOutput(keras.layers.Layer): def __init__(self, config: SwinConfig, dim: int, **kwargs) -> None: super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense(dim, name="dense") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, "dropout") + self.dense = keras.layers.Dense(dim, name="dense") + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, "dropout") self.config = config self.dim = dim @@ -745,7 +746,7 @@ class TFSwinOutput(tf.keras.layers.Layer): self.dense.build([None, None, int(self.config.mlp_ratio * self.dim)]) -class TFSwinLayer(tf.keras.layers.Layer): +class TFSwinLayer(keras.layers.Layer): def __init__( self, config, dim, input_resolution: Tuple[int, int], num_heads: int, shift_size: int = 0, **kwargs ) -> None: @@ -756,18 +757,14 @@ class TFSwinLayer(tf.keras.layers.Layer): self.shift_size = 0 if min_res <= self.window_size else shift_size self.input_resolution = input_resolution - self.layernorm_before = tf.keras.layers.LayerNormalization( - epsilon=config.layer_norm_eps, name="layernorm_before" - ) + self.layernorm_before = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_before") self.attention = TFSwinAttention(config, dim, num_heads, name="attention") self.drop_path = ( TFSwinDropPath(config.drop_path_rate, name="drop_path") if config.drop_path_rate > 0.0 - else tf.keras.layers.Activation("linear", name="drop_path") - ) - self.layernorm_after = tf.keras.layers.LayerNormalization( - epsilon=config.layer_norm_eps, name="layernorm_after" + else keras.layers.Activation("linear", name="drop_path") ) + self.layernorm_after = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_after") self.intermediate = TFSwinIntermediate(config, dim, name="intermediate") self.swin_output = TFSwinOutput(config, dim, name="output") self.dim = dim @@ -900,7 +897,7 @@ class TFSwinLayer(tf.keras.layers.Layer): self.swin_output.build(None) -class TFSwinStage(tf.keras.layers.Layer): +class TFSwinStage(keras.layers.Layer): def __init__( self, config: SwinConfig, @@ -932,7 +929,7 @@ class TFSwinStage(tf.keras.layers.Layer): self.downsample = downsample( input_resolution, dim=dim, - norm_layer=partial(tf.keras.layers.LayerNormalization, epsilon=1e-5), + norm_layer=partial(keras.layers.LayerNormalization, epsilon=1e-5), name="downsample", ) else: @@ -984,7 +981,7 @@ class TFSwinStage(tf.keras.layers.Layer): layer.build(None) -class TFSwinEncoder(tf.keras.layers.Layer): +class TFSwinEncoder(keras.layers.Layer): def __init__(self, config: SwinConfig, grid_size: Tuple[int, int], **kwargs): super().__init__(**kwargs) self.num_layers = len(config.depths) @@ -1086,7 +1083,7 @@ class TFSwinPreTrainedModel(TFPreTrainedModel): SWIN_START_DOCSTRING = r""" This model is a Tensorflow - [tf.keras.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer) sub-class. Use it as a + [keras.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer) sub-class. Use it as a regular Tensorflow Module and refer to the Tensorflow documentation for all matter related to general usage and behavior. @@ -1124,7 +1121,7 @@ def normalize_data_format(value: str) -> str: https://github.com/tensorflow/addons/blob/8cec33fcaaf1cf90aec7bdd55a0fcdbb251ce5c2/tensorflow_addons/utils/keras_utils.py#L71 """ if value is None: - value = tf.keras.backend.image_data_format() + value = keras.backend.image_data_format() data_format = value.lower() if data_format not in {"channels_first", "channels_last"}: raise ValueError( @@ -1133,7 +1130,7 @@ def normalize_data_format(value: str) -> str: return data_format -class AdaptiveAveragePooling1D(tf.keras.layers.Layer): +class AdaptiveAveragePooling1D(keras.layers.Layer): """ Args: Average 1D Pooling with adaptive kernel size. @@ -1197,7 +1194,7 @@ class AdaptiveAveragePooling1D(tf.keras.layers.Layer): @keras_serializable -class TFSwinMainLayer(tf.keras.layers.Layer): +class TFSwinMainLayer(keras.layers.Layer): config_class = SwinConfig def __init__( @@ -1211,7 +1208,7 @@ class TFSwinMainLayer(tf.keras.layers.Layer): self.embeddings = TFSwinEmbeddings(config, use_mask_token=use_mask_token, name="embeddings") self.encoder = TFSwinEncoder(config, self.embeddings.patch_grid, name="encoder") - self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") + self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") self.pooler = AdaptiveAveragePooling1D(output_size=(1,)) if add_pooling_layer else None def get_input_embeddings(self) -> TFSwinPatchEmbeddings: @@ -1371,7 +1368,7 @@ class TFSwinModel(TFSwinPreTrainedModel): self.swin.build(None) -class TFSwinPixelShuffle(tf.keras.layers.Layer): +class TFSwinPixelShuffle(keras.layers.Layer): """TF layer implementation of torch.nn.PixelShuffle""" def __init__(self, upscale_factor: int, **kwargs) -> None: @@ -1397,10 +1394,10 @@ class TFSwinPixelShuffle(tf.keras.layers.Layer): return hidden_states -class TFSwinDecoder(tf.keras.layers.Layer): +class TFSwinDecoder(keras.layers.Layer): def __init__(self, config: SwinConfig, **kwargs): super().__init__(**kwargs) - self.conv2d = tf.keras.layers.Conv2D( + self.conv2d = keras.layers.Conv2D( filters=config.encoder_stride**2 * config.num_channels, kernel_size=1, strides=1, name="0" ) self.pixel_shuffle = TFSwinPixelShuffle(config.encoder_stride, name="1") @@ -1514,7 +1511,7 @@ class TFSwinForMaskedImageModeling(TFSwinPreTrainedModel): mask = tf.expand_dims(mask, 1) mask = tf.cast(mask, tf.float32) - reconstruction_loss = tf.keras.losses.mean_absolute_error( + reconstruction_loss = keras.losses.mean_absolute_error( # Swap axes as metric calculation reduces over the final dimension tf.transpose(pixel_values, (1, 2, 3, 0)), tf.transpose(reconstructed_pixel_values, (1, 2, 3, 0)), @@ -1565,9 +1562,9 @@ class TFSwinForImageClassification(TFSwinPreTrainedModel, TFSequenceClassificati # Classifier head self.classifier = ( - tf.keras.layers.Dense(config.num_labels, name="classifier") + keras.layers.Dense(config.num_labels, name="classifier") if config.num_labels > 0 - else tf.keras.layers.Activation("linear", name="classifier") + else keras.layers.Activation("linear", name="classifier") ) @add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING) diff --git a/src/transformers/models/t5/modeling_tf_t5.py b/src/transformers/models/t5/modeling_tf_t5.py index b6a1c16238..c0a05a8a39 100644 --- a/src/transformers/models/t5/modeling_tf_t5.py +++ b/src/transformers/models/t5/modeling_tf_t5.py @@ -40,6 +40,7 @@ from ...modeling_tf_utils import ( TFModelInputType, TFPreTrainedModel, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -68,12 +69,12 @@ TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = [ #################################################### # TF 2.0 Models are constructed using Keras imperative API by sub-classing -# - tf.keras.layers.Layer for the layers and -# - TFPreTrainedModel for the models (it-self a sub-class of tf.keras.Model) +# - keras.layers.Layer for the layers and +# - TFPreTrainedModel for the models (it-self a sub-class of keras.Model) #################################################### -class TFT5LayerNorm(tf.keras.layers.Layer): +class TFT5LayerNorm(keras.layers.Layer): def __init__(self, hidden_size, epsilon=1e-6, **kwargs): """ Construct a layernorm module in the T5 style No bias and no subtraction of mean. @@ -93,22 +94,22 @@ class TFT5LayerNorm(tf.keras.layers.Layer): return self.weight * hidden_states -class TFT5DenseActDense(tf.keras.layers.Layer): +class TFT5DenseActDense(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - wi_initializer = tf.keras.initializers.RandomNormal( + wi_initializer = keras.initializers.RandomNormal( mean=0, stddev=config.initializer_factor * (config.d_model**-0.5) ) - wo_initializer = tf.keras.initializers.RandomNormal( + wo_initializer = keras.initializers.RandomNormal( mean=0, stddev=config.initializer_factor * (config.d_ff**-0.5) ) - self.wi = tf.keras.layers.Dense( + self.wi = keras.layers.Dense( config.d_ff, use_bias=False, name="wi", kernel_initializer=wi_initializer ) # Update init weights as in flax - self.wo = tf.keras.layers.Dense( + self.wo = keras.layers.Dense( config.d_model, use_bias=False, name="wo", kernel_initializer=wo_initializer ) # Update init weights as in flax - self.dropout = tf.keras.layers.Dropout(config.dropout_rate) + self.dropout = keras.layers.Dropout(config.dropout_rate) self.act = get_tf_activation(config.dense_act_fn) self.config = config @@ -131,25 +132,25 @@ class TFT5DenseActDense(tf.keras.layers.Layer): self.wo.build([None, None, self.config.d_ff]) -class TFT5DenseGatedActDense(tf.keras.layers.Layer): +class TFT5DenseGatedActDense(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - wi_initializer = tf.keras.initializers.RandomNormal( + wi_initializer = keras.initializers.RandomNormal( mean=0, stddev=config.initializer_factor * (config.d_model**-0.5) ) - wo_initializer = tf.keras.initializers.RandomNormal( + wo_initializer = keras.initializers.RandomNormal( mean=0, stddev=config.initializer_factor * (config.d_ff**-0.5) ) - self.wi_0 = tf.keras.layers.Dense( + self.wi_0 = keras.layers.Dense( config.d_ff, use_bias=False, name="wi_0", kernel_initializer=wi_initializer ) # Update init weights as in flax - self.wi_1 = tf.keras.layers.Dense( + self.wi_1 = keras.layers.Dense( config.d_ff, use_bias=False, name="wi_1", kernel_initializer=wi_initializer ) # Update init weights as in flax - self.wo = tf.keras.layers.Dense( + self.wo = keras.layers.Dense( config.d_model, use_bias=False, name="wo", kernel_initializer=wo_initializer ) # Update init weights as in flax - self.dropout = tf.keras.layers.Dropout(config.dropout_rate) + self.dropout = keras.layers.Dropout(config.dropout_rate) self.act = get_tf_activation(config.dense_act_fn) self.config = config @@ -176,7 +177,7 @@ class TFT5DenseGatedActDense(tf.keras.layers.Layer): self.wo.build([None, None, self.config.d_ff]) -class TFT5LayerFF(tf.keras.layers.Layer): +class TFT5LayerFF(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) if config.is_gated_act: @@ -185,7 +186,7 @@ class TFT5LayerFF(tf.keras.layers.Layer): self.DenseReluDense = TFT5DenseActDense(config, name="DenseReluDense") self.layer_norm = TFT5LayerNorm(config.d_model, epsilon=config.layer_norm_epsilon, name="layer_norm") - self.dropout = tf.keras.layers.Dropout(config.dropout_rate) + self.dropout = keras.layers.Dropout(config.dropout_rate) def call(self, hidden_states, training=False): normed_hidden_states = self.layer_norm(hidden_states) @@ -205,7 +206,7 @@ class TFT5LayerFF(tf.keras.layers.Layer): self.DenseReluDense.build(None) -class TFT5Attention(tf.keras.layers.Layer): +class TFT5Attention(keras.layers.Layer): NEW_ID = itertools.count() def __init__(self, config, has_relative_attention_bias=False, **kwargs): @@ -224,35 +225,35 @@ class TFT5Attention(tf.keras.layers.Layer): self.inner_dim = self.n_heads * self.key_value_proj_dim # Mesh TensorFlow initialization to avoid scaling before softmax - q_initializer = tf.keras.initializers.RandomNormal( + q_initializer = keras.initializers.RandomNormal( mean=0, stddev=config.initializer_factor * ((self.inner_dim * self.key_value_proj_dim) ** -0.5) ) - k_initializer = tf.keras.initializers.RandomNormal( + k_initializer = keras.initializers.RandomNormal( mean=0, stddev=config.initializer_factor * (self.inner_dim**-0.5) ) - v_initializer = tf.keras.initializers.RandomNormal( + v_initializer = keras.initializers.RandomNormal( mean=0, stddev=config.initializer_factor * (self.inner_dim**-0.5) ) - o_initializer = tf.keras.initializers.RandomNormal( + o_initializer = keras.initializers.RandomNormal( mean=0, stddev=config.initializer_factor * (self.inner_dim**-0.5) ) - self.relative_attention_bias_initializer = tf.keras.initializers.RandomNormal( + self.relative_attention_bias_initializer = keras.initializers.RandomNormal( mean=0, stddev=config.initializer_factor * (self.inner_dim**-0.5) ) - self.q = tf.keras.layers.Dense( + self.q = keras.layers.Dense( self.inner_dim, use_bias=False, name="q", kernel_initializer=q_initializer ) # Update init weights as in flax - self.k = tf.keras.layers.Dense( + self.k = keras.layers.Dense( self.inner_dim, use_bias=False, name="k", kernel_initializer=k_initializer ) # Update init weights as in flax - self.v = tf.keras.layers.Dense( + self.v = keras.layers.Dense( self.inner_dim, use_bias=False, name="v", kernel_initializer=v_initializer ) # Update init weights as in flax - self.o = tf.keras.layers.Dense( + self.o = keras.layers.Dense( self.d_model, use_bias=False, name="o", kernel_initializer=o_initializer ) # Update init weights as in flax - self.dropout = tf.keras.layers.Dropout(config.dropout_rate) + self.dropout = keras.layers.Dropout(config.dropout_rate) self.pruned_heads = set() @@ -482,7 +483,7 @@ class TFT5Attention(tf.keras.layers.Layer): return outputs -class TFT5LayerSelfAttention(tf.keras.layers.Layer): +class TFT5LayerSelfAttention(keras.layers.Layer): def __init__(self, config, has_relative_attention_bias=False, **kwargs): super().__init__(**kwargs) self.SelfAttention = TFT5Attention( @@ -491,7 +492,7 @@ class TFT5LayerSelfAttention(tf.keras.layers.Layer): name="SelfAttention", ) self.layer_norm = TFT5LayerNorm(config.d_model, epsilon=config.layer_norm_epsilon, name="layer_norm") - self.dropout = tf.keras.layers.Dropout(config.dropout_rate) + self.dropout = keras.layers.Dropout(config.dropout_rate) def call( self, @@ -531,7 +532,7 @@ class TFT5LayerSelfAttention(tf.keras.layers.Layer): self.layer_norm.build(None) -class TFT5LayerCrossAttention(tf.keras.layers.Layer): +class TFT5LayerCrossAttention(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.EncDecAttention = TFT5Attention( @@ -540,7 +541,7 @@ class TFT5LayerCrossAttention(tf.keras.layers.Layer): name="EncDecAttention", ) self.layer_norm = TFT5LayerNorm(config.d_model, epsilon=config.layer_norm_epsilon, name="layer_norm") - self.dropout = tf.keras.layers.Dropout(config.dropout_rate) + self.dropout = keras.layers.Dropout(config.dropout_rate) def call( self, @@ -584,7 +585,7 @@ class TFT5LayerCrossAttention(tf.keras.layers.Layer): self.layer_norm.build(None) -class TFT5Block(tf.keras.layers.Layer): +class TFT5Block(keras.layers.Layer): def __init__(self, config, has_relative_attention_bias=False, **kwargs): super().__init__(**kwargs) self.is_decoder = config.is_decoder @@ -698,10 +699,10 @@ class TFT5Block(tf.keras.layers.Layer): #################################################### # The full model without a specific pretrained or finetuning head is -# provided as a tf.keras.layers.Layer usually called "TFT5MainLayer" +# provided as a keras.layers.Layer usually called "TFT5MainLayer" #################################################### @keras_serializable -class TFT5MainLayer(tf.keras.layers.Layer): +class TFT5MainLayer(keras.layers.Layer): config_class = T5Config def __init__(self, config, embed_tokens=None, **kwargs): @@ -725,7 +726,7 @@ class TFT5MainLayer(tf.keras.layers.Layer): self.final_layer_norm = TFT5LayerNorm( config.d_model, epsilon=config.layer_norm_epsilon, name="final_layer_norm" ) - self.dropout = tf.keras.layers.Dropout(config.dropout_rate) + self.dropout = keras.layers.Dropout(config.dropout_rate) def _prune_heads(self, heads_to_prune): raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models @@ -936,7 +937,7 @@ class TFT5MainLayer(tf.keras.layers.Layer): #################################################### -# TFT5PreTrainedModel is a sub-class of tf.keras.Model +# TFT5PreTrainedModel is a sub-class of keras.Model # which take care of loading and saving pretrained weights # and various common utilities. # Here you just need to specify a few (self-explanatory) @@ -1006,7 +1007,7 @@ T5_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -1182,10 +1183,10 @@ class TFT5Model(TFT5PreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) - self.shared = tf.keras.layers.Embedding( + self.shared = keras.layers.Embedding( input_dim=config.vocab_size, output_dim=config.d_model, - embeddings_initializer=tf.keras.initializers.TruncatedNormal(self.config.initializer_factor), + embeddings_initializer=keras.initializers.TruncatedNormal(self.config.initializer_factor), name="shared", ) # Additional attribute to specify the expected name scope of the layer (for loading/storing weights) @@ -1331,7 +1332,7 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.model_dim = config.d_model - self.shared = tf.keras.layers.Embedding( + self.shared = keras.layers.Embedding( config.vocab_size, config.d_model, name="shared", @@ -1350,8 +1351,8 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling self.decoder = TFT5MainLayer(decoder_config, self.shared, name="decoder") if not config.tie_word_embeddings: - lm_head_initializer = tf.keras.initializers.RandomNormal(mean=0, stddev=config.initializer_factor) - self.lm_head = tf.keras.layers.Dense( + lm_head_initializer = keras.initializers.RandomNormal(mean=0, stddev=config.initializer_factor) + self.lm_head = keras.layers.Dense( config.vocab_size, use_bias=False, name="lm_head", kernel_initializer=lm_head_initializer ) # Update init weights as in flax self.config = config @@ -1368,8 +1369,8 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling if self.config.tie_word_embeddings: self.set_input_embeddings(value) else: - lm_head_initializer = tf.keras.initializers.RandomNormal(mean=0, stddev=self.config.initializer_factor) - self.lm_head = tf.keras.layers.Dense( + lm_head_initializer = keras.initializers.RandomNormal(mean=0, stddev=self.config.initializer_factor) + self.lm_head = keras.layers.Dense( shape_list(value)[0], use_bias=False, name="lm_head", kernel_initializer=lm_head_initializer ) # Update init weights as in flax # in a dense layer the kernel has a shape (last_dim, units), for us (dim, num_tokens) @@ -1603,7 +1604,7 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling class TFT5EncoderModel(TFT5PreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) - self.shared = tf.keras.layers.Embedding( + self.shared = keras.layers.Embedding( config.vocab_size, config.d_model, name="shared", diff --git a/src/transformers/models/tapas/modeling_tf_tapas.py b/src/transformers/models/tapas/modeling_tf_tapas.py index 237b7b5b76..79b1a9ebfc 100644 --- a/src/transformers/models/tapas/modeling_tf_tapas.py +++ b/src/transformers/models/tapas/modeling_tf_tapas.py @@ -38,6 +38,7 @@ from ...modeling_tf_utils import ( TFPreTrainedModel, TFSequenceClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -142,7 +143,7 @@ class TFTableQuestionAnsweringOutput(ModelOutput): attentions: Tuple[tf.Tensor] | None = None -class TFTapasEmbeddings(tf.keras.layers.Layer): +class TFTapasEmbeddings(keras.layers.Layer): """ Construct the embeddings from word, position and token_type embeddings. Same as BertEmbeddings but with a number of additional token type embeddings to encode tabular structure. @@ -157,8 +158,8 @@ class TFTapasEmbeddings(tf.keras.layers.Layer): self.hidden_size = config.hidden_size self.max_position_embeddings = config.max_position_embeddings self.initializer_range = config.initializer_range - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) def build(self, input_shape=None): with tf.name_scope("word_embeddings"): @@ -257,7 +258,7 @@ class TFTapasEmbeddings(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Tapas -class TFTapasSelfAttention(tf.keras.layers.Layer): +class TFTapasSelfAttention(keras.layers.Layer): def __init__(self, config: TapasConfig, **kwargs): super().__init__(**kwargs) @@ -272,16 +273,16 @@ class TFTapasSelfAttention(tf.keras.layers.Layer): self.all_head_size = self.num_attention_heads * self.attention_head_size self.sqrt_att_head_size = math.sqrt(self.attention_head_size) - self.query = tf.keras.layers.Dense( + self.query = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" ) - self.key = tf.keras.layers.Dense( + self.key = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" ) - self.value = tf.keras.layers.Dense( + self.value = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) - self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.is_decoder = config.is_decoder self.config = config @@ -390,15 +391,15 @@ class TFTapasSelfAttention(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Tapas -class TFTapasSelfOutput(tf.keras.layers.Layer): +class TFTapasSelfOutput(keras.layers.Layer): def __init__(self, config: TapasConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -421,7 +422,7 @@ class TFTapasSelfOutput(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Tapas -class TFTapasAttention(tf.keras.layers.Layer): +class TFTapasAttention(keras.layers.Layer): def __init__(self, config: TapasConfig, **kwargs): super().__init__(**kwargs) @@ -473,11 +474,11 @@ class TFTapasAttention(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Tapas -class TFTapasIntermediate(tf.keras.layers.Layer): +class TFTapasIntermediate(keras.layers.Layer): def __init__(self, config: TapasConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -503,15 +504,15 @@ class TFTapasIntermediate(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Tapas -class TFTapasOutput(tf.keras.layers.Layer): +class TFTapasOutput(keras.layers.Layer): def __init__(self, config: TapasConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -534,7 +535,7 @@ class TFTapasOutput(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Tapas -class TFTapasLayer(tf.keras.layers.Layer): +class TFTapasLayer(keras.layers.Layer): def __init__(self, config: TapasConfig, **kwargs): super().__init__(**kwargs) @@ -638,7 +639,7 @@ class TFTapasLayer(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Tapas -class TFTapasEncoder(tf.keras.layers.Layer): +class TFTapasEncoder(keras.layers.Layer): def __init__(self, config: TapasConfig, **kwargs): super().__init__(**kwargs) self.config = config @@ -717,11 +718,11 @@ class TFTapasEncoder(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Tapas -class TFTapasPooler(tf.keras.layers.Layer): +class TFTapasPooler(keras.layers.Layer): def __init__(self, config: TapasConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", @@ -747,11 +748,11 @@ class TFTapasPooler(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->Tapas -class TFTapasPredictionHeadTransform(tf.keras.layers.Layer): +class TFTapasPredictionHeadTransform(keras.layers.Layer): def __init__(self, config: TapasConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense", @@ -762,7 +763,7 @@ class TFTapasPredictionHeadTransform(tf.keras.layers.Layer): else: self.transform_act_fn = config.hidden_act - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: @@ -785,8 +786,8 @@ class TFTapasPredictionHeadTransform(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMPredictionHead with Bert->Tapas -class TFTapasLMPredictionHead(tf.keras.layers.Layer): - def __init__(self, config: TapasConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): +class TFTapasLMPredictionHead(keras.layers.Layer): + def __init__(self, config: TapasConfig, input_embeddings: keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.config = config @@ -808,7 +809,7 @@ class TFTapasLMPredictionHead(tf.keras.layers.Layer): with tf.name_scope(self.transform.name): self.transform.build(None) - def get_output_embeddings(self) -> tf.keras.layers.Layer: + def get_output_embeddings(self) -> keras.layers.Layer: return self.input_embeddings def set_output_embeddings(self, value: tf.Variable): @@ -834,8 +835,8 @@ class TFTapasLMPredictionHead(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertMLMHead with Bert->Tapas -class TFTapasMLMHead(tf.keras.layers.Layer): - def __init__(self, config: TapasConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): +class TFTapasMLMHead(keras.layers.Layer): + def __init__(self, config: TapasConfig, input_embeddings: keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.predictions = TFTapasLMPredictionHead(config, input_embeddings, name="predictions") @@ -855,7 +856,7 @@ class TFTapasMLMHead(tf.keras.layers.Layer): @keras_serializable -class TFTapasMainLayer(tf.keras.layers.Layer): +class TFTapasMainLayer(keras.layers.Layer): config_class = TapasConfig def __init__(self, config: TapasConfig, add_pooling_layer: bool = True, **kwargs): @@ -868,7 +869,7 @@ class TFTapasMainLayer(tf.keras.layers.Layer): self.encoder = TFTapasEncoder(config, name="encoder") self.pooler = TFTapasPooler(config, name="pooler") if add_pooling_layer else None - def get_input_embeddings(self) -> tf.keras.layers.Layer: + def get_input_embeddings(self) -> keras.layers.Layer: return self.embeddings def set_input_embeddings(self, value: tf.Variable): @@ -1015,7 +1016,7 @@ TAPAS_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -1194,7 +1195,7 @@ class TFTapasForMaskedLM(TFTapasPreTrainedModel, TFMaskedLanguageModelingLoss): self.tapas = TFTapasMainLayer(config, add_pooling_layer=False, name="tapas") self.lm_head = TFTapasMLMHead(config, input_embeddings=self.tapas.embeddings, name="cls") - def get_lm_head(self) -> tf.keras.layers.Layer: + def get_lm_head(self) -> keras.layers.Layer: return self.lm_head.predictions @unpack_inputs @@ -1287,7 +1288,7 @@ class TFTapasForMaskedLM(TFTapasPreTrainedModel, TFMaskedLanguageModelingLoss): self.lm_head.build(None) -class TFTapasComputeTokenLogits(tf.keras.layers.Layer): +class TFTapasComputeTokenLogits(keras.layers.Layer): def __init__(self, config: TapasConfig, **kwargs): super().__init__(**kwargs) @@ -1301,7 +1302,7 @@ class TFTapasComputeTokenLogits(tf.keras.layers.Layer): trainable=True, initializer=tf.zeros_initializer() if config.init_cell_selection_weights_to_zero - else tf.keras.initializers.TruncatedNormal(stddev=config.initializer_range), + else keras.initializers.TruncatedNormal(stddev=config.initializer_range), ) self.output_bias = self.add_weight( name="output_bias", shape=(), trainable=True, initializer=tf.zeros_initializer() @@ -1323,7 +1324,7 @@ class TFTapasComputeTokenLogits(tf.keras.layers.Layer): return logits -class TFTapasComputeColumnLogits(tf.keras.layers.Layer): +class TFTapasComputeColumnLogits(keras.layers.Layer): def __init__(self, config: TapasConfig, **kwargs): super().__init__(**kwargs) @@ -1335,7 +1336,7 @@ class TFTapasComputeColumnLogits(tf.keras.layers.Layer): trainable=True, initializer=tf.zeros_initializer() if config.init_cell_selection_weights_to_zero - else tf.keras.initializers.TruncatedNormal(stddev=config.initializer_range), + else keras.initializers.TruncatedNormal(stddev=config.initializer_range), ) self.column_output_bias = self.add_weight( name="column_output_bias", shape=(), trainable=True, initializer=tf.zeros_initializer() @@ -1400,14 +1401,14 @@ class TFTapasForQuestionAnswering(TFTapasPreTrainedModel): self.tapas = TFTapasMainLayer(config, name="tapas") # dropout - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) self.compute_token_logits = TFTapasComputeTokenLogits(config, name="compute_token_logits") self.compute_column_logits = TFTapasComputeColumnLogits(config, name="compute_column_logits") if config.num_aggregation_labels > 0: - self.aggregation_classifier = tf.keras.layers.Dense( + self.aggregation_classifier = keras.layers.Dense( config.num_aggregation_labels, kernel_initializer=get_initializer(config.initializer_range), name="aggregation_classifier", @@ -1740,8 +1741,8 @@ class TFTapasForSequenceClassification(TFTapasPreTrainedModel, TFSequenceClassif self.num_labels = config.num_labels self.tapas = TFTapasMainLayer(config, name="tapas") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, name="dropout") - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, name="dropout") + self.classifier = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config diff --git a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py index a74fe7d62e..a323c0607f 100644 --- a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py +++ b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py @@ -26,7 +26,7 @@ import tensorflow as tf from ...configuration_utils import PretrainedConfig from ...modeling_tf_outputs import TFBaseModelOutput, TFSeq2SeqLMOutput -from ...modeling_tf_utils import TFCausalLanguageModelingLoss, TFPreTrainedModel, get_initializer, unpack_inputs +from ...modeling_tf_utils import TFCausalLanguageModelingLoss, TFPreTrainedModel, get_initializer, keras, unpack_inputs from ...tf_utils import shape_list from ...utils import ( ModelOutput, @@ -74,7 +74,7 @@ VISION_ENCODER_DECODER_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -242,7 +242,7 @@ class TFVisionEncoderDecoderModel(TFPreTrainedModel, TFCausalLanguageModelingLos self.encoder.config.hidden_size != self.decoder.config.hidden_size and self.decoder.config.cross_attention_hidden_size is None ): - self.enc_to_dec_proj = tf.keras.layers.Dense( + self.enc_to_dec_proj = keras.layers.Dense( units=self.decoder.config.hidden_size, kernel_initializer=get_initializer(config.encoder.initializer_range), name="enc_to_dec_proj", @@ -445,7 +445,7 @@ class TFVisionEncoderDecoderModel(TFPreTrainedModel, TFCausalLanguageModelingLos kwargs_decoder["load_weight_prefix"] = cls.load_weight_prefix decoder = TFAutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder) - # Make sure these 2 `tf.keras.Model` have fixed names so `from_pretrained` could load model weights correctly. + # Make sure these 2 `keras.Model` have fixed names so `from_pretrained` could load model weights correctly. if encoder.name != "encoder": raise ValueError("encoder model must be created with the name `encoder`.") if decoder.name != "decoder": diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py index 165b309fb5..3f3cc81795 100644 --- a/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py +++ b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py @@ -21,10 +21,9 @@ import re from typing import Optional, Tuple, Union import tensorflow as tf -from tensorflow.keras.layers import Dense from ...configuration_utils import PretrainedConfig -from ...modeling_tf_utils import TFPreTrainedModel, unpack_inputs +from ...modeling_tf_utils import TFPreTrainedModel, keras, unpack_inputs from ...tf_utils import shape_list from ...utils import ( DUMMY_INPUTS, @@ -159,7 +158,7 @@ VISION_TEXT_DUAL_ENCODER_INPUTS_DOCSTRING = r""" # Copied from transformers.models.clip.modeling_tf_clip.contrastive_loss def contrastive_loss(logits: tf.Tensor) -> tf.Tensor: return tf.math.reduce_mean( - tf.keras.metrics.sparse_categorical_crossentropy( + keras.metrics.sparse_categorical_crossentropy( y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True ) ) @@ -217,8 +216,8 @@ class TFVisionTextDualEncoderModel(TFPreTrainedModel): self.text_embed_dim = config.text_config.hidden_size self.projection_dim = config.projection_dim - self.visual_projection = Dense(self.projection_dim, use_bias=False, name="visual_projection") - self.text_projection = Dense(self.projection_dim, use_bias=False, name="text_projection") + self.visual_projection = keras.layers.Dense(self.projection_dim, use_bias=False, name="visual_projection") + self.text_projection = keras.layers.Dense(self.projection_dim, use_bias=False, name="text_projection") self.logit_scale = None self.config = config @@ -227,7 +226,7 @@ class TFVisionTextDualEncoderModel(TFPreTrainedModel): return self.built = True # Build in the build() method to make sure the names are right - initializer = tf.keras.initializers.Constant(self.config.logit_scale_init_value) + initializer = keras.initializers.Constant(self.config.logit_scale_init_value) self.logit_scale = self.add_weight(shape=(1,), initializer=initializer, name="logit_scale") if getattr(self, "visual_projection", None) is not None: diff --git a/src/transformers/models/vit/modeling_tf_vit.py b/src/transformers/models/vit/modeling_tf_vit.py index 4ac81e24ee..ac5cf691e9 100644 --- a/src/transformers/models/vit/modeling_tf_vit.py +++ b/src/transformers/models/vit/modeling_tf_vit.py @@ -31,6 +31,7 @@ from ...modeling_tf_utils import ( TFPreTrainedModel, TFSequenceClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -53,7 +54,7 @@ _IMAGE_CLASS_CHECKPOINT = "google/vit-base-patch16-224" _IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat" -class TFViTEmbeddings(tf.keras.layers.Layer): +class TFViTEmbeddings(keras.layers.Layer): """ Construct the CLS token, position and patch embeddings. @@ -63,7 +64,7 @@ class TFViTEmbeddings(tf.keras.layers.Layer): super().__init__(**kwargs) self.patch_embeddings = TFViTPatchEmbeddings(config, name="patch_embeddings") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def build(self, input_shape=None): @@ -147,7 +148,7 @@ class TFViTEmbeddings(tf.keras.layers.Layer): # Based on timm implementation, which can be found here: # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py -class TFViTPatchEmbeddings(tf.keras.layers.Layer): +class TFViTPatchEmbeddings(keras.layers.Layer): """ This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a @@ -168,7 +169,7 @@ class TFViTPatchEmbeddings(tf.keras.layers.Layer): self.num_channels = num_channels self.config = config - self.projection = tf.keras.layers.Conv2D( + self.projection = keras.layers.Conv2D( filters=hidden_size, kernel_size=patch_size, strides=patch_size, @@ -196,7 +197,7 @@ class TFViTPatchEmbeddings(tf.keras.layers.Layer): f" ({self.image_size[0]}*{self.image_size[1]})." ) - # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format. + # When running on CPU, `keras.layers.Conv2D` doesn't support `NCHW` format. # So change the input format from `NCHW` to `NHWC`. # shape = (batch_size, in_height, in_width, in_channels=num_channels) pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) @@ -219,7 +220,7 @@ class TFViTPatchEmbeddings(tf.keras.layers.Layer): self.projection.build([None, None, None, self.num_channels]) -class TFViTSelfAttention(tf.keras.layers.Layer): +class TFViTSelfAttention(keras.layers.Layer): def __init__(self, config: ViTConfig, **kwargs): super().__init__(**kwargs) @@ -234,16 +235,16 @@ class TFViTSelfAttention(tf.keras.layers.Layer): self.all_head_size = self.num_attention_heads * self.attention_head_size self.sqrt_att_head_size = math.sqrt(self.attention_head_size) - self.query = tf.keras.layers.Dense( + self.query = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" ) - self.key = tf.keras.layers.Dense( + self.key = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" ) - self.value = tf.keras.layers.Dense( + self.value = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) - self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: @@ -309,7 +310,7 @@ class TFViTSelfAttention(tf.keras.layers.Layer): self.value.build([None, None, self.config.hidden_size]) -class TFViTSelfOutput(tf.keras.layers.Layer): +class TFViTSelfOutput(keras.layers.Layer): """ The residual connection is defined in TFViTLayer instead of here (as is the case with other models), due to the layernorm applied before each block. @@ -318,10 +319,10 @@ class TFViTSelfOutput(tf.keras.layers.Layer): def __init__(self, config: ViTConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -339,7 +340,7 @@ class TFViTSelfOutput(tf.keras.layers.Layer): self.dense.build([None, None, self.config.hidden_size]) -class TFViTAttention(tf.keras.layers.Layer): +class TFViTAttention(keras.layers.Layer): def __init__(self, config: ViTConfig, **kwargs): super().__init__(**kwargs) @@ -378,11 +379,11 @@ class TFViTAttention(tf.keras.layers.Layer): self.dense_output.build(None) -class TFViTIntermediate(tf.keras.layers.Layer): +class TFViTIntermediate(keras.layers.Layer): def __init__(self, config: ViTConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -407,14 +408,14 @@ class TFViTIntermediate(tf.keras.layers.Layer): self.dense.build([None, None, self.config.hidden_size]) -class TFViTOutput(tf.keras.layers.Layer): +class TFViTOutput(keras.layers.Layer): def __init__(self, config: ViTConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -433,7 +434,7 @@ class TFViTOutput(tf.keras.layers.Layer): self.dense.build([None, None, self.config.intermediate_size]) -class TFViTLayer(tf.keras.layers.Layer): +class TFViTLayer(keras.layers.Layer): """This corresponds to the Block class in the timm implementation.""" def __init__(self, config: ViTConfig, **kwargs): @@ -443,12 +444,8 @@ class TFViTLayer(tf.keras.layers.Layer): self.intermediate = TFViTIntermediate(config, name="intermediate") self.vit_output = TFViTOutput(config, name="output") - self.layernorm_before = tf.keras.layers.LayerNormalization( - epsilon=config.layer_norm_eps, name="layernorm_before" - ) - self.layernorm_after = tf.keras.layers.LayerNormalization( - epsilon=config.layer_norm_eps, name="layernorm_after" - ) + self.layernorm_before = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_before") + self.layernorm_after = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_after") self.config = config def call( @@ -504,7 +501,7 @@ class TFViTLayer(tf.keras.layers.Layer): self.layernorm_after.build([None, None, self.config.hidden_size]) -class TFViTEncoder(tf.keras.layers.Layer): +class TFViTEncoder(keras.layers.Layer): def __init__(self, config: ViTConfig, **kwargs): super().__init__(**kwargs) @@ -559,7 +556,7 @@ class TFViTEncoder(tf.keras.layers.Layer): @keras_serializable -class TFViTMainLayer(tf.keras.layers.Layer): +class TFViTMainLayer(keras.layers.Layer): config_class = ViTConfig def __init__(self, config: ViTConfig, add_pooling_layer: bool = True, **kwargs): @@ -569,10 +566,10 @@ class TFViTMainLayer(tf.keras.layers.Layer): self.embeddings = TFViTEmbeddings(config, name="embeddings") self.encoder = TFViTEncoder(config, name="encoder") - self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") + self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") self.pooler = TFViTPooler(config, name="pooler") if add_pooling_layer else None - def get_input_embeddings(self) -> tf.keras.layers.Layer: + def get_input_embeddings(self) -> keras.layers.Layer: return self.embeddings.patch_embeddings def _prune_heads(self, heads_to_prune): @@ -670,7 +667,7 @@ VIT_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -787,11 +784,11 @@ class TFViTModel(TFViTPreTrainedModel): self.vit.build(None) -class TFViTPooler(tf.keras.layers.Layer): +class TFViTPooler(keras.layers.Layer): def __init__(self, config: ViTConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", @@ -839,7 +836,7 @@ class TFViTForImageClassification(TFViTPreTrainedModel, TFSequenceClassification self.vit = TFViTMainLayer(config, add_pooling_layer=False, name="vit") # Classifier head - self.classifier = tf.keras.layers.Dense( + self.classifier = keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier", diff --git a/src/transformers/models/vit_mae/modeling_tf_vit_mae.py b/src/transformers/models/vit_mae/modeling_tf_vit_mae.py index fe7be4f086..fff8234e06 100644 --- a/src/transformers/models/vit_mae/modeling_tf_vit_mae.py +++ b/src/transformers/models/vit_mae/modeling_tf_vit_mae.py @@ -38,6 +38,7 @@ from ...modeling_tf_utils import ( TFModelInputType, TFPreTrainedModel, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -199,7 +200,7 @@ def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): return emb -class TFViTMAEEmbeddings(tf.keras.layers.Layer): +class TFViTMAEEmbeddings(keras.layers.Layer): """ Construct the CLS token, position and patch embeddings. @@ -298,7 +299,7 @@ class TFViTMAEEmbeddings(tf.keras.layers.Layer): return embeddings, mask, ids_restore -class TFViTMAEPatchEmbeddings(tf.keras.layers.Layer): +class TFViTMAEPatchEmbeddings(keras.layers.Layer): """ This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a @@ -318,7 +319,7 @@ class TFViTMAEPatchEmbeddings(tf.keras.layers.Layer): self.num_channels = num_channels self.config = config - self.projection = tf.keras.layers.Conv2D( + self.projection = keras.layers.Conv2D( filters=hidden_size, kernel_size=patch_size, strides=patch_size, @@ -343,7 +344,7 @@ class TFViTMAEPatchEmbeddings(tf.keras.layers.Layer): f" ({self.image_size[0]}*{self.image_size[1]})." ) - # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format. + # When running on CPU, `keras.layers.Conv2D` doesn't support `NCHW` format. # So change the input format from `NCHW` to `NHWC`. # shape = (batch_size, in_height, in_width, in_channels=num_channels) pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) @@ -367,7 +368,7 @@ class TFViTMAEPatchEmbeddings(tf.keras.layers.Layer): # Copied from transformers.models.vit.modeling_tf_vit.TFViTSelfAttention with ViT->ViTMAE -class TFViTMAESelfAttention(tf.keras.layers.Layer): +class TFViTMAESelfAttention(keras.layers.Layer): def __init__(self, config: ViTMAEConfig, **kwargs): super().__init__(**kwargs) @@ -382,16 +383,16 @@ class TFViTMAESelfAttention(tf.keras.layers.Layer): self.all_head_size = self.num_attention_heads * self.attention_head_size self.sqrt_att_head_size = math.sqrt(self.attention_head_size) - self.query = tf.keras.layers.Dense( + self.query = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" ) - self.key = tf.keras.layers.Dense( + self.key = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" ) - self.value = tf.keras.layers.Dense( + self.value = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) - self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: @@ -458,7 +459,7 @@ class TFViTMAESelfAttention(tf.keras.layers.Layer): # Copied from transformers.models.vit.modeling_tf_vit.TFViTSelfOutput with ViT->ViTMAE -class TFViTMAESelfOutput(tf.keras.layers.Layer): +class TFViTMAESelfOutput(keras.layers.Layer): """ The residual connection is defined in TFViTMAELayer instead of here (as is the case with other models), due to the layernorm applied before each block. @@ -467,10 +468,10 @@ class TFViTMAESelfOutput(tf.keras.layers.Layer): def __init__(self, config: ViTMAEConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -489,7 +490,7 @@ class TFViTMAESelfOutput(tf.keras.layers.Layer): # Copied from transformers.models.vit.modeling_tf_vit.TFViTAttention with ViT->ViTMAE -class TFViTMAEAttention(tf.keras.layers.Layer): +class TFViTMAEAttention(keras.layers.Layer): def __init__(self, config: ViTMAEConfig, **kwargs): super().__init__(**kwargs) @@ -529,11 +530,11 @@ class TFViTMAEAttention(tf.keras.layers.Layer): # Copied from transformers.models.vit.modeling_tf_vit.TFViTIntermediate with ViT->ViTMAE -class TFViTMAEIntermediate(tf.keras.layers.Layer): +class TFViTMAEIntermediate(keras.layers.Layer): def __init__(self, config: ViTMAEConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -559,14 +560,14 @@ class TFViTMAEIntermediate(tf.keras.layers.Layer): # Copied from transformers.models.vit.modeling_tf_vit.TFViTOutput with ViT->ViTMAE -class TFViTMAEOutput(tf.keras.layers.Layer): +class TFViTMAEOutput(keras.layers.Layer): def __init__(self, config: ViTMAEConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -586,7 +587,7 @@ class TFViTMAEOutput(tf.keras.layers.Layer): # Copied from transformers.models.vit.modeling_tf_vit.TFViTLayer with ViT->ViTMAE -class TFViTMAELayer(tf.keras.layers.Layer): +class TFViTMAELayer(keras.layers.Layer): """This corresponds to the Block class in the timm implementation.""" def __init__(self, config: ViTMAEConfig, **kwargs): @@ -596,12 +597,8 @@ class TFViTMAELayer(tf.keras.layers.Layer): self.intermediate = TFViTMAEIntermediate(config, name="intermediate") self.vit_output = TFViTMAEOutput(config, name="output") - self.layernorm_before = tf.keras.layers.LayerNormalization( - epsilon=config.layer_norm_eps, name="layernorm_before" - ) - self.layernorm_after = tf.keras.layers.LayerNormalization( - epsilon=config.layer_norm_eps, name="layernorm_after" - ) + self.layernorm_before = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_before") + self.layernorm_after = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_after") self.config = config def call( @@ -658,7 +655,7 @@ class TFViTMAELayer(tf.keras.layers.Layer): # Copied from transformers.models.vit.modeling_tf_vit.TFViTEncoder with ViT->ViTMAE -class TFViTMAEEncoder(tf.keras.layers.Layer): +class TFViTMAEEncoder(keras.layers.Layer): def __init__(self, config: ViTMAEConfig, **kwargs): super().__init__(**kwargs) @@ -713,7 +710,7 @@ class TFViTMAEEncoder(tf.keras.layers.Layer): @keras_serializable -class TFViTMAEMainLayer(tf.keras.layers.Layer): +class TFViTMAEMainLayer(keras.layers.Layer): config_class = ViTMAEConfig def __init__(self, config: ViTMAEConfig, **kwargs): @@ -723,9 +720,9 @@ class TFViTMAEMainLayer(tf.keras.layers.Layer): self.embeddings = TFViTMAEEmbeddings(config, name="embeddings") self.encoder = TFViTMAEEncoder(config, name="encoder") - self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") + self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") - def get_input_embeddings(self) -> tf.keras.layers.Layer: + def get_input_embeddings(self) -> keras.layers.Layer: return self.embeddings.patch_embeddings def _prune_heads(self, heads_to_prune): @@ -814,7 +811,7 @@ VIT_MAE_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -948,10 +945,10 @@ class TFViTMAEModel(TFViTMAEPreTrainedModel): self.vit.build(None) -class TFViTMAEDecoder(tf.keras.layers.Layer): +class TFViTMAEDecoder(keras.layers.Layer): def __init__(self, config, num_patches, **kwargs): super().__init__(**kwargs) - self.decoder_embed = tf.keras.layers.Dense(config.decoder_hidden_size, name="decoder_embed") + self.decoder_embed = keras.layers.Dense(config.decoder_hidden_size, name="decoder_embed") decoder_config = deepcopy(config) decoder_config.hidden_size = config.decoder_hidden_size @@ -962,8 +959,8 @@ class TFViTMAEDecoder(tf.keras.layers.Layer): TFViTMAELayer(decoder_config, name=f"decoder_layers.{j}") for j in range(config.decoder_num_hidden_layers) ] - self.decoder_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="decoder_norm") - self.decoder_pred = tf.keras.layers.Dense( + self.decoder_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="decoder_norm") + self.decoder_pred = keras.layers.Dense( config.patch_size**2 * config.num_channels, kernel_initializer=get_initializer(config.initializer_range), name="decoder_pred", diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py index 9f2f5ab86f..e6a6cb4a75 100644 --- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py @@ -29,6 +29,7 @@ from ...modeling_tf_outputs import TFBaseModelOutput, TFCausalLMOutput, TFSequen from ...modeling_tf_utils import ( TFPreTrainedModel, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -204,7 +205,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None): return (one_cst - expanded_mask) * LARGE_NEGATIVE -class TFWav2Vec2GroupNorm(tf.keras.layers.Layer): +class TFWav2Vec2GroupNorm(keras.layers.Layer): """ From tensorflow-addons https://www.tensorflow.org/addons/api_docs/python/tfa/layers/GroupNormalization """ @@ -216,12 +217,12 @@ class TFWav2Vec2GroupNorm(tf.keras.layers.Layer): epsilon: float = 1e-3, center: bool = True, scale: bool = True, - beta_initializer: tf.keras.initializers.Initializer = "zeros", - gamma_initializer: tf.keras.initializers.Initializer = "ones", - beta_regularizer: tf.keras.regularizers.Regularizer = None, - gamma_regularizer: tf.keras.regularizers.Regularizer = None, - beta_constraint: tf.keras.constraints.Constraint = None, - gamma_constraint: tf.keras.constraints.Constraint = None, + beta_initializer: keras.initializers.Initializer = "zeros", + gamma_initializer: keras.initializers.Initializer = "ones", + beta_regularizer: keras.regularizers.Regularizer = None, + gamma_regularizer: keras.regularizers.Regularizer = None, + beta_constraint: keras.constraints.Constraint = None, + gamma_constraint: keras.constraints.Constraint = None, **kwargs, ): super().__init__(**kwargs) @@ -231,12 +232,12 @@ class TFWav2Vec2GroupNorm(tf.keras.layers.Layer): self.epsilon = epsilon self.center = center self.scale = scale - self.beta_initializer = tf.keras.initializers.get(beta_initializer) - self.gamma_initializer = tf.keras.initializers.get(gamma_initializer) - self.beta_regularizer = tf.keras.regularizers.get(beta_regularizer) - self.gamma_regularizer = tf.keras.regularizers.get(gamma_regularizer) - self.beta_constraint = tf.keras.constraints.get(beta_constraint) - self.gamma_constraint = tf.keras.constraints.get(gamma_constraint) + self.beta_initializer = keras.initializers.get(beta_initializer) + self.gamma_initializer = keras.initializers.get(gamma_initializer) + self.beta_regularizer = keras.regularizers.get(beta_regularizer) + self.gamma_regularizer = keras.regularizers.get(gamma_regularizer) + self.beta_constraint = keras.constraints.get(beta_constraint) + self.gamma_constraint = keras.constraints.get(gamma_constraint) self._check_axis() def build(self, input_shape): @@ -251,7 +252,7 @@ class TFWav2Vec2GroupNorm(tf.keras.layers.Layer): super().build(input_shape) def call(self, inputs): - input_shape = tf.keras.backend.int_shape(inputs) + input_shape = keras.backend.int_shape(inputs) tensor_input_shape = tf.shape(inputs) reshaped_inputs, group_shape = self._reshape_into_groups(inputs, input_shape, tensor_input_shape) @@ -273,12 +274,12 @@ class TFWav2Vec2GroupNorm(tf.keras.layers.Layer): "epsilon": self.epsilon, "center": self.center, "scale": self.scale, - "beta_initializer": tf.keras.initializers.serialize(self.beta_initializer), - "gamma_initializer": tf.keras.initializers.serialize(self.gamma_initializer), - "beta_regularizer": tf.keras.regularizers.serialize(self.beta_regularizer), - "gamma_regularizer": tf.keras.regularizers.serialize(self.gamma_regularizer), - "beta_constraint": tf.keras.constraints.serialize(self.beta_constraint), - "gamma_constraint": tf.keras.constraints.serialize(self.gamma_constraint), + "beta_initializer": keras.initializers.serialize(self.beta_initializer), + "gamma_initializer": keras.initializers.serialize(self.gamma_initializer), + "beta_regularizer": keras.regularizers.serialize(self.beta_regularizer), + "gamma_regularizer": keras.regularizers.serialize(self.gamma_regularizer), + "beta_constraint": keras.constraints.serialize(self.beta_constraint), + "gamma_constraint": keras.constraints.serialize(self.gamma_constraint), } base_config = super().get_config() return {**base_config, **config} @@ -299,7 +300,7 @@ class TFWav2Vec2GroupNorm(tf.keras.layers.Layer): return inputs, group_shape def _apply_normalization(self, reshaped_inputs, input_shape): - group_shape = tf.keras.backend.int_shape(reshaped_inputs) + group_shape = keras.backend.int_shape(reshaped_inputs) group_reduction_axes = list(range(1, len(group_shape))) is_instance_norm = (input_shape[self.axis] // self.groups) == 1 if not is_instance_norm: @@ -377,7 +378,7 @@ class TFWav2Vec2GroupNorm(tf.keras.layers.Layer): def _create_input_spec(self, input_shape): dim = input_shape[self.axis] - self.input_spec = tf.keras.layers.InputSpec(ndim=len(input_shape), axes={self.axis: dim}) + self.input_spec = keras.layers.InputSpec(ndim=len(input_shape), axes={self.axis: dim}) def _add_gamma_weight(self, input_shape): dim = input_shape[self.axis] @@ -420,7 +421,7 @@ class TFWav2Vec2GroupNorm(tf.keras.layers.Layer): return broadcast_shape -class TFWav2Vec2WeightNormConv1D(tf.keras.layers.Conv1D): +class TFWav2Vec2WeightNormConv1D(keras.layers.Conv1D): """Adapted from https://www.tensorflow.org/probability/api_docs/python/tfp/layers/weight_norm/WeightNorm""" def __init__(self, filters, kernel_size, groups, explicit_padding, **kwargs): @@ -476,13 +477,13 @@ class TFWav2Vec2WeightNormConv1D(tf.keras.layers.Conv1D): return output -class TFWav2Vec2NoLayerNormConvLayer(tf.keras.layers.Layer): +class TFWav2Vec2NoLayerNormConvLayer(keras.layers.Layer): def __init__(self, config: Wav2Vec2Config, layer_id: int = 0, **kwargs: Any) -> None: super().__init__(**kwargs) self.in_conv_dim = config.conv_dim[layer_id] if layer_id > 0 else 1 self.out_conv_dim = config.conv_dim[layer_id] - self.conv = tf.keras.layers.Conv1D( + self.conv = keras.layers.Conv1D( filters=self.out_conv_dim, kernel_size=config.conv_kernel[layer_id], strides=config.conv_stride[layer_id], @@ -505,20 +506,20 @@ class TFWav2Vec2NoLayerNormConvLayer(tf.keras.layers.Layer): self.conv.build([None, None, self.in_conv_dim]) -class TFWav2Vec2LayerNormConvLayer(tf.keras.layers.Layer): +class TFWav2Vec2LayerNormConvLayer(keras.layers.Layer): def __init__(self, config: Wav2Vec2Config, layer_id: int = 0, **kwargs: Any) -> None: super().__init__(**kwargs) self.in_conv_dim = config.conv_dim[layer_id] if layer_id > 0 else 1 self.out_conv_dim = config.conv_dim[layer_id] - self.conv = tf.keras.layers.Conv1D( + self.conv = keras.layers.Conv1D( filters=self.out_conv_dim, kernel_size=config.conv_kernel[layer_id], strides=config.conv_stride[layer_id], use_bias=config.conv_bias, name="conv", ) - self.layer_norm = tf.keras.layers.LayerNormalization(name="layer_norm", epsilon=config.layer_norm_eps) + self.layer_norm = keras.layers.LayerNormalization(name="layer_norm", epsilon=config.layer_norm_eps) self.activation = get_tf_activation(config.feat_extract_activation) def call(self, hidden_states: tf.Tensor) -> tf.Tensor: @@ -539,13 +540,13 @@ class TFWav2Vec2LayerNormConvLayer(tf.keras.layers.Layer): self.layer_norm.build([None, None, self.out_conv_dim]) -class TFWav2Vec2GroupNormConvLayer(tf.keras.layers.Layer): +class TFWav2Vec2GroupNormConvLayer(keras.layers.Layer): def __init__(self, config: Wav2Vec2Config, layer_id: int = 0, **kwargs: Any) -> None: super().__init__(**kwargs) self.in_conv_dim = config.conv_dim[layer_id] if layer_id > 0 else 1 self.out_conv_dim = config.conv_dim[layer_id] - self.conv = tf.keras.layers.Conv1D( + self.conv = keras.layers.Conv1D( filters=self.out_conv_dim, kernel_size=config.conv_kernel[layer_id], strides=config.conv_stride[layer_id], @@ -575,7 +576,7 @@ class TFWav2Vec2GroupNormConvLayer(tf.keras.layers.Layer): self.layer_norm.build([None, None, self.out_conv_dim]) -class TFWav2Vec2PositionalConvEmbedding(tf.keras.layers.Layer): +class TFWav2Vec2PositionalConvEmbedding(keras.layers.Layer): def __init__(self, config: Wav2Vec2Config, **kwargs: Any) -> None: super().__init__(**kwargs) self.conv = TFWav2Vec2WeightNormConv1D( @@ -604,7 +605,7 @@ class TFWav2Vec2PositionalConvEmbedding(tf.keras.layers.Layer): self.conv.build([None, None, self.config.hidden_size]) -class TFWav2Vec2SamePadLayer(tf.keras.layers.Layer): +class TFWav2Vec2SamePadLayer(keras.layers.Layer): def __init__(self, num_conv_pos_embeddings, **kwargs): super().__init__(**kwargs) self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0 @@ -615,7 +616,7 @@ class TFWav2Vec2SamePadLayer(tf.keras.layers.Layer): return hidden_states -class TFWav2Vec2FeatureEncoder(tf.keras.layers.Layer): +class TFWav2Vec2FeatureEncoder(keras.layers.Layer): def __init__(self, config: Wav2Vec2Config, **kwargs: Any) -> None: super().__init__(**kwargs) @@ -662,18 +663,18 @@ class TFWav2Vec2FeatureExtractor(TFWav2Vec2FeatureEncoder): ) -class TFWav2Vec2FeatureProjection(tf.keras.layers.Layer): +class TFWav2Vec2FeatureProjection(keras.layers.Layer): def __init__(self, config: Wav2Vec2Config, **kwargs): super().__init__(**kwargs) - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") - self.projection = tf.keras.layers.Dense( + self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") + self.projection = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), bias_initializer="zeros", name="projection", ) - self.dropout = tf.keras.layers.Dropout(rate=config.feat_proj_dropout) + self.dropout = keras.layers.Dropout(rate=config.feat_proj_dropout) self.config = config def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -695,7 +696,7 @@ class TFWav2Vec2FeatureProjection(tf.keras.layers.Layer): # Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with TFBart->TFWav2Vec2 -class TFWav2Vec2Attention(tf.keras.layers.Layer): +class TFWav2Vec2Attention(keras.layers.Layer): """Multi-headed attention from "Attention Is All You Need""" def __init__( @@ -711,7 +712,7 @@ class TFWav2Vec2Attention(tf.keras.layers.Layer): self.embed_dim = embed_dim self.num_heads = num_heads - self.dropout = tf.keras.layers.Dropout(dropout) + self.dropout = keras.layers.Dropout(dropout) self.head_dim = embed_dim // num_heads if (self.head_dim * num_heads) != self.embed_dim: raise ValueError( @@ -721,10 +722,10 @@ class TFWav2Vec2Attention(tf.keras.layers.Layer): self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder - self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") - self.q_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") - self.v_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") - self.out_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") + self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") + self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") + self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") + self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int): return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3)) @@ -865,13 +866,13 @@ class TFWav2Vec2Attention(tf.keras.layers.Layer): self.out_proj.build([None, None, self.embed_dim]) -class TFWav2Vec2FeedForward(tf.keras.layers.Layer): +class TFWav2Vec2FeedForward(keras.layers.Layer): def __init__(self, config: Wav2Vec2Config, **kwargs): super().__init__(**kwargs) - self.intermediate_dropout = tf.keras.layers.Dropout(config.activation_dropout) + self.intermediate_dropout = keras.layers.Dropout(config.activation_dropout) - self.intermediate_dense = tf.keras.layers.Dense( + self.intermediate_dense = keras.layers.Dense( units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), bias_initializer="zeros", @@ -879,13 +880,13 @@ class TFWav2Vec2FeedForward(tf.keras.layers.Layer): ) self.intermediate_act_fn = get_tf_activation(config.hidden_act) - self.output_dense = tf.keras.layers.Dense( + self.output_dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), bias_initializer="zeros", name="output_dense", ) - self.output_dropout = tf.keras.layers.Dropout(config.hidden_dropout) + self.output_dropout = keras.layers.Dropout(config.hidden_dropout) self.config = config def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -909,7 +910,7 @@ class TFWav2Vec2FeedForward(tf.keras.layers.Layer): self.output_dense.build([None, None, self.config.intermediate_size]) -class TFWav2Vec2EncoderLayer(tf.keras.layers.Layer): +class TFWav2Vec2EncoderLayer(keras.layers.Layer): def __init__(self, config: Wav2Vec2Config, **kwargs): super().__init__(**kwargs) self.attention = TFWav2Vec2Attention( @@ -919,12 +920,10 @@ class TFWav2Vec2EncoderLayer(tf.keras.layers.Layer): is_decoder=False, name="attention", ) - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout) - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") + self.dropout = keras.layers.Dropout(config.hidden_dropout) + self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.feed_forward = TFWav2Vec2FeedForward(config, name="feed_forward") - self.final_layer_norm = tf.keras.layers.LayerNormalization( - epsilon=config.layer_norm_eps, name="final_layer_norm" - ) + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="final_layer_norm") self.config = config def call( @@ -970,7 +969,7 @@ class TFWav2Vec2EncoderLayer(tf.keras.layers.Layer): self.final_layer_norm.build([None, None, self.config.hidden_size]) -class TFWav2Vec2EncoderLayerStableLayerNorm(tf.keras.layers.Layer): +class TFWav2Vec2EncoderLayerStableLayerNorm(keras.layers.Layer): def __init__(self, config: Wav2Vec2Config, **kwargs): super().__init__(**kwargs) self.attention = TFWav2Vec2Attention( @@ -980,12 +979,10 @@ class TFWav2Vec2EncoderLayerStableLayerNorm(tf.keras.layers.Layer): is_decoder=False, name="attention", ) - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout) - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") + self.dropout = keras.layers.Dropout(config.hidden_dropout) + self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.feed_forward = TFWav2Vec2FeedForward(config, name="feed_forward") - self.final_layer_norm = tf.keras.layers.LayerNormalization( - epsilon=config.layer_norm_eps, name="final_layer_norm" - ) + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="final_layer_norm") self.config = config def call( @@ -1029,13 +1026,13 @@ class TFWav2Vec2EncoderLayerStableLayerNorm(tf.keras.layers.Layer): self.final_layer_norm.build([None, None, self.config.hidden_size]) -class TFWav2Vec2Encoder(tf.keras.layers.Layer): +class TFWav2Vec2Encoder(keras.layers.Layer): def __init__(self, config: Wav2Vec2Config, **kwargs): super().__init__(**kwargs) self.config = config self.pos_conv_embed = TFWav2Vec2PositionalConvEmbedding(config, name="pos_conv_embed") - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout) + self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") + self.dropout = keras.layers.Dropout(config.hidden_dropout) self.layer = [TFWav2Vec2EncoderLayer(config, name=f"layers.{i}") for i in range(config.num_hidden_layers)] def call( @@ -1109,13 +1106,13 @@ class TFWav2Vec2Encoder(tf.keras.layers.Layer): layer.build(None) -class TFWav2Vec2EncoderStableLayerNorm(tf.keras.layers.Layer): +class TFWav2Vec2EncoderStableLayerNorm(keras.layers.Layer): def __init__(self, config: Wav2Vec2Config, **kwargs): super().__init__(**kwargs) self.config = config self.pos_conv_embed = TFWav2Vec2PositionalConvEmbedding(config, name="pos_conv_embed") - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout) + self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") + self.dropout = keras.layers.Dropout(config.hidden_dropout) self.layer = [ TFWav2Vec2EncoderLayerStableLayerNorm(config, name=f"layers.{i}") for i in range(config.num_hidden_layers) ] @@ -1192,7 +1189,7 @@ class TFWav2Vec2EncoderStableLayerNorm(tf.keras.layers.Layer): @keras_serializable -class TFWav2Vec2MainLayer(tf.keras.layers.Layer): +class TFWav2Vec2MainLayer(keras.layers.Layer): config_class = Wav2Vec2Config def __init__(self, config: Wav2Vec2Config, **kwargs): @@ -1414,7 +1411,7 @@ WAV_2_VEC_2_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -1597,8 +1594,8 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel): super().__init__(config, *inputs, **kwargs) self.wav2vec2 = TFWav2Vec2MainLayer(config, name="wav2vec2") - self.dropout = tf.keras.layers.Dropout(config.final_dropout) - self.lm_head = tf.keras.layers.Dense(config.vocab_size, name="lm_head") + self.dropout = keras.layers.Dropout(config.final_dropout) + self.lm_head = keras.layers.Dense(config.vocab_size, name="lm_head") self.output_hidden_size = ( config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size ) @@ -1766,8 +1763,8 @@ class TFWav2Vec2ForSequenceClassification(TFWav2Vec2PreTrainedModel): shape=(self.num_layers,), initializer="ones", trainable=True, name="layer_weights" ) self.config = config - self.projector = tf.keras.layers.Dense(units=config.classifier_proj_size, name="projector") - self.classifier = tf.keras.layers.Dense(units=config.num_labels, activation=None, name="classifier") + self.projector = keras.layers.Dense(units=config.classifier_proj_size, name="projector") + self.classifier = keras.layers.Dense(units=config.num_labels, activation=None, name="classifier") def freeze_feature_extractor(self): """ @@ -1839,7 +1836,7 @@ class TFWav2Vec2ForSequenceClassification(TFWav2Vec2PreTrainedModel): logits = self.classifier(pooled_output) loss = None if labels is not None: - loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) + loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True) loss = loss_fn(tf.reshape(labels, [-1]), tf.reshape(logits, [-1, self.config.num_labels])) if not return_dict: output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:] diff --git a/src/transformers/models/whisper/modeling_tf_whisper.py b/src/transformers/models/whisper/modeling_tf_whisper.py index b62dbcd178..e5d59c00d3 100644 --- a/src/transformers/models/whisper/modeling_tf_whisper.py +++ b/src/transformers/models/whisper/modeling_tf_whisper.py @@ -37,6 +37,7 @@ from ...modeling_tf_utils import ( TFCausalLanguageModelingLoss, TFModelInputType, TFPreTrainedModel, + keras, keras_serializable, unpack_inputs, ) @@ -129,7 +130,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None): return (one_cst - expanded_mask) * LARGE_NEGATIVE -class TFWhisperPositionalEmbedding(tf.keras.layers.Layer): +class TFWhisperPositionalEmbedding(keras.layers.Layer): def __init__( self, num_positions: int, @@ -142,7 +143,7 @@ class TFWhisperPositionalEmbedding(tf.keras.layers.Layer): self.num_positions = num_positions self.embedding_dim = embedding_dim self.padding_idx = padding_idx - self.embedding_initializer = tf.keras.initializers.get(embedding_initializer) + self.embedding_initializer = keras.initializers.get(embedding_initializer) def build(self, input_shape): self.weight = self.add_weight( @@ -159,7 +160,7 @@ class TFWhisperPositionalEmbedding(tf.keras.layers.Layer): return tf.gather(self.weight, gather_indices) -class TFWhisperAttention(tf.keras.layers.Layer): +class TFWhisperAttention(keras.layers.Layer): """Multi-headed attention from 'Attention Is All You Need' paper""" def __init__( @@ -174,7 +175,7 @@ class TFWhisperAttention(tf.keras.layers.Layer): super().__init__(**kwargs) self.embed_dim = embed_dim self.num_heads = num_heads - self.dropout = tf.keras.layers.Dropout(dropout) + self.dropout = keras.layers.Dropout(dropout) self.head_dim = embed_dim // num_heads if (self.head_dim * num_heads) != self.embed_dim: @@ -185,10 +186,10 @@ class TFWhisperAttention(tf.keras.layers.Layer): self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder - self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=False, name="k_proj") - self.v_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") - self.q_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") - self.out_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") + self.k_proj = keras.layers.Dense(embed_dim, use_bias=False, name="k_proj") + self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") + self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") + self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") # Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention._shape with BART->whisper def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int): @@ -332,20 +333,20 @@ class TFWhisperAttention(tf.keras.layers.Layer): # Copied from transformers.models.speech_to_text.modeling_tf_speech_to_text.TFSpeech2TextEncoderLayer with Speech2Text->Whisper -class TFWhisperEncoderLayer(tf.keras.layers.Layer): +class TFWhisperEncoderLayer(keras.layers.Layer): def __init__(self, config: WhisperConfig, **kwargs): super().__init__(**kwargs) self.embed_dim = config.d_model self.self_attn = TFWhisperAttention( self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn" ) - self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.dropout = keras.layers.Dropout(config.dropout) self.activation_fn = get_tf_activation(config.activation_function) - self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) - self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") - self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") - self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) + self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.config = config def call( @@ -409,7 +410,7 @@ class TFWhisperEncoderLayer(tf.keras.layers.Layer): # Copied from transformers.models.speech_to_text.modeling_tf_speech_to_text.TFSpeech2TextDecoderLayer with Speech2Text->Whisper -class TFWhisperDecoderLayer(tf.keras.layers.Layer): +class TFWhisperDecoderLayer(keras.layers.Layer): def __init__(self, config: WhisperConfig, **kwargs): super().__init__(**kwargs) self.embed_dim = config.d_model @@ -421,11 +422,11 @@ class TFWhisperDecoderLayer(tf.keras.layers.Layer): name="self_attn", is_decoder=True, ) - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) self.activation_fn = get_tf_activation(config.activation_function) - self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) - self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") self.encoder_attn = TFWhisperAttention( self.embed_dim, config.decoder_attention_heads, @@ -433,10 +434,10 @@ class TFWhisperDecoderLayer(tf.keras.layers.Layer): name="encoder_attn", is_decoder=True, ) - self.encoder_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") - self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") - self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") - self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") + self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.config = config def call( @@ -590,7 +591,7 @@ WHISPER_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -680,7 +681,7 @@ WHISPER_INPUTS_DOCSTRING = r""" @keras_serializable -class TFWhisperEncoder(tf.keras.layers.Layer): +class TFWhisperEncoder(keras.layers.Layer): config_class = WhisperConfig """ Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a @@ -703,8 +704,8 @@ class TFWhisperEncoder(tf.keras.layers.Layer): self.embed_scale = math.sqrt(self.embed_dim) if config.scale_embedding else 1.0 # Padding is added in call() to match the PyTorch implementation - self.conv1 = tf.keras.layers.Conv1D(self.embed_dim, kernel_size=3, strides=1, padding="valid", name="conv1") - self.conv2 = tf.keras.layers.Conv1D(self.embed_dim, kernel_size=3, strides=2, padding="valid", name="conv2") + self.conv1 = keras.layers.Conv1D(self.embed_dim, kernel_size=3, strides=1, padding="valid", name="conv1") + self.conv2 = keras.layers.Conv1D(self.embed_dim, kernel_size=3, strides=2, padding="valid", name="conv2") self.embed_positions = TFWhisperPositionalEmbedding( num_positions=self.max_source_positions, @@ -715,9 +716,9 @@ class TFWhisperEncoder(tf.keras.layers.Layer): self.embed_positions.trainable = False self.encoder_layers = [TFWhisperEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)] - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") + self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) @unpack_inputs def call( @@ -762,9 +763,9 @@ class TFWhisperEncoder(tf.keras.layers.Layer): # TF 2.0 layers can't use channels first format when running on CPU. input_features = tf.transpose(input_features, perm=(0, 2, 1)) input_features = tf.pad(input_features, [[0, 0], [1, 1], [0, 0]]) - inputs_embeds = tf.keras.activations.gelu(self.conv1(input_features)) + inputs_embeds = keras.activations.gelu(self.conv1(input_features)) inputs_embeds = tf.pad(inputs_embeds, [[0, 0], [1, 1], [0, 0]]) - inputs_embeds = tf.keras.activations.gelu(self.conv2(inputs_embeds)) + inputs_embeds = keras.activations.gelu(self.conv2(inputs_embeds)) inputs_embeds = tf.transpose(inputs_embeds, perm=(0, 1, 2)) embed_pos = self.embed_positions(input_ids=tf.zeros((1, self.max_source_positions), dtype=tf.int32)) @@ -837,7 +838,7 @@ class TFWhisperEncoder(tf.keras.layers.Layer): @keras_serializable -class TFWhisperDecoder(tf.keras.layers.Layer): +class TFWhisperDecoder(keras.layers.Layer): config_class = WhisperConfig """ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFWhisperDecoderLayer`] @@ -849,17 +850,17 @@ class TFWhisperDecoder(tf.keras.layers.Layer): def __init__(self, config: WhisperConfig, **kwargs): super().__init__(**kwargs) self.config = config - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) self.layerdrop = config.decoder_layerdrop self.padding_idx = config.pad_token_id self.max_target_positions = config.max_target_positions self.max_source_positions = config.max_source_positions self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0 - self.embed_tokens = tf.keras.layers.Embedding( + self.embed_tokens = keras.layers.Embedding( input_dim=config.vocab_size, output_dim=config.d_model, - embeddings_initializer=tf.keras.initializers.TruncatedNormal(stddev=self.config.init_std), + embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std), name="embed_tokens", ) self.embed_positions = TFWhisperPositionalEmbedding( @@ -868,7 +869,7 @@ class TFWhisperDecoder(tf.keras.layers.Layer): self.decoder_layers = [TFWhisperDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)] - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") + self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") def get_input_embeddings(self): return self.embed_tokens @@ -1098,7 +1099,7 @@ class TFWhisperDecoder(tf.keras.layers.Layer): WHISPER_START_DOCSTRING, ) @keras_serializable -class TFWhisperMainLayer(tf.keras.layers.Layer): +class TFWhisperMainLayer(keras.layers.Layer): config_class = WhisperConfig def __init__(self, config: WhisperConfig, **kwargs): @@ -1374,7 +1375,7 @@ class TFWhisperForConditionalGeneration(TFWhisperPreTrainedModel, TFCausalLangua def set_output_embeddings(self, value): self.set_input_embeddings(value) - def resize_token_embeddings(self, new_num_tokens: int) -> tf.keras.layers.Embedding: + def resize_token_embeddings(self, new_num_tokens: int) -> keras.layers.Embedding: new_embeddings = super().resize_token_embeddings(new_num_tokens) return new_embeddings diff --git a/src/transformers/models/xglm/modeling_tf_xglm.py b/src/transformers/models/xglm/modeling_tf_xglm.py index 9f5982c734..4157cc0616 100644 --- a/src/transformers/models/xglm/modeling_tf_xglm.py +++ b/src/transformers/models/xglm/modeling_tf_xglm.py @@ -40,6 +40,7 @@ from ...modeling_tf_utils import ( TFPreTrainedModel, TFSharedEmbeddings, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -149,7 +150,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None): # Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with Bart->XGLM -class TFXGLMAttention(tf.keras.layers.Layer): +class TFXGLMAttention(keras.layers.Layer): """Multi-headed attention from "Attention Is All You Need""" def __init__( @@ -165,7 +166,7 @@ class TFXGLMAttention(tf.keras.layers.Layer): self.embed_dim = embed_dim self.num_heads = num_heads - self.dropout = tf.keras.layers.Dropout(dropout) + self.dropout = keras.layers.Dropout(dropout) self.head_dim = embed_dim // num_heads if (self.head_dim * num_heads) != self.embed_dim: raise ValueError( @@ -175,10 +176,10 @@ class TFXGLMAttention(tf.keras.layers.Layer): self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder - self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") - self.q_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") - self.v_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") - self.out_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") + self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") + self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") + self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") + self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int): return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3)) @@ -319,7 +320,7 @@ class TFXGLMAttention(tf.keras.layers.Layer): self.out_proj.build([None, None, self.embed_dim]) -class TFXGLMDecoderLayer(tf.keras.layers.Layer): +class TFXGLMDecoderLayer(keras.layers.Layer): def __init__(self, config: XGLMConfig, **kwargs: Any) -> None: super().__init__(**kwargs) self.embed_dim = config.d_model @@ -330,9 +331,9 @@ class TFXGLMDecoderLayer(tf.keras.layers.Layer): is_decoder=True, name="self_attn", ) - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) self.activation_fn = get_tf_activation(config.activation_function) - self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) if config.add_cross_attention: self.encoder_attn = TFXGLMAttention( @@ -342,14 +343,14 @@ class TFXGLMDecoderLayer(tf.keras.layers.Layer): is_decoder=True, name="encoder_attn", ) - self.encoder_attn_layer_norm = tf.keras.layers.LayerNormalization( + self.encoder_attn_layer_norm = keras.layers.LayerNormalization( epsilon=1e-5, name="encoder_attn_layer_norm" ) - self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") - self.fc1 = tf.keras.layers.Dense(config.ffn_dim, name="fc1") - self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") - self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.fc1 = keras.layers.Dense(config.ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.config = config # Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartDecoderLayer.call @@ -461,7 +462,7 @@ class TFXGLMDecoderLayer(tf.keras.layers.Layer): @keras_serializable -class TFXGLMMainLayer(tf.keras.layers.Layer): +class TFXGLMMainLayer(keras.layers.Layer): config_class = XGLMConfig def __init__( @@ -488,10 +489,10 @@ class TFXGLMMainLayer(tf.keras.layers.Layer): padding_idx=config.pad_token_id, ) - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) self.layers = [TFXGLMDecoderLayer(config, name=f"layers.{i}") for i in range(config.num_layers)] self.layerdrop = config.layerdrop - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") + self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") def get_input_embeddings(self) -> TFSharedEmbeddings: return self.embed_tokens @@ -679,7 +680,7 @@ XGLM_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -883,7 +884,7 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss): super().__init__(config, *inputs, **kwargs) self.model = TFXGLMMainLayer(config, embed_tokens=embed_tokens, name="model") - self.lm_head = tf.keras.layers.Dense( + self.lm_head = keras.layers.Dense( config.vocab_size, use_bias=False, kernel_initializer=get_initializer(config.init_std), diff --git a/src/transformers/models/xlm/modeling_tf_xlm.py b/src/transformers/models/xlm/modeling_tf_xlm.py index 8f5cc91dde..63d807317b 100644 --- a/src/transformers/models/xlm/modeling_tf_xlm.py +++ b/src/transformers/models/xlm/modeling_tf_xlm.py @@ -45,6 +45,7 @@ from ...modeling_tf_utils import ( TFSharedEmbeddings, TFTokenClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -115,7 +116,7 @@ def get_masks(slen, lengths, causal, padding_mask=None): return mask, attn_mask -class TFXLMMultiHeadAttention(tf.keras.layers.Layer): +class TFXLMMultiHeadAttention(keras.layers.Layer): NEW_ID = itertools.count() def __init__(self, n_heads, dim, config, **kwargs): @@ -126,11 +127,11 @@ class TFXLMMultiHeadAttention(tf.keras.layers.Layer): self.output_attentions = config.output_attentions assert self.dim % self.n_heads == 0 - self.q_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="q_lin") - self.k_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="k_lin") - self.v_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="v_lin") - self.out_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="out_lin") - self.dropout = tf.keras.layers.Dropout(config.attention_dropout) + self.q_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="q_lin") + self.k_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="k_lin") + self.v_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="v_lin") + self.out_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="out_lin") + self.dropout = keras.layers.Dropout(config.attention_dropout) self.pruned_heads = set() self.dim = dim @@ -225,14 +226,14 @@ class TFXLMMultiHeadAttention(tf.keras.layers.Layer): self.out_lin.build([None, None, self.dim]) -class TFXLMTransformerFFN(tf.keras.layers.Layer): +class TFXLMTransformerFFN(keras.layers.Layer): def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs): super().__init__(**kwargs) - self.lin1 = tf.keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name="lin1") - self.lin2 = tf.keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name="lin2") + self.lin1 = keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name="lin1") + self.lin2 = keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name="lin2") self.act = get_tf_activation("gelu") if config.gelu_activation else get_tf_activation("relu") - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) self.in_dim = in_dim self.dim_hidden = dim_hidden @@ -257,7 +258,7 @@ class TFXLMTransformerFFN(tf.keras.layers.Layer): @keras_serializable -class TFXLMMainLayer(tf.keras.layers.Layer): +class TFXLMMainLayer(keras.layers.Layer): config_class = XLMConfig def __init__(self, config, **kwargs): @@ -301,8 +302,8 @@ class TFXLMMainLayer(tf.keras.layers.Layer): raise ValueError("transformer dim must be a multiple of n_heads") # embeddings - self.dropout = tf.keras.layers.Dropout(config.dropout) - self.attention_dropout = tf.keras.layers.Dropout(config.attention_dropout) + self.dropout = keras.layers.Dropout(config.dropout) + self.attention_dropout = keras.layers.Dropout(config.attention_dropout) if config.sinusoidal_embeddings: raise NotImplementedError @@ -311,7 +312,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer): self.embeddings = TFSharedEmbeddings( self.n_words, self.dim, initializer_range=config.embed_init_std, name="embeddings" ) # padding_idx=self.pad_index) - self.layer_norm_emb = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm_emb") + self.layer_norm_emb = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm_emb") # transformer layers self.attentions = [] @@ -327,7 +328,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer): TFXLMMultiHeadAttention(self.n_heads, self.dim, config=config, name=f"attentions_._{i}") ) self.layer_norm1.append( - tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name=f"layer_norm1_._{i}") + keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name=f"layer_norm1_._{i}") ) # if self.is_decoder: # self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps)) @@ -336,7 +337,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer): TFXLMTransformerFFN(self.dim, self.hidden_dim, self.dim, config=config, name=f"ffns_._{i}") ) self.layer_norm2.append( - tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name=f"layer_norm2_._{i}") + keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name=f"layer_norm2_._{i}") ) if hasattr(config, "pruned_heads"): @@ -624,7 +625,7 @@ XLM_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -793,7 +794,7 @@ class TFXLMModel(TFXLMPreTrainedModel): self.transformer.build(None) -class TFXLMPredLayer(tf.keras.layers.Layer): +class TFXLMPredLayer(keras.layers.Layer): """ Prediction layer (cross_entropy or adaptive_softmax). """ @@ -1043,7 +1044,7 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss): self.transformer = TFXLMMainLayer(config, name="transformer") self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary") - self.logits_proj = tf.keras.layers.Dense( + self.logits_proj = keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj" ) self.config = config @@ -1177,8 +1178,8 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos self.num_labels = config.num_labels self.transformer = TFXLMMainLayer(config, name="transformer") - self.dropout = tf.keras.layers.Dropout(config.dropout) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(config.dropout) + self.classifier = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.init_std), name="classifier" ) self.config = config @@ -1267,7 +1268,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFXLMMainLayer(config, name="transformer") - self.qa_outputs = tf.keras.layers.Dense( + self.qa_outputs = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.init_std), name="qa_outputs" ) self.config = config diff --git a/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py index b6003f4284..c33f12298a 100644 --- a/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py @@ -46,6 +46,7 @@ from ...modeling_tf_utils import ( TFSequenceClassificationLoss, TFTokenClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -80,7 +81,7 @@ XLM_ROBERTA_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -162,7 +163,7 @@ XLM_ROBERTA_INPUTS_DOCSTRING = r""" # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaEmbeddings with Roberta->XLMRoberta -class TFXLMRobertaEmbeddings(tf.keras.layers.Layer): +class TFXLMRobertaEmbeddings(keras.layers.Layer): """ Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. """ @@ -175,8 +176,8 @@ class TFXLMRobertaEmbeddings(tf.keras.layers.Layer): self.hidden_size = config.hidden_size self.max_position_embeddings = config.max_position_embeddings self.initializer_range = config.initializer_range - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) def build(self, input_shape=None): with tf.name_scope("word_embeddings"): @@ -268,11 +269,11 @@ class TFXLMRobertaEmbeddings(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->XLMRoberta -class TFXLMRobertaPooler(tf.keras.layers.Layer): +class TFXLMRobertaPooler(keras.layers.Layer): def __init__(self, config: XLMRobertaConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", @@ -298,7 +299,7 @@ class TFXLMRobertaPooler(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->XLMRoberta -class TFXLMRobertaSelfAttention(tf.keras.layers.Layer): +class TFXLMRobertaSelfAttention(keras.layers.Layer): def __init__(self, config: XLMRobertaConfig, **kwargs): super().__init__(**kwargs) @@ -313,16 +314,16 @@ class TFXLMRobertaSelfAttention(tf.keras.layers.Layer): self.all_head_size = self.num_attention_heads * self.attention_head_size self.sqrt_att_head_size = math.sqrt(self.attention_head_size) - self.query = tf.keras.layers.Dense( + self.query = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" ) - self.key = tf.keras.layers.Dense( + self.key = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" ) - self.value = tf.keras.layers.Dense( + self.value = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) - self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.is_decoder = config.is_decoder self.config = config @@ -431,15 +432,15 @@ class TFXLMRobertaSelfAttention(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->XLMRoberta -class TFXLMRobertaSelfOutput(tf.keras.layers.Layer): +class TFXLMRobertaSelfOutput(keras.layers.Layer): def __init__(self, config: XLMRobertaConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -462,7 +463,7 @@ class TFXLMRobertaSelfOutput(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->XLMRoberta -class TFXLMRobertaAttention(tf.keras.layers.Layer): +class TFXLMRobertaAttention(keras.layers.Layer): def __init__(self, config: XLMRobertaConfig, **kwargs): super().__init__(**kwargs) @@ -514,11 +515,11 @@ class TFXLMRobertaAttention(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->XLMRoberta -class TFXLMRobertaIntermediate(tf.keras.layers.Layer): +class TFXLMRobertaIntermediate(keras.layers.Layer): def __init__(self, config: XLMRobertaConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -544,15 +545,15 @@ class TFXLMRobertaIntermediate(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->XLMRoberta -class TFXLMRobertaOutput(tf.keras.layers.Layer): +class TFXLMRobertaOutput(keras.layers.Layer): def __init__(self, config: XLMRobertaConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: @@ -575,7 +576,7 @@ class TFXLMRobertaOutput(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->XLMRoberta -class TFXLMRobertaLayer(tf.keras.layers.Layer): +class TFXLMRobertaLayer(keras.layers.Layer): def __init__(self, config: XLMRobertaConfig, **kwargs): super().__init__(**kwargs) @@ -679,7 +680,7 @@ class TFXLMRobertaLayer(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->XLMRoberta -class TFXLMRobertaEncoder(tf.keras.layers.Layer): +class TFXLMRobertaEncoder(keras.layers.Layer): def __init__(self, config: XLMRobertaConfig, **kwargs): super().__init__(**kwargs) self.config = config @@ -759,7 +760,7 @@ class TFXLMRobertaEncoder(tf.keras.layers.Layer): @keras_serializable # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaMainLayer with Roberta->XLMRoberta -class TFXLMRobertaMainLayer(tf.keras.layers.Layer): +class TFXLMRobertaMainLayer(keras.layers.Layer): config_class = XLMRobertaConfig def __init__(self, config, add_pooling_layer=True, **kwargs): @@ -779,7 +780,7 @@ class TFXLMRobertaMainLayer(tf.keras.layers.Layer): self.embeddings = TFXLMRobertaEmbeddings(config, name="embeddings") # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings - def get_input_embeddings(self) -> tf.keras.layers.Layer: + def get_input_embeddings(self) -> keras.layers.Layer: return self.embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings @@ -1063,7 +1064,7 @@ class TFXLMRobertaModel(TFXLMRobertaPreTrainedModel): # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->XLMRoberta -class TFXLMRobertaLMHead(tf.keras.layers.Layer): +class TFXLMRobertaLMHead(keras.layers.Layer): """XLMRoberta Head for masked language modeling.""" def __init__(self, config, input_embeddings, **kwargs): @@ -1071,10 +1072,10 @@ class TFXLMRobertaLMHead(tf.keras.layers.Layer): self.config = config self.hidden_size = config.hidden_size - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") + self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.act = get_tf_activation("gelu") # The output weights are the same as the input embeddings, but there is @@ -1352,12 +1353,12 @@ class TFXLMRobertaForCausalLM(TFXLMRobertaPreTrainedModel, TFCausalLanguageModel # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead with Roberta->XLMRoberta -class TFXLMRobertaClassificationHead(tf.keras.layers.Layer): +class TFXLMRobertaClassificationHead(keras.layers.Layer): """Head for sentence-level classification tasks.""" def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", @@ -1366,8 +1367,8 @@ class TFXLMRobertaClassificationHead(tf.keras.layers.Layer): classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = tf.keras.layers.Dropout(classifier_dropout) - self.out_proj = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(classifier_dropout) + self.out_proj = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" ) self.config = config @@ -1497,8 +1498,8 @@ class TFXLMRobertaForMultipleChoice(TFXLMRobertaPreTrainedModel, TFMultipleChoic super().__init__(config, *inputs, **kwargs) self.roberta = TFXLMRobertaMainLayer(config, name="roberta") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) + self.classifier = keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config @@ -1606,8 +1607,8 @@ class TFXLMRobertaForTokenClassification(TFXLMRobertaPreTrainedModel, TFTokenCla classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = tf.keras.layers.Dropout(classifier_dropout) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(classifier_dropout) + self.classifier = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config @@ -1698,7 +1699,7 @@ class TFXLMRobertaForQuestionAnswering(TFXLMRobertaPreTrainedModel, TFQuestionAn self.num_labels = config.num_labels self.roberta = TFXLMRobertaMainLayer(config, add_pooling_layer=False, name="roberta") - self.qa_outputs = tf.keras.layers.Dense( + self.qa_outputs = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) self.config = config diff --git a/src/transformers/models/xlnet/modeling_tf_xlnet.py b/src/transformers/models/xlnet/modeling_tf_xlnet.py index 7c5155282b..9bf26872f8 100644 --- a/src/transformers/models/xlnet/modeling_tf_xlnet.py +++ b/src/transformers/models/xlnet/modeling_tf_xlnet.py @@ -39,6 +39,7 @@ from ...modeling_tf_utils import ( TFSharedEmbeddings, TFTokenClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -66,7 +67,7 @@ TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [ ] -class TFXLNetRelativeAttention(tf.keras.layers.Layer): +class TFXLNetRelativeAttention(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) @@ -83,8 +84,8 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): self.initializer_range = config.initializer_range self.output_attentions = config.output_attentions - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") + self.dropout = keras.layers.Dropout(config.dropout) self.config = config def build(self, input_shape=None): @@ -336,17 +337,17 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): return outputs -class TFXLNetFeedForward(tf.keras.layers.Layer): +class TFXLNetFeedForward(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") - self.layer_1 = tf.keras.layers.Dense( + self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") + self.layer_1 = keras.layers.Dense( config.d_inner, kernel_initializer=get_initializer(config.initializer_range), name="layer_1" ) - self.layer_2 = tf.keras.layers.Dense( + self.layer_2 = keras.layers.Dense( config.d_model, kernel_initializer=get_initializer(config.initializer_range), name="layer_2" ) - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) if isinstance(config.ff_activation, str): self.activation_function = get_tf_activation(config.ff_activation) else: @@ -378,12 +379,12 @@ class TFXLNetFeedForward(tf.keras.layers.Layer): self.layer_2.build([None, None, self.config.d_inner]) -class TFXLNetLayer(tf.keras.layers.Layer): +class TFXLNetLayer(keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.rel_attn = TFXLNetRelativeAttention(config, name="rel_attn") self.ff = TFXLNetFeedForward(config, name="ff") - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) def call( self, @@ -433,7 +434,7 @@ class TFXLNetLayer(tf.keras.layers.Layer): self.ff.build(None) -class TFXLNetLMHead(tf.keras.layers.Layer): +class TFXLNetLMHead(keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) self.config = config @@ -466,7 +467,7 @@ class TFXLNetLMHead(tf.keras.layers.Layer): @keras_serializable -class TFXLNetMainLayer(tf.keras.layers.Layer): +class TFXLNetMainLayer(keras.layers.Layer): config_class = XLNetConfig def __init__(self, config, **kwargs): @@ -492,7 +493,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): config.vocab_size, config.d_model, initializer_range=config.initializer_range, name="word_embedding" ) self.layer = [TFXLNetLayer(config, name=f"layer_._{i}") for i in range(config.n_layer)] - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) self.use_mems_eval = config.use_mems_eval self.use_mems_train = config.use_mems_train @@ -1059,7 +1060,7 @@ XLNET_START_DOCSTRING = r""" library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -1415,7 +1416,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif self.sequence_summary = TFSequenceSummary( config, initializer_range=config.initializer_range, name="sequence_summary" ) - self.logits_proj = tf.keras.layers.Dense( + self.logits_proj = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj" ) self.config = config @@ -1516,7 +1517,7 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss): self.sequence_summary = TFSequenceSummary( config, initializer_range=config.initializer_range, name="sequence_summary" ) - self.logits_proj = tf.keras.layers.Dense( + self.logits_proj = keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj" ) self.config = config @@ -1630,7 +1631,7 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio self.num_labels = config.num_labels self.transformer = TFXLNetMainLayer(config, name="transformer") - self.classifier = tf.keras.layers.Dense( + self.classifier = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.config = config @@ -1720,7 +1721,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFXLNetMainLayer(config, name="transformer") - self.qa_outputs = tf.keras.layers.Dense( + self.qa_outputs = keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) self.config = config diff --git a/src/transformers/optimization_tf.py b/src/transformers/optimization_tf.py index a4a84b06f8..4c8d3bba82 100644 --- a/src/transformers/optimization_tf.py +++ b/src/transformers/optimization_tf.py @@ -22,12 +22,14 @@ import tensorflow as tf try: + from tf_keras.optimizers.legacy import Adam +except (ImportError, ModuleNotFoundError): from tensorflow.keras.optimizers.legacy import Adam -except ImportError: - from tensorflow.keras.optimizers import Adam + +from .modeling_tf_utils import keras -class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule): +class WarmUp(keras.optimizers.schedules.LearningRateSchedule): """ Applies a warmup schedule on a given learning rate decay schedule. @@ -131,7 +133,7 @@ def create_optimizer( applied to all parameters except bias and layer norm parameters. """ # Implements linear decay of the learning rate. - lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay( + lr_schedule = keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=init_lr, decay_steps=num_train_steps - num_warmup_steps, end_learning_rate=init_lr * min_lr_ratio, @@ -156,7 +158,7 @@ def create_optimizer( include_in_weight_decay=include_in_weight_decay, ) else: - optimizer = tf.keras.optimizers.Adam( + optimizer = keras.optimizers.Adam( learning_rate=lr_schedule, beta_1=adam_beta1, beta_2=adam_beta2, @@ -180,7 +182,7 @@ class AdamWeightDecay(Adam): to adding the square of the weights to the loss with plain (non-momentum) SGD. Args: - learning_rate (`Union[float, tf.keras.optimizers.schedules.LearningRateSchedule]`, *optional*, defaults to 0.001): + learning_rate (`Union[float, keras.optimizers.schedules.LearningRateSchedule]`, *optional*, defaults to 0.001): The learning rate to use or a schedule. beta_1 (`float`, *optional*, defaults to 0.9): The beta1 parameter in Adam, which is the exponential decay rate for the 1st momentum estimates. @@ -210,7 +212,7 @@ class AdamWeightDecay(Adam): def __init__( self, - learning_rate: Union[float, tf.keras.optimizers.schedules.LearningRateSchedule] = 0.001, + learning_rate: Union[float, keras.optimizers.schedules.LearningRateSchedule] = 0.001, beta_1: float = 0.9, beta_2: float = 0.999, epsilon: float = 1e-7, diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py index 5a13cc551b..4498f4cb79 100644 --- a/src/transformers/training_args_tf.py +++ b/src/transformers/training_args_tf.py @@ -25,6 +25,8 @@ logger = logging.get_logger(__name__) if is_tf_available(): import tensorflow as tf + from .modeling_tf_utils import keras + @dataclass class TFTrainingArguments(TrainingArguments): @@ -195,7 +197,7 @@ class TFTrainingArguments(TrainingArguments): # Set to float16 at first if self.fp16: - tf.keras.mixed_precision.set_global_policy("mixed_float16") + keras.mixed_precision.set_global_policy("mixed_float16") if self.no_cuda: strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0") @@ -216,7 +218,7 @@ class TFTrainingArguments(TrainingArguments): if tpu: # Set to bfloat16 in case of TPU if self.fp16: - tf.keras.mixed_precision.set_global_policy("mixed_bfloat16") + keras.mixed_precision.set_global_policy("mixed_bfloat16") tf.config.experimental_connect_to_cluster(tpu) tf.tpu.experimental.initialize_tpu_system(tpu) diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py index 8bcbef24f8..fdfa32726c 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py @@ -50,6 +50,7 @@ from ...modeling_tf_utils import ( TFSequenceSummary, TFTokenClassificationLoss, get_initializer, + keras, keras_serializable, unpack_inputs, ) @@ -70,7 +71,7 @@ TF_{{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST = [ # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings with Bert->{{cookiecutter.camelcase_modelname}} -class TF{{cookiecutter.camelcase_modelname}}Embeddings(tf.keras.layers.Layer): +class TF{{cookiecutter.camelcase_modelname}}Embeddings(keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): @@ -81,8 +82,8 @@ class TF{{cookiecutter.camelcase_modelname}}Embeddings(tf.keras.layers.Layer): self.hidden_size = config.hidden_size self.max_position_embeddings = config.max_position_embeddings self.initializer_range = config.initializer_range - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) def build(self, input_shape: tf.TensorShape): with tf.name_scope("word_embeddings"): @@ -149,7 +150,7 @@ class TF{{cookiecutter.camelcase_modelname}}Embeddings(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->{{cookiecutter.camelcase_modelname}} -class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer): +class TF{{cookiecutter.camelcase_modelname}}SelfAttention(keras.layers.Layer): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) @@ -164,16 +165,16 @@ class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer) self.all_head_size = self.num_attention_heads * self.attention_head_size self.sqrt_att_head_size = math.sqrt(self.attention_head_size) - self.query = tf.keras.layers.Dense( + self.query = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" ) - self.key = tf.keras.layers.Dense( + self.key = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" ) - self.value = tf.keras.layers.Dense( + self.value = keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) - self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.is_decoder = config.is_decoder @@ -267,15 +268,15 @@ class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer) # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->{{cookiecutter.camelcase_modelname}} -class TF{{cookiecutter.camelcase_modelname}}SelfOutput(tf.keras.layers.Layer): +class TF{{cookiecutter.camelcase_modelname}}SelfOutput(keras.layers.Layer): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -286,7 +287,7 @@ class TF{{cookiecutter.camelcase_modelname}}SelfOutput(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->{{cookiecutter.camelcase_modelname}} -class TF{{cookiecutter.camelcase_modelname}}Attention(tf.keras.layers.Layer): +class TF{{cookiecutter.camelcase_modelname}}Attention(keras.layers.Layer): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) @@ -327,11 +328,11 @@ class TF{{cookiecutter.camelcase_modelname}}Attention(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->{{cookiecutter.camelcase_modelname}} -class TF{{cookiecutter.camelcase_modelname}}Intermediate(tf.keras.layers.Layer): +class TF{{cookiecutter.camelcase_modelname}}Intermediate(keras.layers.Layer): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -348,15 +349,15 @@ class TF{{cookiecutter.camelcase_modelname}}Intermediate(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->{{cookiecutter.camelcase_modelname}} -class TF{{cookiecutter.camelcase_modelname}}Output(tf.keras.layers.Layer): +class TF{{cookiecutter.camelcase_modelname}}Output(keras.layers.Layer): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -367,7 +368,7 @@ class TF{{cookiecutter.camelcase_modelname}}Output(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->{{cookiecutter.camelcase_modelname}} -class TF{{cookiecutter.camelcase_modelname}}Layer(tf.keras.layers.Layer): +class TF{{cookiecutter.camelcase_modelname}}Layer(keras.layers.Layer): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) @@ -454,7 +455,7 @@ class TF{{cookiecutter.camelcase_modelname}}Layer(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->{{cookiecutter.camelcase_modelname}} -class TF{{cookiecutter.camelcase_modelname}}Encoder(tf.keras.layers.Layer): +class TF{{cookiecutter.camelcase_modelname}}Encoder(keras.layers.Layer): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) self.config = config @@ -524,11 +525,11 @@ class TF{{cookiecutter.camelcase_modelname}}Encoder(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->{{cookiecutter.camelcase_modelname}} -class TF{{cookiecutter.camelcase_modelname}}PredictionHeadTransform(tf.keras.layers.Layer): +class TF{{cookiecutter.camelcase_modelname}}PredictionHeadTransform(keras.layers.Layer): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense", @@ -539,7 +540,7 @@ class TF{{cookiecutter.camelcase_modelname}}PredictionHeadTransform(tf.keras.lay else: self.transform_act_fn = config.hidden_act - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -550,8 +551,8 @@ class TF{{cookiecutter.camelcase_modelname}}PredictionHeadTransform(tf.keras.lay # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMPredictionHead with Bert->{{cookiecutter.camelcase_modelname}} -class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(tf.keras.layers.Layer): - def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, input_embeddings: tf.keras.layers.Layer, **kwargs): +class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(keras.layers.Layer): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, input_embeddings: keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size @@ -568,7 +569,7 @@ class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(tf.keras.layers.Lay super().build(input_shape) - def get_output_embeddings(self) -> tf.keras.layers.Layer: + def get_output_embeddings(self) -> keras.layers.Layer: return self.input_embeddings def set_output_embeddings(self, value: tf.Variable): @@ -594,8 +595,8 @@ class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(tf.keras.layers.Lay # Copied from transformers.models.bert.modeling_tf_bert.TFBertMLMHead with Bert->{{cookiecutter.camelcase_modelname}} -class TF{{cookiecutter.camelcase_modelname}}MLMHead(tf.keras.layers.Layer): - def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, input_embeddings: tf.keras.layers.Layer, **kwargs): +class TF{{cookiecutter.camelcase_modelname}}MLMHead(keras.layers.Layer): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, input_embeddings: keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.predictions = TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(config, input_embeddings, name="predictions") @@ -607,7 +608,7 @@ class TF{{cookiecutter.camelcase_modelname}}MLMHead(tf.keras.layers.Layer): @keras_serializable -class TF{{cookiecutter.camelcase_modelname}}MainLayer(tf.keras.layers.Layer): +class TF{{cookiecutter.camelcase_modelname}}MainLayer(keras.layers.Layer): config_class = {{cookiecutter.camelcase_modelname}}Config def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, add_pooling_layer: bool = True, **kwargs): @@ -620,7 +621,7 @@ class TF{{cookiecutter.camelcase_modelname}}MainLayer(tf.keras.layers.Layer): self.encoder = TF{{cookiecutter.camelcase_modelname}}Encoder(config, name="encoder") # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings - def get_input_embeddings(self) -> tf.keras.layers.Layer: + def get_input_embeddings(self) -> keras.layers.Layer: return self.embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings @@ -811,7 +812,7 @@ class TF{{cookiecutter.camelcase_modelname}}PreTrainedModel(TFPreTrainedModel): generic methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -991,7 +992,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForMaskedLM(TF{{cookiecutter.camelca self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, input_embeddings=self.{{cookiecutter.lowercase_modelname}}.embeddings, name="mlm___cls") - def get_lm_head(self) -> tf.keras.layers.Layer: + def get_lm_head(self) -> keras.layers.Layer: return self.mlm.predictions @unpack_inputs @@ -1064,7 +1065,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForCausalLM(TF{{cookiecutter.camelca self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, input_embeddings=self.{{cookiecutter.lowercase_modelname}}.embeddings, name="mlm___cls") - def get_lm_head(self) -> tf.keras.layers.Layer: + def get_lm_head(self) -> keras.layers.Layer: return self.mlm.predictions def prepare_inputs_for_generation(self, inputs, past_key_values=None, attention_mask=None, **model_kwargs): @@ -1166,17 +1167,17 @@ class TF{{cookiecutter.camelcase_modelname}}ForCausalLM(TF{{cookiecutter.camelca -class TF{{cookiecutter.camelcase_modelname}}ClassificationHead(tf.keras.layers.Layer): +class TF{{cookiecutter.camelcase_modelname}}ClassificationHead(keras.layers.Layer): """Head for sentence-level classification tasks.""" def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) - self.dense = tf.keras.layers.Dense( + self.dense = keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.out_proj = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.out_proj = keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" ) @@ -1277,7 +1278,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice(TF{{cookiecutter.c self.sequence_summary = TFSequenceSummary( config, config.initializer_range, name="sequence_summary" ) - self.classifier = tf.keras.layers.Dense( + self.classifier = keras.layers.Dense( units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) @@ -1383,8 +1384,8 @@ class TF{{cookiecutter.camelcase_modelname}}ForTokenClassification(TF{{cookiecut self.num_labels = config.num_labels self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") - self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.classifier = tf.keras.layers.Dense( + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.classifier = keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) @@ -1456,7 +1457,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering(TF{{cookiecutte self.num_labels = config.num_labels self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") - self.qa_outputs = tf.keras.layers.Dense( + self.qa_outputs = keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) @@ -1623,7 +1624,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None): return (one_cst - expanded_mask) * LARGE_NEGATIVE -class TF{{cookiecutter.camelcase_modelname}}LearnedPositionalEmbedding(tf.keras.layers.Embedding): +class TF{{cookiecutter.camelcase_modelname}}LearnedPositionalEmbedding(keras.layers.Embedding): """ This module learns positional embeddings up to a fixed maximum size. """ @@ -1639,7 +1640,7 @@ class TF{{cookiecutter.camelcase_modelname}}LearnedPositionalEmbedding(tf.keras. return super().call(tf.cast(position_ids, dtype=tf.int32)) -class TF{{cookiecutter.camelcase_modelname}}Attention(tf.keras.layers.Layer): +class TF{{cookiecutter.camelcase_modelname}}Attention(keras.layers.Layer): """Multi-headed attention from "Attention Is All You Need""" def __init__( @@ -1655,16 +1656,16 @@ class TF{{cookiecutter.camelcase_modelname}}Attention(tf.keras.layers.Layer): self.embed_dim = embed_dim self.num_heads = num_heads - self.dropout = tf.keras.layers.Dropout(dropout) + self.dropout = keras.layers.Dropout(dropout) self.head_dim = embed_dim // num_heads assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" self.scaling = self.head_dim ** -0.5 self.is_decoder = is_decoder - self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") - self.q_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") - self.v_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") - self.out_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") + self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") + self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") + self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") + self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int): return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3)) @@ -1776,20 +1777,20 @@ class TF{{cookiecutter.camelcase_modelname}}Attention(tf.keras.layers.Layer): return attn_output, attn_weights, past_key_value -class TF{{cookiecutter.camelcase_modelname}}EncoderLayer(tf.keras.layers.Layer): +class TF{{cookiecutter.camelcase_modelname}}EncoderLayer(keras.layers.Layer): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) self.embed_dim = config.d_model self.self_attn = TF{{cookiecutter.camelcase_modelname}}Attention( self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn" ) - self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.dropout = keras.layers.Dropout(config.dropout) self.activation_fn = get_tf_activation(config.activation_function) - self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) - self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") - self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") - self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) + self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") def call(self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training=False): """ @@ -1826,7 +1827,7 @@ class TF{{cookiecutter.camelcase_modelname}}EncoderLayer(tf.keras.layers.Layer): return hidden_states, self_attn_weights -class TF{{cookiecutter.camelcase_modelname}}DecoderLayer(tf.keras.layers.Layer): +class TF{{cookiecutter.camelcase_modelname}}DecoderLayer(keras.layers.Layer): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) self.embed_dim = config.d_model @@ -1837,11 +1838,11 @@ class TF{{cookiecutter.camelcase_modelname}}DecoderLayer(tf.keras.layers.Layer): name="self_attn", is_decoder=True, ) - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) self.activation_fn = get_tf_activation(config.activation_function) - self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) - self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") self.encoder_attn = TF{{cookiecutter.camelcase_modelname}}Attention( self.embed_dim, config.decoder_attention_heads, @@ -1849,10 +1850,10 @@ class TF{{cookiecutter.camelcase_modelname}}DecoderLayer(tf.keras.layers.Layer): name="encoder_attn", is_decoder=True, ) - self.encoder_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") - self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") - self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") - self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") + self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") def call( self, @@ -1944,7 +1945,7 @@ class TF{{cookiecutter.camelcase_modelname}}PreTrainedModel(TFPreTrainedModel): generic methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) - This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. @@ -2062,7 +2063,7 @@ class TF{{cookiecutter.camelcase_modelname}}PreTrainedModel(TFPreTrainedModel): @keras_serializable -class TF{{cookiecutter.camelcase_modelname}}Encoder(tf.keras.layers.Layer): +class TF{{cookiecutter.camelcase_modelname}}Encoder(keras.layers.Layer): config_class = {{cookiecutter.camelcase_modelname}}Config """ Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a @@ -2072,10 +2073,10 @@ class TF{{cookiecutter.camelcase_modelname}}Encoder(tf.keras.layers.Layer): config: {{cookiecutter.camelcase_modelname}}Config """ - def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs): super().__init__(**kwargs) self.config = config - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) self.layerdrop = config.encoder_layerdrop self.padding_idx = config.pad_token_id self.max_source_positions = config.max_position_embeddings @@ -2088,7 +2089,7 @@ class TF{{cookiecutter.camelcase_modelname}}Encoder(tf.keras.layers.Layer): name="embed_positions", ) self.layers = [TF{{cookiecutter.camelcase_modelname}}EncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)] - self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") + self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") def get_embed_tokens(self): return self.embed_tokens @@ -2215,7 +2216,7 @@ class TF{{cookiecutter.camelcase_modelname}}Encoder(tf.keras.layers.Layer): @keras_serializable -class TF{{cookiecutter.camelcase_modelname}}Decoder(tf.keras.layers.Layer): +class TF{{cookiecutter.camelcase_modelname}}Decoder(keras.layers.Layer): config_class = {{cookiecutter.camelcase_modelname}}Config """ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TF{{cookiecutter.camelcase_modelname}}DecoderLayer`] @@ -2225,7 +2226,7 @@ class TF{{cookiecutter.camelcase_modelname}}Decoder(tf.keras.layers.Layer): embed_tokens: output embedding """ - def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs): super().__init__(**kwargs) self.config = config self.padding_idx = config.pad_token_id @@ -2238,9 +2239,9 @@ class TF{{cookiecutter.camelcase_modelname}}Decoder(tf.keras.layers.Layer): ) self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0 self.layers = [TF{{cookiecutter.camelcase_modelname}}DecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)] - self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") + self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = keras.layers.Dropout(config.dropout) def get_embed_tokens(self): return self.embed_tokens @@ -2458,17 +2459,17 @@ class TF{{cookiecutter.camelcase_modelname}}Decoder(tf.keras.layers.Layer): @keras_serializable -class TF{{cookiecutter.camelcase_modelname}}MainLayer(tf.keras.layers.Layer): +class TF{{cookiecutter.camelcase_modelname}}MainLayer(keras.layers.Layer): config_class = {{cookiecutter.camelcase_modelname}}Config def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) self.config = config - self.shared = tf.keras.layers.Embedding( + self.shared = keras.layers.Embedding( input_dim=config.vocab_size, output_dim=config.d_model, - embeddings_initializer=tf.keras.initializers.TruncatedNormal(stddev=self.config.init_std), + embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std), name="model.shared" ) # Additional attribute to specify the expected name scope of the layer (for loading/storing weights) @@ -2637,9 +2638,9 @@ class TF{{cookiecutter.camelcase_modelname}}Model(TF{{cookiecutter.camelcase_mod # Copied from transformers.models.bart.modeling_tf_bart.BiasLayer -class BiasLayer(tf.keras.layers.Layer): +class BiasLayer(keras.layers.Layer): """ - Bias as a layer. It is used for serialization purposes: `tf.keras.Model.save_weights` stores on a per-layer basis, + Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis, so all weights have to be registered in a layer. """ @@ -2811,9 +2812,9 @@ class TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration(TF{{cookiec def hf_compute_loss(self, labels, logits): """CrossEntropyLoss that ignores pad tokens""" - loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( + loss_fn = keras.losses.SparseCategoricalCrossentropy( from_logits=True, - reduction=tf.keras.losses.Reduction.NONE, + reduction=keras.losses.Reduction.NONE, ) melted_labels = tf.reshape(labels, (-1,)) active_loss = tf.not_equal(melted_labels, self.config.pad_token_id) diff --git a/tests/generation/test_tf_utils.py b/tests/generation/test_tf_utils.py index 186e0c8d43..bcb7c63924 100644 --- a/tests/generation/test_tf_utils.py +++ b/tests/generation/test_tf_utils.py @@ -43,6 +43,7 @@ if is_tf_available(): TFMinLengthLogitsProcessor, tf_top_k_top_p_filtering, ) + from transformers.modeling_tf_utils import keras if is_tensorflow_text_available(): import tensorflow_text as text @@ -254,7 +255,7 @@ class TFGenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTests # file needed to load the TF tokenizer hf_hub_download(repo_id="google/flan-t5-small", filename="spiece.model", local_dir=tmp_dir) - class CompleteSentenceTransformer(tf.keras.layers.Layer): + class CompleteSentenceTransformer(keras.layers.Layer): def __init__(self): super().__init__() self.tokenizer = text.SentencepieceTokenizer( @@ -271,9 +272,9 @@ class TFGenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTests return self.tokenizer.detokenize(outputs) complete_model = CompleteSentenceTransformer() - inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string, name="inputs") + inputs = keras.layers.Input(shape=(1,), dtype=tf.string, name="inputs") outputs = complete_model(inputs) - keras_model = tf.keras.Model(inputs, outputs) + keras_model = keras.Model(inputs, outputs) keras_model.save(tmp_dir) def test_eos_token_id_int_and_list_top_k_top_sampling(self): diff --git a/tests/models/bert/test_tokenization_bert_tf.py b/tests/models/bert/test_tokenization_bert_tf.py index e5f736ede7..16ac1d4867 100644 --- a/tests/models/bert/test_tokenization_bert_tf.py +++ b/tests/models/bert/test_tokenization_bert_tf.py @@ -10,6 +10,8 @@ from transformers.testing_utils import require_tensorflow_text, require_tf, slow if is_tf_available(): import tensorflow as tf + from transformers.modeling_tf_utils import keras + if is_tensorflow_text_available(): from transformers.models.bert import TFBertTokenizer @@ -18,8 +20,9 @@ TOKENIZER_CHECKPOINTS = ["bert-base-uncased", "bert-base-cased"] TINY_MODEL_CHECKPOINT = "hf-internal-testing/tiny-bert-tf-only" if is_tf_available(): + from transformers.modeling_tf_utils import keras - class ModelToSave(tf.keras.Model): + class ModelToSave(keras.Model): def __init__(self, tokenizer): super().__init__() self.tokenizer = tokenizer diff --git a/tests/models/blip/test_modeling_tf_blip.py b/tests/models/blip/test_modeling_tf_blip.py index ac6f8e3a67..11e18403dc 100644 --- a/tests/models/blip/test_modeling_tf_blip.py +++ b/tests/models/blip/test_modeling_tf_blip.py @@ -44,6 +44,7 @@ if is_tf_available(): TFBlipTextModel, TFBlipVisionModel, ) + from transformers.modeling_tf_utils import keras from transformers.models.blip.modeling_tf_blip import TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST @@ -172,9 +173,9 @@ class TFBlipVisionModelTest(TFModelTesterMixin, unittest.TestCase): for model_class in self.all_model_classes: model = model_class(config) - self.assertIsInstance(model.get_input_embeddings(), (tf.keras.layers.Layer)) + self.assertIsInstance(model.get_input_embeddings(), (keras.layers.Layer)) x = model.get_output_embeddings() - self.assertTrue(x is None or isinstance(x, tf.keras.layers.Layer)) + self.assertTrue(x is None or isinstance(x, keras.layers.Layer)) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() diff --git a/tests/models/clip/test_modeling_tf_clip.py b/tests/models/clip/test_modeling_tf_clip.py index 897b89d5c3..8feeeebd0d 100644 --- a/tests/models/clip/test_modeling_tf_clip.py +++ b/tests/models/clip/test_modeling_tf_clip.py @@ -38,6 +38,7 @@ if is_tf_available(): import tensorflow as tf from transformers import TFCLIPModel, TFCLIPTextModel, TFCLIPVisionModel, TFSharedEmbeddings + from transformers.modeling_tf_utils import keras from transformers.models.clip.modeling_tf_clip import TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST @@ -151,9 +152,9 @@ class TFCLIPVisionModelTest(TFModelTesterMixin, unittest.TestCase): for model_class in self.all_model_classes: model = model_class(config) - self.assertIsInstance(model.get_input_embeddings(), (tf.keras.layers.Layer)) + self.assertIsInstance(model.get_input_embeddings(), (keras.layers.Layer)) x = model.get_output_embeddings() - self.assertTrue(x is None or isinstance(x, tf.keras.layers.Layer)) + self.assertTrue(x is None or isinstance(x, keras.layers.Layer)) def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() @@ -283,7 +284,7 @@ class TFCLIPVisionModelTest(TFModelTesterMixin, unittest.TestCase): with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname, saved_model=True) saved_model_dir = os.path.join(tmpdirname, "saved_model", "1") - model = tf.keras.models.load_model(saved_model_dir) + model = keras.models.load_model(saved_model_dir) outputs = model(class_inputs_dict) output_hidden_states = outputs["hidden_states"] output_attentions = outputs["attentions"] @@ -443,7 +444,7 @@ class TFCLIPTextModelTest(TFModelTesterMixin, unittest.TestCase): with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname, saved_model=True) saved_model_dir = os.path.join(tmpdirname, "saved_model", "1") - model = tf.keras.models.load_model(saved_model_dir) + model = keras.models.load_model(saved_model_dir) outputs = model(class_inputs_dict) output_hidden_states = outputs["hidden_states"] output_attentions = outputs["attentions"] @@ -565,7 +566,7 @@ class TFCLIPModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")] for module_member in (getattr(module, module_member_name),) if isinstance(module_member, type) - and tf.keras.layers.Layer in module_member.__bases__ + and keras.layers.Layer in module_member.__bases__ and getattr(module_member, "_keras_serializable", False) } for main_layer_class in tf_main_layer_classes: @@ -579,17 +580,17 @@ class TFCLIPModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase main_layer = main_layer_class(config) symbolic_inputs = { - name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items() + name: keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items() } - model = tf.keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs)) + model = keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs)) outputs = model(inputs_dict) with tempfile.TemporaryDirectory() as tmpdirname: filepath = os.path.join(tmpdirname, "keras_model.h5") model.save(filepath) if "T5" in main_layer_class.__name__: - model = tf.keras.models.load_model( + model = keras.models.load_model( filepath, custom_objects={ main_layer_class.__name__: main_layer_class, @@ -597,10 +598,10 @@ class TFCLIPModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase }, ) else: - model = tf.keras.models.load_model( + model = keras.models.load_model( filepath, custom_objects={main_layer_class.__name__: main_layer_class} ) - assert isinstance(model, tf.keras.Model) + assert isinstance(model, keras.Model) after_outputs = model(inputs_dict) self.assert_outputs_same(after_outputs, outputs) diff --git a/tests/models/convbert/test_modeling_tf_convbert.py b/tests/models/convbert/test_modeling_tf_convbert.py index 5c5d83de30..a4e458e7d2 100644 --- a/tests/models/convbert/test_modeling_tf_convbert.py +++ b/tests/models/convbert/test_modeling_tf_convbert.py @@ -37,6 +37,7 @@ if is_tf_available(): TFConvBertForTokenClassification, TFConvBertModel, ) + from transformers.modeling_tf_utils import keras class TFConvBertModelTester: @@ -306,7 +307,7 @@ class TFConvBertModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.Test with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname, saved_model=True) saved_model_dir = os.path.join(tmpdirname, "saved_model", "1") - model = tf.keras.models.load_model(saved_model_dir) + model = keras.models.load_model(saved_model_dir) outputs = model(class_inputs_dict) if self.is_encoder_decoder: diff --git a/tests/models/ctrl/test_modeling_tf_ctrl.py b/tests/models/ctrl/test_modeling_tf_ctrl.py index be080573a9..29a8b6fb6a 100644 --- a/tests/models/ctrl/test_modeling_tf_ctrl.py +++ b/tests/models/ctrl/test_modeling_tf_ctrl.py @@ -29,6 +29,7 @@ from ...test_pipeline_mixin import PipelineTesterMixin if is_tf_available(): import tensorflow as tf + from transformers.modeling_tf_utils import keras from transformers.models.ctrl.modeling_tf_ctrl import ( TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST, TFCTRLForSequenceClassification, @@ -226,18 +227,18 @@ class TFCTRLModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase for model_class in self.all_model_classes: model = model_class(config) model.build_in_name_scope() # may be needed for the get_bias() call below - assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer) + assert isinstance(model.get_input_embeddings(), keras.layers.Layer) if model_class in list_lm_models: x = model.get_output_embeddings() - assert isinstance(x, tf.keras.layers.Layer) + assert isinstance(x, keras.layers.Layer) name = model.get_bias() assert isinstance(name, dict) for k, v in name.items(): assert isinstance(v, tf.Variable) elif model_class in list_other_models_with_output_ebd: x = model.get_output_embeddings() - assert isinstance(x, tf.keras.layers.Layer) + assert isinstance(x, keras.layers.Layer) name = model.get_bias() assert name is None else: diff --git a/tests/models/cvt/test_modeling_tf_cvt.py b/tests/models/cvt/test_modeling_tf_cvt.py index ecb672d422..4ec245ad49 100644 --- a/tests/models/cvt/test_modeling_tf_cvt.py +++ b/tests/models/cvt/test_modeling_tf_cvt.py @@ -22,6 +22,7 @@ if is_tf_available(): import tensorflow as tf from transformers import TFCvtForImageClassification, TFCvtModel + from transformers.modeling_tf_utils import keras from transformers.models.cvt.modeling_tf_cvt import TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST @@ -191,10 +192,10 @@ class TFCvtModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase) @unittest.skip(reason="Get `Failed to determine best cudnn convolution algo.` error after using TF 2.12+cuda 11.8") def test_keras_fit_mixed_precision(self): - policy = tf.keras.mixed_precision.Policy("mixed_float16") - tf.keras.mixed_precision.set_global_policy(policy) + policy = keras.mixed_precision.Policy("mixed_float16") + keras.mixed_precision.set_global_policy(policy) super().test_keras_fit() - tf.keras.mixed_precision.set_global_policy("float32") + keras.mixed_precision.set_global_policy("float32") def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/data2vec/test_modeling_tf_data2vec_vision.py b/tests/models/data2vec/test_modeling_tf_data2vec_vision.py index fa67643440..685a9e4680 100644 --- a/tests/models/data2vec/test_modeling_tf_data2vec_vision.py +++ b/tests/models/data2vec/test_modeling_tf_data2vec_vision.py @@ -39,6 +39,7 @@ if is_tf_available(): TFData2VecVisionForSemanticSegmentation, TFData2VecVisionModel, ) + from transformers.modeling_tf_utils import keras from transformers.models.data2vec.modeling_tf_data2vec_vision import ( TF_DATA2VEC_VISION_PRETRAINED_MODEL_ARCHIVE_LIST, ) @@ -216,9 +217,9 @@ class TFData2VecVisionModelTest(TFModelTesterMixin, PipelineTesterMixin, unittes for model_class in self.all_model_classes: model = model_class(config) - self.assertIsInstance(model.get_input_embeddings(), (tf.keras.layers.Layer)) + self.assertIsInstance(model.get_input_embeddings(), (keras.layers.Layer)) x = model.get_output_embeddings() - self.assertTrue(x is None or isinstance(x, tf.keras.layers.Layer)) + self.assertTrue(x is None or isinstance(x, keras.layers.Layer)) def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() @@ -365,7 +366,7 @@ class TFData2VecVisionModelTest(TFModelTesterMixin, PipelineTesterMixin, unittes key: val for key, val in prepared_for_class.items() if key not in label_names } self.assertGreater(len(inputs_minus_labels), 0) - model.compile(optimizer=tf.keras.optimizers.SGD(0.0), run_eagerly=True) + model.compile(optimizer=keras.optimizers.SGD(0.0), run_eagerly=True) # Make sure the model fits without crashing regardless of where we pass the labels history1 = model.fit( diff --git a/tests/models/deit/test_modeling_tf_deit.py b/tests/models/deit/test_modeling_tf_deit.py index 0e34f35b60..848370a113 100644 --- a/tests/models/deit/test_modeling_tf_deit.py +++ b/tests/models/deit/test_modeling_tf_deit.py @@ -40,6 +40,7 @@ if is_tf_available(): TFDeiTForMaskedImageModeling, TFDeiTModel, ) + from transformers.modeling_tf_utils import keras from transformers.models.deit.modeling_tf_deit import TF_DEIT_PRETRAINED_MODEL_ARCHIVE_LIST @@ -211,9 +212,9 @@ class TFDeiTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase for model_class in self.all_model_classes: model = model_class(config) - self.assertIsInstance(model.get_input_embeddings(), (tf.keras.layers.Layer)) + self.assertIsInstance(model.get_input_embeddings(), (keras.layers.Layer)) x = model.get_output_embeddings() - self.assertTrue(x is None or isinstance(x, tf.keras.layers.Dense)) + self.assertTrue(x is None or isinstance(x, keras.layers.Dense)) def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/efficientformer/test_modeling_tf_efficientformer.py b/tests/models/efficientformer/test_modeling_tf_efficientformer.py index 059ff1ac12..35cbeb75ae 100644 --- a/tests/models/efficientformer/test_modeling_tf_efficientformer.py +++ b/tests/models/efficientformer/test_modeling_tf_efficientformer.py @@ -37,6 +37,7 @@ if is_tf_available(): TFEfficientFormerForImageClassificationWithTeacher, TFEfficientFormerModel, ) + from transformers.modeling_tf_utils import keras from transformers.models.efficientformer.modeling_tf_efficientformer import ( TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, ) @@ -355,7 +356,7 @@ class TFEfficientFormerModelTest(TFModelTesterMixin, PipelineTesterMixin, unitte # These are maximally general inputs for the model, with multiple None dimensions # Hopefully this will catch any conditionals that fail for flexible shapes functional_inputs = { - key: tf.keras.Input(shape=val.shape[1:], dtype=val.dtype, name=key) + key: keras.Input(shape=val.shape[1:], dtype=val.dtype, name=key) for key, val in model.input_signature.items() if key in model.dummy_inputs } diff --git a/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py index c056e16c50..a9d32474c3 100644 --- a/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py +++ b/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py @@ -509,7 +509,7 @@ class TFEncoderDecoderMixin: tf_outputs = tf_model(tf_inputs_dict) # tf models returned loss is usually a tensor rather than a scalar. - # (see `hf_compute_loss`: it uses `tf.keras.losses.Reduction.NONE`) + # (see `hf_compute_loss`: it uses `keras.losses.Reduction.NONE`) # Change it here to a scalar to match PyTorch models' loss tf_loss = getattr(tf_outputs, "loss", None) if tf_loss is not None: diff --git a/tests/models/esm/test_modeling_tf_esm.py b/tests/models/esm/test_modeling_tf_esm.py index b687da355a..0e92e352fe 100644 --- a/tests/models/esm/test_modeling_tf_esm.py +++ b/tests/models/esm/test_modeling_tf_esm.py @@ -30,6 +30,7 @@ if is_tf_available(): import numpy import tensorflow as tf + from transformers.modeling_tf_utils import keras from transformers.models.esm.modeling_tf_esm import ( TF_ESM_PRETRAINED_MODEL_ARCHIVE_LIST, TFEsmForMaskedLM, @@ -269,7 +270,7 @@ class TFEsmModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase) for model_class in self.all_model_classes: model = model_class(config) - assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer) + assert isinstance(model.get_input_embeddings(), keras.layers.Layer) if model_class is TFEsmForMaskedLM: # Output embedding test differs from the main test because they're a matrix, not a layer name = model.get_bias() diff --git a/tests/models/gpt2/test_tokenization_gpt2_tf.py b/tests/models/gpt2/test_tokenization_gpt2_tf.py index e92c9e65df..a3eac86fa6 100644 --- a/tests/models/gpt2/test_tokenization_gpt2_tf.py +++ b/tests/models/gpt2/test_tokenization_gpt2_tf.py @@ -10,6 +10,7 @@ from transformers.testing_utils import require_keras_nlp, require_tf, slow if is_tf_available(): import tensorflow as tf + if is_keras_nlp_available(): from transformers.models.gpt2 import TFGPT2Tokenizer diff --git a/tests/models/groupvit/test_modeling_tf_groupvit.py b/tests/models/groupvit/test_modeling_tf_groupvit.py index 1a1a14e301..968d955846 100644 --- a/tests/models/groupvit/test_modeling_tf_groupvit.py +++ b/tests/models/groupvit/test_modeling_tf_groupvit.py @@ -46,6 +46,7 @@ if is_tf_available(): import tensorflow as tf from transformers import TFGroupViTModel, TFGroupViTTextModel, TFGroupViTVisionModel, TFSharedEmbeddings + from transformers.modeling_tf_utils import keras from transformers.models.groupvit.modeling_tf_groupvit import TF_GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST @@ -186,9 +187,9 @@ class TFGroupViTVisionModelTest(TFModelTesterMixin, unittest.TestCase): for model_class in self.all_model_classes: model = model_class(config) - self.assertIsInstance(model.get_input_embeddings(), (tf.keras.layers.Layer)) + self.assertIsInstance(model.get_input_embeddings(), (keras.layers.Layer)) x = model.get_output_embeddings() - self.assertTrue(x is None or isinstance(x, tf.keras.layers.Layer)) + self.assertTrue(x is None or isinstance(x, keras.layers.Layer)) def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() @@ -340,7 +341,7 @@ class TFGroupViTVisionModelTest(TFModelTesterMixin, unittest.TestCase): with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname, saved_model=True) saved_model_dir = os.path.join(tmpdirname, "saved_model", "1") - model = tf.keras.models.load_model(saved_model_dir) + model = keras.models.load_model(saved_model_dir) outputs = model(class_inputs_dict) output_hidden_states = outputs["hidden_states"] output_attentions = outputs["attentions"] @@ -505,7 +506,7 @@ class TFGroupViTTextModelTest(TFModelTesterMixin, unittest.TestCase): with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname, saved_model=True) saved_model_dir = os.path.join(tmpdirname, "saved_model", "1") - model = tf.keras.models.load_model(saved_model_dir) + model = keras.models.load_model(saved_model_dir) outputs = model(class_inputs_dict) output_hidden_states = outputs["hidden_states"] output_attentions = outputs["attentions"] @@ -655,7 +656,7 @@ class TFGroupViTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.Test and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")] for module_member in (getattr(module, module_member_name),) if isinstance(module_member, type) - and tf.keras.layers.Layer in module_member.__bases__ + and keras.layers.Layer in module_member.__bases__ and getattr(module_member, "_keras_serializable", False) } for main_layer_class in tf_main_layer_classes: @@ -669,17 +670,17 @@ class TFGroupViTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.Test main_layer = main_layer_class(config) symbolic_inputs = { - name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items() + name: keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items() } - model = tf.keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs)) + model = keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs)) outputs = model(inputs_dict) with tempfile.TemporaryDirectory() as tmpdirname: filepath = os.path.join(tmpdirname, "keras_model.h5") model.save(filepath) if "T5" in main_layer_class.__name__: - model = tf.keras.models.load_model( + model = keras.models.load_model( filepath, custom_objects={ main_layer_class.__name__: main_layer_class, @@ -687,10 +688,10 @@ class TFGroupViTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.Test }, ) else: - model = tf.keras.models.load_model( + model = keras.models.load_model( filepath, custom_objects={main_layer_class.__name__: main_layer_class} ) - assert isinstance(model, tf.keras.Model) + assert isinstance(model, keras.Model) after_outputs = model(inputs_dict) self.assert_outputs_same(after_outputs, outputs) diff --git a/tests/models/sam/test_modeling_tf_sam.py b/tests/models/sam/test_modeling_tf_sam.py index 4478815e7c..d742a9b085 100644 --- a/tests/models/sam/test_modeling_tf_sam.py +++ b/tests/models/sam/test_modeling_tf_sam.py @@ -36,6 +36,7 @@ if is_tf_available(): import tensorflow as tf from transformers import SamProcessor, TFSamModel + from transformers.modeling_tf_utils import keras if is_vision_available(): from PIL import Image @@ -322,9 +323,9 @@ class TFSamModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase) for model_class in self.all_model_classes: model = model_class(config) - self.assertIsInstance(model.get_input_embeddings(), (tf.keras.layers.Layer)) + self.assertIsInstance(model.get_input_embeddings(), (keras.layers.Layer)) x = model.get_output_embeddings() - self.assertTrue(x is None or isinstance(x, tf.keras.layers.Dense)) + self.assertTrue(x is None or isinstance(x, keras.layers.Dense)) def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/swin/test_modeling_tf_swin.py b/tests/models/swin/test_modeling_tf_swin.py index 597643936f..e15ecbc41d 100644 --- a/tests/models/swin/test_modeling_tf_swin.py +++ b/tests/models/swin/test_modeling_tf_swin.py @@ -34,6 +34,7 @@ from ...test_pipeline_mixin import PipelineTesterMixin if is_tf_available(): import tensorflow as tf + from transformers.modeling_tf_utils import keras from transformers.models.swin.modeling_tf_swin import ( TF_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST, TFSwinForImageClassification, @@ -237,9 +238,9 @@ class TFSwinModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase for model_class in self.all_model_classes: model = model_class(config) - self.assertIsInstance(model.get_input_embeddings(), tf.keras.layers.Layer) + self.assertIsInstance(model.get_input_embeddings(), keras.layers.Layer) x = model.get_output_embeddings() - self.assertTrue(x is None or isinstance(x, tf.keras.layers.Dense)) + self.assertTrue(x is None or isinstance(x, keras.layers.Dense)) def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py index 4a1e0bfdd0..057df26d30 100644 --- a/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py +++ b/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py @@ -442,7 +442,7 @@ class TFVisionEncoderDecoderMixin: tf_outputs = tf_model(tf_inputs_dict) # tf models returned loss is usually a tensor rather than a scalar. - # (see `hf_compute_loss`: it uses `tf.keras.losses.Reduction.NONE`) + # (see `hf_compute_loss`: it uses `keras.losses.Reduction.NONE`) # Change it here to a scalar to match PyTorch models' loss tf_loss = getattr(tf_outputs, "loss", None) if tf_loss is not None: diff --git a/tests/models/vit/test_modeling_tf_vit.py b/tests/models/vit/test_modeling_tf_vit.py index 0db27dfb2e..dee2c8f18c 100644 --- a/tests/models/vit/test_modeling_tf_vit.py +++ b/tests/models/vit/test_modeling_tf_vit.py @@ -33,6 +33,7 @@ if is_tf_available(): import tensorflow as tf from transformers import TFViTForImageClassification, TFViTModel + from transformers.modeling_tf_utils import keras if is_vision_available(): @@ -188,9 +189,9 @@ class TFViTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase) for model_class in self.all_model_classes: model = model_class(config) - self.assertIsInstance(model.get_input_embeddings(), (tf.keras.layers.Layer)) + self.assertIsInstance(model.get_input_embeddings(), (keras.layers.Layer)) x = model.get_output_embeddings() - self.assertTrue(x is None or isinstance(x, tf.keras.layers.Layer)) + self.assertTrue(x is None or isinstance(x, keras.layers.Layer)) def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/vit_mae/test_modeling_tf_vit_mae.py b/tests/models/vit_mae/test_modeling_tf_vit_mae.py index 8f6064e016..6a77e95102 100644 --- a/tests/models/vit_mae/test_modeling_tf_vit_mae.py +++ b/tests/models/vit_mae/test_modeling_tf_vit_mae.py @@ -41,6 +41,7 @@ if is_tf_available(): import tensorflow as tf from transformers import TFViTMAEForPreTraining, TFViTMAEModel + from transformers.modeling_tf_utils import keras if is_vision_available(): @@ -188,9 +189,9 @@ class TFViTMAEModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCa for model_class in self.all_model_classes: model = model_class(config) - self.assertIsInstance(model.get_input_embeddings(), (tf.keras.layers.Layer)) + self.assertIsInstance(model.get_input_embeddings(), (keras.layers.Layer)) x = model.get_output_embeddings() - self.assertTrue(x is None or isinstance(x, tf.keras.layers.Layer)) + self.assertTrue(x is None or isinstance(x, keras.layers.Layer)) def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() @@ -301,7 +302,7 @@ class TFViTMAEModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCa and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")] for module_member in (getattr(module, module_member_name),) if isinstance(module_member, type) - and tf.keras.layers.Layer in module_member.__bases__ + and keras.layers.Layer in module_member.__bases__ and getattr(module_member, "_keras_serializable", False) } @@ -314,19 +315,17 @@ class TFViTMAEModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCa main_layer = main_layer_class(config) symbolic_inputs = { - name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items() + name: keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items() } - model = tf.keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs)) + model = keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs)) outputs = model(inputs_dict) with tempfile.TemporaryDirectory() as tmpdirname: filepath = os.path.join(tmpdirname, "keras_model.h5") model.save(filepath) - model = tf.keras.models.load_model( - filepath, custom_objects={main_layer_class.__name__: main_layer_class} - ) - assert isinstance(model, tf.keras.Model) + model = keras.models.load_model(filepath, custom_objects={main_layer_class.__name__: main_layer_class}) + assert isinstance(model, keras.Model) after_outputs = model(inputs_dict) self.assert_outputs_same(after_outputs, outputs) diff --git a/tests/sagemaker/scripts/tensorflow/run_tf.py b/tests/sagemaker/scripts/tensorflow/run_tf.py index 03f631d266..315fcca898 100644 --- a/tests/sagemaker/scripts/tensorflow/run_tf.py +++ b/tests/sagemaker/scripts/tensorflow/run_tf.py @@ -5,10 +5,24 @@ import time import tensorflow as tf from datasets import load_dataset +from packaging.version import parse from transformers import AutoTokenizer, TFAutoModelForSequenceClassification +try: + import tf_keras as keras +except (ModuleNotFoundError, ImportError): + import keras + + if parse(keras.__version__).major > 2: + raise ValueError( + "Your currently installed version of Keras is Keras 3, but this is not yet supported in " + "Transformers. Please install the backwards-compatible tf-keras package with " + "`pip install tf-keras`." + ) + + if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -75,9 +89,9 @@ if __name__ == "__main__": ) # fine optimizer and loss - optimizer = tf.keras.optimizers.Adam(learning_rate=args.learning_rate) - loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) - metrics = [tf.keras.metrics.SparseCategoricalAccuracy()] + optimizer = keras.optimizers.Adam(learning_rate=args.learning_rate) + loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True) + metrics = [keras.metrics.SparseCategoricalAccuracy()] model.compile(optimizer=optimizer, loss=loss, metrics=metrics) start_train_time = time.time() diff --git a/tests/sagemaker/scripts/tensorflow/run_tf_dist.py b/tests/sagemaker/scripts/tensorflow/run_tf_dist.py index f8f2e4bcf2..324715e12f 100644 --- a/tests/sagemaker/scripts/tensorflow/run_tf_dist.py +++ b/tests/sagemaker/scripts/tensorflow/run_tf_dist.py @@ -9,6 +9,7 @@ from datasets import load_dataset from tqdm import tqdm from transformers import AutoTokenizer, TFAutoModelForSequenceClassification +from transformers.modeling_tf_utils import keras from transformers.utils import is_sagemaker_dp_enabled @@ -135,9 +136,9 @@ if __name__ == "__main__": ) # fine optimizer and loss - optimizer = tf.keras.optimizers.Adam(learning_rate=args.learning_rate) - loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) - metrics = [tf.keras.metrics.SparseCategoricalAccuracy()] + optimizer = keras.optimizers.Adam(learning_rate=args.learning_rate) + loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True) + metrics = [keras.metrics.SparseCategoricalAccuracy()] model.compile(optimizer=optimizer, loss=loss, metrics=metrics) # Training diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py index e9b63cd1d9..f396875570 100644 --- a/tests/test_modeling_tf_common.py +++ b/tests/test_modeling_tf_common.py @@ -80,6 +80,7 @@ if is_tf_available(): TFSampleDecoderOnlyOutput, TFSampleEncoderDecoderOutput, ) + from transformers.modeling_tf_utils import keras tf.config.experimental.enable_tensor_float_32_execution(False) @@ -365,7 +366,7 @@ class TFModelTesterMixin: and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")] for module_member in (getattr(module, module_member_name),) if isinstance(module_member, type) - and tf.keras.layers.Layer in module_member.__bases__ + and keras.layers.Layer in module_member.__bases__ and getattr(module_member, "_keras_serializable", False) } for main_layer_class in tf_main_layer_classes: @@ -379,17 +380,17 @@ class TFModelTesterMixin: main_layer = main_layer_class(config) symbolic_inputs = { - name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items() + name: keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items() } - model = tf.keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs)) + model = keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs)) outputs = model(inputs_dict) with tempfile.TemporaryDirectory() as tmpdirname: filepath = os.path.join(tmpdirname, "keras_model.h5") model.save(filepath) if "T5" in main_layer_class.__name__: - model = tf.keras.models.load_model( + model = keras.models.load_model( filepath, custom_objects={ main_layer_class.__name__: main_layer_class, @@ -397,10 +398,10 @@ class TFModelTesterMixin: }, ) else: - model = tf.keras.models.load_model( + model = keras.models.load_model( filepath, custom_objects={main_layer_class.__name__: main_layer_class} ) - assert isinstance(model, tf.keras.Model) + assert isinstance(model, keras.Model) after_outputs = model(inputs_dict) self.assert_outputs_same(after_outputs, outputs) @@ -610,7 +611,7 @@ class TFModelTesterMixin: tf_outputs = tf_model(tf_inputs_dict) # tf models returned loss is usually a tensor rather than a scalar. - # (see `hf_compute_loss`: it uses `tf.keras.losses.Reduction.NONE`) + # (see `hf_compute_loss`: it uses `keras.losses.Reduction.NONE`) # Change it here to a scalar to match PyTorch models' loss tf_loss = getattr(tf_outputs, "loss", None) if tf_loss is not None: @@ -697,7 +698,7 @@ class TFModelTesterMixin: # These are maximally general inputs for the model, with multiple None dimensions # Hopefully this will catch any conditionals that fail for flexible shapes functional_inputs = { - key: tf.keras.Input(shape=val.shape[1:], dtype=val.dtype, name=key) + key: keras.Input(shape=val.shape[1:], dtype=val.dtype, name=key) for key, val in model.input_signature.items() if key in model.dummy_inputs } @@ -706,7 +707,7 @@ class TFModelTesterMixin: hidden_states = outputs_dict[0] # Compile extended model - functional_model = tf.keras.Model(inputs=functional_inputs, outputs=hidden_states) + functional_model = keras.Model(inputs=functional_inputs, outputs=hidden_states) model_out = functional_model.predict(model.dummy_inputs) # Check we can pass inputs with the Keras API self.assertTrue(model_out is not None) with tempfile.TemporaryDirectory() as tmpdirname: @@ -918,12 +919,12 @@ class TFModelTesterMixin: for model_class in self.all_model_classes: model = model_class(config) - self.assertIsInstance(model.get_input_embeddings(), tf.keras.layers.Layer) + self.assertIsInstance(model.get_input_embeddings(), keras.layers.Layer) legacy_text_in_text_out = model.get_lm_head() is not None if model_class in text_in_text_out_models or legacy_text_in_text_out: out_embeddings = model.get_output_embeddings() - self.assertIsInstance(out_embeddings, tf.keras.layers.Layer) + self.assertIsInstance(out_embeddings, keras.layers.Layer) bias = model.get_bias() if bias is not None: self.assertIsInstance(bias, dict) @@ -931,7 +932,7 @@ class TFModelTesterMixin: self.assertIsInstance(v, tf.Variable) elif model_class in speech_in_text_out_models: out_embeddings = model.get_output_embeddings() - self.assertIsInstance(out_embeddings, tf.keras.layers.Layer) + self.assertIsInstance(out_embeddings, keras.layers.Layer) bias = model.get_bias() self.assertIsNone(bias) else: @@ -1079,14 +1080,14 @@ class TFModelTesterMixin: def test_resize_token_embeddings(self): # TODO (joao): after the embeddings refactor is complete, rework this test so as to rely exclusively on - # tf.keras.layers.Embedding + # keras.layers.Embedding if not self.test_resize_embeddings: return config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() def _get_word_embedding_weight(model, embedding_layer): - if isinstance(embedding_layer, tf.keras.layers.Embedding): + if isinstance(embedding_layer, keras.layers.Embedding): # builds the embeddings layer model.build_in_name_scope() return embedding_layer.embeddings @@ -1456,7 +1457,7 @@ class TFModelTesterMixin: ] for accuracy_class in accuracy_classes: if model.__class__.__name__.endswith(accuracy_class): - metrics = [tf.keras.metrics.SparseCategoricalAccuracy()] + metrics = [keras.metrics.SparseCategoricalAccuracy()] break else: metrics = [] @@ -1472,7 +1473,7 @@ class TFModelTesterMixin: model_weights = model.get_weights() # Run eagerly to save some expensive compilation times - model.compile(optimizer=tf.keras.optimizers.SGD(0.0), run_eagerly=True, metrics=metrics) + model.compile(optimizer=keras.optimizers.SGD(0.0), run_eagerly=True, metrics=metrics) # Make sure the model fits without crashing regardless of where we pass the labels history1 = model.fit( prepared_for_class, @@ -1557,7 +1558,7 @@ class TFModelTesterMixin: # After testing that the model accepts all int inputs, confirm that its dummies are int32 for key, tensor in model.dummy_inputs.items(): self.assertTrue( - isinstance(tensor, tf.Tensor) or tf.keras.backend.is_keras_tensor(tensor), + isinstance(tensor, tf.Tensor) or keras.backend.is_keras_tensor(tensor), "Dummy inputs should be tf.Tensor!", ) if tensor.dtype.is_integer: diff --git a/tests/test_modeling_tf_utils.py b/tests/test_modeling_tf_utils.py index 293d242f3e..9ab60db781 100644 --- a/tests/test_modeling_tf_utils.py +++ b/tests/test_modeling_tf_utils.py @@ -64,7 +64,7 @@ if is_tf_available(): TFPreTrainedModel, TFRagModel, ) - from transformers.modeling_tf_utils import tf_shard_checkpoint, unpack_inputs + from transformers.modeling_tf_utils import keras, tf_shard_checkpoint, unpack_inputs from transformers.tf_utils import stable_softmax tf.config.experimental.enable_tensor_float_32_execution(False) @@ -282,12 +282,12 @@ class TFModelUtilsTest(unittest.TestCase): def test_shard_checkpoint(self): # This is the model we will use, total size 340,000 bytes. - model = tf.keras.Sequential( + model = keras.Sequential( [ - tf.keras.layers.Dense(200, use_bias=False), # size 80,000 - tf.keras.layers.Dense(200, use_bias=False), # size 160,000 - tf.keras.layers.Dense(100, use_bias=False), # size 80,000 - tf.keras.layers.Dense(50, use_bias=False), # size 20,000 + keras.layers.Dense(200, use_bias=False), # size 80,000 + keras.layers.Dense(200, use_bias=False), # size 160,000 + keras.layers.Dense(100, use_bias=False), # size 80,000 + keras.layers.Dense(50, use_bias=False), # size 20,000 ] ) inputs = tf.zeros((1, 100), dtype=tf.float32) @@ -429,13 +429,13 @@ class TFModelUtilsTest(unittest.TestCase): # Using default signature (default behavior) overrides 'serving_default' with tempfile.TemporaryDirectory() as tmp_dir: model.save_pretrained(tmp_dir, saved_model=True, signatures=None) - model_loaded = tf.keras.models.load_model(f"{tmp_dir}/saved_model/1") + model_loaded = keras.models.load_model(f"{tmp_dir}/saved_model/1") self.assertTrue("serving_default" in list(model_loaded.signatures.keys())) # Providing custom signature function with tempfile.TemporaryDirectory() as tmp_dir: model.save_pretrained(tmp_dir, saved_model=True, signatures={"custom_signature": serving_fn}) - model_loaded = tf.keras.models.load_model(f"{tmp_dir}/saved_model/1") + model_loaded = keras.models.load_model(f"{tmp_dir}/saved_model/1") self.assertTrue("custom_signature" in list(model_loaded.signatures.keys())) # Providing multiple custom signature function @@ -445,7 +445,7 @@ class TFModelUtilsTest(unittest.TestCase): saved_model=True, signatures={"custom_signature_1": serving_fn, "custom_signature_2": serving_fn}, ) - model_loaded = tf.keras.models.load_model(f"{tmp_dir}/saved_model/1") + model_loaded = keras.models.load_model(f"{tmp_dir}/saved_model/1") self.assertTrue("custom_signature_1" in list(model_loaded.signatures.keys())) self.assertTrue("custom_signature_2" in list(model_loaded.signatures.keys())) diff --git a/tests/utils/test_modeling_tf_core.py b/tests/utils/test_modeling_tf_core.py index 7ec2198dd1..53f3edede7 100644 --- a/tests/utils/test_modeling_tf_core.py +++ b/tests/utils/test_modeling_tf_core.py @@ -46,6 +46,7 @@ if is_tf_available(): TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, TFSharedEmbeddings, ) + from transformers.modeling_tf_utils import keras if _tf_gpu_memory_limit is not None: gpus = tf.config.list_physical_devices("GPU") @@ -169,7 +170,7 @@ class TFCoreModelTesterMixin: self.assertGreater(len(inputs_minus_labels), 0) # Make sure it works with XLA! - model.compile(optimizer=tf.keras.optimizers.SGD(0.0), jit_compile=True) + model.compile(optimizer=keras.optimizers.SGD(0.0), jit_compile=True) # Make sure the model fits without crashing regardless of where we pass the labels history = model.fit( prepared_for_class, @@ -186,7 +187,7 @@ class TFCoreModelTesterMixin: # Now test it with separate labels, to make sure that path works in XLA too. model = model_class(config) - model.compile(optimizer=tf.keras.optimizers.SGD(0.0), jit_compile=True) + model.compile(optimizer=keras.optimizers.SGD(0.0), jit_compile=True) history = model.fit( inputs_minus_labels, labels, @@ -234,7 +235,7 @@ class TFCoreModelTesterMixin: with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname, saved_model=True) saved_model_dir = os.path.join(tmpdirname, "saved_model", "1") - model = tf.keras.models.load_model(saved_model_dir) + model = keras.models.load_model(saved_model_dir) outputs = model(class_inputs_dict) if self.is_encoder_decoder: @@ -264,7 +265,7 @@ class TFCoreModelTesterMixin: @slow def test_mixed_precision(self): - tf.keras.mixed_precision.set_global_policy("mixed_float16") + keras.mixed_precision.set_global_policy("mixed_float16") # try/finally block to ensure subsequent tests run in float32 try: @@ -276,7 +277,7 @@ class TFCoreModelTesterMixin: self.assertIsNotNone(outputs) finally: - tf.keras.mixed_precision.set_global_policy("float32") + keras.mixed_precision.set_global_policy("float32") @slow def test_train_pipeline_custom_model(self): @@ -296,7 +297,7 @@ class TFCoreModelTesterMixin: if module_member_name.endswith("MainLayer") for module_member in (getattr(module, module_member_name),) if isinstance(module_member, type) - and tf.keras.layers.Layer in module_member.__bases__ + and keras.layers.Layer in module_member.__bases__ and getattr(module_member, "_keras_serializable", False) } @@ -311,7 +312,7 @@ class TFCoreModelTesterMixin: main_layer = main_layer_class(config) symbolic_inputs = { - name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items() + name: keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items() } if hasattr(self.model_tester, "num_labels"): @@ -324,8 +325,8 @@ class TFCoreModelTesterMixin: ).batch(1) hidden_states = main_layer(symbolic_inputs)[0] - outputs = tf.keras.layers.Dense(num_labels, activation="softmax", name="outputs")(hidden_states) - model = tf.keras.models.Model(inputs=symbolic_inputs, outputs=[outputs]) + outputs = keras.layers.Dense(num_labels, activation="softmax", name="outputs")(hidden_states) + model = keras.models.Model(inputs=symbolic_inputs, outputs=[outputs]) model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["binary_accuracy"]) model.fit(X, epochs=1) @@ -334,7 +335,7 @@ class TFCoreModelTesterMixin: filepath = os.path.join(tmpdirname, "keras_model.h5") model.save(filepath) if "T5" in main_layer_class.__name__: - model = tf.keras.models.load_model( + model = keras.models.load_model( filepath, custom_objects={ main_layer_class.__name__: main_layer_class, @@ -342,10 +343,10 @@ class TFCoreModelTesterMixin: }, ) else: - model = tf.keras.models.load_model( + model = keras.models.load_model( filepath, custom_objects={main_layer_class.__name__: main_layer_class} ) - assert isinstance(model, tf.keras.Model) + assert isinstance(model, keras.Model) model(inputs_dict) @slow