From 14042d560ff85a0749610d81669faa0409e6e73b Mon Sep 17 00:00:00 2001 From: Julien Plu Date: Wed, 20 Jan 2021 12:08:12 +0100 Subject: [PATCH] New TF embeddings (cleaner and faster) (#9418) * Create new embeddings + add to BERT * Add Albert * Add DistilBert * Add Albert + Electra + Funnel * Add Longformer + Lxmert * Add last models * Apply style * Update the template * Remove unused imports * Rename attribute * Import embeddings in their own model file * Replace word_embeddings per weight * fix naming * Fix Albert * Fix Albert * Fix Longformer * Fix Lxmert Mobilebert and MPNet * Fix copy * Fix template * Update the get weights function * Update src/transformers/modeling_tf_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/electra/modeling_tf_electra.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * address Sylvain's comments Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- src/transformers/modeling_tf_utils.py | 155 +++++++-- .../models/albert/modeling_tf_albert.py | 272 ++++++++++------ .../models/bert/modeling_tf_bert.py | 277 +++++++++------- .../distilbert/modeling_tf_distilbert.py | 201 +++++++----- .../models/electra/modeling_tf_electra.py | 274 +++++++++------- .../models/funnel/modeling_tf_funnel.py | 147 ++++----- .../longformer/modeling_tf_longformer.py | 293 ++++++++++------- .../models/lxmert/modeling_tf_lxmert.py | 267 +++++++++------ .../mobilebert/modeling_tf_mobilebert.py | 245 ++++++++------ .../models/mpnet/modeling_tf_mpnet.py | 267 ++++++++------- .../models/roberta/modeling_tf_roberta.py | 305 +++++++++++------- ...tf_{{cookiecutter.lowercase_modelname}}.py | 273 ++++++++++------ tests/test_modeling_tf_common.py | 69 ++-- 13 files changed, 1843 insertions(+), 1202 deletions(-) diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index 866630ed1b..6c8b698e87 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -809,25 +809,29 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin): return model_embeds - def _get_word_embedding_weight(self, embedding_layer): - if hasattr(embedding_layer, "word_embeddings"): - return embedding_layer.word_embeddings - elif hasattr(embedding_layer, "weight"): - return embedding_layer.weight - elif hasattr(embedding_layer, "decoder"): - return embedding_layer.decoder - else: - # Here we build the word embeddings weights if not exists. - # And then we retry to get the attribute once built. - self(self.dummy_inputs) - if hasattr(embedding_layer, "word_embeddings"): - return embedding_layer.word_embeddings - elif hasattr(embedding_layer, "weight"): - return embedding_layer.weight - elif hasattr(embedding_layer, "decoder"): - return embedding_layer.decoder - else: - return None + def _get_word_embedding_weight(model, embedding_layer): + embeds = getattr(embedding_layer, "weight", None) + if embeds is not None: + return embeds + + embeds = getattr(embedding_layer, "decoder", None) + if embeds is not None: + return embeds + + # The reason why the attributes don't exist might be + # because the model is not built, so retry getting + # the argument after building the model + model(model.dummy_inputs) + + embeds = getattr(embedding_layer, "weight", None) + if embeds is not None: + return embeds + + embeds = getattr(embedding_layer, "decoder", None) + if embeds is not None: + return embeds + + return None def _resize_token_embeddings(self, new_num_tokens): old_embeddings = self._get_word_embedding_weight(self.get_input_embeddings()) @@ -1319,6 +1323,119 @@ class TFConv1D(tf.keras.layers.Layer): return x +class WordEmbeddings(tf.keras.layers.Layer): + def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.word_embeddings = self.add_weight( + name="weight", + shape=[self.vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape=input_shape) + + def get_config(self): + config = { + "vocab_size": self.vocab_size, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, input_ids): + flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) + embeddings = tf.gather(params=self.word_embeddings, indices=flat_input_ids) + embeddings = tf.reshape( + tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + ) + + embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + + return embeddings + + +class TokenTypeEmbeddings(tf.keras.layers.Layer): + def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.type_vocab_size = type_vocab_size + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.token_type_embeddings = self.add_weight( + name="embeddings", + shape=[self.type_vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape=input_shape) + + def get_config(self): + config = { + "type_vocab_size": self.type_vocab_size, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, token_type_ids): + flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) + one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) + embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) + embeddings = tf.reshape( + tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) + ) + + embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) + + return embeddings + + +class PositionEmbeddings(tf.keras.layers.Layer): + def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape) + + def get_config(self): + config = { + "max_position_embeddings": self.max_position_embeddings, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, position_ids): + input_shape = shape_list(tensor=position_ids) + position_embeddings = self.position_embeddings[: input_shape[1], :] + + return tf.broadcast_to(input=position_embeddings, shape=input_shape) + + class TFSharedEmbeddings(tf.keras.layers.Layer): r""" Construct shared token embeddings. diff --git a/src/transformers/models/albert/modeling_tf_albert.py b/src/transformers/models/albert/modeling_tf_albert.py index 2cbca6e3e0..2c96a3f597 100644 --- a/src/transformers/models/albert/modeling_tf_albert.py +++ b/src/transformers/models/albert/modeling_tf_albert.py @@ -73,124 +73,178 @@ TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ ] +# Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings +class TFAlbertWordEmbeddings(tf.keras.layers.Layer): + def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.weight = self.add_weight( + name="weight", + shape=[self.vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape=input_shape) + + def get_config(self): + config = { + "vocab_size": self.vocab_size, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, input_ids): + flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) + embeddings = tf.gather(params=self.weight, indices=flat_input_ids) + embeddings = tf.reshape( + tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + ) + + embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + + return embeddings + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertTokenTypeEmbeddings +class TFAlbertTokenTypeEmbeddings(tf.keras.layers.Layer): + def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.type_vocab_size = type_vocab_size + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.token_type_embeddings = self.add_weight( + name="embeddings", + shape=[self.type_vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape=input_shape) + + def get_config(self): + config = { + "type_vocab_size": self.type_vocab_size, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, token_type_ids): + flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) + one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) + embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) + embeddings = tf.reshape( + tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) + ) + + embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) + + return embeddings + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertPositionEmbeddings +class TFAlbertPositionEmbeddings(tf.keras.layers.Layer): + def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape) + + def get_config(self): + config = { + "max_position_embeddings": self.max_position_embeddings, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, position_ids): + input_shape = shape_list(tensor=position_ids) + position_embeddings = self.position_embeddings[: input_shape[1], :] + + return tf.broadcast_to(input=position_embeddings, shape=input_shape) + + class TFAlbertEmbeddings(tf.keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.vocab_size = config.vocab_size - self.embedding_size = config.embedding_size - self.initializer_range = config.initializer_range - self.max_position_embeddings = config.max_position_embeddings - self.type_vocab_size = config.type_vocab_size - self.layer_norm_eps = config.layer_norm_eps - self.hidden_dropout_prob = config.hidden_dropout_prob - - self.position_embeddings = tf.keras.layers.Embedding( - self.max_position_embeddings, - self.embedding_size, - embeddings_initializer=get_initializer(self.initializer_range), + self.word_embeddings = TFAlbertWordEmbeddings( + vocab_size=config.vocab_size, + hidden_size=config.embedding_size, + initializer_range=config.initializer_range, + name="word_embeddings", + ) + self.position_embeddings = TFAlbertPositionEmbeddings( + max_position_embeddings=config.max_position_embeddings, + hidden_size=config.embedding_size, + initializer_range=config.initializer_range, name="position_embeddings", ) - self.token_type_embeddings = tf.keras.layers.Embedding( - self.type_vocab_size, - self.embedding_size, - embeddings_initializer=get_initializer(self.initializer_range), + self.token_type_embeddings = TFAlbertTokenTypeEmbeddings( + type_vocab_size=config.type_vocab_size, + hidden_size=config.embedding_size, + initializer_range=config.initializer_range, name="token_type_embeddings", ) + self.embeddings_sum = tf.keras.layers.Add() + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load - # any TensorFlow checkpoint file - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=self.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(self.hidden_dropout_prob) - - def build(self, input_shape): - """Build shared word embedding layer """ - with tf.name_scope("word_embeddings"): - # Create and initialize weights. The random normal initializer was chosen - # arbitrarily, and works well. - self.word_embeddings = self.add_weight( - "weight", - shape=[self.vocab_size, self.embedding_size], - initializer=get_initializer(self.initializer_range), - ) - super().build(input_shape) - - def call( - self, - input_ids=None, - position_ids=None, - token_type_ids=None, - inputs_embeds=None, - mode="embedding", - training=False, - ): + # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call + def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False): """ - Get token embeddings of inputs - - Args: - inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) - mode: string, a valid value is one of "embedding" and "linear" + Applies embedding based on inputs tensor. Returns: - outputs: (1) If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, - embedding_size]; (2) mode == "linear", output linear tensor, float32 with shape [batch_size, length, - vocab_size] - - Raises: - ValueError: if mode is not valid. - - Shared weights logic adapted from - https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 + final_embeddings (:obj:`tf.Tensor`): output embedding tensor. """ - if mode == "embedding": - return self._embedding(input_ids, position_ids, token_type_ids, inputs_embeds, training=training) - elif mode == "linear": - return self._linear(input_ids) - else: - raise ValueError("mode {} is not valid.".format(mode)) - - def _embedding(self, input_ids, position_ids, token_type_ids, inputs_embeds, training=False): - """Applies embedding based on inputs tensor.""" assert not (input_ids is None and inputs_embeds is None) if input_ids is not None: - input_shape = shape_list(input_ids) - else: - input_shape = shape_list(inputs_embeds)[:-1] + inputs_embeds = self.word_embeddings(input_ids=input_ids) - seq_length = input_shape[1] - if position_ids is None: - position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] if token_type_ids is None: - token_type_ids = tf.fill(input_shape, 0) + input_shape = shape_list(tensor=inputs_embeds)[:-1] + token_type_ids = tf.fill(dims=input_shape, value=0) - if inputs_embeds is None: - inputs_embeds = tf.gather(self.word_embeddings, input_ids) - position_embeddings = self.position_embeddings(position_ids) - token_type_embeddings = self.token_type_embeddings(token_type_ids) + if position_ids is None: + position_embeds = self.position_embeddings(position_ids=inputs_embeds) + else: + position_embeds = self.position_embeddings(position_ids=position_ids) - embeddings = inputs_embeds + position_embeddings + token_type_embeddings - embeddings = self.LayerNorm(embeddings) - embeddings = self.dropout(embeddings, training=training) - return embeddings + token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids) + final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) + final_embeddings = self.LayerNorm(inputs=final_embeddings) + final_embeddings = self.dropout(inputs=final_embeddings, training=training) - def _linear(self, inputs): - """ - Computes logits by running inputs through a linear layer - - Args: - inputs: A float32 tensor with shape [batch_size, length, embedding_size - - Returns: - float32 tensor with shape [batch_size, length, vocab_size]. - """ - batch_size = shape_list(inputs)[0] - length = shape_list(inputs)[1] - x = tf.reshape(inputs, [-1, self.embedding_size]) - logits = tf.matmul(x, self.word_embeddings, transpose_b=True) - return tf.reshape(logits, [batch_size, length, self.vocab_size]) + return final_embeddings class TFAlbertSelfOutput(tf.keras.layers.Layer): @@ -446,8 +500,9 @@ class TFAlbertPreTrainedModel(TFPreTrainedModel): class TFAlbertMLMHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) - self.vocab_size = config.vocab_size + self.vocab_size = config.vocab_size + self.embedding_size = config.embedding_size self.dense = tf.keras.layers.Dense( config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -474,7 +529,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer): return self.decoder def set_output_embeddings(self, value): - self.decoder.word_embeddings = value + self.decoder.weight = value self.decoder.vocab_size = shape_list(value)[0] def get_bias(self): @@ -486,10 +541,15 @@ class TFAlbertMLMHead(tf.keras.layers.Layer): self.vocab_size = shape_list(value["bias"])[0] def call(self, hidden_states): - hidden_states = self.dense(hidden_states) - hidden_states = self.activation(hidden_states) - hidden_states = self.LayerNorm(hidden_states) - hidden_states = self.decoder(hidden_states, mode="linear") + self.decoder_bias + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.activation(inputs=hidden_states) + hidden_states = self.LayerNorm(inputs=hidden_states) + seq_length = shape_list(tensor=hidden_states)[1] + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size]) + hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True) + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) + hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.decoder_bias) + return hidden_states @@ -516,11 +576,11 @@ class TFAlbertMainLayer(tf.keras.layers.Layer): ) def get_input_embeddings(self): - return self.embeddings + return self.embeddings.word_embeddings def set_input_embeddings(self, value): - self.embeddings.word_embeddings = value - self.embeddings.vocab_size = shape_list(value)[0] + self.embeddings.word_embeddings.weight = value + self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] def _prune_heads(self, heads_to_prune): """ @@ -844,7 +904,7 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel): self.num_labels = config.num_labels self.albert = TFAlbertMainLayer(config, name="albert") - self.predictions = TFAlbertMLMHead(config, self.albert.embeddings, name="predictions") + self.predictions = TFAlbertMLMHead(config, self.albert.embeddings.word_embeddings, name="predictions") self.sop_classifier = TFAlbertSOPHead(config, name="sop_classifier") def get_lm_head(self): @@ -964,7 +1024,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss) super().__init__(config, *inputs, **kwargs) self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert") - self.predictions = TFAlbertMLMHead(config, self.albert.embeddings, name="predictions") + self.predictions = TFAlbertMLMHead(config, self.albert.embeddings.word_embeddings, name="predictions") def get_lm_head(self): return self.predictions diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py index 366c266948..d1314b17b4 100644 --- a/src/transformers/models/bert/modeling_tf_bert.py +++ b/src/transformers/models/bert/modeling_tf_bert.py @@ -121,124 +121,174 @@ class TFBertPreTrainingLoss: return masked_lm_loss + next_sentence_loss +class TFBertWordEmbeddings(tf.keras.layers.Layer): + def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.weight = self.add_weight( + name="weight", + shape=[self.vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape=input_shape) + + def get_config(self): + config = { + "vocab_size": self.vocab_size, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, input_ids): + flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) + embeddings = tf.gather(params=self.weight, indices=flat_input_ids) + embeddings = tf.reshape( + tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + ) + + embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + + return embeddings + + +class TFBertTokenTypeEmbeddings(tf.keras.layers.Layer): + def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.type_vocab_size = type_vocab_size + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.token_type_embeddings = self.add_weight( + name="embeddings", + shape=[self.type_vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape=input_shape) + + def get_config(self): + config = { + "type_vocab_size": self.type_vocab_size, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, token_type_ids): + flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) + one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) + embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) + embeddings = tf.reshape( + tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) + ) + + embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) + + return embeddings + + +class TFBertPositionEmbeddings(tf.keras.layers.Layer): + def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape) + + def get_config(self): + config = { + "max_position_embeddings": self.max_position_embeddings, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, position_ids): + input_shape = shape_list(tensor=position_ids) + position_embeddings = self.position_embeddings[: input_shape[1], :] + + return tf.broadcast_to(input=position_embeddings, shape=input_shape) + + class TFBertEmbeddings(tf.keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.vocab_size = config.vocab_size - self.hidden_size = config.hidden_size - self.initializer_range = config.initializer_range - self.position_embeddings = tf.keras.layers.Embedding( - config.max_position_embeddings, - config.hidden_size, - embeddings_initializer=get_initializer(self.initializer_range), + self.word_embeddings = TFBertWordEmbeddings( + vocab_size=config.vocab_size, + hidden_size=config.hidden_size, + initializer_range=config.initializer_range, + name="word_embeddings", + ) + self.position_embeddings = TFBertPositionEmbeddings( + max_position_embeddings=config.max_position_embeddings, + hidden_size=config.hidden_size, + initializer_range=config.initializer_range, name="position_embeddings", ) - self.token_type_embeddings = tf.keras.layers.Embedding( - config.type_vocab_size, - config.hidden_size, - embeddings_initializer=get_initializer(self.initializer_range), + self.token_type_embeddings = TFBertTokenTypeEmbeddings( + type_vocab_size=config.type_vocab_size, + hidden_size=config.hidden_size, + initializer_range=config.initializer_range, name="token_type_embeddings", ) - - # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load - # any TensorFlow checkpoint file + self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape): - """Build shared word embedding layer """ - with tf.name_scope("word_embeddings"): - # Create and initialize weights. The random normal initializer was chosen - # arbitrarily, and works well. - self.word_embeddings = self.add_weight( - "weight", - shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def call( - self, - input_ids=None, - position_ids=None, - token_type_ids=None, - inputs_embeds=None, - mode="embedding", - training=False, - ): + def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False): """ - Get token embeddings of inputs. - - Args: - inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) - mode: string, a valid value is one of "embedding" and "linear". + Applies embedding based on inputs tensor. Returns: - outputs: If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, - embedding_size]; if mode == "linear", output linear tensor, float32 with shape [batch_size, length, - vocab_size]. - - Raises: - ValueError: if mode is not valid. - - Shared weights logic adapted from - https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 + final_embeddings (:obj:`tf.Tensor`): output embedding tensor. """ - if mode == "embedding": - return self._embedding(input_ids, position_ids, token_type_ids, inputs_embeds, training=training) - elif mode == "linear": - return self._linear(input_ids) - else: - raise ValueError("mode {} is not valid.".format(mode)) - - def _embedding(self, input_ids, position_ids, token_type_ids, inputs_embeds, training=False): - """Applies embedding based on inputs tensor.""" assert not (input_ids is None and inputs_embeds is None) if input_ids is not None: - input_shape = shape_list(input_ids) - else: - input_shape = shape_list(inputs_embeds)[:-1] - - seq_length = input_shape[1] - - if position_ids is None: - position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] + inputs_embeds = self.word_embeddings(input_ids=input_ids) if token_type_ids is None: - token_type_ids = tf.fill(input_shape, 0) + input_shape = shape_list(tensor=inputs_embeds)[:-1] + token_type_ids = tf.fill(dims=input_shape, value=0) - if inputs_embeds is None: - inputs_embeds = tf.gather(self.word_embeddings, input_ids) + if position_ids is None: + position_embeds = self.position_embeddings(position_ids=inputs_embeds) + else: + position_embeds = self.position_embeddings(position_ids=position_ids) - position_embeddings = tf.cast(self.position_embeddings(position_ids), inputs_embeds.dtype) - token_type_embeddings = tf.cast(self.token_type_embeddings(token_type_ids), inputs_embeds.dtype) - embeddings = inputs_embeds + position_embeddings + token_type_embeddings - embeddings = self.LayerNorm(embeddings) - embeddings = self.dropout(embeddings, training=training) + token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids) + final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) + final_embeddings = self.LayerNorm(inputs=final_embeddings) + final_embeddings = self.dropout(inputs=final_embeddings, training=training) - return embeddings - - def _linear(self, inputs): - """ - Computes logits by running inputs through a linear layer. - - Args: - inputs: A float32 tensor with shape [batch_size, length, hidden_size]. - - Returns: - float32 tensor with shape [batch_size, length, vocab_size]. - """ - batch_size = shape_list(inputs)[0] - length = shape_list(inputs)[1] - x = tf.reshape(inputs, [-1, self.hidden_size]) - logits = tf.matmul(x, self.word_embeddings, transpose_b=True) - - return tf.reshape(logits, [batch_size, length, self.vocab_size]) + return final_embeddings class TFBertSelfAttention(tf.keras.layers.Layer): @@ -251,8 +301,8 @@ class TFBertSelfAttention(tf.keras.layers.Layer): f"of attention heads ({config.num_attention_heads})" ) - self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.query = tf.keras.layers.experimental.EinsumDense( equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), @@ -318,9 +368,9 @@ class TFBertSelfOutput(tf.keras.layers.Layer): f"of attention heads ({config.num_attention_heads})" ) - self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) - self.all_head_size = self.num_attention_heads * self.attention_head_size + self.all_head_size = config.num_attention_heads * self.attention_head_size + self.dense = tf.keras.layers.experimental.EinsumDense( equation="abcd,cde->abe", output_shape=(None, self.all_head_size), @@ -516,6 +566,8 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer): super().__init__(**kwargs) self.vocab_size = config.vocab_size + self.hidden_size = config.hidden_size + self.transform = TFBertPredictionHeadTransform(config, name="transform") # The output weights are the same as the input embeddings, but there is @@ -531,7 +583,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer): return self.input_embeddings def set_output_embeddings(self, value): - self.input_embeddings.word_embeddings = value + self.input_embeddings.weight = value self.input_embeddings.vocab_size = shape_list(value)[0] def get_bias(self): @@ -542,9 +594,12 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer): self.vocab_size = shape_list(value["bias"])[0] def call(self, hidden_states): - hidden_states = self.transform(hidden_states) - hidden_states = self.input_embeddings(hidden_states, mode="linear") - hidden_states = hidden_states + self.bias + hidden_states = self.transform(hidden_states=hidden_states) + seq_length = shape_list(tensor=hidden_states)[1] + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) + hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) + hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) return hidden_states @@ -583,21 +638,17 @@ class TFBertMainLayer(tf.keras.layers.Layer): super().__init__(**kwargs) self.config = config - self.num_hidden_layers = config.num_hidden_layers - self.initializer_range = config.initializer_range - self.output_attentions = config.output_attentions - self.output_hidden_states = config.output_hidden_states - self.return_dict = config.use_return_dict + self.embeddings = TFBertEmbeddings(config, name="embeddings") self.encoder = TFBertEncoder(config, name="encoder") self.pooler = TFBertPooler(config, name="pooler") if add_pooling_layer else None def get_input_embeddings(self): - return self.embeddings + return self.embeddings.word_embeddings def set_input_embeddings(self, value): - self.embeddings.word_embeddings = value - self.embeddings.vocab_size = shape_list(value)[0] + self.embeddings.word_embeddings.weight = value + self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] def _prune_heads(self, heads_to_prune): """ @@ -682,7 +733,7 @@ class TFBertMainLayer(tf.keras.layers.Layer): if inputs["head_mask"] is not None: raise NotImplementedError else: - inputs["head_mask"] = [None] * self.num_hidden_layers + inputs["head_mask"] = [None] * self.config.num_hidden_layers encoder_outputs = self.encoder( embedding_output, @@ -931,7 +982,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss): self.bert = TFBertMainLayer(config, name="bert") self.nsp = TFBertNSPHead(config, name="nsp___cls") - self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls") + self.mlm = TFBertMLMHead(config, self.bert.embeddings.word_embeddings, name="mlm___cls") def get_lm_head(self): return self.mlm.predictions @@ -1055,7 +1106,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss): ) self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") - self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls") + self.mlm = TFBertMLMHead(config, self.bert.embeddings.word_embeddings, name="mlm___cls") def get_lm_head(self): return self.mlm.predictions @@ -1158,7 +1209,7 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss): logger.warning("If you want to use `TFBertLMHeadModel` as a standalone, add `is_decoder=True.`") self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") - self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls") + self.mlm = TFBertMLMHead(config, self.bert.embeddings.word_embeddings, name="mlm___cls") def get_lm_head(self): return self.mlm.predictions diff --git a/src/transformers/models/distilbert/modeling_tf_distilbert.py b/src/transformers/models/distilbert/modeling_tf_distilbert.py index 5defba021f..64786f3ed9 100644 --- a/src/transformers/models/distilbert/modeling_tf_distilbert.py +++ b/src/transformers/models/distilbert/modeling_tf_distilbert.py @@ -67,104 +67,128 @@ TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ ] +# Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings +class TFDistilBertWordEmbeddings(tf.keras.layers.Layer): + def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.weight = self.add_weight( + name="weight", + shape=[self.vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape=input_shape) + + def get_config(self): + config = { + "vocab_size": self.vocab_size, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, input_ids): + flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) + embeddings = tf.gather(params=self.weight, indices=flat_input_ids) + embeddings = tf.reshape( + tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + ) + + embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + + return embeddings + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertPositionEmbeddings +class TFDistilBertPositionEmbeddings(tf.keras.layers.Layer): + def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape) + + def get_config(self): + config = { + "max_position_embeddings": self.max_position_embeddings, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, position_ids): + input_shape = shape_list(tensor=position_ids) + position_embeddings = self.position_embeddings[: input_shape[1], :] + + return tf.broadcast_to(input=position_embeddings, shape=input_shape) + + class TFEmbeddings(tf.keras.layers.Layer): + """Construct the embeddings from word, position and token_type embeddings.""" + def __init__(self, config, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size self.dim = config.dim self.initializer_range = config.initializer_range - self.position_embeddings = tf.keras.layers.Embedding( - config.max_position_embeddings, - config.dim, - embeddings_initializer=get_initializer(config.initializer_range), + + self.word_embeddings = TFDistilBertWordEmbeddings( + vocab_size=config.vocab_size, + hidden_size=config.dim, + initializer_range=config.initializer_range, + name="word_embeddings", + ) + self.position_embeddings = TFDistilBertPositionEmbeddings( + max_position_embeddings=config.max_position_embeddings, + hidden_size=config.dim, + initializer_range=config.initializer_range, name="position_embeddings", ) - + self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(config.dropout) + self.dropout = tf.keras.layers.Dropout(rate=config.dropout) - def build(self, input_shape): - """Build shared word embedding layer """ - with tf.name_scope("word_embeddings"): - # Create and initialize weights. The random normal initializer was chosen - # arbitrarily, and works well. - self.word_embeddings = self.add_weight( - "weight", shape=[self.vocab_size, self.dim], initializer=get_initializer(self.initializer_range) - ) - super().build(input_shape) - - def call(self, input_ids=None, position_ids=None, inputs_embeds=None, mode="embedding", training=False): + def call(self, input_ids=None, position_ids=None, inputs_embeds=None, training=False): """ - Get token embeddings of inputs. - - Args: - inputs: list of two int64 tensors with shape [batch_size, length]: (input_ids, position_ids) - mode: string, a valid value is one of "embedding" and "linear". + Applies embedding based on inputs tensor. Returns: - outputs: If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, - embedding_size]; if mode == "linear", output linear tensor, float32 with shape [batch_size, length, - vocab_size]. - - Raises: - ValueError: if mode is not valid. - - Shared weights logic adapted from - https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 - """ - if mode == "embedding": - return self._embedding(input_ids, position_ids, inputs_embeds, training=training) - elif mode == "linear": - return self._linear(input_ids) - else: - raise ValueError("mode {} is not valid.".format(mode)) - - def _embedding(self, input_ids, position_ids, inputs_embeds, training=False): - """ - Parameters: - input_ids: tf.Tensor(bs, max_seq_length) The token ids to embed. - - Returns: - tf.Tensor(bs, max_seq_length, dim) The embedded tokens (plus position embeddings, no token_type embeddings) + final_embeddings (:obj:`tf.Tensor`): output embedding tensor. """ assert not (input_ids is None and inputs_embeds is None) if input_ids is not None: - seq_length = shape_list(input_ids)[1] - else: - seq_length = shape_list(inputs_embeds)[1] + inputs_embeds = self.word_embeddings(input_ids=input_ids) if position_ids is None: - position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] + position_embeds = self.position_embeddings(position_ids=inputs_embeds) + else: + position_embeds = self.position_embeddings(position_ids=position_ids) - if inputs_embeds is None: - inputs_embeds = tf.gather(self.word_embeddings, input_ids) - position_embeddings = tf.cast( - self.position_embeddings(position_ids), inputs_embeds.dtype - ) # (bs, max_seq_length, dim) + final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds]) + final_embeddings = self.LayerNorm(inputs=final_embeddings) + final_embeddings = self.dropout(inputs=final_embeddings, training=training) - embeddings = inputs_embeds + position_embeddings # (bs, max_seq_length, dim) - embeddings = self.LayerNorm(embeddings) # (bs, max_seq_length, dim) - embeddings = self.dropout(embeddings, training=training) # (bs, max_seq_length, dim) - return embeddings - - def _linear(self, inputs): - """ - Computes logits by running inputs through a linear layer - - Args: - inputs: A float32 tensor with shape [batch_size, length, hidden_size] - - Returns: - float32 tensor with shape [batch_size, length, vocab_size]. - """ - batch_size = shape_list(inputs)[0] - length = shape_list(inputs)[1] - - x = tf.reshape(inputs, [-1, self.dim]) - logits = tf.matmul(x, self.word_embeddings, transpose_b=True) - - return tf.reshape(logits, [batch_size, length, self.vocab_size]) + return final_embeddings class TFMultiHeadSelfAttention(tf.keras.layers.Layer): @@ -397,11 +421,11 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer): self.transformer = TFTransformer(config, name="transformer") # Encoder def get_input_embeddings(self): - return self.embeddings + return self.embeddings.word_embeddings def set_input_embeddings(self, value): - self.embeddings.word_embeddings = value - self.embeddings.vocab_size = value.shape[0] + self.embeddings.word_embeddings.weight = value + self.embeddings.word_embeddings.vocab_size = value.shape[0] def _prune_heads(self, heads_to_prune): raise NotImplementedError @@ -636,7 +660,9 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel): class TFDistilBertLMHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) + self.vocab_size = config.vocab_size + self.dim = config.dim # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. @@ -644,13 +670,14 @@ class TFDistilBertLMHead(tf.keras.layers.Layer): def build(self, input_shape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") + super().build(input_shape) def get_output_embeddings(self): return self.input_embeddings def set_output_embeddings(self, value): - self.input_embeddings.word_embeddings = value + self.input_embeddings.weight = value self.input_embeddings.vocab_size = shape_list(value)[0] def get_bias(self): @@ -661,8 +688,12 @@ class TFDistilBertLMHead(tf.keras.layers.Layer): self.vocab_size = shape_list(value["bias"])[0] def call(self, hidden_states): - hidden_states = self.input_embeddings(hidden_states, mode="linear") - hidden_states = hidden_states + self.bias + seq_length = shape_list(tensor=hidden_states)[1] + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.dim]) + hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) + hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) + return hidden_states @@ -681,7 +712,9 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModel ) self.act = get_tf_activation("gelu") self.vocab_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm") - self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector") + self.vocab_projector = TFDistilBertLMHead( + config, self.distilbert.embeddings.word_embeddings, name="vocab_projector" + ) def get_lm_head(self): return self.vocab_projector diff --git a/src/transformers/models/electra/modeling_tf_electra.py b/src/transformers/models/electra/modeling_tf_electra.py index 8923594e0e..1fc725162c 100644 --- a/src/transformers/models/electra/modeling_tf_electra.py +++ b/src/transformers/models/electra/modeling_tf_electra.py @@ -70,6 +70,122 @@ TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [ ] +# Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings +class TFElectraWordEmbeddings(tf.keras.layers.Layer): + def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.weight = self.add_weight( + name="weight", + shape=[self.vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape=input_shape) + + def get_config(self): + config = { + "vocab_size": self.vocab_size, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, input_ids): + flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) + embeddings = tf.gather(params=self.weight, indices=flat_input_ids) + embeddings = tf.reshape( + tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + ) + + embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + + return embeddings + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertTokenTypeEmbeddings +class TFElectraTokenTypeEmbeddings(tf.keras.layers.Layer): + def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.type_vocab_size = type_vocab_size + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.token_type_embeddings = self.add_weight( + name="embeddings", + shape=[self.type_vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape=input_shape) + + def get_config(self): + config = { + "type_vocab_size": self.type_vocab_size, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, token_type_ids): + flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) + one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) + embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) + embeddings = tf.reshape( + tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) + ) + + embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) + + return embeddings + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertPositionEmbeddings +class TFElectraPositionEmbeddings(tf.keras.layers.Layer): + def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape) + + def get_config(self): + config = { + "max_position_embeddings": self.max_position_embeddings, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, position_ids): + input_shape = shape_list(tensor=position_ids) + position_embeddings = self.position_embeddings[: input_shape[1], :] + + return tf.broadcast_to(input=position_embeddings, shape=input_shape) + + # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Electra class TFElectraSelfAttention(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -81,8 +197,8 @@ class TFElectraSelfAttention(tf.keras.layers.Layer): f"of attention heads ({config.num_attention_heads})" ) - self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.query = tf.keras.layers.experimental.EinsumDense( equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), @@ -138,7 +254,7 @@ class TFElectraSelfAttention(tf.keras.layers.Layer): return outputs -# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Electra +# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput class TFElectraSelfOutput(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) @@ -149,9 +265,9 @@ class TFElectraSelfOutput(tf.keras.layers.Layer): f"of attention heads ({config.num_attention_heads})" ) - self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) - self.all_head_size = self.num_attention_heads * self.attention_head_size + self.all_head_size = config.num_attention_heads * self.attention_head_size + self.dense = tf.keras.layers.experimental.EinsumDense( equation="abcd,cde->abe", output_shape=(None, self.all_head_size), @@ -331,120 +447,56 @@ class TFElectraEmbeddings(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.vocab_size = config.vocab_size - self.embedding_size = config.embedding_size - self.initializer_range = config.initializer_range - self.position_embeddings = tf.keras.layers.Embedding( - config.max_position_embeddings, - config.embedding_size, - embeddings_initializer=get_initializer(self.initializer_range), + self.word_embeddings = TFElectraWordEmbeddings( + vocab_size=config.vocab_size, + hidden_size=config.embedding_size, + initializer_range=config.initializer_range, + name="word_embeddings", + ) + self.position_embeddings = TFElectraPositionEmbeddings( + max_position_embeddings=config.max_position_embeddings, + hidden_size=config.embedding_size, + initializer_range=config.initializer_range, name="position_embeddings", ) - self.token_type_embeddings = tf.keras.layers.Embedding( - config.type_vocab_size, - config.embedding_size, - embeddings_initializer=get_initializer(self.initializer_range), + self.token_type_embeddings = TFElectraTokenTypeEmbeddings( + type_vocab_size=config.type_vocab_size, + hidden_size=config.embedding_size, + initializer_range=config.initializer_range, name="token_type_embeddings", ) - - # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load - # any TensorFlow checkpoint file + self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape): - """Build shared word embedding layer """ - with tf.name_scope("word_embeddings"): - # Create and initialize weights. The random normal initializer was chosen - # arbitrarily, and works well. - self.word_embeddings = self.add_weight( - "weight", - shape=[self.vocab_size, self.embedding_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call - def call( - self, - input_ids=None, - position_ids=None, - token_type_ids=None, - inputs_embeds=None, - mode="embedding", - training=False, - ): + # Copied from transformers.models.albert.modeling_tf_albert.TFAlbertEmbeddings.call with Albert->Electra + def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False): """ - Get token embeddings of inputs. - - Args: - inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) - mode: string, a valid value is one of "embedding" and "linear". + Applies embedding based on inputs tensor. Returns: - outputs: If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, - embedding_size]; if mode == "linear", output linear tensor, float32 with shape [batch_size, length, - vocab_size]. - - Raises: - ValueError: if mode is not valid. - - Shared weights logic adapted from - https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 + final_embeddings (:obj:`tf.Tensor`): output embedding tensor. """ - if mode == "embedding": - return self._embedding(input_ids, position_ids, token_type_ids, inputs_embeds, training=training) - elif mode == "linear": - return self._linear(input_ids) - else: - raise ValueError("mode {} is not valid.".format(mode)) - - # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings._embedding - def _embedding(self, input_ids, position_ids, token_type_ids, inputs_embeds, training=False): - """Applies embedding based on inputs tensor.""" assert not (input_ids is None and inputs_embeds is None) if input_ids is not None: - input_shape = shape_list(input_ids) - else: - input_shape = shape_list(inputs_embeds)[:-1] - - seq_length = input_shape[1] - - if position_ids is None: - position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] + inputs_embeds = self.word_embeddings(input_ids=input_ids) if token_type_ids is None: - token_type_ids = tf.fill(input_shape, 0) + input_shape = shape_list(tensor=inputs_embeds)[:-1] + token_type_ids = tf.fill(dims=input_shape, value=0) - if inputs_embeds is None: - inputs_embeds = tf.gather(self.word_embeddings, input_ids) + if position_ids is None: + position_embeds = self.position_embeddings(position_ids=inputs_embeds) + else: + position_embeds = self.position_embeddings(position_ids=position_ids) - position_embeddings = tf.cast(self.position_embeddings(position_ids), inputs_embeds.dtype) - token_type_embeddings = tf.cast(self.token_type_embeddings(token_type_ids), inputs_embeds.dtype) - embeddings = inputs_embeds + position_embeddings + token_type_embeddings - embeddings = self.LayerNorm(embeddings) - embeddings = self.dropout(embeddings, training=training) + token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids) + final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) + final_embeddings = self.LayerNorm(inputs=final_embeddings) + final_embeddings = self.dropout(inputs=final_embeddings, training=training) - return embeddings - - def _linear(self, inputs): - """ - Computes logits by running inputs through a linear layer. - - Args: - inputs: A float32 tensor with shape [batch_size, length, hidden_size] - - Returns: - float32 tensor with shape [batch_size, length, vocab_size]. - """ - batch_size = shape_list(inputs)[0] - length = shape_list(inputs)[1] - x = tf.reshape(inputs, [-1, self.embedding_size]) - logits = tf.matmul(x, self.word_embeddings, transpose_b=True) - - return tf.reshape(logits, [batch_size, length, self.vocab_size]) + return final_embeddings class TFElectraDiscriminatorPredictions(tf.keras.layers.Layer): @@ -508,11 +560,11 @@ class TFElectraMainLayer(tf.keras.layers.Layer): self.config = config def get_input_embeddings(self): - return self.embeddings + return self.embeddings.word_embeddings def set_input_embeddings(self, value): - self.embeddings.word_embeddings = value - self.embeddings.vocab_size = shape_list(value)[0] + self.embeddings.word_embeddings.weight = value + self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] def _prune_heads(self, heads_to_prune): """ @@ -903,6 +955,7 @@ class TFElectraMaskedLMHead(tf.keras.layers.Layer): super().__init__(**kwargs) self.vocab_size = config.vocab_size + self.embedding_size = config.embedding_size self.input_embeddings = input_embeddings def build(self, input_shape): @@ -914,7 +967,7 @@ class TFElectraMaskedLMHead(tf.keras.layers.Layer): return self.input_embeddings def set_output_embeddings(self, value): - self.input_embeddings.word_embeddings = value + self.input_embeddings.weight = value self.input_embeddings.vocab_size = shape_list(value)[0] def get_bias(self): @@ -924,9 +977,12 @@ class TFElectraMaskedLMHead(tf.keras.layers.Layer): self.bias = value["bias"] self.vocab_size = shape_list(value["bias"])[0] - def call(self, hidden_states, training=False): - hidden_states = self.input_embeddings(hidden_states, mode="linear") - hidden_states = hidden_states + self.bias + def call(self, hidden_states): + seq_length = shape_list(tensor=hidden_states)[1] + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size]) + hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) + hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) return hidden_states @@ -953,7 +1009,9 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLos else: self.activation = config.hidden_act - self.generator_lm_head = TFElectraMaskedLMHead(config, self.electra.embeddings, name="generator_lm_head") + self.generator_lm_head = TFElectraMaskedLMHead( + config, self.electra.embeddings.word_embeddings, name="generator_lm_head" + ) def get_lm_head(self): return self.generator_lm_head diff --git a/src/transformers/models/funnel/modeling_tf_funnel.py b/src/transformers/models/funnel/modeling_tf_funnel.py index e3319fe36b..819b553d3f 100644 --- a/src/transformers/models/funnel/modeling_tf_funnel.py +++ b/src/transformers/models/funnel/modeling_tf_funnel.py @@ -74,89 +74,78 @@ TF_FUNNEL_PRETRAINED_MODEL_ARCHIVE_LIST = [ INF = 1e6 -class TFFunnelEmbeddings(tf.keras.layers.Layer): - """Construct the embeddings from word embeddings.""" - - def __init__(self, config, **kwargs): +# Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings +class TFFunnelWordEmbeddings(tf.keras.layers.Layer): + def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): super().__init__(**kwargs) - self.vocab_size = config.vocab_size - self.hidden_size = config.hidden_size - self.initializer_range = config.initializer_range - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout) + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.initializer_range = initializer_range def build(self, input_shape): - """Build shared word embedding layer """ - with tf.name_scope("word_embeddings"): - # Create and initialize weights. The random normal initializer was chosen - # arbitrarily, and works well. - self.word_embeddings = self.add_weight( - "weight", - shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - super().build(input_shape) + self.weight = self.add_weight( + name="weight", + shape=[self.vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) - def call( - self, - input_ids=None, - inputs_embeds=None, - mode="embedding", - training=False, - ): - """ - Get token embeddings of inputs + super().build(input_shape=input_shape) - Args: - inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) - mode: string, a valid value is one of "embedding" and "linear" + def get_config(self): + config = { + "vocab_size": self.vocab_size, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() - Returns: - outputs: (1) If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, - embedding_size]; (2) mode == "linear", output linear tensor, float32 with shape [batch_size, length, - vocab_size] + return dict(list(base_config.items()) + list(config.items())) - Raises: - ValueError: if mode is not valid. + def call(self, input_ids): + flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) + embeddings = tf.gather(params=self.weight, indices=flat_input_ids) + embeddings = tf.reshape( + tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + ) - Shared weights logic adapted from - https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 - """ - if mode == "embedding": - return self._embedding(input_ids, inputs_embeds, training=training) - elif mode == "linear": - return self._linear(input_ids) - else: - raise ValueError("mode {} is not valid.".format(mode)) - - def _embedding(self, input_ids, inputs_embeds, training=False): - """Applies embedding based on inputs tensor.""" - assert not (input_ids is None and inputs_embeds is None) - if inputs_embeds is None: - inputs_embeds = tf.gather(self.word_embeddings, input_ids) - - embeddings = self.layer_norm(inputs_embeds) - embeddings = self.dropout(embeddings, training=training) + embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) return embeddings - def _linear(self, inputs): - """ - Computes logits by running inputs through a linear layer - Args: - inputs: A float32 tensor with shape [batch_size, length, hidden_size +class TFFunnelEmbeddings(tf.keras.layers.Layer): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.word_embeddings = TFFunnelWordEmbeddings( + vocab_size=config.vocab_size, + hidden_size=config.hidden_size, + initializer_range=config.initializer_range, + name="word_embeddings", + ) + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout) + + def call(self, input_ids=None, inputs_embeds=None, training=False): + """ + Applies embedding based on inputs tensor. Returns: - float32 tensor with shape [batch_size, length, vocab_size]. + final_embeddings (:obj:`tf.Tensor`): output embedding tensor. """ - batch_size = shape_list(inputs)[0] - length = shape_list(inputs)[1] - x = tf.reshape(inputs, [-1, self.hidden_size]) - logits = tf.matmul(x, self.word_embeddings, transpose_b=True) + assert not (input_ids is None and inputs_embeds is None) + assert not (input_ids is not None and inputs_embeds is not None) - return tf.reshape(logits, [batch_size, length, self.vocab_size]) + if input_ids is not None: + inputs_embeds = self.word_embeddings(input_ids=input_ids) + + final_embeddings = self.LayerNorm(inputs=inputs_embeds) + final_embeddings = self.dropout(inputs=final_embeddings, training=training) + + return final_embeddings class TFFunnelAttentionStructure: @@ -784,11 +773,11 @@ class TFFunnelBaseLayer(tf.keras.layers.Layer): self.encoder = TFFunnelEncoder(config, name="encoder") def get_input_embeddings(self): - return self.embeddings + return self.embeddings.word_embeddings def set_input_embeddings(self, value): - self.embeddings.word_embeddings = value - self.embeddings.vocab_size = shape_list(value)[0] + self.embeddings.word_embeddings.weight = value + self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] def _prune_heads(self, heads_to_prune): raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models @@ -870,11 +859,11 @@ class TFFunnelMainLayer(tf.keras.layers.Layer): self.decoder = TFFunnelDecoder(config, name="decoder") def get_input_embeddings(self): - return self.embeddings + return self.embeddings.word_embeddings def set_input_embeddings(self, value): - self.embeddings.word_embeddings = value - self.embeddings.vocab_size = shape_list(value)[0] + self.embeddings.word_embeddings.weight = value + self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] def _prune_heads(self, heads_to_prune): raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models @@ -987,17 +976,19 @@ class TFFunnelMaskedLMHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size + self.hidden_size = config.hidden_size self.input_embeddings = input_embeddings def build(self, input_shape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") + super().build(input_shape) def get_output_embeddings(self): return self.input_embeddings def set_output_embeddings(self, value): - self.input_embeddings.word_embeddings = value + self.input_embeddings.weight = value self.input_embeddings.vocab_size = shape_list(value)[0] def get_bias(self): @@ -1008,8 +999,12 @@ class TFFunnelMaskedLMHead(tf.keras.layers.Layer): self.vocab_size = shape_list(value["bias"])[0] def call(self, hidden_states, training=False): - hidden_states = self.input_embeddings(hidden_states, mode="linear") - hidden_states = hidden_states + self.bias + seq_length = shape_list(tensor=hidden_states)[1] + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) + hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) + hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) + return hidden_states @@ -1362,7 +1357,7 @@ class TFFunnelForMaskedLM(TFFunnelPreTrainedModel, TFMaskedLanguageModelingLoss) super().__init__(config, *inputs, **kwargs) self.funnel = TFFunnelMainLayer(config, name="funnel") - self.lm_head = TFFunnelMaskedLMHead(config, self.funnel.embeddings, name="lm_head") + self.lm_head = TFFunnelMaskedLMHead(config, self.funnel.embeddings.word_embeddings, name="lm_head") def get_lm_head(self): return self.lm_head diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py index eb38235fd3..71fdfc150d 100644 --- a/src/transformers/models/longformer/modeling_tf_longformer.py +++ b/src/transformers/models/longformer/modeling_tf_longformer.py @@ -415,14 +415,135 @@ def _compute_global_attention_mask(input_ids_shape, sep_token_indices, before_se return attention_mask -# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead +# Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings +class TFLongformerWordEmbeddings(tf.keras.layers.Layer): + def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.weight = self.add_weight( + name="weight", + shape=[self.vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape=input_shape) + + def get_config(self): + config = { + "vocab_size": self.vocab_size, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, input_ids): + flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) + embeddings = tf.gather(params=self.weight, indices=flat_input_ids) + embeddings = tf.reshape( + tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + ) + + embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + + return embeddings + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertTokenTypeEmbeddings +class TFLongformerTokenTypeEmbeddings(tf.keras.layers.Layer): + def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.type_vocab_size = type_vocab_size + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.token_type_embeddings = self.add_weight( + name="embeddings", + shape=[self.type_vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape=input_shape) + + def get_config(self): + config = { + "type_vocab_size": self.type_vocab_size, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, token_type_ids): + flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) + one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) + embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) + embeddings = tf.reshape( + tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) + ) + + embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) + + return embeddings + + +class TFLongformerPositionEmbeddings(tf.keras.layers.Layer): + def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape) + + def get_config(self): + config = { + "max_position_embeddings": self.max_position_embeddings, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, position_ids): + flat_position_ids = tf.reshape(tensor=position_ids, shape=[-1]) + embeddings = tf.gather(params=self.position_embeddings, indices=flat_position_ids) + embeddings = tf.reshape( + tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=position_ids), [self.hidden_size]], axis=0) + ) + + embeddings.set_shape(shape=position_ids.shape.as_list() + [self.hidden_size]) + + return embeddings + + +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->Longformer class TFLongformerLMHead(tf.keras.layers.Layer): - """Roberta Head for masked language modeling.""" + """Longformer Head for masked language modeling.""" def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size + self.hidden_size = config.hidden_size self.dense = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -442,7 +563,7 @@ class TFLongformerLMHead(tf.keras.layers.Layer): return self.decoder def set_output_embeddings(self, value): - self.decoder.word_embeddings = value + self.decoder.weight = value self.decoder.vocab_size = shape_list(value)[0] def get_bias(self): @@ -458,11 +579,16 @@ class TFLongformerLMHead(tf.keras.layers.Layer): hidden_states = self.layer_norm(hidden_states) # project back to size of vocabulary with bias - hidden_states = self.decoder(hidden_states, mode="linear") + self.bias + seq_length = shape_list(tensor=hidden_states)[1] + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) + hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True) + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) + hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) return hidden_states +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaEmbeddings with Roberta->Longformer class TFLongformerEmbeddings(tf.keras.layers.Layer): """ Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. @@ -472,39 +598,27 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer): super().__init__(**kwargs) self.padding_idx = 1 - self.vocab_size = config.vocab_size - self.hidden_size = config.hidden_size - self.initializer_range = config.initializer_range - self.position_embeddings = tf.keras.layers.Embedding( - config.max_position_embeddings, - config.hidden_size, - embeddings_initializer=get_initializer(self.initializer_range), + self.word_embeddings = TFLongformerWordEmbeddings( + vocab_size=config.vocab_size, + hidden_size=config.hidden_size, + initializer_range=config.initializer_range, + name="word_embeddings", + ) + self.position_embeddings = TFLongformerPositionEmbeddings( + max_position_embeddings=config.max_position_embeddings, + hidden_size=config.hidden_size, + initializer_range=config.initializer_range, name="position_embeddings", ) - self.token_type_embeddings = tf.keras.layers.Embedding( - config.type_vocab_size, - config.hidden_size, - embeddings_initializer=get_initializer(self.initializer_range), + self.token_type_embeddings = TFLongformerTokenTypeEmbeddings( + type_vocab_size=config.type_vocab_size, + hidden_size=config.hidden_size, + initializer_range=config.initializer_range, name="token_type_embeddings", ) - - # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load - # any TensorFlow checkpoint file + self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - - def build(self, input_shape): - """Build shared word embedding layer """ - with tf.name_scope("word_embeddings"): - # Create and initialize weights. The random normal initializer was chosen - # arbitrarily, and works well. - self.word_embeddings = self.add_weight( - "weight", - shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) def create_position_ids_from_input_ids(self, input_ids): """ @@ -516,14 +630,16 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer): Returns: tf.Tensor """ - input_ids_shape = shape_list(input_ids) + input_ids_shape = shape_list(tensor=input_ids) # multiple choice has 3 dimensions if len(input_ids_shape) == 3: - input_ids = tf.reshape(input_ids, (input_ids_shape[0] * input_ids_shape[1], input_ids_shape[2])) + input_ids = tf.reshape( + tensor=input_ids, shape=(input_ids_shape[0] * input_ids_shape[1], input_ids_shape[2]) + ) - mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=tf.int32) - incremental_indices = tf.math.cumsum(mask, axis=1) * mask + mask = tf.cast(x=tf.math.not_equal(x=input_ids, y=self.padding_idx), dtype=input_ids.dtype) + incremental_indices = tf.math.cumsum(x=mask, axis=1) * mask return incremental_indices + self.padding_idx @@ -536,96 +652,41 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer): Returns: tf.Tensor """ - seq_length = shape_list(inputs_embeds)[1] - position_ids = tf.range(self.padding_idx + 1, seq_length + self.padding_idx + 1, dtype=tf.int32)[tf.newaxis, :] + batch_size, seq_length = shape_list(tensor=inputs_embeds)[:2] + position_ids = tf.range(start=self.padding_idx + 1, limit=seq_length + self.padding_idx + 1)[tf.newaxis, :] - return position_ids + return tf.tile(input=position_ids, multiples=(batch_size, 1)) - def call( - self, - input_ids=None, - position_ids=None, - token_type_ids=None, - inputs_embeds=None, - mode="embedding", - training=False, - ): + def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False): """ - Get token embeddings of inputs. - - Args: - inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) - mode: string, a valid value is one of "embedding" and "linear". + Applies embedding based on inputs tensor. Returns: - outputs: If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, - embedding_size]; if mode == "linear", output linear tensor, float32 with shape [batch_size, length, - vocab_size]. - - Raises: - ValueError: if mode is not valid. - - Shared weights logic adapted from - https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 + final_embeddings (:obj:`tf.Tensor`): output embedding tensor. """ - if mode == "embedding": - return self._embedding(input_ids, position_ids, token_type_ids, inputs_embeds, training=training) - elif mode == "linear": - return self._linear(input_ids) - else: - raise ValueError("mode {} is not valid.".format(mode)) - - def _embedding(self, input_ids, position_ids, token_type_ids, inputs_embeds, training=False): - """Applies embedding based on inputs tensor.""" assert not (input_ids is None and inputs_embeds is None) + if input_ids is not None: + inputs_embeds = self.word_embeddings(input_ids=input_ids) + + if token_type_ids is None: + input_shape = shape_list(tensor=inputs_embeds)[:-1] + token_type_ids = tf.fill(dims=input_shape, value=0) + if position_ids is None: if input_ids is not None: # Create the position ids from the input token ids. Any padded tokens remain padded. - position_ids = self.create_position_ids_from_input_ids(input_ids) + position_ids = self.create_position_ids_from_input_ids(input_ids=input_ids) else: - position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) + position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds=inputs_embeds) - if input_ids is not None: - input_shape = shape_list(input_ids) - else: - input_shape = shape_list(inputs_embeds)[:-1] + position_embeds = self.position_embeddings(position_ids=position_ids) + token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids) + final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) + final_embeddings = self.LayerNorm(inputs=final_embeddings) + final_embeddings = self.dropout(inputs=final_embeddings, training=training) - seq_length = input_shape[1] - - if position_ids is None: - position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] - - if token_type_ids is None: - token_type_ids = tf.fill(input_shape, 0) - - if inputs_embeds is None: - inputs_embeds = tf.gather(self.word_embeddings, input_ids) - - position_embeddings = tf.cast(self.position_embeddings(position_ids), inputs_embeds.dtype) - token_type_embeddings = tf.cast(self.token_type_embeddings(token_type_ids), inputs_embeds.dtype) - embeddings = inputs_embeds + position_embeddings + token_type_embeddings - embeddings = self.LayerNorm(embeddings) - embeddings = self.dropout(embeddings, training=training) - - return embeddings - - def _linear(self, inputs): - """ - Computes logits by running inputs through a linear layer. - - Args: - inputs: A float32 tensor with shape [batch_size, length, hidden_size] - - Returns: - float32 tensor with shape [batch_size, length, vocab_size]. - """ - batch_size = shape_list(inputs)[0] - length = shape_list(inputs)[1] - x = tf.reshape(inputs, [-1, self.hidden_size]) - logits = tf.matmul(x, self.word_embeddings, transpose_b=True) - - return tf.reshape(logits, [batch_size, length, self.vocab_size]) + return final_embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate @@ -1613,11 +1674,11 @@ class TFLongformerMainLayer(tf.keras.layers.Layer): self.pooler = TFLongformerPooler(config, name="pooler") if add_pooling_layer else None def get_input_embeddings(self): - return self.embeddings + return self.embeddings.word_embeddings def set_input_embeddings(self, value): - self.embeddings.word_embeddings = value - self.embeddings.vocab_size = shape_list(value)[0] + self.embeddings.word_embeddings.weight = value + self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] def _prune_heads(self, heads_to_prune): """ @@ -2053,7 +2114,7 @@ class TFLongformerForMaskedLM(TFLongformerPreTrainedModel, TFMaskedLanguageModel super().__init__(config, *inputs, **kwargs) self.longformer = TFLongformerMainLayer(config, add_pooling_layer=False, name="longformer") - self.lm_head = TFLongformerLMHead(config, self.longformer.embeddings, name="lm_head") + self.lm_head = TFLongformerLMHead(config, self.longformer.embeddings.word_embeddings, name="lm_head") def get_lm_head(self): return self.lm_head diff --git a/src/transformers/models/lxmert/modeling_tf_lxmert.py b/src/transformers/models/lxmert/modeling_tf_lxmert.py index cc5d93aa95..16b72f2466 100644 --- a/src/transformers/models/lxmert/modeling_tf_lxmert.py +++ b/src/transformers/models/lxmert/modeling_tf_lxmert.py @@ -177,112 +177,173 @@ class TFLxmertVisualFeatureEncoder(tf.keras.layers.Layer): return output +# Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings +class TFLxmertWordEmbeddings(tf.keras.layers.Layer): + def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.weight = self.add_weight( + name="weight", + shape=[self.vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape=input_shape) + + def get_config(self): + config = { + "vocab_size": self.vocab_size, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, input_ids): + flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) + embeddings = tf.gather(params=self.weight, indices=flat_input_ids) + embeddings = tf.reshape( + tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + ) + + embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + + return embeddings + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertTokenTypeEmbeddings +class TFLxmertTokenTypeEmbeddings(tf.keras.layers.Layer): + def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.type_vocab_size = type_vocab_size + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.token_type_embeddings = self.add_weight( + name="embeddings", + shape=[self.type_vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape=input_shape) + + def get_config(self): + config = { + "type_vocab_size": self.type_vocab_size, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, token_type_ids): + flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) + one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) + embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) + embeddings = tf.reshape( + tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) + ) + + embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) + + return embeddings + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertPositionEmbeddings +class TFLxmertPositionEmbeddings(tf.keras.layers.Layer): + def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape) + + def get_config(self): + config = { + "max_position_embeddings": self.max_position_embeddings, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, position_ids): + input_shape = shape_list(tensor=position_ids) + position_embeddings = self.position_embeddings[: input_shape[1], :] + + return tf.broadcast_to(input=position_embeddings, shape=input_shape) + + class TFLxmertEmbeddings(tf.keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.vocab_size = config.vocab_size - self.hidden_size = config.hidden_size - self.initializer_range = config.initializer_range - self.position_embeddings = tf.keras.layers.Embedding( - config.max_position_embeddings, - config.hidden_size, - embeddings_initializer=get_initializer(self.initializer_range), + self.word_embeddings = TFLxmertWordEmbeddings( + vocab_size=config.vocab_size, + hidden_size=config.hidden_size, + initializer_range=config.initializer_range, + name="word_embeddings", + ) + self.position_embeddings = TFLxmertPositionEmbeddings( + max_position_embeddings=config.max_position_embeddings, + hidden_size=config.hidden_size, + initializer_range=config.initializer_range, name="position_embeddings", ) - self.token_type_embeddings = tf.keras.layers.Embedding( - config.type_vocab_size, - config.hidden_size, - embeddings_initializer=get_initializer(self.initializer_range), + self.token_type_embeddings = TFLxmertTokenTypeEmbeddings( + type_vocab_size=config.type_vocab_size, + hidden_size=config.hidden_size, + initializer_range=config.initializer_range, name="token_type_embeddings", ) - - # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load - # any TensorFlow checkpoint file + self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape): - """Build shared word embedding layer """ - with tf.name_scope("word_embeddings"): - # Create and initialize weights. The random normal initializer was chosen - # arbitrarily, and works well. - self.word_embeddings = self.add_weight( - "weight", - shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - super().build(input_shape) - - def call(self, inputs, mode="embedding", training=False): + def call(self, input_ids=None, token_type_ids=None, inputs_embeds=None, training=False): """ - Get token embeddings of inputs. - - Args: - inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) - mode: string, a valid value is one of "embedding" and "linear". + Applies embedding based on inputs tensor. Returns: - outputs: If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, - embedding_size]; if mode == "linear", output linear tensor, float32 with shape [batch_size, length, - vocab_size]. - - Raises: - ValueError: if mode is not valid. - - Shared weights logic adapted from - https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 + final_embeddings (:obj:`tf.Tensor`): output embedding tensor. """ - if mode == "embedding": - return self._embedding(inputs, training=training) - elif mode == "linear": - return self._linear(inputs) - else: - raise ValueError("mode {} is not valid.".format(mode)) - - def _embedding(self, inputs, training=False): - """Applies embedding based on inputs tensor.""" - input_ids, token_type_ids, inputs_embeds = inputs + assert not (input_ids is None and inputs_embeds is None) if input_ids is not None: - input_shape = shape_list(input_ids) - else: - input_shape = shape_list(inputs_embeds)[:-1] + inputs_embeds = self.word_embeddings(input_ids=input_ids) - seq_length = input_shape[1] - position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] if token_type_ids is None: - token_type_ids = tf.fill(input_shape, 0) + input_shape = shape_list(tensor=inputs_embeds)[:-1] + token_type_ids = tf.fill(dims=input_shape, value=0) - if inputs_embeds is None: - inputs_embeds = tf.gather(self.word_embeddings, input_ids) - position_embeddings = self.position_embeddings(position_ids) - token_type_embeddings = self.token_type_embeddings(token_type_ids) + position_embeds = self.position_embeddings(position_ids=inputs_embeds) + token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids) + final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) + final_embeddings = self.LayerNorm(inputs=final_embeddings) + final_embeddings = self.dropout(inputs=final_embeddings, training=training) - embeddings = inputs_embeds + position_embeddings + token_type_embeddings - embeddings = self.LayerNorm(embeddings) - embeddings = self.dropout(embeddings, training=training) - return embeddings - - def _linear(self, inputs): - """ - Computes logits by running inputs through a linear layer. - - Args: - inputs: A float32 tensor with shape [batch_size, length, hidden_size] - - Returns: - float32 tensor with shape [batch_size, length, vocab_size]. - """ - batch_size = shape_list(inputs)[0] - length = shape_list(inputs)[1] - - x = tf.reshape(inputs, [-1, self.hidden_size]) - logits = tf.matmul(x, self.word_embeddings, transpose_b=True) - - return tf.reshape(logits, [batch_size, length, self.vocab_size]) + return final_embeddings class TFLxmertAttention(tf.keras.layers.Layer): @@ -703,11 +764,11 @@ class TFLxmertMainLayer(tf.keras.layers.Layer): self.config = config def get_input_embeddings(self): - return self.embeddings + return self.embeddings.word_embeddings def set_input_embeddings(self, value): - self.embeddings.word_embeddings = value - self.embeddings.vocab_size = shape_list(value)[0] + self.embeddings.word_embeddings.weight = value + self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] def _prune_heads(self, heads_to_prune): raise NotImplementedError @@ -787,7 +848,7 @@ class TFLxmertMainLayer(tf.keras.layers.Layer): # Positional Word Embeddings embedding_output = self.embeddings( - [inputs["input_ids"], inputs["token_type_ids"], inputs["inputs_embeds"]], training=inputs["training"] + inputs["input_ids"], inputs["token_type_ids"], inputs["inputs_embeds"], training=inputs["training"] ) # Run Lxmert encoder @@ -1066,31 +1127,38 @@ class TFLxmertPooler(tf.keras.layers.Layer): return pooled_output +# Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->Lxmert class TFLxmertPredictionHeadTransform(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) + self.dense = tf.keras.layers.Dense( - config.hidden_size, - kernel_initializer=get_initializer(config.initializer_range), - name="dense", + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) + if isinstance(config.hidden_act, str): self.transform_act_fn = get_tf_activation(config.hidden_act) else: self.transform_act_fn = config.hidden_act + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") def call(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.transform_act_fn(hidden_states) hidden_states = self.LayerNorm(hidden_states) + return hidden_states +# Copied from transformers.models.bert.modeling_tf_bert.TFBertLMPredictionHead with Bert->Lxmert class TFLxmertLMPredictionHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) + self.vocab_size = config.vocab_size + self.hidden_size = config.hidden_size + self.transform = TFLxmertPredictionHeadTransform(config, name="transform") # The output weights are the same as the input embeddings, but there is @@ -1099,13 +1167,14 @@ class TFLxmertLMPredictionHead(tf.keras.layers.Layer): def build(self, input_shape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") + super().build(input_shape) def get_output_embeddings(self): return self.input_embeddings def set_output_embeddings(self, value): - self.input_embeddings.word_embeddings = value + self.input_embeddings.weight = value self.input_embeddings.vocab_size = shape_list(value)[0] def get_bias(self): @@ -1116,12 +1185,17 @@ class TFLxmertLMPredictionHead(tf.keras.layers.Layer): self.vocab_size = shape_list(value["bias"])[0] def call(self, hidden_states): - hidden_states = self.transform(hidden_states) - hidden_states = self.input_embeddings(hidden_states, mode="linear") - hidden_states = hidden_states + self.bias + hidden_states = self.transform(hidden_states=hidden_states) + seq_length = shape_list(tensor=hidden_states)[1] + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) + hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) + hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) + return hidden_states +# Copied from transformers.models.bert.modeling_tf_bert.TFBertMLMHead with Bert->Lxmert class TFLxmertMLMHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) @@ -1130,6 +1204,7 @@ class TFLxmertMLMHead(tf.keras.layers.Layer): def call(self, sequence_output): prediction_scores = self.predictions(sequence_output) + return prediction_scores @@ -1229,7 +1304,7 @@ class TFLxmertForPreTraining(TFLxmertPreTrainedModel): self.lxmert = TFLxmertMainLayer(config, name="lxmert") # Pre-training heads - self.cls = TFLxmertPreTrainingHeads(config, self.lxmert.embeddings, name="cls") + self.cls = TFLxmertPreTrainingHeads(config, self.lxmert.embeddings.word_embeddings, name="cls") if self.task_obj_predict: self.obj_predict_head = TFLxmertVisualObjHead(config, name="obj_predict_head") if self.task_qa: diff --git a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py index 5a97b331d3..4035151405 100644 --- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py +++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py @@ -107,30 +107,150 @@ class TFNoNorm(tf.keras.layers.Layer): NORM2FN = {"layer_norm": TFLayerNorm, "no_norm": TFNoNorm} +# Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings +class TFMobileBertWordEmbeddings(tf.keras.layers.Layer): + def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.weight = self.add_weight( + name="weight", + shape=[self.vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape=input_shape) + + def get_config(self): + config = { + "vocab_size": self.vocab_size, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, input_ids): + flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) + embeddings = tf.gather(params=self.weight, indices=flat_input_ids) + embeddings = tf.reshape( + tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + ) + + embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + + return embeddings + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertTokenTypeEmbeddings +class TFMobileBertTokenTypeEmbeddings(tf.keras.layers.Layer): + def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.type_vocab_size = type_vocab_size + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.token_type_embeddings = self.add_weight( + name="embeddings", + shape=[self.type_vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape=input_shape) + + def get_config(self): + config = { + "type_vocab_size": self.type_vocab_size, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, token_type_ids): + flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) + one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) + embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) + embeddings = tf.reshape( + tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) + ) + + embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) + + return embeddings + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertPositionEmbeddings +class TFMobileBertPositionEmbeddings(tf.keras.layers.Layer): + def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape) + + def get_config(self): + config = { + "max_position_embeddings": self.max_position_embeddings, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, position_ids): + input_shape = shape_list(tensor=position_ids) + position_embeddings = self.position_embeddings[: input_shape[1], :] + + return tf.broadcast_to(input=position_embeddings, shape=input_shape) + + class TFMobileBertEmbeddings(tf.keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, config, **kwargs): super().__init__(**kwargs) + self.trigram_input = config.trigram_input self.embedding_size = config.embedding_size - self.vocab_size = config.vocab_size self.hidden_size = config.hidden_size - self.initializer_range = config.initializer_range - - self.position_embeddings = tf.keras.layers.Embedding( - config.max_position_embeddings, - config.hidden_size, - embeddings_initializer=get_initializer(self.initializer_range), + self.word_embeddings = TFMobileBertWordEmbeddings( + vocab_size=config.vocab_size, + hidden_size=config.embedding_size, + initializer_range=config.initializer_range, + name="word_embeddings", + ) + self.position_embeddings = TFMobileBertPositionEmbeddings( + max_position_embeddings=config.max_position_embeddings, + hidden_size=config.hidden_size, + initializer_range=config.initializer_range, name="position_embeddings", ) - self.token_type_embeddings = tf.keras.layers.Embedding( - config.type_vocab_size, - config.hidden_size, - embeddings_initializer=get_initializer(self.initializer_range), + self.token_type_embeddings = TFMobileBertTokenTypeEmbeddings( + type_vocab_size=config.type_vocab_size, + hidden_size=config.hidden_size, + initializer_range=config.initializer_range, name="token_type_embeddings", ) - + self.embeddings_sum = tf.keras.layers.Add() self.embedding_transformation = tf.keras.layers.Dense(config.hidden_size, name="embedding_transformation") # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load @@ -138,71 +258,23 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer): self.LayerNorm = NORM2FN[config.normalization_type]( config.hidden_size, epsilon=config.layer_norm_eps, name="LayerNorm" ) - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape): - """Build shared word embedding layer """ - with tf.name_scope("word_embeddings"): - # Create and initialize weights. The random normal initializer was chosen - # arbitrarily, and works well. - self.word_embeddings = self.add_weight( - "weight", - shape=[self.vocab_size, self.embedding_size], - initializer=get_initializer(self.initializer_range), - ) - super().build(input_shape) - - def call( - self, - input_ids=None, - position_ids=None, - token_type_ids=None, - inputs_embeds=None, - mode="embedding", - training=False, - ): + def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False): """ - Get token embeddings of inputs. - - Args: - inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) - mode: string, a valid value is one of "embedding" and "linear". + Applies embedding based on inputs tensor. Returns: - outputs: If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, - embedding_size]; if mode == "linear", output linear tensor, float32 with shape [batch_size, length, - vocab_size]. - - Raises: - ValueError: if mode is not valid. - - Shared weights logic adapted from - https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 + final_embeddings (:obj:`tf.Tensor`): output embedding tensor. """ - if mode == "embedding": - return self._embedding(input_ids, position_ids, token_type_ids, inputs_embeds, training=training) - elif mode == "linear": - return self._linear(input_ids) - else: - raise ValueError("mode {} is not valid.".format(mode)) - - def _embedding(self, input_ids, position_ids, token_type_ids, inputs_embeds, training=False): - """Applies embedding based on inputs tensor.""" assert not (input_ids is None and inputs_embeds is None) if input_ids is not None: - input_shape = shape_list(input_ids) - else: - input_shape = shape_list(inputs_embeds)[:-1] + inputs_embeds = self.word_embeddings(input_ids=input_ids) - seq_length = input_shape[1] - if position_ids is None: - position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] if token_type_ids is None: - token_type_ids = tf.fill(input_shape, 0) - - if inputs_embeds is None: - inputs_embeds = tf.gather(self.word_embeddings, input_ids) + input_shape = shape_list(tensor=inputs_embeds)[:-1] + token_type_ids = tf.fill(dims=input_shape, value=0) if self.trigram_input: # From the paper MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited @@ -224,32 +296,17 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer): if self.trigram_input or self.embedding_size != self.hidden_size: inputs_embeds = self.embedding_transformation(inputs_embeds) - position_embeddings = self.position_embeddings(position_ids) - token_type_embeddings = self.token_type_embeddings(token_type_ids) + if position_ids is None: + position_embeds = self.position_embeddings(position_ids=inputs_embeds) + else: + position_embeds = self.position_embeddings(position_ids=position_ids) - embeddings = inputs_embeds + position_embeddings + token_type_embeddings - embeddings = self.LayerNorm(embeddings) - embeddings = self.dropout(embeddings, training=training) + token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids) + final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) + final_embeddings = self.LayerNorm(inputs=final_embeddings) + final_embeddings = self.dropout(inputs=final_embeddings, training=training) - return embeddings - - def _linear(self, inputs): - """ - Computes logits by running inputs through a linear layer. - - Args: - inputs: A float32 tensor with shape [batch_size, length, hidden_size] - - Returns: - float32 tensor with shape [batch_size, length, vocab_size]. - """ - batch_size = shape_list(inputs)[0] - length = shape_list(inputs)[1] - - x = tf.reshape(inputs, [-1, self.hidden_size]) - logits = tf.matmul(x, self.word_embeddings, transpose_b=True) - - return tf.reshape(logits, [batch_size, length, self.vocab_size]) + return final_embeddings class TFMobileBertSelfAttention(tf.keras.layers.Layer): @@ -715,11 +772,11 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer): self.pooler = TFMobileBertPooler(config, name="pooler") if add_pooling_layer else None def get_input_embeddings(self): - return self.embeddings + return self.embeddings.word_embeddings def set_input_embeddings(self, value): - self.embeddings.word_embeddings = value - self.embeddings.vocab_size = shape_list(value)[0] + self.embeddings.word_embeddings.weight = value + self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] def _prune_heads(self, heads_to_prune): """ diff --git a/src/transformers/models/mpnet/modeling_tf_mpnet.py b/src/transformers/models/mpnet/modeling_tf_mpnet.py index 90cba086d5..e029acd2db 100644 --- a/src/transformers/models/mpnet/modeling_tf_mpnet.py +++ b/src/transformers/models/mpnet/modeling_tf_mpnet.py @@ -86,6 +86,86 @@ class TFMPNetPreTrainedModel(TFPreTrainedModel): return self.serving_output(output) +# Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings +class TFMPNetWordEmbeddings(tf.keras.layers.Layer): + def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.weight = self.add_weight( + name="weight", + shape=[self.vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape=input_shape) + + def get_config(self): + config = { + "vocab_size": self.vocab_size, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, input_ids): + flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) + embeddings = tf.gather(params=self.weight, indices=flat_input_ids) + embeddings = tf.reshape( + tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + ) + + embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + + return embeddings + + +# Copied from transformers.models.longformer.modeling_tf_longformer.TFLongformerPositionEmbeddings +class TFMPNetPositionEmbeddings(tf.keras.layers.Layer): + def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape) + + def get_config(self): + config = { + "max_position_embeddings": self.max_position_embeddings, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, position_ids): + flat_position_ids = tf.reshape(tensor=position_ids, shape=[-1]) + embeddings = tf.gather(params=self.position_embeddings, indices=flat_position_ids) + embeddings = tf.reshape( + tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=position_ids), [self.hidden_size]], axis=0) + ) + + embeddings.set_shape(shape=position_ids.shape.as_list() + [self.hidden_size]) + + return embeddings + + class TFMPNetEmbeddings(tf.keras.layers.Layer): """Construct the embeddings from word, position embeddings.""" @@ -93,136 +173,84 @@ class TFMPNetEmbeddings(tf.keras.layers.Layer): super().__init__(**kwargs) self.padding_idx = 1 - self.vocab_size = config.vocab_size - self.hidden_size = config.hidden_size - self.initializer_range = config.initializer_range - - self.position_embeddings = tf.keras.layers.Embedding( - config.max_position_embeddings, - config.hidden_size, - embeddings_initializer=get_initializer(self.initializer_range), + self.word_embeddings = TFMPNetWordEmbeddings( + vocab_size=config.vocab_size, + hidden_size=config.hidden_size, + initializer_range=config.initializer_range, + name="word_embeddings", + ) + self.position_embeddings = TFMPNetPositionEmbeddings( + max_position_embeddings=config.max_position_embeddings, + hidden_size=config.hidden_size, + initializer_range=config.initializer_range, name="position_embeddings", ) - - # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load - # any TensorFlow checkpoint file + self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape): - """Build shared word embedding layer""" - with tf.name_scope("word_embeddings"): - # Create and initialize weights. The random normal initializer was chosen - # arbitrarily, and works well. - self.word_embeddings = self.add_weight( - "weight", - shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def create_position_ids_from_input_ids(self, x): + def create_position_ids_from_input_ids(self, input_ids): """ Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding - symbols are ignored. This is modified from fairseq's `utils.make_positions`. :param tf.Tensor x: :return - tf.Tensor: - """ - mask = tf.cast(tf.math.not_equal(x, self.padding_idx), dtype=tf.int32) - incremental_indicies = tf.math.cumsum(mask, axis=1) * mask + symbols are ignored. This is modified from fairseq's `utils.make_positions`. - return incremental_indicies + self.padding_idx + Args: + input_ids: tf.Tensor + + Returns: tf.Tensor + """ + input_ids_shape = shape_list(tensor=input_ids) + + # multiple choice has 3 dimensions + if len(input_ids_shape) == 3: + input_ids = tf.reshape( + tensor=input_ids, shape=(input_ids_shape[0] * input_ids_shape[1], input_ids_shape[2]) + ) + + mask = tf.cast(x=tf.math.not_equal(x=input_ids, y=self.padding_idx), dtype=input_ids.dtype) + incremental_indices = tf.math.cumsum(x=mask, axis=1) * mask + + return incremental_indices + self.padding_idx def create_position_ids_from_inputs_embeds(self, inputs_embeds): """ We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids. - :param tf.Tensor inputs_embeds: :return tf.Tensor: - """ - seq_length = shape_list(inputs_embeds)[1] - position_ids = tf.range(self.padding_idx + 1, seq_length + self.padding_idx + 1, dtype=tf.int32)[tf.newaxis, :] - - return position_ids - - def call( - self, - input_ids=None, - position_ids=None, - inputs_embeds=None, - mode="embedding", - training=False, - ): - """ - Get token embeddings of inputs Args: - inputs: list of two int64 tensors with shape [batch_size, length]: (input_ids, position_ids) - mode: string, a valid value is one of "embedding" and "linear" + inputs_embeds: tf.Tensor + + Returns: tf.Tensor + """ + batch_size, seq_length = shape_list(tensor=inputs_embeds)[:2] + position_ids = tf.range(start=self.padding_idx + 1, limit=seq_length + self.padding_idx + 1)[tf.newaxis, :] + + return tf.tile(input=position_ids, multiples=(batch_size, 1)) + + def call(self, input_ids=None, position_ids=None, inputs_embeds=None, training=False): + """ + Applies embedding based on inputs tensor. Returns: - outputs: (1) If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, - embedding_size]; (2) mode == "linear", output linear tensor, float32 with shape [batch_size, length, - vocab_size] - - Raises: - ValueError: if mode is not valid. Shared weights logic adapted from - https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 + final_embeddings (:obj:`tf.Tensor`): output embedding tensor. """ - if mode == "embedding": - return self._embedding(input_ids, position_ids, inputs_embeds, training=training) - elif mode == "linear": - return self._linear(input_ids) - else: - raise ValueError("mode {} is not valid.".format(mode)) - - def _embedding(self, input_ids, position_ids, inputs_embeds, training=False): - """Applies embedding based on inputs tensor.""" assert not (input_ids is None and inputs_embeds is None) + if input_ids is not None: + inputs_embeds = self.word_embeddings(input_ids=input_ids) + if position_ids is None: if input_ids is not None: # Create the position ids from the input token ids. Any padded tokens remain padded. - position_ids = self.create_position_ids_from_input_ids(input_ids) + position_ids = self.create_position_ids_from_input_ids(input_ids=input_ids) else: - position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) + position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds=inputs_embeds) - assert position_ids is None or len(position_ids.shape) <= 2 + position_embeds = self.position_embeddings(position_ids=position_ids) + final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds]) + final_embeddings = self.LayerNorm(inputs=final_embeddings) + final_embeddings = self.dropout(inputs=final_embeddings, training=training) - if input_ids is not None: - input_shape = shape_list(input_ids) - else: - input_shape = shape_list(inputs_embeds)[:-1] - - seq_length = input_shape[1] - - if position_ids is None: - position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] - - if inputs_embeds is None: - inputs_embeds = tf.gather(self.word_embeddings, input_ids) - - position_embeddings = tf.cast(self.position_embeddings(position_ids), inputs_embeds.dtype) - embeddings = inputs_embeds + position_embeddings - embeddings = self.LayerNorm(embeddings) - embeddings = self.dropout(embeddings, training=training) - - return embeddings - - def _linear(self, inputs): - """ - Computes logits by running inputs through a linear layer - - Args: - inputs: A float32 tensor with shape [batch_size, length, hidden_size - - Returns: - float32 tensor with shape [batch_size, length, vocab_size]. - """ - batch_size = shape_list(inputs)[0] - length = shape_list(inputs)[1] - x = tf.reshape(inputs, [-1, self.hidden_size]) - logits = tf.matmul(x, self.word_embeddings, transpose_b=True) - - return tf.reshape(logits, [batch_size, length, self.vocab_size]) + return final_embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler @@ -536,12 +564,12 @@ class TFMPNetMainLayer(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings def get_input_embeddings(self): - return self.embeddings + return self.embeddings.word_embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings def set_input_embeddings(self, value): - self.embeddings.word_embeddings = value - self.embeddings.vocab_size = shape_list(value)[0] + self.embeddings.word_embeddings.weight = value + self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads def _prune_heads(self, heads_to_prune): @@ -808,6 +836,7 @@ class TFMPNetLMHead(tf.keras.layers.Layer): super().__init__(**kwargs) self.vocab_size = config.vocab_size + self.hidden_size = config.hidden_size self.dense = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -827,7 +856,7 @@ class TFMPNetLMHead(tf.keras.layers.Layer): return self.decoder def set_output_embeddings(self, value): - self.decoder.word_embeddings = value + self.decoder.weight = value self.decoder.vocab_size = shape_list(value)[0] def get_bias(self): @@ -837,15 +866,19 @@ class TFMPNetLMHead(tf.keras.layers.Layer): self.bias = value["bias"] self.vocab_size = shape_list(value["bias"])[0] - def call(self, features): - x = self.dense(features) - x = self.act(x) - x = self.layer_norm(x) + def call(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.layer_norm(hidden_states) # project back to size of vocabulary with bias - x = self.decoder(x, mode="linear") + self.bias + seq_length = shape_list(tensor=hidden_states)[1] + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) + hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True) + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) + hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) - return x + return hidden_states @add_start_docstrings("""MPNet Model with a `language modeling` head on top. """, MPNET_START_DOCSTRING) @@ -857,7 +890,7 @@ class TFMPNetForMaskedLM(TFMPNetPreTrainedModel, TFMaskedLanguageModelingLoss): super().__init__(config, *inputs, **kwargs) self.mpnet = TFMPNetMainLayer(config, name="mpnet") - self.lm_head = TFMPNetLMHead(config, self.mpnet.embeddings, name="lm_head") + self.lm_head = TFMPNetLMHead(config, self.mpnet.embeddings.word_embeddings, name="lm_head") def get_lm_head(self): return self.lm_head diff --git a/src/transformers/models/roberta/modeling_tf_roberta.py b/src/transformers/models/roberta/modeling_tf_roberta.py index 80518a37d5..4df2cb2834 100644 --- a/src/transformers/models/roberta/modeling_tf_roberta.py +++ b/src/transformers/models/roberta/modeling_tf_roberta.py @@ -65,6 +65,127 @@ TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [ ] +# Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings +class TFRobertaWordEmbeddings(tf.keras.layers.Layer): + def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.weight = self.add_weight( + name="weight", + shape=[self.vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape=input_shape) + + def get_config(self): + config = { + "vocab_size": self.vocab_size, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, input_ids): + flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) + embeddings = tf.gather(params=self.weight, indices=flat_input_ids) + embeddings = tf.reshape( + tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + ) + + embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + + return embeddings + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertTokenTypeEmbeddings +class TFRobertaTokenTypeEmbeddings(tf.keras.layers.Layer): + def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.type_vocab_size = type_vocab_size + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.token_type_embeddings = self.add_weight( + name="embeddings", + shape=[self.type_vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape=input_shape) + + def get_config(self): + config = { + "type_vocab_size": self.type_vocab_size, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, token_type_ids): + flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) + one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) + embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) + embeddings = tf.reshape( + tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) + ) + + embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) + + return embeddings + + +# Copied from transformers.models.longformer.modeling_tf_longformer.TFLongformerPositionEmbeddings +class TFRobertaPositionEmbeddings(tf.keras.layers.Layer): + def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape) + + def get_config(self): + config = { + "max_position_embeddings": self.max_position_embeddings, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, position_ids): + flat_position_ids = tf.reshape(tensor=position_ids, shape=[-1]) + embeddings = tf.gather(params=self.position_embeddings, indices=flat_position_ids) + embeddings = tf.reshape( + tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=position_ids), [self.hidden_size]], axis=0) + ) + + embeddings.set_shape(shape=position_ids.shape.as_list() + [self.hidden_size]) + + return embeddings + + class TFRobertaEmbeddings(tf.keras.layers.Layer): """ Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. @@ -74,52 +195,48 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer): super().__init__(**kwargs) self.padding_idx = 1 - self.vocab_size = config.vocab_size - self.hidden_size = config.hidden_size - self.initializer_range = config.initializer_range - self.position_embeddings = tf.keras.layers.Embedding( - config.max_position_embeddings, - config.hidden_size, - embeddings_initializer=get_initializer(self.initializer_range), + self.word_embeddings = TFRobertaWordEmbeddings( + vocab_size=config.vocab_size, + hidden_size=config.hidden_size, + initializer_range=config.initializer_range, + name="word_embeddings", + ) + self.position_embeddings = TFRobertaPositionEmbeddings( + max_position_embeddings=config.max_position_embeddings, + hidden_size=config.hidden_size, + initializer_range=config.initializer_range, name="position_embeddings", ) - self.token_type_embeddings = tf.keras.layers.Embedding( - config.type_vocab_size, - config.hidden_size, - embeddings_initializer=get_initializer(self.initializer_range), + self.token_type_embeddings = TFRobertaTokenTypeEmbeddings( + type_vocab_size=config.type_vocab_size, + hidden_size=config.hidden_size, + initializer_range=config.initializer_range, name="token_type_embeddings", ) - - # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load - # any TensorFlow checkpoint file + self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape): - """Build shared word embedding layer """ - with tf.name_scope("word_embeddings"): - # Create and initialize weights. The random normal initializer was chosen - # arbitrarily, and works well. - self.word_embeddings = self.add_weight( - "weight", - shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def create_position_ids_from_input_ids(self, x): + def create_position_ids_from_input_ids(self, input_ids): """ Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols are ignored. This is modified from fairseq's `utils.make_positions`. Args: - x: tf.Tensor + input_ids: tf.Tensor Returns: tf.Tensor """ - mask = tf.cast(tf.math.not_equal(x, self.padding_idx), dtype=tf.int32) - incremental_indices = tf.math.cumsum(mask, axis=1) * mask + input_ids_shape = shape_list(tensor=input_ids) + + # multiple choice has 3 dimensions + if len(input_ids_shape) == 3: + input_ids = tf.reshape( + tensor=input_ids, shape=(input_ids_shape[0] * input_ids_shape[1], input_ids_shape[2]) + ) + + mask = tf.cast(x=tf.math.not_equal(x=input_ids, y=self.padding_idx), dtype=input_ids.dtype) + incremental_indices = tf.math.cumsum(x=mask, axis=1) * mask return incremental_indices + self.padding_idx @@ -132,96 +249,41 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer): Returns: tf.Tensor """ - seq_length = shape_list(inputs_embeds)[1] - position_ids = tf.range(self.padding_idx + 1, seq_length + self.padding_idx + 1, dtype=tf.int32)[tf.newaxis, :] + batch_size, seq_length = shape_list(tensor=inputs_embeds)[:2] + position_ids = tf.range(start=self.padding_idx + 1, limit=seq_length + self.padding_idx + 1)[tf.newaxis, :] - return position_ids + return tf.tile(input=position_ids, multiples=(batch_size, 1)) - def call( - self, - input_ids=None, - position_ids=None, - token_type_ids=None, - inputs_embeds=None, - mode="embedding", - training=False, - ): + def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False): """ - Get token embeddings of inputs. - - Args: - inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) - mode: string, a valid value is one of "embedding" and "linear". + Applies embedding based on inputs tensor. Returns: - outputs: If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, - embedding_size]; if mode == "linear", output linear tensor, float32 with shape [batch_size, length, - vocab_size]. - - Raises: - ValueError: if mode is not valid. - - Shared weights logic adapted from - https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 + final_embeddings (:obj:`tf.Tensor`): output embedding tensor. """ - if mode == "embedding": - return self._embedding(input_ids, position_ids, token_type_ids, inputs_embeds, training=training) - elif mode == "linear": - return self._linear(input_ids) - else: - raise ValueError("mode {} is not valid.".format(mode)) - - def _embedding(self, input_ids, position_ids, token_type_ids, inputs_embeds, training=False): - """Applies embedding based on inputs tensor.""" assert not (input_ids is None and inputs_embeds is None) + if input_ids is not None: + inputs_embeds = self.word_embeddings(input_ids=input_ids) + + if token_type_ids is None: + input_shape = shape_list(tensor=inputs_embeds)[:-1] + token_type_ids = tf.fill(dims=input_shape, value=0) + if position_ids is None: if input_ids is not None: # Create the position ids from the input token ids. Any padded tokens remain padded. - position_ids = self.create_position_ids_from_input_ids(input_ids) + position_ids = self.create_position_ids_from_input_ids(input_ids=input_ids) else: - position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) + position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds=inputs_embeds) - if input_ids is not None: - input_shape = shape_list(input_ids) - else: - input_shape = shape_list(inputs_embeds)[:-1] + position_embeds = self.position_embeddings(position_ids=position_ids) + token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids) + final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) + final_embeddings = self.LayerNorm(inputs=final_embeddings) + final_embeddings = self.dropout(inputs=final_embeddings, training=training) - seq_length = input_shape[1] - - if position_ids is None: - position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] - - if token_type_ids is None: - token_type_ids = tf.fill(input_shape, 0) - - if inputs_embeds is None: - inputs_embeds = tf.gather(self.word_embeddings, input_ids) - - position_embeddings = tf.cast(self.position_embeddings(position_ids), inputs_embeds.dtype) - token_type_embeddings = tf.cast(self.token_type_embeddings(token_type_ids), inputs_embeds.dtype) - embeddings = inputs_embeds + position_embeddings + token_type_embeddings - embeddings = self.LayerNorm(embeddings) - embeddings = self.dropout(embeddings, training=training) - - return embeddings - - def _linear(self, inputs): - """ - Computes logits by running inputs through a linear layer. - - Args: - inputs: A float32 tensor with shape [batch_size, length, hidden_size] - - Returns: - float32 tensor with shape [batch_size, length, vocab_size]. - """ - batch_size = shape_list(inputs)[0] - length = shape_list(inputs)[1] - x = tf.reshape(inputs, [-1, self.hidden_size]) - logits = tf.matmul(x, self.word_embeddings, transpose_b=True) - - return tf.reshape(logits, [batch_size, length, self.vocab_size]) + return final_embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler @@ -245,7 +307,7 @@ class TFRobertaPooler(tf.keras.layers.Layer): return pooled_output -# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Roberta +# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention class TFRobertaSelfAttention(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) @@ -256,8 +318,8 @@ class TFRobertaSelfAttention(tf.keras.layers.Layer): f"of attention heads ({config.num_attention_heads})" ) - self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.query = tf.keras.layers.experimental.EinsumDense( equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), @@ -293,7 +355,7 @@ class TFRobertaSelfAttention(tf.keras.layers.Layer): attention_scores = tf.einsum("aecd,abcd->acbe", key_layer, query_layer) if attention_mask is not None: - # Apply the attention mask is (precomputed for all layers in TFRobertaModel call() function) + # Apply the attention mask is (precomputed for all layers in TFBertModel call() function) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. @@ -324,9 +386,9 @@ class TFRobertaSelfOutput(tf.keras.layers.Layer): f"of attention heads ({config.num_attention_heads})" ) - self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) - self.all_head_size = self.num_attention_heads * self.attention_head_size + self.all_head_size = config.num_attention_heads * self.attention_head_size + self.dense = tf.keras.layers.experimental.EinsumDense( equation="abcd,cde->abe", output_shape=(None, self.all_head_size), @@ -499,12 +561,12 @@ class TFRobertaMainLayer(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings def get_input_embeddings(self): - return self.embeddings + return self.embeddings.word_embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings def set_input_embeddings(self, value): - self.embeddings.word_embeddings = value - self.embeddings.vocab_size = shape_list(value)[0] + self.embeddings.word_embeddings.weight = value + self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads def _prune_heads(self, heads_to_prune): @@ -814,6 +876,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer): super().__init__(**kwargs) self.vocab_size = config.vocab_size + self.hidden_size = config.hidden_size self.dense = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -833,7 +896,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer): return self.decoder def set_output_embeddings(self, value): - self.decoder.word_embeddings = value + self.decoder.weight = value self.decoder.vocab_size = shape_list(value)[0] def get_bias(self): @@ -849,7 +912,11 @@ class TFRobertaLMHead(tf.keras.layers.Layer): hidden_states = self.layer_norm(hidden_states) # project back to size of vocabulary with bias - hidden_states = self.decoder(hidden_states, mode="linear") + self.bias + seq_length = shape_list(tensor=hidden_states)[1] + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) + hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True) + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) + hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) return hidden_states @@ -863,7 +930,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos super().__init__(config, *inputs, **kwargs) self.roberta = TFRobertaMainLayer(config, add_pooling_layer=False, name="roberta") - self.lm_head = TFRobertaLMHead(config, self.roberta.embeddings, name="lm_head") + self.lm_head = TFRobertaLMHead(config, self.roberta.embeddings.word_embeddings, name="lm_head") def get_lm_head(self): return self.lm_head diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py index 3526d1bfde..1ae28aaaef 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py @@ -66,6 +66,122 @@ TF_{{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST = [ ] +# Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings +class TF{{cookiecutter.camelcase_modelname}}WordEmbeddings(tf.keras.layers.Layer): + def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.weight = self.add_weight( + name="weight", + shape=[self.vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape=input_shape) + + def get_config(self): + config = { + "vocab_size": self.vocab_size, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, input_ids): + flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) + embeddings = tf.gather(params=self.weight, indices=flat_input_ids) + embeddings = tf.reshape( + tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + ) + + embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + + return embeddings + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertTokenTypeEmbeddings +class TF{{cookiecutter.camelcase_modelname}}TokenTypeEmbeddings(tf.keras.layers.Layer): + def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.type_vocab_size = type_vocab_size + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.token_type_embeddings = self.add_weight( + name="embeddings", + shape=[self.type_vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape=input_shape) + + def get_config(self): + config = { + "type_vocab_size": self.type_vocab_size, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, token_type_ids): + flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) + one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) + embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) + embeddings = tf.reshape( + tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) + ) + + embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) + + return embeddings + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertPositionEmbeddings +class TF{{cookiecutter.camelcase_modelname}}PositionEmbeddings(tf.keras.layers.Layer): + def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape) + + def get_config(self): + config = { + "max_position_embeddings": self.max_position_embeddings, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, position_ids): + input_shape = shape_list(tensor=position_ids) + position_embeddings = self.position_embeddings[: input_shape[1], :] + + return tf.broadcast_to(input=position_embeddings, shape=input_shape) + + # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings with Bert->{{cookiecutter.camelcase_modelname}} class TF{{cookiecutter.camelcase_modelname}}Embeddings(tf.keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" @@ -73,121 +189,59 @@ class TF{{cookiecutter.camelcase_modelname}}Embeddings(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.vocab_size = config.vocab_size - self.hidden_size = config.hidden_size - self.initializer_range = config.initializer_range - self.position_embeddings = tf.keras.layers.Embedding( - config.max_position_embeddings, - config.hidden_size, - embeddings_initializer=get_initializer(self.initializer_range), + self.word_embeddings = TF{{cookiecutter.camelcase_modelname}}WordEmbeddings( + vocab_size=config.vocab_size, + hidden_size=config.hidden_size, + initializer_range=config.initializer_range, + name="word_embeddings", + ) + self.position_embeddings = TF{{cookiecutter.camelcase_modelname}}PositionEmbeddings( + max_position_embeddings=config.max_position_embeddings, + hidden_size=config.hidden_size, + initializer_range=config.initializer_range, name="position_embeddings", ) - self.token_type_embeddings = tf.keras.layers.Embedding( - config.type_vocab_size, - config.hidden_size, - embeddings_initializer=get_initializer(self.initializer_range), + self.token_type_embeddings = TF{{cookiecutter.camelcase_modelname}}TokenTypeEmbeddings( + type_vocab_size=config.type_vocab_size, + hidden_size=config.hidden_size, + initializer_range=config.initializer_range, name="token_type_embeddings", ) - - # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load - # any TensorFlow checkpoint file + self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape): - """Build shared word embedding layer """ - with tf.name_scope("word_embeddings"): - # Create and initialize weights. The random normal initializer was chosen - # arbitrarily, and works well. - self.word_embeddings = self.add_weight( - "weight", - shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def call( - self, - input_ids=None, - position_ids=None, - token_type_ids=None, - inputs_embeds=None, - mode="embedding", - training=False, - ): + def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False): """ - Get token embeddings of inputs. - - Args: - inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) - mode: string, a valid value is one of "embedding" and "linear". + Applies embedding based on inputs tensor. Returns: - outputs: If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, - embedding_size]; if mode == "linear", output linear tensor, float32 with shape [batch_size, length, - vocab_size]. - - Raises: - ValueError: if mode is not valid. - - Shared weights logic adapted from - https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 + final_embeddings (:obj:`tf.Tensor`): output embedding tensor. """ - if mode == "embedding": - return self._embedding(input_ids, position_ids, token_type_ids, inputs_embeds, training=training) - elif mode == "linear": - return self._linear(input_ids) - else: - raise ValueError("mode {} is not valid.".format(mode)) - - def _embedding(self, input_ids, position_ids, token_type_ids, inputs_embeds, training=False): - """Applies embedding based on inputs tensor.""" assert not (input_ids is None and inputs_embeds is None) if input_ids is not None: - input_shape = shape_list(input_ids) - else: - input_shape = shape_list(inputs_embeds)[:-1] - - seq_length = input_shape[1] - - if position_ids is None: - position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] + inputs_embeds = self.word_embeddings(input_ids=input_ids) if token_type_ids is None: - token_type_ids = tf.fill(input_shape, 0) + input_shape = shape_list(tensor=inputs_embeds)[:-1] + token_type_ids = tf.fill(dims=input_shape, value=0) - if inputs_embeds is None: - inputs_embeds = tf.gather(self.word_embeddings, input_ids) + if position_ids is None: + position_embeds = self.position_embeddings(position_ids=inputs_embeds) + else: + position_embeds = self.position_embeddings(position_ids=position_ids) - position_embeddings = tf.cast(self.position_embeddings(position_ids), inputs_embeds.dtype) - token_type_embeddings = tf.cast(self.token_type_embeddings(token_type_ids), inputs_embeds.dtype) - embeddings = inputs_embeds + position_embeddings + token_type_embeddings - embeddings = self.LayerNorm(embeddings) - embeddings = self.dropout(embeddings, training=training) + token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids) + final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) + final_embeddings = self.LayerNorm(inputs=final_embeddings) + final_embeddings = self.dropout(inputs=final_embeddings, training=training) - return embeddings - - def _linear(self, inputs): - """ - Computes logits by running inputs through a linear layer. - - Args: - inputs: A float32 tensor with shape [batch_size, length, hidden_size]. - - Returns: - float32 tensor with shape [batch_size, length, vocab_size]. - """ - batch_size = shape_list(inputs)[0] - length = shape_list(inputs)[1] - x = tf.reshape(inputs, [-1, self.hidden_size]) - logits = tf.matmul(x, self.word_embeddings, transpose_b=True) - - return tf.reshape(logits, [batch_size, length, self.vocab_size]) + return final_embeddings -# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->{{cookiecutter.camelcase_modelname}} + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) @@ -198,8 +252,8 @@ class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer) f"of attention heads ({config.num_attention_heads})" ) - self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.query = tf.keras.layers.experimental.EinsumDense( equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), @@ -266,9 +320,9 @@ class TF{{cookiecutter.camelcase_modelname}}SelfOutput(tf.keras.layers.Layer): f"of attention heads ({config.num_attention_heads})" ) - self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) - self.all_head_size = self.num_attention_heads * self.attention_head_size + self.all_head_size = config.num_attention_heads * self.attention_head_size + self.dense = tf.keras.layers.experimental.EinsumDense( equation="abcd,cde->abe", output_shape=(None, self.all_head_size), @@ -450,6 +504,8 @@ class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(tf.keras.layers.Lay super().__init__(**kwargs) self.vocab_size = config.vocab_size + self.hidden_size = config.hidden_size + self.transform = TF{{cookiecutter.camelcase_modelname}}PredictionHeadTransform(config, name="transform") # The output weights are the same as the input embeddings, but there is @@ -465,7 +521,7 @@ class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(tf.keras.layers.Lay return self.input_embeddings def set_output_embeddings(self, value): - self.input_embeddings.word_embeddings = value + self.input_embeddings.weight = value self.input_embeddings.vocab_size = shape_list(value)[0] def get_bias(self): @@ -476,9 +532,12 @@ class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(tf.keras.layers.Lay self.vocab_size = shape_list(value["bias"])[0] def call(self, hidden_states): - hidden_states = self.transform(hidden_states) - hidden_states = self.input_embeddings(hidden_states, mode="linear") - hidden_states = hidden_states + self.bias + hidden_states = self.transform(hidden_states=hidden_states) + seq_length = shape_list(tensor=hidden_states)[1] + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) + hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) + hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) return hidden_states @@ -514,11 +573,11 @@ class TF{{cookiecutter.camelcase_modelname}}MainLayer(tf.keras.layers.Layer): self.config = config def get_input_embeddings(self): - return self.embeddings + return self.embeddings.word_embeddings def set_input_embeddings(self, value): - self.embeddings.word_embeddings = value - self.embeddings.vocab_size = shape_list(value)[0] + self.embeddings.word_embeddings.weight = value + self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] def _prune_heads(self, heads_to_prune): """Prunes heads of the model. @@ -812,7 +871,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForMaskedLM(TF{{cookiecutter.camelca ) self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") - self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, self.{{cookiecutter.lowercase_modelname}}.embeddings, name="mlm___cls") + self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, self.{{cookiecutter.lowercase_modelname}}.embeddings.word_embeddings, name="mlm___cls") def get_lm_head(self): return self.mlm.predictions @@ -909,7 +968,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForCausalLM(TF{{cookiecutter.camelca logger.warning("If you want to use `TF{{cookiecutter.camelcase_modelname}}ForCausalLM` as a standalone, add `is_decoder=True.`") self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") - self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, self.{{cookiecutter.lowercase_modelname}}.embeddings, name="mlm___cls") + self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, self.{{cookiecutter.lowercase_modelname}}.embeddings.word_embeddings, name="mlm___cls") def get_lm_head(self): return self.mlm.predictions diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py index 794238faa1..c1e379d949 100644 --- a/tests/test_modeling_tf_common.py +++ b/tests/test_modeling_tf_common.py @@ -760,31 +760,6 @@ class TFModelTesterMixin: model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True} ) - def _get_embeds(self, wte, input_ids): - # ^^ In our TF models, the input_embeddings can take slightly different forms, - # so we try a few of them. - # We used to fall back to just synthetically creating a dummy tensor of ones: - try: - x = wte(input_ids, mode="embedding") - except Exception: - try: - x = wte([input_ids], mode="embedding") - except Exception: - try: - x = wte([input_ids, None, None, None], mode="embedding") - except Exception: - if hasattr(self.model_tester, "embedding_size"): - x = tf.ones( - input_ids.shape + [self.model_tester.embedding_size], - dtype=tf.dtypes.float32, - ) - else: - x = tf.ones( - input_ids.shape + [self.model_tester.hidden_size], - dtype=tf.dtypes.float32, - ) - return x - def test_inputs_embeds(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() @@ -801,12 +776,11 @@ class TFModelTesterMixin: del inputs["input_ids"] inputs.pop("decoder_input_ids", None) - wte = model.get_input_embeddings() if not self.is_encoder_decoder: - inputs["inputs_embeds"] = self._get_embeds(wte, input_ids) + inputs["inputs_embeds"] = model.get_input_embeddings()(input_ids) else: - inputs["inputs_embeds"] = self._get_embeds(wte, encoder_input_ids) - inputs["decoder_inputs_embeds"] = self._get_embeds(wte, decoder_input_ids) + inputs["inputs_embeds"] = model.get_input_embeddings()(encoder_input_ids) + inputs["decoder_inputs_embeds"] = model.get_input_embeddings()(decoder_input_ids) model(inputs) @@ -837,24 +811,25 @@ class TFModelTesterMixin: config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() def _get_word_embedding_weight(model, embedding_layer): - if hasattr(embedding_layer, "word_embeddings"): - return embedding_layer.word_embeddings - elif hasattr(embedding_layer, "weight"): - return embedding_layer.weight - elif hasattr(embedding_layer, "decoder"): - return embedding_layer.decoder - else: - # Here we build the word embeddings weights if not exists. - # And then we retry to get the attribute once built. - model(model.dummy_inputs) - if hasattr(embedding_layer, "word_embeddings"): - return embedding_layer.word_embeddings - elif hasattr(embedding_layer, "weight"): - return embedding_layer.weight - elif hasattr(embedding_layer, "decoder"): - return embedding_layer.decoder - else: - return None + embeds = getattr(embedding_layer, "weight", None) + if embeds is not None: + return embeds + + embeds = getattr(embedding_layer, "decoder", None) + if embeds is not None: + return embeds + + model(model.dummy_inputs) + + embeds = getattr(embedding_layer, "weight", None) + if embeds is not None: + return embeds + + embeds = getattr(embedding_layer, "decoder", None) + if embeds is not None: + return embeds + + return None for model_class in self.all_model_classes: for size in [config.vocab_size - 10, config.vocab_size + 10, None]: