diff --git a/src/transformers/models/albert/modeling_tf_albert.py b/src/transformers/models/albert/modeling_tf_albert.py index b889fb1a61..f92319b873 100644 --- a/src/transformers/models/albert/modeling_tf_albert.py +++ b/src/transformers/models/albert/modeling_tf_albert.py @@ -17,7 +17,7 @@ from dataclasses import dataclass -from typing import Any, Dict, Optional, Tuple +from typing import Dict, Optional, Tuple import tensorflow as tf @@ -73,157 +73,52 @@ TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ ] -# Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings -class TFAlbertWordEmbeddings(tf.keras.layers.Layer): - def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape: tf.TensorShape): - self.weight = self.add_weight( - name="weight", - shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def get_config(self) -> Dict[str, Any]: - config = { - "vocab_size": self.vocab_size, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, input_ids: tf.Tensor) -> tf.Tensor: - flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) - embeddings = tf.gather(params=self.weight, indices=flat_input_ids) - embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) - ) - - embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) - - return embeddings - - -# Copied from transformers.models.bert.modeling_tf_bert.TFBertTokenTypeEmbeddings -class TFAlbertTokenTypeEmbeddings(tf.keras.layers.Layer): - def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.type_vocab_size = type_vocab_size - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape: tf.TensorShape): - self.token_type_embeddings = self.add_weight( - name="embeddings", - shape=[self.type_vocab_size, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def get_config(self) -> Dict[str, Any]: - config = { - "type_vocab_size": self.type_vocab_size, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, token_type_ids: tf.Tensor) -> tf.Tensor: - flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) - one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) - embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) - embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(token_type_ids), [self.hidden_size]], axis=0) - ) - - embeddings.set_shape(token_type_ids.shape.as_list() + [self.hidden_size]) - - return embeddings - - -# Copied from transformers.models.bert.modeling_tf_bert.TFBertPositionEmbeddings -class TFAlbertPositionEmbeddings(tf.keras.layers.Layer): - def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape: tf.TensorShape): - self.position_embeddings = self.add_weight( - name="embeddings", - shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def get_config(self) -> Dict[str, Any]: - config = { - "max_position_embeddings": self.max_position_embeddings, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, position_ids: tf.Tensor) -> tf.Tensor: - input_shape = shape_list(position_ids) - position_embeddings = self.position_embeddings[: input_shape[1], :] - - return tf.broadcast_to(input=position_embeddings, shape=input_shape) - - class TFAlbertEmbeddings(tf.keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.word_embeddings = TFAlbertWordEmbeddings( - vocab_size=config.vocab_size, - hidden_size=config.embedding_size, - initializer_range=config.initializer_range, - name="word_embeddings", - ) - self.position_embeddings = TFAlbertPositionEmbeddings( - max_position_embeddings=config.max_position_embeddings, - hidden_size=config.embedding_size, - initializer_range=config.initializer_range, - name="position_embeddings", - ) - self.token_type_embeddings = TFAlbertTokenTypeEmbeddings( - type_vocab_size=config.type_vocab_size, - hidden_size=config.embedding_size, - initializer_range=config.initializer_range, - name="token_type_embeddings", - ) + self.vocab_size = config.vocab_size + self.type_vocab_size = config.type_vocab_size + self.embedding_size = config.embedding_size + self.max_position_embeddings = config.max_position_embeddings + self.initializer_range = config.initializer_range self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + def build(self, input_shape: tf.TensorShape): + with tf.name_scope("word_embeddings"): + self.weight = self.add_weight( + name="weight", + shape=[self.vocab_size, self.embedding_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + with tf.name_scope("token_type_embeddings"): + self.token_type_embeddings = self.add_weight( + name="embeddings", + shape=[self.type_vocab_size, self.embedding_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + with tf.name_scope("position_embeddings"): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.embedding_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape) + # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call def call( self, - input_ids: tf.Tensor, - position_ids: tf.Tensor, - token_type_ids: tf.Tensor, - inputs_embeds: tf.Tensor, + input_ids: tf.Tensor = None, + position_ids: tf.Tensor = None, + token_type_ids: tf.Tensor = None, + inputs_embeds: tf.Tensor = None, training: bool = False, ) -> tf.Tensor: """ @@ -235,18 +130,19 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer): assert not (input_ids is None and inputs_embeds is None) if input_ids is not None: - inputs_embeds = self.word_embeddings(input_ids=input_ids) + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + + input_shape = shape_list(inputs_embeds)[:-1] if token_type_ids is None: - input_shape = shape_list(tensor=inputs_embeds)[:-1] token_type_ids = tf.fill(dims=input_shape, value=0) if position_ids is None: - position_embeds = self.position_embeddings(position_ids=inputs_embeds) - else: - position_embeds = self.position_embeddings(position_ids=position_ids) + position_ids = tf.range(start=0, limit=input_shape[-1])[tf.newaxis, :] - token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids) + position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) + position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1)) + token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) final_embeddings = self.LayerNorm(inputs=final_embeddings) final_embeddings = self.dropout(inputs=final_embeddings, training=training) @@ -301,6 +197,7 @@ class TFAlbertAttention(tf.keras.layers.Layer): self.output_dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) def transpose_for_scores(self, x, batch_size): + # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size)) return tf.transpose(x, perm=[0, 2, 1, 3]) @@ -326,7 +223,7 @@ class TFAlbertAttention(tf.keras.layers.Layer): attention_scores = attention_scores / tf.math.sqrt(dk) if attention_mask is not None: - # Apply the attention mask is (precomputed for all layers in TFBertModel call() function) + # Apply the attention mask is (precomputed for all layers in TFAlbertModel call() function) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. @@ -583,11 +480,11 @@ class TFAlbertMainLayer(tf.keras.layers.Layer): ) def get_input_embeddings(self): - return self.embeddings.word_embeddings + return self.embeddings def set_input_embeddings(self, value): - self.embeddings.word_embeddings.weight = value - self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] + self.embeddings.weight = value + self.embeddings.vocab_size = shape_list(value)[0] def _prune_heads(self, heads_to_prune): """ @@ -914,7 +811,7 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel): self.num_labels = config.num_labels self.albert = TFAlbertMainLayer(config, name="albert") - self.predictions = TFAlbertMLMHead(config, self.albert.embeddings.word_embeddings, name="predictions") + self.predictions = TFAlbertMLMHead(config, self.albert.embeddings, name="predictions") self.sop_classifier = TFAlbertSOPHead(config, name="sop_classifier") def get_lm_head(self): @@ -1034,7 +931,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss) super().__init__(config, *inputs, **kwargs) self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert") - self.predictions = TFAlbertMLMHead(config, self.albert.embeddings.word_embeddings, name="predictions") + self.predictions = TFAlbertMLMHead(config, self.albert.embeddings, name="predictions") def get_lm_head(self): return self.predictions diff --git a/src/transformers/models/bart/modeling_tf_bart.py b/src/transformers/models/bart/modeling_tf_bart.py index 8489cf3ed9..e1dbf0c23b 100644 --- a/src/transformers/models/bart/modeling_tf_bart.py +++ b/src/transformers/models/bart/modeling_tf_bart.py @@ -92,17 +92,17 @@ def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: i if past_key_values_length > 0: mask = tf.concat([tf.zeros((tgt_len, past_key_values_length), dtype=tf.float32), mask], axis=-1) - return tf.broadcast_to(mask[None, None, :, :], (bsz, 1, tgt_len, tgt_len + past_key_values_length)) + + return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1)) def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None, past_key_values_length: int = 0): """ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. """ - bsz, src_len = shape_list(mask) + src_len = shape_list(mask)[1] tgt_len = tgt_len if tgt_len is not None else src_len - - expanded_mask = tf.cast(tf.broadcast_to(mask[:, None, None, :], (bsz, 1, tgt_len, src_len)), tf.float32) + expanded_mask = tf.cast(tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1)), tf.float32) return (1.0 - expanded_mask) * LARGE_NEGATIVE diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py index da353e8ca4..5ebc70b434 100644 --- a/src/transformers/models/bert/modeling_tf_bert.py +++ b/src/transformers/models/bert/modeling_tf_bert.py @@ -15,9 +15,10 @@ # limitations under the License. """ TF 2.0 BERT model. """ +import math import warnings from dataclasses import dataclass -from typing import Any, Dict, Optional, Tuple, Union +from typing import Dict, Optional, Tuple, Union import numpy as np import tensorflow as tf @@ -127,153 +128,51 @@ class TFBertPreTrainingLoss: return masked_lm_loss + next_sentence_loss -class TFBertWordEmbeddings(tf.keras.layers.Layer): - def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape: tf.TensorShape): - self.weight = self.add_weight( - name="weight", - shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def get_config(self) -> Dict[str, Any]: - config = { - "vocab_size": self.vocab_size, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, input_ids: tf.Tensor) -> tf.Tensor: - flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) - embeddings = tf.gather(params=self.weight, indices=flat_input_ids) - embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) - ) - - embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) - - return embeddings - - -class TFBertTokenTypeEmbeddings(tf.keras.layers.Layer): - def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.type_vocab_size = type_vocab_size - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape: tf.TensorShape): - self.token_type_embeddings = self.add_weight( - name="embeddings", - shape=[self.type_vocab_size, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def get_config(self) -> Dict[str, Any]: - config = { - "type_vocab_size": self.type_vocab_size, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, token_type_ids: tf.Tensor) -> tf.Tensor: - flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) - one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) - embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) - embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(token_type_ids), [self.hidden_size]], axis=0) - ) - - embeddings.set_shape(token_type_ids.shape.as_list() + [self.hidden_size]) - - return embeddings - - -class TFBertPositionEmbeddings(tf.keras.layers.Layer): - def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape: tf.TensorShape): - self.position_embeddings = self.add_weight( - name="embeddings", - shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def get_config(self) -> Dict[str, Any]: - config = { - "max_position_embeddings": self.max_position_embeddings, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, position_ids: tf.Tensor) -> tf.Tensor: - input_shape = shape_list(position_ids) - position_embeddings = self.position_embeddings[: input_shape[1], :] - - return tf.broadcast_to(input=position_embeddings, shape=input_shape) - - class TFBertEmbeddings(tf.keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) - self.word_embeddings = TFBertWordEmbeddings( - vocab_size=config.vocab_size, - hidden_size=config.hidden_size, - initializer_range=config.initializer_range, - name="word_embeddings", - ) - self.position_embeddings = TFBertPositionEmbeddings( - max_position_embeddings=config.max_position_embeddings, - hidden_size=config.hidden_size, - initializer_range=config.initializer_range, - name="position_embeddings", - ) - self.token_type_embeddings = TFBertTokenTypeEmbeddings( - type_vocab_size=config.type_vocab_size, - hidden_size=config.hidden_size, - initializer_range=config.initializer_range, - name="token_type_embeddings", - ) + self.vocab_size = config.vocab_size + self.type_vocab_size = config.type_vocab_size + self.hidden_size = config.hidden_size + self.max_position_embeddings = config.max_position_embeddings + self.initializer_range = config.initializer_range self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + def build(self, input_shape: tf.TensorShape): + with tf.name_scope("word_embeddings"): + self.weight = self.add_weight( + name="weight", + shape=[self.vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + with tf.name_scope("token_type_embeddings"): + self.token_type_embeddings = self.add_weight( + name="embeddings", + shape=[self.type_vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + with tf.name_scope("position_embeddings"): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape) + def call( self, - input_ids: tf.Tensor, - position_ids: tf.Tensor, - token_type_ids: tf.Tensor, - inputs_embeds: tf.Tensor, + input_ids: tf.Tensor = None, + position_ids: tf.Tensor = None, + token_type_ids: tf.Tensor = None, + inputs_embeds: tf.Tensor = None, training: bool = False, ) -> tf.Tensor: """ @@ -285,18 +184,19 @@ class TFBertEmbeddings(tf.keras.layers.Layer): assert not (input_ids is None and inputs_embeds is None) if input_ids is not None: - inputs_embeds = self.word_embeddings(input_ids) + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + + input_shape = shape_list(inputs_embeds)[:-1] if token_type_ids is None: - input_shape = shape_list(inputs_embeds)[:-1] token_type_ids = tf.fill(dims=input_shape, value=0) if position_ids is None: - position_embeds = self.position_embeddings(inputs_embeds) - else: - position_embeds = self.position_embeddings(position_ids) + position_ids = tf.range(start=0, limit=input_shape[-1])[tf.newaxis, :] - token_type_embeds = self.token_type_embeddings(token_type_ids) + position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) + position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1)) + token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) final_embeddings = self.LayerNorm(inputs=final_embeddings) final_embeddings = self.dropout(inputs=final_embeddings, training=training) @@ -314,31 +214,29 @@ class TFBertSelfAttention(tf.keras.layers.Layer): f"of attention heads ({config.num_attention_heads})" ) + self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.sqrt_att_head_size = math.sqrt(self.attention_head_size) - self.query = tf.keras.layers.experimental.EinsumDense( - equation="abc,cde->abde", - output_shape=(None, config.num_attention_heads, self.attention_head_size), - bias_axes="de", - kernel_initializer=get_initializer(config.initializer_range), - name="query", + self.query = tf.keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" ) - self.key = tf.keras.layers.experimental.EinsumDense( - equation="abc,cde->abde", - output_shape=(None, config.num_attention_heads, self.attention_head_size), - bias_axes="de", - kernel_initializer=get_initializer(config.initializer_range), - name="key", + self.key = tf.keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" ) - self.value = tf.keras.layers.experimental.EinsumDense( - equation="abc,cde->abde", - output_shape=(None, config.num_attention_heads, self.attention_head_size), - bias_axes="de", - kernel_initializer=get_initializer(config.initializer_range), - name="value", + self.value = tf.keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: + # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] + tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size)) + + # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size] + return tf.transpose(tensor, perm=[0, 2, 1, 3]) + def call( self, hidden_states: tf.Tensor, @@ -347,15 +245,20 @@ class TFBertSelfAttention(tf.keras.layers.Layer): output_attentions: bool, training: bool = False, ) -> Tuple[tf.Tensor]: - query_layer = self.query(inputs=hidden_states) - key_layer = self.key(inputs=hidden_states) - value_layer = self.value(inputs=hidden_states) + batch_size = shape_list(hidden_states)[0] + mixed_query_layer = self.query(inputs=hidden_states) + mixed_key_layer = self.key(inputs=hidden_states) + mixed_value_layer = self.value(inputs=hidden_states) + query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) + key_layer = self.transpose_for_scores(mixed_key_layer, batch_size) + value_layer = self.transpose_for_scores(mixed_value_layer, batch_size) # Take the dot product between "query" and "key" to get the raw # attention scores. - dk = tf.cast(self.attention_head_size, dtype=query_layer.dtype) - query_layer = tf.multiply(query_layer, tf.math.rsqrt(dk)) - attention_scores = tf.einsum("aecd,abcd->acbe", key_layer, query_layer) + # (batch size, num_heads, seq_len_q, seq_len_k) + attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) + dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype) + attention_scores = tf.divide(attention_scores, dk) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in TFBertModel call() function) @@ -372,7 +275,11 @@ class TFBertSelfAttention(tf.keras.layers.Layer): if head_mask is not None: attention_probs = tf.multiply(attention_probs, head_mask) - attention_output = tf.einsum("acbe,aecd->abcd", attention_probs, value_layer) + attention_output = tf.matmul(attention_probs, value_layer) + attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3]) + + # (batch_size, seq_len_q, all_head_size) + attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size)) outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) return outputs @@ -382,21 +289,8 @@ class TFBertSelfOutput(tf.keras.layers.Layer): def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) - if config.hidden_size % config.num_attention_heads != 0: - raise ValueError( - f"The hidden size ({config.hidden_size}) is not a multiple of the number " - f"of attention heads ({config.num_attention_heads})" - ) - - self.attention_head_size = int(config.hidden_size / config.num_attention_heads) - self.all_head_size = config.num_attention_heads * self.attention_head_size - - self.dense = tf.keras.layers.experimental.EinsumDense( - equation="abcd,cde->abe", - output_shape=(None, self.all_head_size), - bias_axes="e", - kernel_initializer=get_initializer(config.initializer_range), - name="dense", + self.dense = tf.keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) @@ -446,12 +340,8 @@ class TFBertIntermediate(tf.keras.layers.Layer): def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.experimental.EinsumDense( - equation="abc,cd->abd", - output_shape=(None, config.intermediate_size), - bias_axes="d", - kernel_initializer=get_initializer(config.initializer_range), - name="dense", + self.dense = tf.keras.layers.Dense( + units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) if isinstance(config.hidden_act, str): @@ -470,12 +360,8 @@ class TFBertOutput(tf.keras.layers.Layer): def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.experimental.EinsumDense( - equation="abc,cd->abd", - bias_axes="d", - output_shape=(None, config.hidden_size), - kernel_initializer=get_initializer(config.initializer_range), - name="dense", + self.dense = tf.keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) @@ -698,11 +584,11 @@ class TFBertMainLayer(tf.keras.layers.Layer): self.pooler = TFBertPooler(config, name="pooler") if add_pooling_layer else None def get_input_embeddings(self) -> tf.keras.layers.Layer: - return self.embeddings.word_embeddings + return self.embeddings def set_input_embeddings(self, value: tf.Variable): - self.embeddings.word_embeddings.weight = value - self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] + self.embeddings.weight = value + self.embeddings.vocab_size = shape_list(value)[0] def _prune_heads(self, heads_to_prune): """ @@ -1041,7 +927,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss): self.bert = TFBertMainLayer(config, name="bert") self.nsp = TFBertNSPHead(config, name="nsp___cls") - self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings.word_embeddings, name="mlm___cls") + self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings, name="mlm___cls") def get_lm_head(self) -> tf.keras.layers.Layer: return self.mlm.predictions @@ -1165,7 +1051,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss): ) self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") - self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings.word_embeddings, name="mlm___cls") + self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings, name="mlm___cls") def get_lm_head(self) -> tf.keras.layers.Layer: return self.mlm.predictions @@ -1270,7 +1156,7 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss): logger.warning("If you want to use `TFBertLMHeadModel` as a standalone, add `is_decoder=True.`") self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") - self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings.word_embeddings, name="mlm___cls") + self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings, name="mlm___cls") def get_lm_head(self) -> tf.keras.layers.Layer: return self.mlm.predictions diff --git a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py index 664689ecfa..c5463d1fc3 100644 --- a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py +++ b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py @@ -96,7 +96,8 @@ def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: i if past_key_values_length > 0: mask = tf.concat([tf.zeros((tgt_len, past_key_values_length), dtype=tf.float32), mask], axis=-1) - return tf.broadcast_to(mask[None, None, :, :], (bsz, 1, tgt_len, tgt_len + past_key_values_length)) + + return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1)) # Copied from transformers.models.bart.modeling_tf_bart._expand_mask @@ -104,10 +105,9 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None, past_key_values """ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. """ - bsz, src_len = shape_list(mask) + src_len = shape_list(mask)[1] tgt_len = tgt_len if tgt_len is not None else src_len - - expanded_mask = tf.cast(tf.broadcast_to(mask[:, None, None, :], (bsz, 1, tgt_len, src_len)), tf.float32) + expanded_mask = tf.cast(tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1)), tf.float32) return (1.0 - expanded_mask) * LARGE_NEGATIVE diff --git a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py index af6b0fcc07..b6fb43081f 100644 --- a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py @@ -94,7 +94,8 @@ def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: i if past_key_values_length > 0: mask = tf.concat([tf.zeros((tgt_len, past_key_values_length), dtype=tf.float32), mask], axis=-1) - return tf.broadcast_to(mask[None, None, :, :], (bsz, 1, tgt_len, tgt_len + past_key_values_length)) + + return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1)) # Copied from transformers.models.bart.modeling_tf_bart._expand_mask @@ -102,10 +103,9 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None, past_key_values """ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. """ - bsz, src_len = shape_list(mask) + src_len = shape_list(mask)[1] tgt_len = tgt_len if tgt_len is not None else src_len - - expanded_mask = tf.cast(tf.broadcast_to(mask[:, None, None, :], (bsz, 1, tgt_len, src_len)), tf.float32) + expanded_mask = tf.cast(tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1)), tf.float32) return (1.0 - expanded_mask) * LARGE_NEGATIVE diff --git a/src/transformers/models/convbert/modeling_tf_convbert.py b/src/transformers/models/convbert/modeling_tf_convbert.py index 59e0c3362b..da3ef62f6a 100644 --- a/src/transformers/models/convbert/modeling_tf_convbert.py +++ b/src/transformers/models/convbert/modeling_tf_convbert.py @@ -62,148 +62,55 @@ TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ ] -class TFConvBertWordEmbeddings(tf.keras.layers.Layer): - def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape): - self.weight = self.add_weight( - name="weight", - shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), - ) - - super().build(input_shape=input_shape) - - def get_config(self): - config = { - "vocab_size": self.vocab_size, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, input_ids): - flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) - embeddings = tf.gather(params=self.weight, indices=flat_input_ids) - embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) - ) - - embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) - - return embeddings - - -class TFConvBertTokenTypeEmbeddings(tf.keras.layers.Layer): - def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.type_vocab_size = type_vocab_size - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape): - self.token_type_embeddings = self.add_weight( - name="embeddings", - shape=[self.type_vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), - ) - - super().build(input_shape=input_shape) - - def get_config(self): - config = { - "type_vocab_size": self.type_vocab_size, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, token_type_ids): - flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) - one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) - embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) - embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) - ) - - embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) - - return embeddings - - -class TFConvBertPositionEmbeddings(tf.keras.layers.Layer): - def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape): - self.position_embeddings = self.add_weight( - name="embeddings", - shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), - ) - - super().build(input_shape) - - def get_config(self): - config = { - "max_position_embeddings": self.max_position_embeddings, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, position_ids): - input_shape = shape_list(tensor=position_ids) - position_embeddings = self.position_embeddings[: input_shape[1], :] - - return tf.broadcast_to(input=position_embeddings, shape=input_shape) - - +# Copied from transformers.models.albert.modeling_tf_albert.TFAlbertEmbeddings class TFConvBertEmbeddings(tf.keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.word_embeddings = TFConvBertWordEmbeddings( - vocab_size=config.vocab_size, - hidden_size=config.embedding_size, - initializer_range=config.initializer_range, - name="word_embeddings", - ) - self.position_embeddings = TFConvBertPositionEmbeddings( - max_position_embeddings=config.max_position_embeddings, - hidden_size=config.embedding_size, - initializer_range=config.initializer_range, - name="position_embeddings", - ) - self.token_type_embeddings = TFConvBertTokenTypeEmbeddings( - type_vocab_size=config.type_vocab_size, - hidden_size=config.embedding_size, - initializer_range=config.initializer_range, - name="token_type_embeddings", - ) + self.vocab_size = config.vocab_size + self.type_vocab_size = config.type_vocab_size + self.embedding_size = config.embedding_size + self.max_position_embeddings = config.max_position_embeddings + self.initializer_range = config.initializer_range self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False): + def build(self, input_shape: tf.TensorShape): + with tf.name_scope("word_embeddings"): + self.weight = self.add_weight( + name="weight", + shape=[self.vocab_size, self.embedding_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + with tf.name_scope("token_type_embeddings"): + self.token_type_embeddings = self.add_weight( + name="embeddings", + shape=[self.type_vocab_size, self.embedding_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + with tf.name_scope("position_embeddings"): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.embedding_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape) + + # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call + def call( + self, + input_ids: tf.Tensor = None, + position_ids: tf.Tensor = None, + token_type_ids: tf.Tensor = None, + inputs_embeds: tf.Tensor = None, + training: bool = False, + ) -> tf.Tensor: """ Applies embedding based on inputs tensor. @@ -213,18 +120,19 @@ class TFConvBertEmbeddings(tf.keras.layers.Layer): assert not (input_ids is None and inputs_embeds is None) if input_ids is not None: - inputs_embeds = self.word_embeddings(input_ids=input_ids) + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + + input_shape = shape_list(inputs_embeds)[:-1] if token_type_ids is None: - input_shape = shape_list(tensor=inputs_embeds)[:-1] token_type_ids = tf.fill(dims=input_shape, value=0) if position_ids is None: - position_embeds = self.position_embeddings(position_ids=inputs_embeds) - else: - position_embeds = self.position_embeddings(position_ids=position_ids) + position_ids = tf.range(start=0, limit=input_shape[-1])[tf.newaxis, :] - token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids) + position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) + position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1)) + token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) final_embeddings = self.LayerNorm(inputs=final_embeddings) final_embeddings = self.dropout(inputs=final_embeddings, training=training) @@ -296,6 +204,7 @@ class TFConvBertSelfAttention(tf.keras.layers.Layer): self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) def transpose_for_scores(self, x, batch_size): + # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size)) return tf.transpose(x, perm=[0, 2, 1, 3]) @@ -315,18 +224,27 @@ class TFConvBertSelfAttention(tf.keras.layers.Layer): conv_kernel_layer = tf.reshape(conv_kernel_layer, [-1, self.conv_kernel_size, 1]) conv_kernel_layer = tf.nn.softmax(conv_kernel_layer, axis=1) + paddings = tf.constant( + [ + [ + 0, + 0, + ], + [int((self.conv_kernel_size - 1) / 2), int((self.conv_kernel_size - 1) / 2)], + [0, 0], + ] + ) + conv_out_layer = self.conv_out_layer(hidden_states) conv_out_layer = tf.reshape(conv_out_layer, [batch_size, -1, self.all_head_size]) + conv_out_layer = tf.pad(conv_out_layer, paddings, "CONSTANT") - conv_out_layer = tf.reshape( - conv_out_layer, [batch_size, shape_list(mixed_query_layer)[1], self.all_head_size, 1] - ) - unfold_conv_out_layer = tf.image.extract_patches( - images=conv_out_layer, - sizes=[1, self.conv_kernel_size, 1, 1], - strides=[1, 1, 1, 1], - rates=[1, 1, 1, 1], - padding="SAME", + unfold_conv_out_layer = tf.stack( + [ + tf.slice(conv_out_layer, [0, i, 0], [batch_size, shape_list(mixed_query_layer)[1], self.all_head_size]) + for i in range(self.conv_kernel_size) + ], + axis=-1, ) conv_out_layer = tf.reshape(unfold_conv_out_layer, [-1, self.attention_head_size, self.conv_kernel_size]) @@ -601,11 +519,11 @@ class TFConvBertMainLayer(tf.keras.layers.Layer): self.config = config def get_input_embeddings(self): - return self.embeddings.word_embeddings + return self.embeddings def set_input_embeddings(self, value): - self.embeddings.word_embeddings.weight = value - self.embeddings.word_embeddings.vocab_size = value.shape[0] + self.embeddings.weight = value + self.embeddings.vocab_size = value.shape[0] def _prune_heads(self, heads_to_prune): """ @@ -953,9 +871,7 @@ class TFConvBertForMaskedLM(TFConvBertPreTrainedModel, TFMaskedLanguageModelingL else: self.activation = config.hidden_act - self.generator_lm_head = TFConvBertMaskedLMHead( - config, self.convbert.embeddings.word_embeddings, name="generator_lm_head" - ) + self.generator_lm_head = TFConvBertMaskedLMHead(config, self.convbert.embeddings, name="generator_lm_head") def get_lm_head(self): return self.generator_lm_head diff --git a/src/transformers/models/distilbert/modeling_tf_distilbert.py b/src/transformers/models/distilbert/modeling_tf_distilbert.py index d9942026d1..3d59dc6faa 100644 --- a/src/transformers/models/distilbert/modeling_tf_distilbert.py +++ b/src/transformers/models/distilbert/modeling_tf_distilbert.py @@ -17,7 +17,6 @@ """ import warnings -from typing import Any, Dict import tensorflow as tf @@ -68,81 +67,6 @@ TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ ] -# Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings -class TFDistilBertWordEmbeddings(tf.keras.layers.Layer): - def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape: tf.TensorShape): - self.weight = self.add_weight( - name="weight", - shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def get_config(self) -> Dict[str, Any]: - config = { - "vocab_size": self.vocab_size, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, input_ids: tf.Tensor) -> tf.Tensor: - flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) - embeddings = tf.gather(params=self.weight, indices=flat_input_ids) - embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) - ) - - embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) - - return embeddings - - -# Copied from transformers.models.bert.modeling_tf_bert.TFBertPositionEmbeddings -class TFDistilBertPositionEmbeddings(tf.keras.layers.Layer): - def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape: tf.TensorShape): - self.position_embeddings = self.add_weight( - name="embeddings", - shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def get_config(self) -> Dict[str, Any]: - config = { - "max_position_embeddings": self.max_position_embeddings, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, position_ids: tf.Tensor) -> tf.Tensor: - input_shape = shape_list(position_ids) - position_embeddings = self.position_embeddings[: input_shape[1], :] - - return tf.broadcast_to(input=position_embeddings, shape=input_shape) - - class TFEmbeddings(tf.keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" @@ -151,23 +75,29 @@ class TFEmbeddings(tf.keras.layers.Layer): self.vocab_size = config.vocab_size self.dim = config.dim self.initializer_range = config.initializer_range + self.max_position_embeddings = config.max_position_embeddings - self.word_embeddings = TFDistilBertWordEmbeddings( - vocab_size=config.vocab_size, - hidden_size=config.dim, - initializer_range=config.initializer_range, - name="word_embeddings", - ) - self.position_embeddings = TFDistilBertPositionEmbeddings( - max_position_embeddings=config.max_position_embeddings, - hidden_size=config.dim, - initializer_range=config.initializer_range, - name="position_embeddings", - ) self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.dropout) + def build(self, input_shape: tf.TensorShape): + with tf.name_scope("word_embeddings"): + self.weight = self.add_weight( + name="weight", + shape=[self.vocab_size, self.dim], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + with tf.name_scope("position_embeddings"): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.dim], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape) + def call(self, input_ids=None, position_ids=None, inputs_embeds=None, training=False): """ Applies embedding based on inputs tensor. @@ -178,13 +108,15 @@ class TFEmbeddings(tf.keras.layers.Layer): assert not (input_ids is None and inputs_embeds is None) if input_ids is not None: - inputs_embeds = self.word_embeddings(input_ids=input_ids) + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + + input_shape = shape_list(inputs_embeds)[:-1] if position_ids is None: - position_embeds = self.position_embeddings(position_ids=inputs_embeds) - else: - position_embeds = self.position_embeddings(position_ids=position_ids) + position_ids = tf.range(start=0, limit=input_shape[-1])[tf.newaxis, :] + position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) + position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1)) final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds]) final_embeddings = self.LayerNorm(inputs=final_embeddings) final_embeddings = self.dropout(inputs=final_embeddings, training=training) @@ -422,11 +354,11 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer): self.transformer = TFTransformer(config, name="transformer") # Encoder def get_input_embeddings(self): - return self.embeddings.word_embeddings + return self.embeddings def set_input_embeddings(self, value): - self.embeddings.word_embeddings.weight = value - self.embeddings.word_embeddings.vocab_size = value.shape[0] + self.embeddings.weight = value + self.embeddings.vocab_size = value.shape[0] def _prune_heads(self, heads_to_prune): raise NotImplementedError @@ -716,9 +648,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModel ) self.act = get_tf_activation("gelu") self.vocab_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm") - self.vocab_projector = TFDistilBertLMHead( - config, self.distilbert.embeddings.word_embeddings, name="vocab_projector" - ) + self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector") def get_lm_head(self): return self.vocab_projector diff --git a/src/transformers/models/electra/modeling_tf_electra.py b/src/transformers/models/electra/modeling_tf_electra.py index a968dbf71f..b81385cc44 100644 --- a/src/transformers/models/electra/modeling_tf_electra.py +++ b/src/transformers/models/electra/modeling_tf_electra.py @@ -14,9 +14,10 @@ # limitations under the License. """ TF Electra model. """ +import math import warnings from dataclasses import dataclass -from typing import Any, Dict, Optional, Tuple, Union +from typing import Dict, Optional, Tuple, Union import tensorflow as tf @@ -70,122 +71,6 @@ TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [ ] -# Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings -class TFElectraWordEmbeddings(tf.keras.layers.Layer): - def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape: tf.TensorShape): - self.weight = self.add_weight( - name="weight", - shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def get_config(self) -> Dict[str, Any]: - config = { - "vocab_size": self.vocab_size, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, input_ids: tf.Tensor) -> tf.Tensor: - flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) - embeddings = tf.gather(params=self.weight, indices=flat_input_ids) - embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) - ) - - embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) - - return embeddings - - -# Copied from transformers.models.bert.modeling_tf_bert.TFBertTokenTypeEmbeddings -class TFElectraTokenTypeEmbeddings(tf.keras.layers.Layer): - def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.type_vocab_size = type_vocab_size - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape: tf.TensorShape): - self.token_type_embeddings = self.add_weight( - name="embeddings", - shape=[self.type_vocab_size, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def get_config(self) -> Dict[str, Any]: - config = { - "type_vocab_size": self.type_vocab_size, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, token_type_ids: tf.Tensor) -> tf.Tensor: - flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) - one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) - embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) - embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(token_type_ids), [self.hidden_size]], axis=0) - ) - - embeddings.set_shape(token_type_ids.shape.as_list() + [self.hidden_size]) - - return embeddings - - -# Copied from transformers.models.bert.modeling_tf_bert.TFBertPositionEmbeddings -class TFElectraPositionEmbeddings(tf.keras.layers.Layer): - def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape: tf.TensorShape): - self.position_embeddings = self.add_weight( - name="embeddings", - shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def get_config(self) -> Dict[str, Any]: - config = { - "max_position_embeddings": self.max_position_embeddings, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, position_ids: tf.Tensor) -> tf.Tensor: - input_shape = shape_list(position_ids) - position_embeddings = self.position_embeddings[: input_shape[1], :] - - return tf.broadcast_to(input=position_embeddings, shape=input_shape) - - # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Electra class TFElectraSelfAttention(tf.keras.layers.Layer): def __init__(self, config: ElectraConfig, **kwargs): @@ -197,31 +82,29 @@ class TFElectraSelfAttention(tf.keras.layers.Layer): f"of attention heads ({config.num_attention_heads})" ) + self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.sqrt_att_head_size = math.sqrt(self.attention_head_size) - self.query = tf.keras.layers.experimental.EinsumDense( - equation="abc,cde->abde", - output_shape=(None, config.num_attention_heads, self.attention_head_size), - bias_axes="de", - kernel_initializer=get_initializer(config.initializer_range), - name="query", + self.query = tf.keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" ) - self.key = tf.keras.layers.experimental.EinsumDense( - equation="abc,cde->abde", - output_shape=(None, config.num_attention_heads, self.attention_head_size), - bias_axes="de", - kernel_initializer=get_initializer(config.initializer_range), - name="key", + self.key = tf.keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" ) - self.value = tf.keras.layers.experimental.EinsumDense( - equation="abc,cde->abde", - output_shape=(None, config.num_attention_heads, self.attention_head_size), - bias_axes="de", - kernel_initializer=get_initializer(config.initializer_range), - name="value", + self.value = tf.keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: + # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] + tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size)) + + # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size] + return tf.transpose(tensor, perm=[0, 2, 1, 3]) + def call( self, hidden_states: tf.Tensor, @@ -230,15 +113,20 @@ class TFElectraSelfAttention(tf.keras.layers.Layer): output_attentions: bool, training: bool = False, ) -> Tuple[tf.Tensor]: - query_layer = self.query(inputs=hidden_states) - key_layer = self.key(inputs=hidden_states) - value_layer = self.value(inputs=hidden_states) + batch_size = shape_list(hidden_states)[0] + mixed_query_layer = self.query(inputs=hidden_states) + mixed_key_layer = self.key(inputs=hidden_states) + mixed_value_layer = self.value(inputs=hidden_states) + query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) + key_layer = self.transpose_for_scores(mixed_key_layer, batch_size) + value_layer = self.transpose_for_scores(mixed_value_layer, batch_size) # Take the dot product between "query" and "key" to get the raw # attention scores. - dk = tf.cast(self.attention_head_size, dtype=query_layer.dtype) - query_layer = tf.multiply(query_layer, tf.math.rsqrt(dk)) - attention_scores = tf.einsum("aecd,abcd->acbe", key_layer, query_layer) + # (batch size, num_heads, seq_len_q, seq_len_k) + attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) + dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype) + attention_scores = tf.divide(attention_scores, dk) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in TFElectraModel call() function) @@ -255,7 +143,11 @@ class TFElectraSelfAttention(tf.keras.layers.Layer): if head_mask is not None: attention_probs = tf.multiply(attention_probs, head_mask) - attention_output = tf.einsum("acbe,aecd->abcd", attention_probs, value_layer) + attention_output = tf.matmul(attention_probs, value_layer) + attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3]) + + # (batch_size, seq_len_q, all_head_size) + attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size)) outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) return outputs @@ -266,21 +158,8 @@ class TFElectraSelfOutput(tf.keras.layers.Layer): def __init__(self, config: ElectraConfig, **kwargs): super().__init__(**kwargs) - if config.hidden_size % config.num_attention_heads != 0: - raise ValueError( - f"The hidden size ({config.hidden_size}) is not a multiple of the number " - f"of attention heads ({config.num_attention_heads})" - ) - - self.attention_head_size = int(config.hidden_size / config.num_attention_heads) - self.all_head_size = config.num_attention_heads * self.attention_head_size - - self.dense = tf.keras.layers.experimental.EinsumDense( - equation="abcd,cde->abe", - output_shape=(None, self.all_head_size), - bias_axes="e", - kernel_initializer=get_initializer(config.initializer_range), - name="dense", + self.dense = tf.keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) @@ -332,12 +211,8 @@ class TFElectraIntermediate(tf.keras.layers.Layer): def __init__(self, config: ElectraConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.experimental.EinsumDense( - equation="abc,cd->abd", - output_shape=(None, config.intermediate_size), - bias_axes="d", - kernel_initializer=get_initializer(config.initializer_range), - name="dense", + self.dense = tf.keras.layers.Dense( + units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) if isinstance(config.hidden_act, str): @@ -357,12 +232,8 @@ class TFElectraOutput(tf.keras.layers.Layer): def __init__(self, config: ElectraConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.experimental.EinsumDense( - equation="abc,cd->abd", - bias_axes="d", - output_shape=(None, config.hidden_size), - kernel_initializer=get_initializer(config.initializer_range), - name="dense", + self.dense = tf.keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) @@ -485,35 +356,46 @@ class TFElectraEmbeddings(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.word_embeddings = TFElectraWordEmbeddings( - vocab_size=config.vocab_size, - hidden_size=config.embedding_size, - initializer_range=config.initializer_range, - name="word_embeddings", - ) - self.position_embeddings = TFElectraPositionEmbeddings( - max_position_embeddings=config.max_position_embeddings, - hidden_size=config.embedding_size, - initializer_range=config.initializer_range, - name="position_embeddings", - ) - self.token_type_embeddings = TFElectraTokenTypeEmbeddings( - type_vocab_size=config.type_vocab_size, - hidden_size=config.embedding_size, - initializer_range=config.initializer_range, - name="token_type_embeddings", - ) + self.vocab_size = config.vocab_size + self.type_vocab_size = config.type_vocab_size + self.embedding_size = config.embedding_size + self.max_position_embeddings = config.max_position_embeddings + self.initializer_range = config.initializer_range self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + def build(self, input_shape: tf.TensorShape): + with tf.name_scope("word_embeddings"): + self.weight = self.add_weight( + name="weight", + shape=[self.vocab_size, self.embedding_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + with tf.name_scope("token_type_embeddings"): + self.token_type_embeddings = self.add_weight( + name="embeddings", + shape=[self.type_vocab_size, self.embedding_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + with tf.name_scope("position_embeddings"): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.embedding_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape) + # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call def call( self, - input_ids: tf.Tensor, - position_ids: tf.Tensor, - token_type_ids: tf.Tensor, - inputs_embeds: tf.Tensor, + input_ids: tf.Tensor = None, + position_ids: tf.Tensor = None, + token_type_ids: tf.Tensor = None, + inputs_embeds: tf.Tensor = None, training: bool = False, ) -> tf.Tensor: """ @@ -525,18 +407,19 @@ class TFElectraEmbeddings(tf.keras.layers.Layer): assert not (input_ids is None and inputs_embeds is None) if input_ids is not None: - inputs_embeds = self.word_embeddings(input_ids=input_ids) + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + + input_shape = shape_list(inputs_embeds)[:-1] if token_type_ids is None: - input_shape = shape_list(tensor=inputs_embeds)[:-1] token_type_ids = tf.fill(dims=input_shape, value=0) if position_ids is None: - position_embeds = self.position_embeddings(position_ids=inputs_embeds) - else: - position_embeds = self.position_embeddings(position_ids=position_ids) + position_ids = tf.range(start=0, limit=input_shape[-1])[tf.newaxis, :] - token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids) + position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) + position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1)) + token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) final_embeddings = self.LayerNorm(inputs=final_embeddings) final_embeddings = self.dropout(inputs=final_embeddings, training=training) @@ -605,11 +488,11 @@ class TFElectraMainLayer(tf.keras.layers.Layer): self.config = config def get_input_embeddings(self): - return self.embeddings.word_embeddings + return self.embeddings def set_input_embeddings(self, value): - self.embeddings.word_embeddings.weight = value - self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] + self.embeddings.weight = value + self.embeddings.vocab_size = shape_list(value)[0] def _prune_heads(self, heads_to_prune): """ @@ -1057,9 +940,7 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLos else: self.activation = config.hidden_act - self.generator_lm_head = TFElectraMaskedLMHead( - config, self.electra.embeddings.word_embeddings, name="generator_lm_head" - ) + self.generator_lm_head = TFElectraMaskedLMHead(config, self.electra.embeddings, name="generator_lm_head") def get_lm_head(self): return self.generator_lm_head diff --git a/src/transformers/models/funnel/modeling_tf_funnel.py b/src/transformers/models/funnel/modeling_tf_funnel.py index f0e6f0a7b7..b54bb876fa 100644 --- a/src/transformers/models/funnel/modeling_tf_funnel.py +++ b/src/transformers/models/funnel/modeling_tf_funnel.py @@ -16,7 +16,7 @@ import warnings from dataclasses import dataclass -from typing import Any, Dict, Optional, Tuple +from typing import Dict, Optional, Tuple import tensorflow as tf @@ -74,61 +74,29 @@ TF_FUNNEL_PRETRAINED_MODEL_ARCHIVE_LIST = [ INF = 1e6 -# Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings -class TFFunnelWordEmbeddings(tf.keras.layers.Layer): - def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape: tf.TensorShape): - self.weight = self.add_weight( - name="weight", - shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def get_config(self) -> Dict[str, Any]: - config = { - "vocab_size": self.vocab_size, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, input_ids: tf.Tensor) -> tf.Tensor: - flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) - embeddings = tf.gather(params=self.weight, indices=flat_input_ids) - embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) - ) - - embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) - - return embeddings - - class TFFunnelEmbeddings(tf.keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.word_embeddings = TFFunnelWordEmbeddings( - vocab_size=config.vocab_size, - hidden_size=config.hidden_size, - initializer_range=config.initializer_range, - name="word_embeddings", - ) + self.vocab_size = config.vocab_size + self.hidden_size = config.hidden_size + self.initializer_range = config.initializer_range + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout) + def build(self, input_shape): + with tf.name_scope("word_embeddings"): + self.weight = self.add_weight( + name="weight", + shape=[self.vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape) + def call(self, input_ids=None, inputs_embeds=None, training=False): """ Applies embedding based on inputs tensor. @@ -140,7 +108,7 @@ class TFFunnelEmbeddings(tf.keras.layers.Layer): assert not (input_ids is not None and inputs_embeds is not None) if input_ids is not None: - inputs_embeds = self.word_embeddings(input_ids=input_ids) + inputs_embeds = tf.gather(self.weight, input_ids) final_embeddings = self.LayerNorm(inputs=inputs_embeds) final_embeddings = self.dropout(inputs=final_embeddings, training=training) @@ -513,13 +481,15 @@ class TFFunnelRelMultiheadAttention(tf.keras.layers.Layer): # Shape batch_size x n_head x seq_len x 2 token_type_bias = tf.einsum("bind,snd->bnis", q_head + r_s_bias, self.seg_embed) # Shape batch_size x n_head x seq_len x context_len - new_shape = [batch_size, shape_list(q_head)[2], seq_len, context_len] - token_type_mat = tf.broadcast_to(token_type_mat[:, None], new_shape) + token_type_mat = tf.tile(token_type_mat[:, None], [1, shape_list(q_head)[2], 1, 1]) + # token_type_mat = tf.broadcast_to(token_type_mat[:, None], new_shape) # Shapes batch_size x n_head x seq_len diff_token_type, same_token_type = tf.split(token_type_bias, 2, axis=-1) # Shape batch_size x n_head x seq_len x context_len token_type_attn = tf.where( - token_type_mat, tf.broadcast_to(same_token_type, new_shape), tf.broadcast_to(diff_token_type, new_shape) + token_type_mat, + tf.tile(same_token_type, [1, 1, 1, context_len]), + tf.tile(diff_token_type, [1, 1, 1, context_len]), ) if cls_mask is not None: @@ -773,11 +743,11 @@ class TFFunnelBaseLayer(tf.keras.layers.Layer): self.encoder = TFFunnelEncoder(config, name="encoder") def get_input_embeddings(self): - return self.embeddings.word_embeddings + return self.embeddings def set_input_embeddings(self, value): - self.embeddings.word_embeddings.weight = value - self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] + self.embeddings.weight = value + self.embeddings.vocab_size = shape_list(value)[0] def _prune_heads(self, heads_to_prune): raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models @@ -859,11 +829,11 @@ class TFFunnelMainLayer(tf.keras.layers.Layer): self.decoder = TFFunnelDecoder(config, name="decoder") def get_input_embeddings(self): - return self.embeddings.word_embeddings + return self.embeddings def set_input_embeddings(self, value): - self.embeddings.word_embeddings.weight = value - self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] + self.embeddings.weight = value + self.embeddings.vocab_size = shape_list(value)[0] def _prune_heads(self, heads_to_prune): raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models @@ -1360,7 +1330,7 @@ class TFFunnelForMaskedLM(TFFunnelPreTrainedModel, TFMaskedLanguageModelingLoss) super().__init__(config, *inputs, **kwargs) self.funnel = TFFunnelMainLayer(config, name="funnel") - self.lm_head = TFFunnelMaskedLMHead(config, self.funnel.embeddings.word_embeddings, name="lm_head") + self.lm_head = TFFunnelMaskedLMHead(config, self.funnel.embeddings, name="lm_head") def get_lm_head(self): return self.lm_head diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py index ba068869eb..bce2fc5316 100644 --- a/src/transformers/models/led/modeling_tf_led.py +++ b/src/transformers/models/led/modeling_tf_led.py @@ -87,17 +87,17 @@ def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: i if past_key_values_length > 0: mask = tf.concat([tf.zeros((tgt_len, past_key_values_length), dtype=tf.float32), mask], axis=-1) - return tf.broadcast_to(mask[None, None, :, :], (bsz, 1, tgt_len, tgt_len + past_key_values_length)) + + return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1)) def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None, past_key_values_length: int = 0): """ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. """ - bsz, src_len = shape_list(mask) + src_len = shape_list(mask)[1] tgt_len = tgt_len if tgt_len is not None else src_len - - expanded_mask = tf.cast(tf.broadcast_to(mask[:, None, None, :], (bsz, 1, tgt_len, src_len)), tf.float32) + expanded_mask = tf.cast(tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1)), tf.float32) return (1.0 - expanded_mask) * LARGE_NEGATIVE diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py index e3c747c939..81f0eb3880 100644 --- a/src/transformers/models/longformer/modeling_tf_longformer.py +++ b/src/transformers/models/longformer/modeling_tf_longformer.py @@ -16,7 +16,7 @@ import warnings from dataclasses import dataclass -from typing import Any, Dict, Optional, Tuple +from typing import Optional, Tuple import tensorflow as tf @@ -415,126 +415,6 @@ def _compute_global_attention_mask(input_ids_shape, sep_token_indices, before_se return attention_mask -# Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings -class TFLongformerWordEmbeddings(tf.keras.layers.Layer): - def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape: tf.TensorShape): - self.weight = self.add_weight( - name="weight", - shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def get_config(self) -> Dict[str, Any]: - config = { - "vocab_size": self.vocab_size, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, input_ids: tf.Tensor) -> tf.Tensor: - flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) - embeddings = tf.gather(params=self.weight, indices=flat_input_ids) - embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) - ) - - embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) - - return embeddings - - -# Copied from transformers.models.bert.modeling_tf_bert.TFBertTokenTypeEmbeddings -class TFLongformerTokenTypeEmbeddings(tf.keras.layers.Layer): - def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.type_vocab_size = type_vocab_size - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape: tf.TensorShape): - self.token_type_embeddings = self.add_weight( - name="embeddings", - shape=[self.type_vocab_size, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def get_config(self) -> Dict[str, Any]: - config = { - "type_vocab_size": self.type_vocab_size, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, token_type_ids: tf.Tensor) -> tf.Tensor: - flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) - one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) - embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) - embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(token_type_ids), [self.hidden_size]], axis=0) - ) - - embeddings.set_shape(token_type_ids.shape.as_list() + [self.hidden_size]) - - return embeddings - - -class TFLongformerPositionEmbeddings(tf.keras.layers.Layer): - def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape): - self.position_embeddings = self.add_weight( - name="embeddings", - shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def get_config(self): - config = { - "max_position_embeddings": self.max_position_embeddings, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, position_ids): - flat_position_ids = tf.reshape(tensor=position_ids, shape=[-1]) - embeddings = tf.gather(params=self.position_embeddings, indices=flat_position_ids) - embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(position_ids), [self.hidden_size]], axis=0) - ) - - embeddings.set_shape(position_ids.shape.as_list() + [self.hidden_size]) - - return embeddings - - # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->Longformer class TFLongformerLMHead(tf.keras.layers.Layer): """Longformer Head for masked language modeling.""" @@ -598,28 +478,39 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer): super().__init__(**kwargs) self.padding_idx = 1 - self.word_embeddings = TFLongformerWordEmbeddings( - vocab_size=config.vocab_size, - hidden_size=config.hidden_size, - initializer_range=config.initializer_range, - name="word_embeddings", - ) - self.position_embeddings = TFLongformerPositionEmbeddings( - max_position_embeddings=config.max_position_embeddings, - hidden_size=config.hidden_size, - initializer_range=config.initializer_range, - name="position_embeddings", - ) - self.token_type_embeddings = TFLongformerTokenTypeEmbeddings( - type_vocab_size=config.type_vocab_size, - hidden_size=config.hidden_size, - initializer_range=config.initializer_range, - name="token_type_embeddings", - ) + self.vocab_size = config.vocab_size + self.type_vocab_size = config.type_vocab_size + self.hidden_size = config.hidden_size + self.max_position_embeddings = config.max_position_embeddings + self.initializer_range = config.initializer_range self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + def build(self, input_shape: tf.TensorShape): + with tf.name_scope("word_embeddings"): + self.weight = self.add_weight( + name="weight", + shape=[self.vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + with tf.name_scope("token_type_embeddings"): + self.token_type_embeddings = self.add_weight( + name="embeddings", + shape=[self.type_vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + with tf.name_scope("position_embeddings"): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape) + def create_position_ids_from_input_ids(self, input_ids): """ Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding @@ -627,36 +518,13 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer): Args: input_ids: tf.Tensor - Returns: tf.Tensor """ - input_ids_shape = shape_list(tensor=input_ids) - - # multiple choice has 3 dimensions - if len(input_ids_shape) == 3: - input_ids = tf.reshape( - tensor=input_ids, shape=(input_ids_shape[0] * input_ids_shape[1], input_ids_shape[2]) - ) - mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=input_ids.dtype) incremental_indices = tf.math.cumsum(mask, axis=1) * mask return incremental_indices + self.padding_idx - def create_position_ids_from_inputs_embeds(self, inputs_embeds): - """ - We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids. - - Args: - inputs_embeds: tf.Tensor - - Returns: tf.Tensor - """ - batch_size, seq_length = shape_list(tensor=inputs_embeds)[:2] - position_ids = tf.range(start=self.padding_idx + 1, limit=seq_length + self.padding_idx + 1)[tf.newaxis, :] - - return tf.tile(input=position_ids, multiples=(batch_size, 1)) - def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False): """ Applies embedding based on inputs tensor. @@ -667,10 +535,11 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer): assert not (input_ids is None and inputs_embeds is None) if input_ids is not None: - inputs_embeds = self.word_embeddings(input_ids=input_ids) + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + + input_shape = shape_list(inputs_embeds)[:-1] if token_type_ids is None: - input_shape = shape_list(tensor=inputs_embeds)[:-1] token_type_ids = tf.fill(dims=input_shape, value=0) if position_ids is None: @@ -678,10 +547,13 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer): # Create the position ids from the input token ids. Any padded tokens remain padded. position_ids = self.create_position_ids_from_input_ids(input_ids=input_ids) else: - position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds=inputs_embeds) + position_ids = tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1)[ + tf.newaxis, : + ] + position_ids = tf.tile(input=position_ids, multiples=(input_shape[0], 1)) - position_embeds = self.position_embeddings(position_ids=position_ids) - token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids) + position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) + token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) final_embeddings = self.LayerNorm(inputs=final_embeddings) final_embeddings = self.dropout(inputs=final_embeddings, training=training) @@ -694,12 +566,8 @@ class TFLongformerIntermediate(tf.keras.layers.Layer): def __init__(self, config: LongformerConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.experimental.EinsumDense( - equation="abc,cd->abd", - output_shape=(None, config.intermediate_size), - bias_axes="d", - kernel_initializer=get_initializer(config.initializer_range), - name="dense", + self.dense = tf.keras.layers.Dense( + units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) if isinstance(config.hidden_act, str): @@ -719,12 +587,8 @@ class TFLongformerOutput(tf.keras.layers.Layer): def __init__(self, config: LongformerConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.experimental.EinsumDense( - equation="abc,cd->abd", - bias_axes="d", - output_shape=(None, config.hidden_size), - kernel_initializer=get_initializer(config.initializer_range), - name="dense", + self.dense = tf.keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) @@ -758,20 +622,21 @@ class TFLongformerPooler(tf.keras.layers.Layer): return pooled_output +# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Longformer class TFLongformerSelfOutput(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: LongformerConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( - config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def call(self, hidden_states, input_tensor, training=False): - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states, training=training) - hidden_states = self.LayerNorm(hidden_states + input_tensor) + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states @@ -1676,11 +1541,11 @@ class TFLongformerMainLayer(tf.keras.layers.Layer): self.pooler = TFLongformerPooler(config, name="pooler") if add_pooling_layer else None def get_input_embeddings(self): - return self.embeddings.word_embeddings + return self.embeddings def set_input_embeddings(self, value): - self.embeddings.word_embeddings.weight = value - self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] + self.embeddings.weight = value + self.embeddings.vocab_size = shape_list(value)[0] def _prune_heads(self, heads_to_prune): """ @@ -2119,7 +1984,7 @@ class TFLongformerForMaskedLM(TFLongformerPreTrainedModel, TFMaskedLanguageModel super().__init__(config, *inputs, **kwargs) self.longformer = TFLongformerMainLayer(config, add_pooling_layer=False, name="longformer") - self.lm_head = TFLongformerLMHead(config, self.longformer.embeddings.word_embeddings, name="lm_head") + self.lm_head = TFLongformerLMHead(config, self.longformer.embeddings, name="lm_head") def get_lm_head(self): return self.lm_head diff --git a/src/transformers/models/lxmert/modeling_tf_lxmert.py b/src/transformers/models/lxmert/modeling_tf_lxmert.py index eddc82bd1a..70d8364547 100644 --- a/src/transformers/models/lxmert/modeling_tf_lxmert.py +++ b/src/transformers/models/lxmert/modeling_tf_lxmert.py @@ -18,7 +18,7 @@ import warnings from dataclasses import dataclass -from typing import Any, Dict, Optional, Tuple +from typing import Dict, Optional, Tuple import tensorflow as tf @@ -177,150 +177,45 @@ class TFLxmertVisualFeatureEncoder(tf.keras.layers.Layer): return output -# Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings -class TFLxmertWordEmbeddings(tf.keras.layers.Layer): - def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape: tf.TensorShape): - self.weight = self.add_weight( - name="weight", - shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def get_config(self) -> Dict[str, Any]: - config = { - "vocab_size": self.vocab_size, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, input_ids: tf.Tensor) -> tf.Tensor: - flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) - embeddings = tf.gather(params=self.weight, indices=flat_input_ids) - embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) - ) - - embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) - - return embeddings - - -# Copied from transformers.models.bert.modeling_tf_bert.TFBertTokenTypeEmbeddings -class TFLxmertTokenTypeEmbeddings(tf.keras.layers.Layer): - def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.type_vocab_size = type_vocab_size - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape: tf.TensorShape): - self.token_type_embeddings = self.add_weight( - name="embeddings", - shape=[self.type_vocab_size, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def get_config(self) -> Dict[str, Any]: - config = { - "type_vocab_size": self.type_vocab_size, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, token_type_ids: tf.Tensor) -> tf.Tensor: - flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) - one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) - embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) - embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(token_type_ids), [self.hidden_size]], axis=0) - ) - - embeddings.set_shape(token_type_ids.shape.as_list() + [self.hidden_size]) - - return embeddings - - -# Copied from transformers.models.bert.modeling_tf_bert.TFBertPositionEmbeddings -class TFLxmertPositionEmbeddings(tf.keras.layers.Layer): - def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape: tf.TensorShape): - self.position_embeddings = self.add_weight( - name="embeddings", - shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def get_config(self) -> Dict[str, Any]: - config = { - "max_position_embeddings": self.max_position_embeddings, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, position_ids: tf.Tensor) -> tf.Tensor: - input_shape = shape_list(position_ids) - position_embeddings = self.position_embeddings[: input_shape[1], :] - - return tf.broadcast_to(input=position_embeddings, shape=input_shape) - - class TFLxmertEmbeddings(tf.keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.word_embeddings = TFLxmertWordEmbeddings( - vocab_size=config.vocab_size, - hidden_size=config.hidden_size, - initializer_range=config.initializer_range, - name="word_embeddings", - ) - self.position_embeddings = TFLxmertPositionEmbeddings( - max_position_embeddings=config.max_position_embeddings, - hidden_size=config.hidden_size, - initializer_range=config.initializer_range, - name="position_embeddings", - ) - self.token_type_embeddings = TFLxmertTokenTypeEmbeddings( - type_vocab_size=config.type_vocab_size, - hidden_size=config.hidden_size, - initializer_range=config.initializer_range, - name="token_type_embeddings", - ) + self.vocab_size = config.vocab_size + self.type_vocab_size = config.type_vocab_size + self.hidden_size = config.hidden_size + self.max_position_embeddings = config.max_position_embeddings + self.initializer_range = config.initializer_range self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + def build(self, input_shape): + with tf.name_scope("word_embeddings"): + self.weight = self.add_weight( + name="weight", + shape=[self.vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + with tf.name_scope("token_type_embeddings"): + self.token_type_embeddings = self.add_weight( + name="embeddings", + shape=[self.type_vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + with tf.name_scope("position_embeddings"): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape) + def call(self, input_ids=None, token_type_ids=None, inputs_embeds=None, training=False): """ Applies embedding based on inputs tensor. @@ -331,14 +226,17 @@ class TFLxmertEmbeddings(tf.keras.layers.Layer): assert not (input_ids is None and inputs_embeds is None) if input_ids is not None: - inputs_embeds = self.word_embeddings(input_ids=input_ids) + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + + input_shape = shape_list(inputs_embeds)[:-1] if token_type_ids is None: - input_shape = shape_list(tensor=inputs_embeds)[:-1] token_type_ids = tf.fill(dims=input_shape, value=0) - position_embeds = self.position_embeddings(position_ids=inputs_embeds) - token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids) + position_ids = tf.range(start=0, limit=input_shape[-1])[tf.newaxis, :] + position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) + position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1)) + token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) final_embeddings = self.LayerNorm(inputs=final_embeddings) final_embeddings = self.dropout(inputs=final_embeddings, training=training) @@ -379,6 +277,7 @@ class TFLxmertAttention(tf.keras.layers.Layer): self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) def transpose_for_scores(self, x, batch_size): + # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size)) return tf.transpose(x, perm=[0, 2, 1, 3]) @@ -764,11 +663,11 @@ class TFLxmertMainLayer(tf.keras.layers.Layer): self.config = config def get_input_embeddings(self): - return self.embeddings.word_embeddings + return self.embeddings def set_input_embeddings(self, value): - self.embeddings.word_embeddings.weight = value - self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] + self.embeddings.weight = value + self.embeddings.vocab_size = shape_list(value)[0] def _prune_heads(self, heads_to_prune): raise NotImplementedError @@ -1309,7 +1208,7 @@ class TFLxmertForPreTraining(TFLxmertPreTrainedModel): self.lxmert = TFLxmertMainLayer(config, name="lxmert") # Pre-training heads - self.cls = TFLxmertPreTrainingHeads(config, self.lxmert.embeddings.word_embeddings, name="cls") + self.cls = TFLxmertPreTrainingHeads(config, self.lxmert.embeddings, name="cls") if self.task_obj_predict: self.obj_predict_head = TFLxmertVisualObjHead(config, name="obj_predict_head") if self.task_qa: diff --git a/src/transformers/models/marian/modeling_tf_marian.py b/src/transformers/models/marian/modeling_tf_marian.py index 885df88b81..674bf1b52e 100644 --- a/src/transformers/models/marian/modeling_tf_marian.py +++ b/src/transformers/models/marian/modeling_tf_marian.py @@ -95,7 +95,8 @@ def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: i if past_key_values_length > 0: mask = tf.concat([tf.zeros((tgt_len, past_key_values_length), dtype=tf.float32), mask], axis=-1) - return tf.broadcast_to(mask[None, None, :, :], (bsz, 1, tgt_len, tgt_len + past_key_values_length)) + + return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1)) # Copied from transformers.models.bart.modeling_tf_bart._expand_mask @@ -103,10 +104,9 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None, past_key_values """ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. """ - bsz, src_len = shape_list(mask) + src_len = shape_list(mask)[1] tgt_len = tgt_len if tgt_len is not None else src_len - - expanded_mask = tf.cast(tf.broadcast_to(mask[:, None, None, :], (bsz, 1, tgt_len, src_len)), tf.float32) + expanded_mask = tf.cast(tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1)), tf.float32) return (1.0 - expanded_mask) * LARGE_NEGATIVE diff --git a/src/transformers/models/mbart/modeling_tf_mbart.py b/src/transformers/models/mbart/modeling_tf_mbart.py index 478400ff84..94d4e814f5 100644 --- a/src/transformers/models/mbart/modeling_tf_mbart.py +++ b/src/transformers/models/mbart/modeling_tf_mbart.py @@ -95,7 +95,8 @@ def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: i if past_key_values_length > 0: mask = tf.concat([tf.zeros((tgt_len, past_key_values_length), dtype=tf.float32), mask], axis=-1) - return tf.broadcast_to(mask[None, None, :, :], (bsz, 1, tgt_len, tgt_len + past_key_values_length)) + + return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1)) # Copied from transformers.models.bart.modeling_tf_bart._expand_mask @@ -103,10 +104,9 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None, past_key_values """ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. """ - bsz, src_len = shape_list(mask) + src_len = shape_list(mask)[1] tgt_len = tgt_len if tgt_len is not None else src_len - - expanded_mask = tf.cast(tf.broadcast_to(mask[:, None, None, :], (bsz, 1, tgt_len, src_len)), tf.float32) + expanded_mask = tf.cast(tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1)), tf.float32) return (1.0 - expanded_mask) * LARGE_NEGATIVE diff --git a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py index d38a4869f1..537bc20632 100644 --- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py +++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py @@ -17,7 +17,7 @@ import warnings from dataclasses import dataclass -from typing import Any, Dict, Optional, Tuple +from typing import Dict, Optional, Tuple import tensorflow as tf @@ -107,122 +107,6 @@ class TFNoNorm(tf.keras.layers.Layer): NORM2FN = {"layer_norm": TFLayerNorm, "no_norm": TFNoNorm} -# Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings -class TFMobileBertWordEmbeddings(tf.keras.layers.Layer): - def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape: tf.TensorShape): - self.weight = self.add_weight( - name="weight", - shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def get_config(self) -> Dict[str, Any]: - config = { - "vocab_size": self.vocab_size, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, input_ids: tf.Tensor) -> tf.Tensor: - flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) - embeddings = tf.gather(params=self.weight, indices=flat_input_ids) - embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) - ) - - embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) - - return embeddings - - -# Copied from transformers.models.bert.modeling_tf_bert.TFBertTokenTypeEmbeddings -class TFMobileBertTokenTypeEmbeddings(tf.keras.layers.Layer): - def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.type_vocab_size = type_vocab_size - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape: tf.TensorShape): - self.token_type_embeddings = self.add_weight( - name="embeddings", - shape=[self.type_vocab_size, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def get_config(self) -> Dict[str, Any]: - config = { - "type_vocab_size": self.type_vocab_size, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, token_type_ids: tf.Tensor) -> tf.Tensor: - flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) - one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) - embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) - embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(token_type_ids), [self.hidden_size]], axis=0) - ) - - embeddings.set_shape(token_type_ids.shape.as_list() + [self.hidden_size]) - - return embeddings - - -# Copied from transformers.models.bert.modeling_tf_bert.TFBertPositionEmbeddings -class TFMobileBertPositionEmbeddings(tf.keras.layers.Layer): - def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape: tf.TensorShape): - self.position_embeddings = self.add_weight( - name="embeddings", - shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def get_config(self) -> Dict[str, Any]: - config = { - "max_position_embeddings": self.max_position_embeddings, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, position_ids: tf.Tensor) -> tf.Tensor: - input_shape = shape_list(position_ids) - position_embeddings = self.position_embeddings[: input_shape[1], :] - - return tf.broadcast_to(input=position_embeddings, shape=input_shape) - - class TFMobileBertEmbeddings(tf.keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" @@ -231,25 +115,11 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer): self.trigram_input = config.trigram_input self.embedding_size = config.embedding_size + self.vocab_size = config.vocab_size self.hidden_size = config.hidden_size - self.word_embeddings = TFMobileBertWordEmbeddings( - vocab_size=config.vocab_size, - hidden_size=config.embedding_size, - initializer_range=config.initializer_range, - name="word_embeddings", - ) - self.position_embeddings = TFMobileBertPositionEmbeddings( - max_position_embeddings=config.max_position_embeddings, - hidden_size=config.hidden_size, - initializer_range=config.initializer_range, - name="position_embeddings", - ) - self.token_type_embeddings = TFMobileBertTokenTypeEmbeddings( - type_vocab_size=config.type_vocab_size, - hidden_size=config.hidden_size, - initializer_range=config.initializer_range, - name="token_type_embeddings", - ) + self.type_vocab_size = config.type_vocab_size + self.max_position_embeddings = config.max_position_embeddings + self.initializer_range = config.initializer_range self.embeddings_sum = tf.keras.layers.Add() self.embedding_transformation = tf.keras.layers.Dense(config.hidden_size, name="embedding_transformation") @@ -260,6 +130,30 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer): ) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + def build(self, input_shape): + with tf.name_scope("word_embeddings"): + self.weight = self.add_weight( + name="weight", + shape=[self.vocab_size, self.embedding_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + with tf.name_scope("token_type_embeddings"): + self.token_type_embeddings = self.add_weight( + name="embeddings", + shape=[self.type_vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + with tf.name_scope("position_embeddings"): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape) + def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False): """ Applies embedding based on inputs tensor. @@ -270,10 +164,11 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer): assert not (input_ids is None and inputs_embeds is None) if input_ids is not None: - inputs_embeds = self.word_embeddings(input_ids=input_ids) + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + + input_shape = shape_list(inputs_embeds)[:-1] if token_type_ids is None: - input_shape = shape_list(tensor=inputs_embeds)[:-1] token_type_ids = tf.fill(dims=input_shape, value=0) if self.trigram_input: @@ -297,11 +192,11 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer): inputs_embeds = self.embedding_transformation(inputs_embeds) if position_ids is None: - position_embeds = self.position_embeddings(position_ids=inputs_embeds) - else: - position_embeds = self.position_embeddings(position_ids=position_ids) + position_ids = tf.range(start=0, limit=input_shape[-1])[tf.newaxis, :] - token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids) + position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) + position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1)) + token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) final_embeddings = self.LayerNorm(inputs=final_embeddings) final_embeddings = self.dropout(inputs=final_embeddings, training=training) @@ -337,6 +232,7 @@ class TFMobileBertSelfAttention(tf.keras.layers.Layer): self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) def transpose_for_scores(self, x, batch_size): + # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size)) return tf.transpose(x, perm=[0, 2, 1, 3]) @@ -772,11 +668,11 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer): self.pooler = TFMobileBertPooler(config, name="pooler") if add_pooling_layer else None def get_input_embeddings(self): - return self.embeddings.word_embeddings + return self.embeddings def set_input_embeddings(self, value): - self.embeddings.word_embeddings.weight = value - self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] + self.embeddings.weight = value + self.embeddings.vocab_size = shape_list(value)[0] def _prune_heads(self, heads_to_prune): """ diff --git a/src/transformers/models/mpnet/modeling_tf_mpnet.py b/src/transformers/models/mpnet/modeling_tf_mpnet.py index 67f1031d12..799b7b8982 100644 --- a/src/transformers/models/mpnet/modeling_tf_mpnet.py +++ b/src/transformers/models/mpnet/modeling_tf_mpnet.py @@ -18,7 +18,6 @@ import math import warnings -from typing import Any, Dict import tensorflow as tf @@ -87,86 +86,6 @@ class TFMPNetPreTrainedModel(TFPreTrainedModel): return self.serving_output(output) -# Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings -class TFMPNetWordEmbeddings(tf.keras.layers.Layer): - def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape: tf.TensorShape): - self.weight = self.add_weight( - name="weight", - shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def get_config(self) -> Dict[str, Any]: - config = { - "vocab_size": self.vocab_size, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, input_ids: tf.Tensor) -> tf.Tensor: - flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) - embeddings = tf.gather(params=self.weight, indices=flat_input_ids) - embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) - ) - - embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) - - return embeddings - - -# Copied from transformers.models.longformer.modeling_tf_longformer.TFLongformerPositionEmbeddings -class TFMPNetPositionEmbeddings(tf.keras.layers.Layer): - def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape): - self.position_embeddings = self.add_weight( - name="embeddings", - shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def get_config(self): - config = { - "max_position_embeddings": self.max_position_embeddings, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, position_ids): - flat_position_ids = tf.reshape(tensor=position_ids, shape=[-1]) - embeddings = tf.gather(params=self.position_embeddings, indices=flat_position_ids) - embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(position_ids), [self.hidden_size]], axis=0) - ) - - embeddings.set_shape(position_ids.shape.as_list() + [self.hidden_size]) - - return embeddings - - class TFMPNetEmbeddings(tf.keras.layers.Layer): """Construct the embeddings from word, position embeddings.""" @@ -174,22 +93,31 @@ class TFMPNetEmbeddings(tf.keras.layers.Layer): super().__init__(**kwargs) self.padding_idx = 1 - self.word_embeddings = TFMPNetWordEmbeddings( - vocab_size=config.vocab_size, - hidden_size=config.hidden_size, - initializer_range=config.initializer_range, - name="word_embeddings", - ) - self.position_embeddings = TFMPNetPositionEmbeddings( - max_position_embeddings=config.max_position_embeddings, - hidden_size=config.hidden_size, - initializer_range=config.initializer_range, - name="position_embeddings", - ) + self.vocab_size = config.vocab_size + self.hidden_size = config.hidden_size + self.max_position_embeddings = config.max_position_embeddings + self.initializer_range = config.initializer_range self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + def build(self, input_shape: tf.TensorShape): + with tf.name_scope("word_embeddings"): + self.weight = self.add_weight( + name="weight", + shape=[self.vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + with tf.name_scope("position_embeddings"): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape) + def create_position_ids_from_input_ids(self, input_ids): """ Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding @@ -197,36 +125,13 @@ class TFMPNetEmbeddings(tf.keras.layers.Layer): Args: input_ids: tf.Tensor - Returns: tf.Tensor """ - input_ids_shape = shape_list(tensor=input_ids) - - # multiple choice has 3 dimensions - if len(input_ids_shape) == 3: - input_ids = tf.reshape( - tensor=input_ids, shape=(input_ids_shape[0] * input_ids_shape[1], input_ids_shape[2]) - ) - mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=input_ids.dtype) incremental_indices = tf.math.cumsum(mask, axis=1) * mask return incremental_indices + self.padding_idx - def create_position_ids_from_inputs_embeds(self, inputs_embeds): - """ - We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids. - - Args: - inputs_embeds: tf.Tensor - - Returns: tf.Tensor - """ - batch_size, seq_length = shape_list(tensor=inputs_embeds)[:2] - position_ids = tf.range(start=self.padding_idx + 1, limit=seq_length + self.padding_idx + 1)[tf.newaxis, :] - - return tf.tile(input=position_ids, multiples=(batch_size, 1)) - def call(self, input_ids=None, position_ids=None, inputs_embeds=None, training=False): """ Applies embedding based on inputs tensor. @@ -237,16 +142,21 @@ class TFMPNetEmbeddings(tf.keras.layers.Layer): assert not (input_ids is None and inputs_embeds is None) if input_ids is not None: - inputs_embeds = self.word_embeddings(input_ids=input_ids) + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + + input_shape = shape_list(inputs_embeds)[:-1] if position_ids is None: if input_ids is not None: # Create the position ids from the input token ids. Any padded tokens remain padded. position_ids = self.create_position_ids_from_input_ids(input_ids=input_ids) else: - position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds=inputs_embeds) + position_ids = tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1)[ + tf.newaxis, : + ] + position_ids = tf.tile(input=position_ids, multiples=(input_shape[0], 1)) - position_embeds = self.position_embeddings(position_ids=position_ids) + position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds]) final_embeddings = self.LayerNorm(inputs=final_embeddings) final_embeddings = self.dropout(inputs=final_embeddings, training=training) @@ -281,58 +191,55 @@ class TFMPNetSelfAttention(tf.keras.layers.Layer): if config.hidden_size % config.num_attention_heads != 0: raise ValueError( - f"The hidden size ({config.hidden_size}) is not a multiple of the number " - f"of attention heads ({config.num_attention_heads})" + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (config.hidden_size, config.num_attention_heads) ) self.num_attention_heads = config.num_attention_heads + assert config.hidden_size % config.num_attention_heads == 0 self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size - self.q = tf.keras.layers.experimental.EinsumDense( - equation="abc,cde->abde", - output_shape=(None, config.num_attention_heads, self.attention_head_size), - bias_axes="de", - kernel_initializer=get_initializer(config.initializer_range), - name="q", + + self.q = tf.keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="q" ) - self.k = tf.keras.layers.experimental.EinsumDense( - equation="abc,cde->abde", - output_shape=(None, config.num_attention_heads, self.attention_head_size), - bias_axes="de", - kernel_initializer=get_initializer(config.initializer_range), - name="k", + self.k = tf.keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="k" ) - self.v = tf.keras.layers.experimental.EinsumDense( - equation="abc,cde->abde", - output_shape=(None, config.num_attention_heads, self.attention_head_size), - bias_axes="de", - kernel_initializer=get_initializer(config.initializer_range), - name="v", + self.v = tf.keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="v" ) - self.o = tf.keras.layers.experimental.EinsumDense( - equation="abcd,cde->abe", - output_shape=(None, self.all_head_size), - bias_axes="e", - kernel_initializer=get_initializer(config.initializer_range), - name="o", + self.o = tf.keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="o" ) self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) + def transpose_for_scores(self, x, batch_size): + # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] + x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size)) + + return tf.transpose(x, perm=[0, 2, 1, 3]) + def call(self, hidden_states, attention_mask, head_mask, output_attentions, position_bias=None, training=False): + batch_size = shape_list(hidden_states)[0] + q = self.q(hidden_states) k = self.k(hidden_states) v = self.v(hidden_states) - dk = tf.cast(self.attention_head_size, dtype=q.dtype) - q = tf.multiply(q, y=tf.math.rsqrt(dk)) - attention_scores = tf.einsum("aecd,abcd->acbe", k, q) + q = self.transpose_for_scores(q, batch_size) + k = self.transpose_for_scores(k, batch_size) + v = self.transpose_for_scores(v, batch_size) + + attention_scores = tf.matmul(q, k, transpose_b=True) + dk = tf.cast(shape_list(k)[-1], attention_scores.dtype) + attention_scores = attention_scores / tf.math.sqrt(dk) # Apply relative position embedding (precomputed in MPNetEncoder) if provided. if position_bias is not None: attention_scores += position_bias if attention_mask is not None: - # Apply the attention mask is (precomputed for all layers in TFMPNetModel call() function) attention_scores = attention_scores + attention_mask attention_probs = tf.nn.softmax(attention_scores, axis=-1) @@ -342,7 +249,9 @@ class TFMPNetSelfAttention(tf.keras.layers.Layer): if head_mask is not None: attention_probs = attention_probs * head_mask - c = tf.einsum("acbe,aecd->abcd", attention_probs, v) + c = tf.matmul(attention_probs, v) + c = tf.transpose(c, perm=[0, 2, 1, 3]) + c = tf.reshape(c, (batch_size, -1, self.all_head_size)) o = self.o(c) outputs = (o, attention_probs) if output_attentions else (o,) @@ -374,12 +283,8 @@ class TFMPNetIntermediate(tf.keras.layers.Layer): def __init__(self, config: MPNetConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.experimental.EinsumDense( - equation="abc,cd->abd", - output_shape=(None, config.intermediate_size), - bias_axes="d", - kernel_initializer=get_initializer(config.initializer_range), - name="dense", + self.dense = tf.keras.layers.Dense( + units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) if isinstance(config.hidden_act, str): @@ -399,12 +304,8 @@ class TFMPNetOutput(tf.keras.layers.Layer): def __init__(self, config: MPNetConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.experimental.EinsumDense( - equation="abc,cd->abd", - bias_axes="d", - output_shape=(None, config.hidden_size), - kernel_initializer=get_initializer(config.initializer_range), - name="dense", + self.dense = tf.keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) @@ -565,12 +466,12 @@ class TFMPNetMainLayer(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings def get_input_embeddings(self) -> tf.keras.layers.Layer: - return self.embeddings.word_embeddings + return self.embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings def set_input_embeddings(self, value: tf.Variable): - self.embeddings.word_embeddings.weight = value - self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] + self.embeddings.weight = value + self.embeddings.vocab_size = shape_list(value)[0] # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads def _prune_heads(self, heads_to_prune): @@ -894,7 +795,7 @@ class TFMPNetForMaskedLM(TFMPNetPreTrainedModel, TFMaskedLanguageModelingLoss): super().__init__(config, *inputs, **kwargs) self.mpnet = TFMPNetMainLayer(config, name="mpnet") - self.lm_head = TFMPNetLMHead(config, self.mpnet.embeddings.word_embeddings, name="lm_head") + self.lm_head = TFMPNetLMHead(config, self.mpnet.embeddings, name="lm_head") def get_lm_head(self): return self.lm_head diff --git a/src/transformers/models/pegasus/modeling_tf_pegasus.py b/src/transformers/models/pegasus/modeling_tf_pegasus.py index 3ddfb67137..396d2c71c0 100644 --- a/src/transformers/models/pegasus/modeling_tf_pegasus.py +++ b/src/transformers/models/pegasus/modeling_tf_pegasus.py @@ -95,7 +95,8 @@ def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: i if past_key_values_length > 0: mask = tf.concat([tf.zeros((tgt_len, past_key_values_length), dtype=tf.float32), mask], axis=-1) - return tf.broadcast_to(mask[None, None, :, :], (bsz, 1, tgt_len, tgt_len + past_key_values_length)) + + return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1)) # Copied from transformers.models.bart.modeling_tf_bart._expand_mask @@ -103,10 +104,9 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None, past_key_values """ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. """ - bsz, src_len = shape_list(mask) + src_len = shape_list(mask)[1] tgt_len = tgt_len if tgt_len is not None else src_len - - expanded_mask = tf.cast(tf.broadcast_to(mask[:, None, None, :], (bsz, 1, tgt_len, src_len)), tf.float32) + expanded_mask = tf.cast(tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1)), tf.float32) return (1.0 - expanded_mask) * LARGE_NEGATIVE diff --git a/src/transformers/models/roberta/modeling_tf_roberta.py b/src/transformers/models/roberta/modeling_tf_roberta.py index c7460ff776..c00e7a41fa 100644 --- a/src/transformers/models/roberta/modeling_tf_roberta.py +++ b/src/transformers/models/roberta/modeling_tf_roberta.py @@ -15,8 +15,9 @@ # limitations under the License. """ TF 2.0 RoBERTa model. """ +import math import warnings -from typing import Any, Dict, Optional, Tuple, Union +from typing import Optional, Tuple, Union import numpy as np import tensorflow as tf @@ -68,127 +69,6 @@ TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [ ] -# Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings -class TFRobertaWordEmbeddings(tf.keras.layers.Layer): - def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape: tf.TensorShape): - self.weight = self.add_weight( - name="weight", - shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def get_config(self) -> Dict[str, Any]: - config = { - "vocab_size": self.vocab_size, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, input_ids: tf.Tensor) -> tf.Tensor: - flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) - embeddings = tf.gather(params=self.weight, indices=flat_input_ids) - embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) - ) - - embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) - - return embeddings - - -# Copied from transformers.models.bert.modeling_tf_bert.TFBertTokenTypeEmbeddings -class TFRobertaTokenTypeEmbeddings(tf.keras.layers.Layer): - def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.type_vocab_size = type_vocab_size - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape: tf.TensorShape): - self.token_type_embeddings = self.add_weight( - name="embeddings", - shape=[self.type_vocab_size, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def get_config(self) -> Dict[str, Any]: - config = { - "type_vocab_size": self.type_vocab_size, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, token_type_ids: tf.Tensor) -> tf.Tensor: - flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) - one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) - embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) - embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(token_type_ids), [self.hidden_size]], axis=0) - ) - - embeddings.set_shape(token_type_ids.shape.as_list() + [self.hidden_size]) - - return embeddings - - -# Copied from transformers.models.longformer.modeling_tf_longformer.TFLongformerPositionEmbeddings -class TFRobertaPositionEmbeddings(tf.keras.layers.Layer): - def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape): - self.position_embeddings = self.add_weight( - name="embeddings", - shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def get_config(self): - config = { - "max_position_embeddings": self.max_position_embeddings, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, position_ids): - flat_position_ids = tf.reshape(tensor=position_ids, shape=[-1]) - embeddings = tf.gather(params=self.position_embeddings, indices=flat_position_ids) - embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(position_ids), [self.hidden_size]], axis=0) - ) - - embeddings.set_shape(position_ids.shape.as_list() + [self.hidden_size]) - - return embeddings - - class TFRobertaEmbeddings(tf.keras.layers.Layer): """ Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. @@ -198,28 +78,39 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer): super().__init__(**kwargs) self.padding_idx = 1 - self.word_embeddings = TFRobertaWordEmbeddings( - vocab_size=config.vocab_size, - hidden_size=config.hidden_size, - initializer_range=config.initializer_range, - name="word_embeddings", - ) - self.position_embeddings = TFRobertaPositionEmbeddings( - max_position_embeddings=config.max_position_embeddings, - hidden_size=config.hidden_size, - initializer_range=config.initializer_range, - name="position_embeddings", - ) - self.token_type_embeddings = TFRobertaTokenTypeEmbeddings( - type_vocab_size=config.type_vocab_size, - hidden_size=config.hidden_size, - initializer_range=config.initializer_range, - name="token_type_embeddings", - ) + self.vocab_size = config.vocab_size + self.type_vocab_size = config.type_vocab_size + self.hidden_size = config.hidden_size + self.max_position_embeddings = config.max_position_embeddings + self.initializer_range = config.initializer_range self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + def build(self, input_shape: tf.TensorShape): + with tf.name_scope("word_embeddings"): + self.weight = self.add_weight( + name="weight", + shape=[self.vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + with tf.name_scope("token_type_embeddings"): + self.token_type_embeddings = self.add_weight( + name="embeddings", + shape=[self.type_vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + with tf.name_scope("position_embeddings"): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape) + def create_position_ids_from_input_ids(self, input_ids): """ Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding @@ -227,36 +118,13 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer): Args: input_ids: tf.Tensor - Returns: tf.Tensor """ - input_ids_shape = shape_list(tensor=input_ids) - - # multiple choice has 3 dimensions - if len(input_ids_shape) == 3: - input_ids = tf.reshape( - tensor=input_ids, shape=(input_ids_shape[0] * input_ids_shape[1], input_ids_shape[2]) - ) - mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=input_ids.dtype) incremental_indices = tf.math.cumsum(mask, axis=1) * mask return incremental_indices + self.padding_idx - def create_position_ids_from_inputs_embeds(self, inputs_embeds): - """ - We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids. - - Args: - inputs_embeds: tf.Tensor - - Returns: tf.Tensor - """ - batch_size, seq_length = shape_list(tensor=inputs_embeds)[:2] - position_ids = tf.range(start=self.padding_idx + 1, limit=seq_length + self.padding_idx + 1)[tf.newaxis, :] - - return tf.tile(input=position_ids, multiples=(batch_size, 1)) - def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False): """ Applies embedding based on inputs tensor. @@ -267,10 +135,11 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer): assert not (input_ids is None and inputs_embeds is None) if input_ids is not None: - inputs_embeds = self.word_embeddings(input_ids=input_ids) + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + + input_shape = shape_list(inputs_embeds)[:-1] if token_type_ids is None: - input_shape = shape_list(tensor=inputs_embeds)[:-1] token_type_ids = tf.fill(dims=input_shape, value=0) if position_ids is None: @@ -278,10 +147,13 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer): # Create the position ids from the input token ids. Any padded tokens remain padded. position_ids = self.create_position_ids_from_input_ids(input_ids=input_ids) else: - position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds=inputs_embeds) + position_ids = tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1)[ + tf.newaxis, : + ] + position_ids = tf.tile(input=position_ids, multiples=(input_shape[0], 1)) - position_embeds = self.position_embeddings(position_ids=position_ids) - token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids) + position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) + token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) final_embeddings = self.LayerNorm(inputs=final_embeddings) final_embeddings = self.dropout(inputs=final_embeddings, training=training) @@ -321,31 +193,29 @@ class TFRobertaSelfAttention(tf.keras.layers.Layer): f"of attention heads ({config.num_attention_heads})" ) + self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.sqrt_att_head_size = math.sqrt(self.attention_head_size) - self.query = tf.keras.layers.experimental.EinsumDense( - equation="abc,cde->abde", - output_shape=(None, config.num_attention_heads, self.attention_head_size), - bias_axes="de", - kernel_initializer=get_initializer(config.initializer_range), - name="query", + self.query = tf.keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" ) - self.key = tf.keras.layers.experimental.EinsumDense( - equation="abc,cde->abde", - output_shape=(None, config.num_attention_heads, self.attention_head_size), - bias_axes="de", - kernel_initializer=get_initializer(config.initializer_range), - name="key", + self.key = tf.keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" ) - self.value = tf.keras.layers.experimental.EinsumDense( - equation="abc,cde->abde", - output_shape=(None, config.num_attention_heads, self.attention_head_size), - bias_axes="de", - kernel_initializer=get_initializer(config.initializer_range), - name="value", + self.value = tf.keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: + # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] + tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size)) + + # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size] + return tf.transpose(tensor, perm=[0, 2, 1, 3]) + def call( self, hidden_states: tf.Tensor, @@ -354,15 +224,20 @@ class TFRobertaSelfAttention(tf.keras.layers.Layer): output_attentions: bool, training: bool = False, ) -> Tuple[tf.Tensor]: - query_layer = self.query(inputs=hidden_states) - key_layer = self.key(inputs=hidden_states) - value_layer = self.value(inputs=hidden_states) + batch_size = shape_list(hidden_states)[0] + mixed_query_layer = self.query(inputs=hidden_states) + mixed_key_layer = self.key(inputs=hidden_states) + mixed_value_layer = self.value(inputs=hidden_states) + query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) + key_layer = self.transpose_for_scores(mixed_key_layer, batch_size) + value_layer = self.transpose_for_scores(mixed_value_layer, batch_size) # Take the dot product between "query" and "key" to get the raw # attention scores. - dk = tf.cast(self.attention_head_size, dtype=query_layer.dtype) - query_layer = tf.multiply(query_layer, tf.math.rsqrt(dk)) - attention_scores = tf.einsum("aecd,abcd->acbe", key_layer, query_layer) + # (batch size, num_heads, seq_len_q, seq_len_k) + attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) + dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype) + attention_scores = tf.divide(attention_scores, dk) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in TFRobertaModel call() function) @@ -379,7 +254,11 @@ class TFRobertaSelfAttention(tf.keras.layers.Layer): if head_mask is not None: attention_probs = tf.multiply(attention_probs, head_mask) - attention_output = tf.einsum("acbe,aecd->abcd", attention_probs, value_layer) + attention_output = tf.matmul(attention_probs, value_layer) + attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3]) + + # (batch_size, seq_len_q, all_head_size) + attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size)) outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) return outputs @@ -390,21 +269,8 @@ class TFRobertaSelfOutput(tf.keras.layers.Layer): def __init__(self, config: RobertaConfig, **kwargs): super().__init__(**kwargs) - if config.hidden_size % config.num_attention_heads != 0: - raise ValueError( - f"The hidden size ({config.hidden_size}) is not a multiple of the number " - f"of attention heads ({config.num_attention_heads})" - ) - - self.attention_head_size = int(config.hidden_size / config.num_attention_heads) - self.all_head_size = config.num_attention_heads * self.attention_head_size - - self.dense = tf.keras.layers.experimental.EinsumDense( - equation="abcd,cde->abe", - output_shape=(None, self.all_head_size), - bias_axes="e", - kernel_initializer=get_initializer(config.initializer_range), - name="dense", + self.dense = tf.keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) @@ -456,12 +322,8 @@ class TFRobertaIntermediate(tf.keras.layers.Layer): def __init__(self, config: RobertaConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.experimental.EinsumDense( - equation="abc,cd->abd", - output_shape=(None, config.intermediate_size), - bias_axes="d", - kernel_initializer=get_initializer(config.initializer_range), - name="dense", + self.dense = tf.keras.layers.Dense( + units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) if isinstance(config.hidden_act, str): @@ -481,12 +343,8 @@ class TFRobertaOutput(tf.keras.layers.Layer): def __init__(self, config: RobertaConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.experimental.EinsumDense( - equation="abc,cd->abd", - bias_axes="d", - output_shape=(None, config.hidden_size), - kernel_initializer=get_initializer(config.initializer_range), - name="dense", + self.dense = tf.keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) @@ -601,12 +459,12 @@ class TFRobertaMainLayer(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings def get_input_embeddings(self) -> tf.keras.layers.Layer: - return self.embeddings.word_embeddings + return self.embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings def set_input_embeddings(self, value: tf.Variable): - self.embeddings.word_embeddings.weight = value - self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] + self.embeddings.weight = value + self.embeddings.vocab_size = shape_list(value)[0] # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads def _prune_heads(self, heads_to_prune): @@ -972,7 +830,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos super().__init__(config, *inputs, **kwargs) self.roberta = TFRobertaMainLayer(config, add_pooling_layer=False, name="roberta") - self.lm_head = TFRobertaLMHead(config, self.roberta.embeddings.word_embeddings, name="lm_head") + self.lm_head = TFRobertaLMHead(config, self.roberta.embeddings, name="lm_head") def get_lm_head(self): return self.lm_head diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py index cc727ac0f6..feffe7a585 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py @@ -16,6 +16,7 @@ {% if cookiecutter.is_encoder_decoder_model == "False" %} +import math from typing import Any, Dict, Optional, Tuple, Union import numpy as np @@ -68,122 +69,6 @@ TF_{{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST = [ ] -# Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings -class TF{{cookiecutter.camelcase_modelname}}WordEmbeddings(tf.keras.layers.Layer): - def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape: tf.TensorShape): - self.weight = self.add_weight( - name="weight", - shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def get_config(self) -> Dict[str, Any]: - config = { - "vocab_size": self.vocab_size, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, input_ids: tf.Tensor) -> tf.Tensor: - flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) - embeddings = tf.gather(params=self.weight, indices=flat_input_ids) - embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) - ) - - embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) - - return embeddings - - -# Copied from transformers.models.bert.modeling_tf_bert.TFBertTokenTypeEmbeddings -class TF{{cookiecutter.camelcase_modelname}}TokenTypeEmbeddings(tf.keras.layers.Layer): - def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.type_vocab_size = type_vocab_size - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape: tf.TensorShape): - self.token_type_embeddings = self.add_weight( - name="embeddings", - shape=[self.type_vocab_size, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def get_config(self) -> Dict[str, Any]: - config = { - "type_vocab_size": self.type_vocab_size, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, token_type_ids: tf.Tensor) -> tf.Tensor: - flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) - one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) - embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) - embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(token_type_ids), [self.hidden_size]], axis=0) - ) - - embeddings.set_shape(token_type_ids.shape.as_list() + [self.hidden_size]) - - return embeddings - - -# Copied from transformers.models.bert.modeling_tf_bert.TFBertPositionEmbeddings -class TF{{cookiecutter.camelcase_modelname}}PositionEmbeddings(tf.keras.layers.Layer): - def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs): - super().__init__(**kwargs) - - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.initializer_range = initializer_range - - def build(self, input_shape: tf.TensorShape): - self.position_embeddings = self.add_weight( - name="embeddings", - shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(self.initializer_range), - ) - - super().build(input_shape) - - def get_config(self) -> Dict[str, Any]: - config = { - "max_position_embeddings": self.max_position_embeddings, - "hidden_size": self.hidden_size, - "initializer_range": self.initializer_range, - } - base_config = super().get_config() - - return dict(list(base_config.items()) + list(config.items())) - - def call(self, position_ids: tf.Tensor) -> tf.Tensor: - input_shape = shape_list(position_ids) - position_embeddings = self.position_embeddings[: input_shape[1], :] - - return tf.broadcast_to(input=position_embeddings, shape=input_shape) - - # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings with Bert->{{cookiecutter.camelcase_modelname}} class TF{{cookiecutter.camelcase_modelname}}Embeddings(tf.keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" @@ -191,34 +76,45 @@ class TF{{cookiecutter.camelcase_modelname}}Embeddings(tf.keras.layers.Layer): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) - self.word_embeddings = TF{{cookiecutter.camelcase_modelname}}WordEmbeddings( - vocab_size=config.vocab_size, - hidden_size=config.hidden_size, - initializer_range=config.initializer_range, - name="word_embeddings", - ) - self.position_embeddings = TF{{cookiecutter.camelcase_modelname}}PositionEmbeddings( - max_position_embeddings=config.max_position_embeddings, - hidden_size=config.hidden_size, - initializer_range=config.initializer_range, - name="position_embeddings", - ) - self.token_type_embeddings = TF{{cookiecutter.camelcase_modelname}}TokenTypeEmbeddings( - type_vocab_size=config.type_vocab_size, - hidden_size=config.hidden_size, - initializer_range=config.initializer_range, - name="token_type_embeddings", - ) + self.vocab_size = config.vocab_size + self.type_vocab_size = config.type_vocab_size + self.hidden_size = config.hidden_size + self.max_position_embeddings = config.max_position_embeddings + self.initializer_range = config.initializer_range self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + def build(self, input_shape: tf.TensorShape): + with tf.name_scope("word_embeddings"): + self.weight = self.add_weight( + name="weight", + shape=[self.vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + with tf.name_scope("token_type_embeddings"): + self.token_type_embeddings = self.add_weight( + name="embeddings", + shape=[self.type_vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + with tf.name_scope("position_embeddings"): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape) + def call( self, - input_ids: tf.Tensor, - position_ids: tf.Tensor, - token_type_ids: tf.Tensor, - inputs_embeds: tf.Tensor, + input_ids: tf.Tensor = None, + position_ids: tf.Tensor = None, + token_type_ids: tf.Tensor = None, + inputs_embeds: tf.Tensor = None, training: bool = False, ) -> tf.Tensor: """ @@ -230,18 +126,19 @@ class TF{{cookiecutter.camelcase_modelname}}Embeddings(tf.keras.layers.Layer): assert not (input_ids is None and inputs_embeds is None) if input_ids is not None: - inputs_embeds = self.word_embeddings(input_ids) + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + + input_shape = shape_list(inputs_embeds)[:-1] if token_type_ids is None: - input_shape = shape_list(inputs_embeds)[:-1] token_type_ids = tf.fill(dims=input_shape, value=0) if position_ids is None: - position_embeds = self.position_embeddings(inputs_embeds) - else: - position_embeds = self.position_embeddings(position_ids) + position_ids = tf.range(start=0, limit=input_shape[-1])[tf.newaxis, :] - token_type_embeds = self.token_type_embeddings(token_type_ids) + position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) + position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1)) + token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) final_embeddings = self.LayerNorm(inputs=final_embeddings) final_embeddings = self.dropout(inputs=final_embeddings, training=training) @@ -261,31 +158,29 @@ class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer) f"of attention heads ({config.num_attention_heads})" ) + self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.rsqrt_att_head_size = 1.0 / math.sqrt(self.attention_head_size) - self.query = tf.keras.layers.experimental.EinsumDense( - equation="abc,cde->abde", - output_shape=(None, config.num_attention_heads, self.attention_head_size), - bias_axes="de", - kernel_initializer=get_initializer(config.initializer_range), - name="query", + self.query = tf.keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" ) - self.key = tf.keras.layers.experimental.EinsumDense( - equation="abc,cde->abde", - output_shape=(None, config.num_attention_heads, self.attention_head_size), - bias_axes="de", - kernel_initializer=get_initializer(config.initializer_range), - name="key", + self.key = tf.keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" ) - self.value = tf.keras.layers.experimental.EinsumDense( - equation="abc,cde->abde", - output_shape=(None, config.num_attention_heads, self.attention_head_size), - bias_axes="de", - kernel_initializer=get_initializer(config.initializer_range), - name="value", + self.value = tf.keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: + # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] + tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size)) + + # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size] + return tf.transpose(tensor, perm=[0, 2, 1, 3]) + def call( self, hidden_states: tf.Tensor, @@ -294,15 +189,20 @@ class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer) output_attentions: bool, training: bool = False, ) -> Tuple[tf.Tensor]: - query_layer = self.query(inputs=hidden_states) - key_layer = self.key(inputs=hidden_states) - value_layer = self.value(inputs=hidden_states) + batch_size = shape_list(hidden_states)[0] + mixed_query_layer = self.query(inputs=hidden_states) + mixed_key_layer = self.key(inputs=hidden_states) + mixed_value_layer = self.value(inputs=hidden_states) + query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) + key_layer = self.transpose_for_scores(mixed_key_layer, batch_size) + value_layer = self.transpose_for_scores(mixed_value_layer, batch_size) # Take the dot product between "query" and "key" to get the raw # attention scores. - dk = tf.cast(self.attention_head_size, dtype=query_layer.dtype) - query_layer = tf.multiply(query_layer, tf.math.rsqrt(dk)) - attention_scores = tf.einsum("aecd,abcd->acbe", key_layer, query_layer) + # (batch size, num_heads, seq_len_q, seq_len_k) + attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) + dk = tf.cast(self.rsqrt_att_head_size, dtype=attention_scores.dtype) + attention_scores = tf.multiply(attention_scores, dk) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in TF{{cookiecutter.camelcase_modelname}}Model call() function) @@ -319,7 +219,11 @@ class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer) if head_mask is not None: attention_probs = tf.multiply(attention_probs, head_mask) - attention_output = tf.einsum("acbe,aecd->abcd", attention_probs, value_layer) + attention_output = tf.matmul(attention_probs, value_layer) + attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3]) + + # (batch_size, seq_len_q, all_head_size) + attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size)) outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) return outputs @@ -330,21 +234,8 @@ class TF{{cookiecutter.camelcase_modelname}}SelfOutput(tf.keras.layers.Layer): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) - if config.hidden_size % config.num_attention_heads != 0: - raise ValueError( - f"The hidden size ({config.hidden_size}) is not a multiple of the number " - f"of attention heads ({config.num_attention_heads})" - ) - - self.attention_head_size = int(config.hidden_size / config.num_attention_heads) - self.all_head_size = config.num_attention_heads * self.attention_head_size - - self.dense = tf.keras.layers.experimental.EinsumDense( - equation="abcd,cde->abe", - output_shape=(None, self.all_head_size), - bias_axes="e", - kernel_initializer=get_initializer(config.initializer_range), - name="dense", + self.dense = tf.keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) @@ -396,12 +287,8 @@ class TF{{cookiecutter.camelcase_modelname}}Intermediate(tf.keras.layers.Layer): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.experimental.EinsumDense( - equation="abc,cd->abd", - output_shape=(None, config.intermediate_size), - bias_axes="d", - kernel_initializer=get_initializer(config.initializer_range), - name="dense", + self.dense = tf.keras.layers.Dense( + units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) if isinstance(config.hidden_act, str): @@ -418,15 +305,11 @@ class TF{{cookiecutter.camelcase_modelname}}Intermediate(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->{{cookiecutter.camelcase_modelname}} class TF{{cookiecutter.camelcase_modelname}}Output(tf.keras.layers.Layer): - def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): + def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.experimental.EinsumDense( - equation="abc,cd->abd", - bias_axes="d", - output_shape=(None, config.hidden_size), - kernel_initializer=get_initializer(config.initializer_range), - name="dense", + self.dense = tf.keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) @@ -614,12 +497,12 @@ class TF{{cookiecutter.camelcase_modelname}}MainLayer(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings def get_input_embeddings(self) -> tf.keras.layers.Layer: - return self.embeddings.word_embeddings + return self.embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings def set_input_embeddings(self, value: tf.Variable): - self.embeddings.word_embeddings.weight = value - self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] + self.embeddings.weight = value + self.embeddings.vocab_size = shape_list(value)[0] # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads def _prune_heads(self, heads_to_prune): @@ -917,7 +800,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForMaskedLM(TF{{cookiecutter.camelca ) self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") - self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, input_embeddings=self.{{cookiecutter.lowercase_modelname}}.embeddings.word_embeddings, name="mlm___cls") + self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, input_embeddings=self.{{cookiecutter.lowercase_modelname}}.embeddings, name="mlm___cls") def get_lm_head(self) -> tf.keras.layers.Layer: return self.mlm.predictions @@ -1014,7 +897,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForCausalLM(TF{{cookiecutter.camelca logger.warning("If you want to use `TF{{cookiecutter.camelcase_modelname}}ForCausalLM` as a standalone, add `is_decoder=True.`") self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") - self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, input_embeddings=self.{{cookiecutter.lowercase_modelname}}.embeddings.word_embeddings, name="mlm___cls") + self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, input_embeddings=self.{{cookiecutter.lowercase_modelname}}.embeddings, name="mlm___cls") def get_lm_head(self) -> tf.keras.layers.Layer: return self.mlm.predictions @@ -1662,17 +1545,17 @@ def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: i if past_key_values_length > 0: mask = tf.concat([tf.zeros((tgt_len, past_key_values_length), dtype=tf.float32), mask], axis=-1) - return tf.broadcast_to(mask[None, None, :, :], (bsz, 1, tgt_len, tgt_len + past_key_values_length)) + + return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1)) def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None, past_key_values_length: int = 0): """ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. """ - bsz, src_len = shape_list(mask) + src_len = shape_list(mask)[1] tgt_len = tgt_len if tgt_len is not None else src_len - - expanded_mask = tf.cast(tf.broadcast_to(mask[:, None, None, :], (bsz, 1, tgt_len, src_len)), tf.float32) + expanded_mask = tf.cast(tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1)), tf.float32) return (1.0 - expanded_mask) * LARGE_NEGATIVE diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py index c685b7a56f..a2524bd98b 100644 --- a/tests/test_modeling_tf_common.py +++ b/tests/test_modeling_tf_common.py @@ -866,7 +866,8 @@ class TFModelTesterMixin: for model_class in self.all_model_classes: model = model_class(config) - inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) + inputs = copy.deepcopy(inputs_dict) + if not self.is_encoder_decoder: input_ids = inputs["input_ids"] del inputs["input_ids"] @@ -882,6 +883,8 @@ class TFModelTesterMixin: inputs["inputs_embeds"] = model.get_input_embeddings()(encoder_input_ids) inputs["decoder_inputs_embeds"] = model.get_input_embeddings()(decoder_input_ids) + inputs = self._prepare_for_class(inputs, model_class) + model(inputs) def test_graph_mode_with_inputs_embeds(self): @@ -890,7 +893,8 @@ class TFModelTesterMixin: for model_class in self.all_model_classes: model = model_class(config) - inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) + inputs = copy.deepcopy(inputs_dict) + if not self.is_encoder_decoder: input_ids = inputs["input_ids"] del inputs["input_ids"] @@ -906,6 +910,8 @@ class TFModelTesterMixin: inputs["inputs_embeds"] = model.get_input_embeddings()(encoder_input_ids) inputs["decoder_inputs_embeds"] = model.get_input_embeddings()(decoder_input_ids) + inputs = self._prepare_for_class(inputs, model_class) + @tf.function def run_in_graph_mode(): return model(inputs)