Restore TF embeddings and attention layers to their previous version (#9890)

* Refacto BERT

* Restore all the concerned models

* Remove print

* Update template

* Apply Sylvain's and Morgan's comments

* Fix cast

* Put the cast inside call

* Remove cond in ebds

* Fix funnel

* Restore previous dot product (attention_scores) computation

* Add ConvBERT and BART

* Make all the S2S models ONNX compliant

* Fix test

* Fix check copies
This commit is contained in:
Julien Plu
2021-02-08 12:36:30 +01:00
committed by GitHub
parent 8bb52bd240
commit 31563e056d
20 changed files with 754 additions and 1966 deletions

View File

@@ -62,148 +62,55 @@ TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
]
class TFConvBertWordEmbeddings(tf.keras.layers.Layer):
def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.weight = self.add_weight(
name="weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape=input_shape)
def get_config(self):
config = {
"vocab_size": self.vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids):
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size])
return embeddings
class TFConvBertTokenTypeEmbeddings(tf.keras.layers.Layer):
def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.type_vocab_size = type_vocab_size
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.token_type_embeddings = self.add_weight(
name="embeddings",
shape=[self.type_vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape=input_shape)
def get_config(self):
config = {
"type_vocab_size": self.type_vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, token_type_ids):
flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1])
one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype)
embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size])
return embeddings
class TFConvBertPositionEmbeddings(tf.keras.layers.Layer):
def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape)
def get_config(self):
config = {
"max_position_embeddings": self.max_position_embeddings,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, position_ids):
input_shape = shape_list(tensor=position_ids)
position_embeddings = self.position_embeddings[: input_shape[1], :]
return tf.broadcast_to(input=position_embeddings, shape=input_shape)
# Copied from transformers.models.albert.modeling_tf_albert.TFAlbertEmbeddings
class TFConvBertEmbeddings(tf.keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.word_embeddings = TFConvBertWordEmbeddings(
vocab_size=config.vocab_size,
hidden_size=config.embedding_size,
initializer_range=config.initializer_range,
name="word_embeddings",
)
self.position_embeddings = TFConvBertPositionEmbeddings(
max_position_embeddings=config.max_position_embeddings,
hidden_size=config.embedding_size,
initializer_range=config.initializer_range,
name="position_embeddings",
)
self.token_type_embeddings = TFConvBertTokenTypeEmbeddings(
type_vocab_size=config.type_vocab_size,
hidden_size=config.embedding_size,
initializer_range=config.initializer_range,
name="token_type_embeddings",
)
self.vocab_size = config.vocab_size
self.type_vocab_size = config.type_vocab_size
self.embedding_size = config.embedding_size
self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range
self.embeddings_sum = tf.keras.layers.Add()
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False):
def build(self, input_shape: tf.TensorShape):
with tf.name_scope("word_embeddings"):
self.weight = self.add_weight(
name="weight",
shape=[self.vocab_size, self.embedding_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
with tf.name_scope("token_type_embeddings"):
self.token_type_embeddings = self.add_weight(
name="embeddings",
shape=[self.type_vocab_size, self.embedding_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
with tf.name_scope("position_embeddings"):
self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.embedding_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call
def call(
self,
input_ids: tf.Tensor = None,
position_ids: tf.Tensor = None,
token_type_ids: tf.Tensor = None,
inputs_embeds: tf.Tensor = None,
training: bool = False,
) -> tf.Tensor:
"""
Applies embedding based on inputs tensor.
@@ -213,18 +120,19 @@ class TFConvBertEmbeddings(tf.keras.layers.Layer):
assert not (input_ids is None and inputs_embeds is None)
if input_ids is not None:
inputs_embeds = self.word_embeddings(input_ids=input_ids)
inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
input_shape = shape_list(inputs_embeds)[:-1]
if token_type_ids is None:
input_shape = shape_list(tensor=inputs_embeds)[:-1]
token_type_ids = tf.fill(dims=input_shape, value=0)
if position_ids is None:
position_embeds = self.position_embeddings(position_ids=inputs_embeds)
else:
position_embeds = self.position_embeddings(position_ids=position_ids)
position_ids = tf.range(start=0, limit=input_shape[-1])[tf.newaxis, :]
token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids)
position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1))
token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds])
final_embeddings = self.LayerNorm(inputs=final_embeddings)
final_embeddings = self.dropout(inputs=final_embeddings, training=training)
@@ -296,6 +204,7 @@ class TFConvBertSelfAttention(tf.keras.layers.Layer):
self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
def transpose_for_scores(self, x, batch_size):
# Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
return tf.transpose(x, perm=[0, 2, 1, 3])
@@ -315,18 +224,27 @@ class TFConvBertSelfAttention(tf.keras.layers.Layer):
conv_kernel_layer = tf.reshape(conv_kernel_layer, [-1, self.conv_kernel_size, 1])
conv_kernel_layer = tf.nn.softmax(conv_kernel_layer, axis=1)
paddings = tf.constant(
[
[
0,
0,
],
[int((self.conv_kernel_size - 1) / 2), int((self.conv_kernel_size - 1) / 2)],
[0, 0],
]
)
conv_out_layer = self.conv_out_layer(hidden_states)
conv_out_layer = tf.reshape(conv_out_layer, [batch_size, -1, self.all_head_size])
conv_out_layer = tf.pad(conv_out_layer, paddings, "CONSTANT")
conv_out_layer = tf.reshape(
conv_out_layer, [batch_size, shape_list(mixed_query_layer)[1], self.all_head_size, 1]
)
unfold_conv_out_layer = tf.image.extract_patches(
images=conv_out_layer,
sizes=[1, self.conv_kernel_size, 1, 1],
strides=[1, 1, 1, 1],
rates=[1, 1, 1, 1],
padding="SAME",
unfold_conv_out_layer = tf.stack(
[
tf.slice(conv_out_layer, [0, i, 0], [batch_size, shape_list(mixed_query_layer)[1], self.all_head_size])
for i in range(self.conv_kernel_size)
],
axis=-1,
)
conv_out_layer = tf.reshape(unfold_conv_out_layer, [-1, self.attention_head_size, self.conv_kernel_size])
@@ -601,11 +519,11 @@ class TFConvBertMainLayer(tf.keras.layers.Layer):
self.config = config
def get_input_embeddings(self):
return self.embeddings.word_embeddings
return self.embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings.weight = value
self.embeddings.word_embeddings.vocab_size = value.shape[0]
self.embeddings.weight = value
self.embeddings.vocab_size = value.shape[0]
def _prune_heads(self, heads_to_prune):
"""
@@ -953,9 +871,7 @@ class TFConvBertForMaskedLM(TFConvBertPreTrainedModel, TFMaskedLanguageModelingL
else:
self.activation = config.hidden_act
self.generator_lm_head = TFConvBertMaskedLMHead(
config, self.convbert.embeddings.word_embeddings, name="generator_lm_head"
)
self.generator_lm_head = TFConvBertMaskedLMHead(config, self.convbert.embeddings, name="generator_lm_head")
def get_lm_head(self):
return self.generator_lm_head