Clean TF Bert (#9788)

* Start cleaning BERT

* Clean BERT and all those depends of it

* Fix attribute name

* Apply style

* Apply Sylvain's comments

* Apply Lysandre's comments

* remove unused import
This commit is contained in:
Julien Plu
2021-01-27 11:28:11 +01:00
committed by GitHub
parent f0329ea516
commit 4adbdce5ee
15 changed files with 1295 additions and 1059 deletions

View File

@@ -46,6 +46,10 @@ from .utils import logging
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
tf_logger = tf.get_logger() tf_logger = tf.get_logger()
TFModelInputType = Union[
List[tf.Tensor], List[np.ndarray], Dict[str, tf.Tensor], Dict[str, np.ndarray], np.ndarray, tf.Tensor
]
class TFModelUtilsMixin: class TFModelUtilsMixin:
""" """

View File

@@ -17,7 +17,7 @@
from dataclasses import dataclass from dataclasses import dataclass
from typing import Optional, Tuple from typing import Any, Dict, Optional, Tuple
import tensorflow as tf import tensorflow as tf
@@ -82,16 +82,16 @@ class TFAlbertWordEmbeddings(tf.keras.layers.Layer):
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.initializer_range = initializer_range self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape: tf.TensorShape):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
shape=[self.vocab_size, self.hidden_size], shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape=input_shape) super().build(input_shape)
def get_config(self): def get_config(self) -> Dict[str, Any]:
config = { config = {
"vocab_size": self.vocab_size, "vocab_size": self.vocab_size,
"hidden_size": self.hidden_size, "hidden_size": self.hidden_size,
@@ -101,14 +101,14 @@ class TFAlbertWordEmbeddings(tf.keras.layers.Layer):
return dict(list(base_config.items()) + list(config.items())) return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids): def call(self, input_ids: tf.Tensor) -> tf.Tensor:
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape( embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0)
) )
embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size])
return embeddings return embeddings
@@ -122,16 +122,16 @@ class TFAlbertTokenTypeEmbeddings(tf.keras.layers.Layer):
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.initializer_range = initializer_range self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape: tf.TensorShape):
self.token_type_embeddings = self.add_weight( self.token_type_embeddings = self.add_weight(
name="embeddings", name="embeddings",
shape=[self.type_vocab_size, self.hidden_size], shape=[self.type_vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape=input_shape) super().build(input_shape)
def get_config(self): def get_config(self) -> Dict[str, Any]:
config = { config = {
"type_vocab_size": self.type_vocab_size, "type_vocab_size": self.type_vocab_size,
"hidden_size": self.hidden_size, "hidden_size": self.hidden_size,
@@ -141,15 +141,15 @@ class TFAlbertTokenTypeEmbeddings(tf.keras.layers.Layer):
return dict(list(base_config.items()) + list(config.items())) return dict(list(base_config.items()) + list(config.items()))
def call(self, token_type_ids): def call(self, token_type_ids: tf.Tensor) -> tf.Tensor:
flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1])
one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype)
embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings)
embeddings = tf.reshape( embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) tensor=embeddings, shape=tf.concat(values=[shape_list(token_type_ids), [self.hidden_size]], axis=0)
) )
embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) embeddings.set_shape(token_type_ids.shape.as_list() + [self.hidden_size])
return embeddings return embeddings
@@ -163,16 +163,16 @@ class TFAlbertPositionEmbeddings(tf.keras.layers.Layer):
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.initializer_range = initializer_range self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape: tf.TensorShape):
self.position_embeddings = self.add_weight( self.position_embeddings = self.add_weight(
name="embeddings", name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size], shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape) super().build(input_shape)
def get_config(self): def get_config(self) -> Dict[str, Any]:
config = { config = {
"max_position_embeddings": self.max_position_embeddings, "max_position_embeddings": self.max_position_embeddings,
"hidden_size": self.hidden_size, "hidden_size": self.hidden_size,
@@ -182,8 +182,8 @@ class TFAlbertPositionEmbeddings(tf.keras.layers.Layer):
return dict(list(base_config.items()) + list(config.items())) return dict(list(base_config.items()) + list(config.items()))
def call(self, position_ids): def call(self, position_ids: tf.Tensor) -> tf.Tensor:
input_shape = shape_list(tensor=position_ids) input_shape = shape_list(position_ids)
position_embeddings = self.position_embeddings[: input_shape[1], :] position_embeddings = self.position_embeddings[: input_shape[1], :]
return tf.broadcast_to(input=position_embeddings, shape=input_shape) return tf.broadcast_to(input=position_embeddings, shape=input_shape)
@@ -218,7 +218,14 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call
def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False): def call(
self,
input_ids: tf.Tensor,
position_ids: tf.Tensor,
token_type_ids: tf.Tensor,
inputs_embeds: tf.Tensor,
training: bool = False,
) -> tf.Tensor:
""" """
Applies embedding based on inputs tensor. Applies embedding based on inputs tensor.
@@ -879,7 +886,7 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
return outputs return outputs
# Copied from transformers.models.bert.modeling_tf_bert.TFBertModel.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertModel.serving_output
def serving_output(self, output): def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1105,7 +1112,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss)
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output
def serving_output(self, output): def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1208,7 +1215,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
def serving_output(self, output): def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1310,7 +1317,7 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
def serving_output(self, output): def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1425,7 +1432,7 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
def serving_output(self, output): def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1572,13 +1579,14 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
} }
] ]
) )
def serving(self, inputs): # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving
output = self.call(inputs) def serving(self, inputs: Dict[str, tf.Tensor]):
output = self.call(input_ids=inputs)
return self.serving_output(output) return self.serving_output(output)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
def serving_output(self, output): def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None

File diff suppressed because it is too large Load Diff

View File

@@ -919,7 +919,7 @@ class TFCTRLForSequenceClassification(TFCTRLPreTrainedModel, TFSequenceClassific
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
def serving_output(self, output): def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None

View File

@@ -17,6 +17,7 @@
""" """
import warnings import warnings
from typing import Any, Dict
import tensorflow as tf import tensorflow as tf
@@ -76,16 +77,16 @@ class TFDistilBertWordEmbeddings(tf.keras.layers.Layer):
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.initializer_range = initializer_range self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape: tf.TensorShape):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
shape=[self.vocab_size, self.hidden_size], shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape=input_shape) super().build(input_shape)
def get_config(self): def get_config(self) -> Dict[str, Any]:
config = { config = {
"vocab_size": self.vocab_size, "vocab_size": self.vocab_size,
"hidden_size": self.hidden_size, "hidden_size": self.hidden_size,
@@ -95,14 +96,14 @@ class TFDistilBertWordEmbeddings(tf.keras.layers.Layer):
return dict(list(base_config.items()) + list(config.items())) return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids): def call(self, input_ids: tf.Tensor) -> tf.Tensor:
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape( embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0)
) )
embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size])
return embeddings return embeddings
@@ -116,16 +117,16 @@ class TFDistilBertPositionEmbeddings(tf.keras.layers.Layer):
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.initializer_range = initializer_range self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape: tf.TensorShape):
self.position_embeddings = self.add_weight( self.position_embeddings = self.add_weight(
name="embeddings", name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size], shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape) super().build(input_shape)
def get_config(self): def get_config(self) -> Dict[str, Any]:
config = { config = {
"max_position_embeddings": self.max_position_embeddings, "max_position_embeddings": self.max_position_embeddings,
"hidden_size": self.hidden_size, "hidden_size": self.hidden_size,
@@ -135,8 +136,8 @@ class TFDistilBertPositionEmbeddings(tf.keras.layers.Layer):
return dict(list(base_config.items()) + list(config.items())) return dict(list(base_config.items()) + list(config.items()))
def call(self, position_ids): def call(self, position_ids: tf.Tensor) -> tf.Tensor:
input_shape = shape_list(tensor=position_ids) input_shape = shape_list(position_ids)
position_embeddings = self.position_embeddings[: input_shape[1], :] position_embeddings = self.position_embeddings[: input_shape[1], :]
return tf.broadcast_to(input=position_embeddings, shape=input_shape) return tf.broadcast_to(input=position_embeddings, shape=input_shape)
@@ -796,7 +797,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModel
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output
def serving_output(self, output): def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -897,7 +898,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
def serving_output(self, output): def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -988,7 +989,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
def serving_output(self, output): def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1131,7 +1132,7 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic
return self.serving_output(output) return self.serving_output(output)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
def serving_output(self, output): def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1238,7 +1239,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
def serving_output(self, output): def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None

View File

@@ -16,7 +16,7 @@
import warnings import warnings
from dataclasses import dataclass from dataclasses import dataclass
from typing import Optional, Tuple from typing import Any, Dict, Optional, Tuple, Union
import tensorflow as tf import tensorflow as tf
@@ -79,16 +79,16 @@ class TFElectraWordEmbeddings(tf.keras.layers.Layer):
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.initializer_range = initializer_range self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape: tf.TensorShape):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
shape=[self.vocab_size, self.hidden_size], shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape=input_shape) super().build(input_shape)
def get_config(self): def get_config(self) -> Dict[str, Any]:
config = { config = {
"vocab_size": self.vocab_size, "vocab_size": self.vocab_size,
"hidden_size": self.hidden_size, "hidden_size": self.hidden_size,
@@ -98,14 +98,14 @@ class TFElectraWordEmbeddings(tf.keras.layers.Layer):
return dict(list(base_config.items()) + list(config.items())) return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids): def call(self, input_ids: tf.Tensor) -> tf.Tensor:
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape( embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0)
) )
embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size])
return embeddings return embeddings
@@ -119,16 +119,16 @@ class TFElectraTokenTypeEmbeddings(tf.keras.layers.Layer):
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.initializer_range = initializer_range self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape: tf.TensorShape):
self.token_type_embeddings = self.add_weight( self.token_type_embeddings = self.add_weight(
name="embeddings", name="embeddings",
shape=[self.type_vocab_size, self.hidden_size], shape=[self.type_vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape=input_shape) super().build(input_shape)
def get_config(self): def get_config(self) -> Dict[str, Any]:
config = { config = {
"type_vocab_size": self.type_vocab_size, "type_vocab_size": self.type_vocab_size,
"hidden_size": self.hidden_size, "hidden_size": self.hidden_size,
@@ -138,15 +138,15 @@ class TFElectraTokenTypeEmbeddings(tf.keras.layers.Layer):
return dict(list(base_config.items()) + list(config.items())) return dict(list(base_config.items()) + list(config.items()))
def call(self, token_type_ids): def call(self, token_type_ids: tf.Tensor) -> tf.Tensor:
flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1])
one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype)
embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings)
embeddings = tf.reshape( embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) tensor=embeddings, shape=tf.concat(values=[shape_list(token_type_ids), [self.hidden_size]], axis=0)
) )
embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) embeddings.set_shape(token_type_ids.shape.as_list() + [self.hidden_size])
return embeddings return embeddings
@@ -160,16 +160,16 @@ class TFElectraPositionEmbeddings(tf.keras.layers.Layer):
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.initializer_range = initializer_range self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape: tf.TensorShape):
self.position_embeddings = self.add_weight( self.position_embeddings = self.add_weight(
name="embeddings", name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size], shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape) super().build(input_shape)
def get_config(self): def get_config(self) -> Dict[str, Any]:
config = { config = {
"max_position_embeddings": self.max_position_embeddings, "max_position_embeddings": self.max_position_embeddings,
"hidden_size": self.hidden_size, "hidden_size": self.hidden_size,
@@ -179,8 +179,8 @@ class TFElectraPositionEmbeddings(tf.keras.layers.Layer):
return dict(list(base_config.items()) + list(config.items())) return dict(list(base_config.items()) + list(config.items()))
def call(self, position_ids): def call(self, position_ids: tf.Tensor) -> tf.Tensor:
input_shape = shape_list(tensor=position_ids) input_shape = shape_list(position_ids)
position_embeddings = self.position_embeddings[: input_shape[1], :] position_embeddings = self.position_embeddings[: input_shape[1], :]
return tf.broadcast_to(input=position_embeddings, shape=input_shape) return tf.broadcast_to(input=position_embeddings, shape=input_shape)
@@ -188,7 +188,7 @@ class TFElectraPositionEmbeddings(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Electra # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Electra
class TFElectraSelfAttention(tf.keras.layers.Layer): class TFElectraSelfAttention(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config: ElectraConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
if config.hidden_size % config.num_attention_heads != 0: if config.hidden_size % config.num_attention_heads != 0:
@@ -203,50 +203,57 @@ class TFElectraSelfAttention(tf.keras.layers.Layer):
equation="abc,cde->abde", equation="abc,cde->abde",
output_shape=(None, config.num_attention_heads, self.attention_head_size), output_shape=(None, config.num_attention_heads, self.attention_head_size),
bias_axes="de", bias_axes="de",
kernel_initializer=get_initializer(initializer_range=config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="query", name="query",
) )
self.key = tf.keras.layers.experimental.EinsumDense( self.key = tf.keras.layers.experimental.EinsumDense(
equation="abc,cde->abde", equation="abc,cde->abde",
output_shape=(None, config.num_attention_heads, self.attention_head_size), output_shape=(None, config.num_attention_heads, self.attention_head_size),
bias_axes="de", bias_axes="de",
kernel_initializer=get_initializer(initializer_range=config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="key", name="key",
) )
self.value = tf.keras.layers.experimental.EinsumDense( self.value = tf.keras.layers.experimental.EinsumDense(
equation="abc,cde->abde", equation="abc,cde->abde",
output_shape=(None, config.num_attention_heads, self.attention_head_size), output_shape=(None, config.num_attention_heads, self.attention_head_size),
bias_axes="de", bias_axes="de",
kernel_initializer=get_initializer(initializer_range=config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="value", name="value",
) )
self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
def call(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False, training=False): def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
head_mask: tf.Tensor,
output_attentions: bool,
training: bool = False,
) -> Tuple[tf.Tensor]:
query_layer = self.query(inputs=hidden_states) query_layer = self.query(inputs=hidden_states)
key_layer = self.key(inputs=hidden_states) key_layer = self.key(inputs=hidden_states)
value_layer = self.value(inputs=hidden_states) value_layer = self.value(inputs=hidden_states)
# Take the dot product between "query" and "key" to get the raw # Take the dot product between "query" and "key" to get the raw
# attention scores. # attention scores.
dk = tf.cast(x=self.attention_head_size, dtype=query_layer.dtype) dk = tf.cast(self.attention_head_size, dtype=query_layer.dtype)
query_layer = tf.multiply(x=query_layer, y=tf.math.rsqrt(x=dk)) query_layer = tf.multiply(query_layer, tf.math.rsqrt(dk))
attention_scores = tf.einsum("aecd,abcd->acbe", key_layer, query_layer) attention_scores = tf.einsum("aecd,abcd->acbe", key_layer, query_layer)
if attention_mask is not None: if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in TFElectraModel call() function) # Apply the attention mask is (precomputed for all layers in TFElectraModel call() function)
attention_scores = attention_scores + attention_mask attention_scores = tf.add(attention_scores, attention_mask)
# Normalize the attention scores to probabilities. # Normalize the attention scores to probabilities.
attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1) attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1)
# This is actually dropping out entire tokens to attend to, which might # This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper. # seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs, training=training) attention_probs = self.dropout(inputs=attention_probs, training=training)
# Mask heads if we want to # Mask heads if we want to
if head_mask is not None: if head_mask is not None:
attention_scores = attention_scores * head_mask attention_scores = tf.multiply(attention_scores, head_mask)
attention_output = tf.einsum("acbe,aecd->abcd", attention_probs, value_layer) attention_output = tf.einsum("acbe,aecd->abcd", attention_probs, value_layer)
outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
@@ -254,9 +261,9 @@ class TFElectraSelfAttention(tf.keras.layers.Layer):
return outputs return outputs
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Electra
class TFElectraSelfOutput(tf.keras.layers.Layer): class TFElectraSelfOutput(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config: ElectraConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
if config.hidden_size % config.num_attention_heads != 0: if config.hidden_size % config.num_attention_heads != 0:
@@ -272,13 +279,13 @@ class TFElectraSelfOutput(tf.keras.layers.Layer):
equation="abcd,cde->abe", equation="abcd,cde->abe",
output_shape=(None, self.all_head_size), output_shape=(None, self.all_head_size),
bias_axes="e", bias_axes="e",
kernel_initializer=get_initializer(initializer_range=config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="dense", name="dense",
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def call(self, hidden_states, input_tensor, training=False): def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.dropout(inputs=hidden_states, training=training) hidden_states = self.dropout(inputs=hidden_states, training=training)
hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
@@ -288,7 +295,7 @@ class TFElectraSelfOutput(tf.keras.layers.Layer):
# Copied from from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Electra # Copied from from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Electra
class TFElectraAttention(tf.keras.layers.Layer): class TFElectraAttention(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config: ElectraConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.self_attention = TFElectraSelfAttention(config, name="self") self.self_attention = TFElectraSelfAttention(config, name="self")
@@ -297,44 +304,57 @@ class TFElectraAttention(tf.keras.layers.Layer):
def prune_heads(self, heads): def prune_heads(self, heads):
raise NotImplementedError raise NotImplementedError
def call(self, input_tensor, attention_mask, head_mask, output_attentions, training=False): def call(
self,
input_tensor: tf.Tensor,
attention_mask: tf.Tensor,
head_mask: tf.Tensor,
output_attentions: bool,
training: bool = False,
) -> Tuple[tf.Tensor]:
self_outputs = self.self_attention( self_outputs = self.self_attention(
input_tensor, attention_mask, head_mask, output_attentions, training=training hidden_states=input_tensor,
attention_mask=attention_mask,
head_mask=head_mask,
output_attentions=output_attentions,
training=training,
)
attention_output = self.dense_output(
hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
) )
attention_output = self.dense_output(self_outputs[0], input_tensor, training=training)
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
return outputs return outputs
# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Electra
class TFElectraIntermediate(tf.keras.layers.Layer): class TFElectraIntermediate(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config: ElectraConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.experimental.EinsumDense( self.dense = tf.keras.layers.experimental.EinsumDense(
equation="abc,cd->abd", equation="abc,cd->abd",
output_shape=(None, config.intermediate_size), output_shape=(None, config.intermediate_size),
bias_axes="d", bias_axes="d",
kernel_initializer=get_initializer(initializer_range=config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="dense", name="dense",
) )
if isinstance(config.hidden_act, str): if isinstance(config.hidden_act, str):
self.intermediate_act_fn = get_tf_activation(activation_string=config.hidden_act) self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else: else:
self.intermediate_act_fn = config.hidden_act self.intermediate_act_fn = config.hidden_act
def call(self, hidden_states): def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states return hidden_states
# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Electra
class TFElectraOutput(tf.keras.layers.Layer): class TFElectraOutput(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config: ElectraConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.experimental.EinsumDense( self.dense = tf.keras.layers.experimental.EinsumDense(
@@ -347,7 +367,7 @@ class TFElectraOutput(tf.keras.layers.Layer):
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def call(self, hidden_states, input_tensor, training=False): def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.dropout(inputs=hidden_states, training=training) hidden_states = self.dropout(inputs=hidden_states, training=training)
hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
@@ -357,20 +377,33 @@ class TFElectraOutput(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Electra # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Electra
class TFElectraLayer(tf.keras.layers.Layer): class TFElectraLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config: ElectraConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.attention = TFElectraAttention(config, name="attention") self.attention = TFElectraAttention(config, name="attention")
self.intermediate = TFElectraIntermediate(config, name="intermediate") self.intermediate = TFElectraIntermediate(config, name="intermediate")
self.bert_output = TFElectraOutput(config, name="output") self.bert_output = TFElectraOutput(config, name="output")
def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False): def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
head_mask: tf.Tensor,
output_attentions: bool,
training: bool = False,
) -> Tuple[tf.Tensor]:
attention_outputs = self.attention( attention_outputs = self.attention(
hidden_states, attention_mask, head_mask, output_attentions, training=training input_tensor=hidden_states,
attention_mask=attention_mask,
head_mask=head_mask,
output_attentions=output_attentions,
training=training,
) )
attention_output = attention_outputs[0] attention_output = attention_outputs[0]
intermediate_output = self.intermediate(attention_output) intermediate_output = self.intermediate(hidden_states=attention_output)
layer_output = self.bert_output(intermediate_output, attention_output, training=training) layer_output = self.bert_output(
hidden_states=intermediate_output, input_tensor=attention_output, training=training
)
outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them
return outputs return outputs
@@ -378,21 +411,21 @@ class TFElectraLayer(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Electra # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Electra
class TFElectraEncoder(tf.keras.layers.Layer): class TFElectraEncoder(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config: ElectraConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.layer = [TFElectraLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)] self.layer = [TFElectraLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)]
def call( def call(
self, self,
hidden_states, hidden_states: tf.Tensor,
attention_mask, attention_mask: tf.Tensor,
head_mask, head_mask: tf.Tensor,
output_attentions, output_attentions: bool,
output_hidden_states, output_hidden_states: bool,
return_dict, return_dict: bool,
training=False, training: bool = False,
): ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
all_hidden_states = () if output_hidden_states else None all_hidden_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None all_attentions = () if output_attentions else None
@@ -401,7 +434,11 @@ class TFElectraEncoder(tf.keras.layers.Layer):
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
layer_outputs = layer_module( layer_outputs = layer_module(
hidden_states, attention_mask, head_mask[i], output_attentions, training=training hidden_states=hidden_states,
attention_mask=attention_mask,
head_mask=head_mask[i],
output_attentions=output_attentions,
training=training,
) )
hidden_states = layer_outputs[0] hidden_states = layer_outputs[0]
@@ -420,27 +457,28 @@ class TFElectraEncoder(tf.keras.layers.Layer):
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Electra
class TFElectraPooler(tf.keras.layers.Layer): class TFElectraPooler(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config: ElectraConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.hidden_size, units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
activation="tanh", activation="tanh",
name="dense", name="dense",
) )
def call(self, hidden_states): def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
# We "pool" the model by simply taking the hidden state corresponding # We "pool" the model by simply taking the hidden state corresponding
# to the first token. # to the first token.
first_token_tensor = hidden_states[:, 0] first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor) pooled_output = self.dense(inputs=first_token_tensor)
return pooled_output return pooled_output
# Copied from transformers.models.albert.modeling_tf_albert.TFAlbertEmbeddings with Albert->Electra
class TFElectraEmbeddings(tf.keras.layers.Layer): class TFElectraEmbeddings(tf.keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings.""" """Construct the embeddings from word, position and token_type embeddings."""
@@ -469,8 +507,15 @@ class TFElectraEmbeddings(tf.keras.layers.Layer):
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
# Copied from transformers.models.albert.modeling_tf_albert.TFAlbertEmbeddings.call with Albert->Electra # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call
def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False): def call(
self,
input_ids: tf.Tensor,
position_ids: tf.Tensor,
token_type_ids: tf.Tensor,
inputs_embeds: tf.Tensor,
training: bool = False,
) -> tf.Tensor:
""" """
Applies embedding based on inputs tensor. Applies embedding based on inputs tensor.
@@ -1097,7 +1142,7 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLos
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output
def serving_output(self, output): def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1215,7 +1260,7 @@ class TFElectraForSequenceClassification(TFElectraPreTrainedModel, TFSequenceCla
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
def serving_output(self, output): def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1356,13 +1401,14 @@ class TFElectraForMultipleChoice(TFElectraPreTrainedModel, TFMultipleChoiceLoss)
} }
] ]
) )
def serving(self, inputs): # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving
output = self.call(inputs) def serving(self, inputs: Dict[str, tf.Tensor]):
output = self.call(input_ids=inputs)
return self.serving_output(output) return self.serving_output(output)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
def serving_output(self, output): def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1460,7 +1506,7 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
def serving_output(self, output): def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1575,7 +1621,7 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
def serving_output(self, output): def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None

View File

@@ -16,7 +16,7 @@
import warnings import warnings
from dataclasses import dataclass from dataclasses import dataclass
from typing import Optional, Tuple from typing import Any, Dict, Optional, Tuple
import tensorflow as tf import tensorflow as tf
@@ -83,16 +83,16 @@ class TFFunnelWordEmbeddings(tf.keras.layers.Layer):
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.initializer_range = initializer_range self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape: tf.TensorShape):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
shape=[self.vocab_size, self.hidden_size], shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape=input_shape) super().build(input_shape)
def get_config(self): def get_config(self) -> Dict[str, Any]:
config = { config = {
"vocab_size": self.vocab_size, "vocab_size": self.vocab_size,
"hidden_size": self.hidden_size, "hidden_size": self.hidden_size,
@@ -102,14 +102,14 @@ class TFFunnelWordEmbeddings(tf.keras.layers.Layer):
return dict(list(base_config.items()) + list(config.items())) return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids): def call(self, input_ids: tf.Tensor) -> tf.Tensor:
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape( embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0)
) )
embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size])
return embeddings return embeddings
@@ -1436,7 +1436,7 @@ class TFFunnelForMaskedLM(TFFunnelPreTrainedModel, TFMaskedLanguageModelingLoss)
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output
def serving_output(self, output): def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1526,7 +1526,7 @@ class TFFunnelForSequenceClassification(TFFunnelPreTrainedModel, TFSequenceClass
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
def serving_output(self, output): def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1656,13 +1656,13 @@ class TFFunnelForMultipleChoice(TFFunnelPreTrainedModel, TFMultipleChoiceLoss):
} }
] ]
) )
def serving(self, inputs): def serving(self, inputs: Dict[str, tf.Tensor]):
output = self.call(inputs) output = self.call(input_ids=inputs)
return self.serving_output(output) return self.serving_output(output=output)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
def serving_output(self, output): def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1755,7 +1755,7 @@ class TFFunnelForTokenClassification(TFFunnelPreTrainedModel, TFTokenClassificat
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
def serving_output(self, output): def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1860,7 +1860,7 @@ class TFFunnelForQuestionAnswering(TFFunnelPreTrainedModel, TFQuestionAnsweringL
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
def serving_output(self, output): def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None

View File

@@ -16,7 +16,7 @@
import warnings import warnings
from dataclasses import dataclass from dataclasses import dataclass
from typing import Optional, Tuple from typing import Any, Dict, Optional, Tuple
import tensorflow as tf import tensorflow as tf
@@ -424,16 +424,16 @@ class TFLongformerWordEmbeddings(tf.keras.layers.Layer):
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.initializer_range = initializer_range self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape: tf.TensorShape):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
shape=[self.vocab_size, self.hidden_size], shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape=input_shape) super().build(input_shape)
def get_config(self): def get_config(self) -> Dict[str, Any]:
config = { config = {
"vocab_size": self.vocab_size, "vocab_size": self.vocab_size,
"hidden_size": self.hidden_size, "hidden_size": self.hidden_size,
@@ -443,14 +443,14 @@ class TFLongformerWordEmbeddings(tf.keras.layers.Layer):
return dict(list(base_config.items()) + list(config.items())) return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids): def call(self, input_ids: tf.Tensor) -> tf.Tensor:
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape( embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0)
) )
embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size])
return embeddings return embeddings
@@ -464,16 +464,16 @@ class TFLongformerTokenTypeEmbeddings(tf.keras.layers.Layer):
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.initializer_range = initializer_range self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape: tf.TensorShape):
self.token_type_embeddings = self.add_weight( self.token_type_embeddings = self.add_weight(
name="embeddings", name="embeddings",
shape=[self.type_vocab_size, self.hidden_size], shape=[self.type_vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape=input_shape) super().build(input_shape)
def get_config(self): def get_config(self) -> Dict[str, Any]:
config = { config = {
"type_vocab_size": self.type_vocab_size, "type_vocab_size": self.type_vocab_size,
"hidden_size": self.hidden_size, "hidden_size": self.hidden_size,
@@ -483,15 +483,15 @@ class TFLongformerTokenTypeEmbeddings(tf.keras.layers.Layer):
return dict(list(base_config.items()) + list(config.items())) return dict(list(base_config.items()) + list(config.items()))
def call(self, token_type_ids): def call(self, token_type_ids: tf.Tensor) -> tf.Tensor:
flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1])
one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype)
embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings)
embeddings = tf.reshape( embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) tensor=embeddings, shape=tf.concat(values=[shape_list(token_type_ids), [self.hidden_size]], axis=0)
) )
embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) embeddings.set_shape(token_type_ids.shape.as_list() + [self.hidden_size])
return embeddings return embeddings
@@ -508,7 +508,7 @@ class TFLongformerPositionEmbeddings(tf.keras.layers.Layer):
self.position_embeddings = self.add_weight( self.position_embeddings = self.add_weight(
name="embeddings", name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size], shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape) super().build(input_shape)
@@ -527,10 +527,10 @@ class TFLongformerPositionEmbeddings(tf.keras.layers.Layer):
flat_position_ids = tf.reshape(tensor=position_ids, shape=[-1]) flat_position_ids = tf.reshape(tensor=position_ids, shape=[-1])
embeddings = tf.gather(params=self.position_embeddings, indices=flat_position_ids) embeddings = tf.gather(params=self.position_embeddings, indices=flat_position_ids)
embeddings = tf.reshape( embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=position_ids), [self.hidden_size]], axis=0) tensor=embeddings, shape=tf.concat(values=[shape_list(position_ids), [self.hidden_size]], axis=0)
) )
embeddings.set_shape(shape=position_ids.shape.as_list() + [self.hidden_size]) embeddings.set_shape(position_ids.shape.as_list() + [self.hidden_size])
return embeddings return embeddings
@@ -638,8 +638,8 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer):
tensor=input_ids, shape=(input_ids_shape[0] * input_ids_shape[1], input_ids_shape[2]) tensor=input_ids, shape=(input_ids_shape[0] * input_ids_shape[1], input_ids_shape[2])
) )
mask = tf.cast(x=tf.math.not_equal(x=input_ids, y=self.padding_idx), dtype=input_ids.dtype) mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=input_ids.dtype)
incremental_indices = tf.math.cumsum(x=mask, axis=1) * mask incremental_indices = tf.math.cumsum(mask, axis=1) * mask
return incremental_indices + self.padding_idx return incremental_indices + self.padding_idx
@@ -689,34 +689,34 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer):
return final_embeddings return final_embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Longformer
class TFLongformerIntermediate(tf.keras.layers.Layer): class TFLongformerIntermediate(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config: LongformerConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.experimental.EinsumDense( self.dense = tf.keras.layers.experimental.EinsumDense(
equation="abc,cd->abd", equation="abc,cd->abd",
output_shape=(None, config.intermediate_size), output_shape=(None, config.intermediate_size),
bias_axes="d", bias_axes="d",
kernel_initializer=get_initializer(initializer_range=config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="dense", name="dense",
) )
if isinstance(config.hidden_act, str): if isinstance(config.hidden_act, str):
self.intermediate_act_fn = get_tf_activation(activation_string=config.hidden_act) self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else: else:
self.intermediate_act_fn = config.hidden_act self.intermediate_act_fn = config.hidden_act
def call(self, hidden_states): def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states return hidden_states
# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Longformer
class TFLongformerOutput(tf.keras.layers.Layer): class TFLongformerOutput(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config: LongformerConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.experimental.EinsumDense( self.dense = tf.keras.layers.experimental.EinsumDense(
@@ -729,7 +729,7 @@ class TFLongformerOutput(tf.keras.layers.Layer):
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def call(self, hidden_states, input_tensor, training=False): def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.dropout(inputs=hidden_states, training=training) hidden_states = self.dropout(inputs=hidden_states, training=training)
hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
@@ -737,23 +737,23 @@ class TFLongformerOutput(tf.keras.layers.Layer):
return hidden_states return hidden_states
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Longformer
class TFLongformerPooler(tf.keras.layers.Layer): class TFLongformerPooler(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config: LongformerConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.hidden_size, units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
activation="tanh", activation="tanh",
name="dense", name="dense",
) )
def call(self, hidden_states): def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
# We "pool" the model by simply taking the hidden state corresponding # We "pool" the model by simply taking the hidden state corresponding
# to the first token. # to the first token.
first_token_tensor = hidden_states[:, 0] first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor) pooled_output = self.dense(inputs=first_token_tensor)
return pooled_output return pooled_output

View File

@@ -18,7 +18,7 @@
import warnings import warnings
from dataclasses import dataclass from dataclasses import dataclass
from typing import Dict, Optional, Tuple from typing import Any, Dict, Optional, Tuple
import tensorflow as tf import tensorflow as tf
@@ -186,16 +186,16 @@ class TFLxmertWordEmbeddings(tf.keras.layers.Layer):
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.initializer_range = initializer_range self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape: tf.TensorShape):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
shape=[self.vocab_size, self.hidden_size], shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape=input_shape) super().build(input_shape)
def get_config(self): def get_config(self) -> Dict[str, Any]:
config = { config = {
"vocab_size": self.vocab_size, "vocab_size": self.vocab_size,
"hidden_size": self.hidden_size, "hidden_size": self.hidden_size,
@@ -205,14 +205,14 @@ class TFLxmertWordEmbeddings(tf.keras.layers.Layer):
return dict(list(base_config.items()) + list(config.items())) return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids): def call(self, input_ids: tf.Tensor) -> tf.Tensor:
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape( embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0)
) )
embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size])
return embeddings return embeddings
@@ -226,16 +226,16 @@ class TFLxmertTokenTypeEmbeddings(tf.keras.layers.Layer):
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.initializer_range = initializer_range self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape: tf.TensorShape):
self.token_type_embeddings = self.add_weight( self.token_type_embeddings = self.add_weight(
name="embeddings", name="embeddings",
shape=[self.type_vocab_size, self.hidden_size], shape=[self.type_vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape=input_shape) super().build(input_shape)
def get_config(self): def get_config(self) -> Dict[str, Any]:
config = { config = {
"type_vocab_size": self.type_vocab_size, "type_vocab_size": self.type_vocab_size,
"hidden_size": self.hidden_size, "hidden_size": self.hidden_size,
@@ -245,15 +245,15 @@ class TFLxmertTokenTypeEmbeddings(tf.keras.layers.Layer):
return dict(list(base_config.items()) + list(config.items())) return dict(list(base_config.items()) + list(config.items()))
def call(self, token_type_ids): def call(self, token_type_ids: tf.Tensor) -> tf.Tensor:
flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1])
one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype)
embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings)
embeddings = tf.reshape( embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) tensor=embeddings, shape=tf.concat(values=[shape_list(token_type_ids), [self.hidden_size]], axis=0)
) )
embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) embeddings.set_shape(token_type_ids.shape.as_list() + [self.hidden_size])
return embeddings return embeddings
@@ -267,16 +267,16 @@ class TFLxmertPositionEmbeddings(tf.keras.layers.Layer):
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.initializer_range = initializer_range self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape: tf.TensorShape):
self.position_embeddings = self.add_weight( self.position_embeddings = self.add_weight(
name="embeddings", name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size], shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape) super().build(input_shape)
def get_config(self): def get_config(self) -> Dict[str, Any]:
config = { config = {
"max_position_embeddings": self.max_position_embeddings, "max_position_embeddings": self.max_position_embeddings,
"hidden_size": self.hidden_size, "hidden_size": self.hidden_size,
@@ -286,8 +286,8 @@ class TFLxmertPositionEmbeddings(tf.keras.layers.Layer):
return dict(list(base_config.items()) + list(config.items())) return dict(list(base_config.items()) + list(config.items()))
def call(self, position_ids): def call(self, position_ids: tf.Tensor) -> tf.Tensor:
input_shape = shape_list(tensor=position_ids) input_shape = shape_list(position_ids)
position_embeddings = self.position_embeddings[: input_shape[1], :] position_embeddings = self.position_embeddings[: input_shape[1], :]
return tf.broadcast_to(input=position_embeddings, shape=input_shape) return tf.broadcast_to(input=position_embeddings, shape=input_shape)
@@ -1132,11 +1132,13 @@ class TFLxmertPooler(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->Lxmert # Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->Lxmert
class TFLxmertPredictionHeadTransform(tf.keras.layers.Layer): class TFLxmertPredictionHeadTransform(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config: LxmertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
name="dense",
) )
if isinstance(config.hidden_act, str): if isinstance(config.hidden_act, str):
@@ -1146,17 +1148,17 @@ class TFLxmertPredictionHeadTransform(tf.keras.layers.Layer):
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
def call(self, hidden_states): def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(hidden_states) hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.transform_act_fn(hidden_states) hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(hidden_states) hidden_states = self.LayerNorm(inputs=hidden_states)
return hidden_states return hidden_states
# Copied from transformers.models.bert.modeling_tf_bert.TFBertLMPredictionHead with Bert->Lxmert # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMPredictionHead with Bert->Lxmert
class TFLxmertLMPredictionHead(tf.keras.layers.Layer): class TFLxmertLMPredictionHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config: LxmertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
@@ -1168,28 +1170,28 @@ class TFLxmertLMPredictionHead(tf.keras.layers.Layer):
# an output-only bias for each token. # an output-only bias for each token.
self.input_embeddings = input_embeddings self.input_embeddings = input_embeddings
def build(self, input_shape): def build(self, input_shape: tf.TensorShape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape) super().build(input_shape)
def get_output_embeddings(self): def get_output_embeddings(self) -> tf.keras.layers.Layer:
return self.input_embeddings return self.input_embeddings
def set_output_embeddings(self, value): def set_output_embeddings(self, value: tf.Variable):
self.input_embeddings.weight = value self.input_embeddings.weight = value
self.input_embeddings.vocab_size = shape_list(value)[0] self.input_embeddings.vocab_size = shape_list(value)[0]
def get_bias(self): def get_bias(self) -> Dict[str, tf.Variable]:
return {"bias": self.bias} return {"bias": self.bias}
def set_bias(self, value): def set_bias(self, value: tf.Variable):
self.bias = value["bias"] self.bias = value["bias"]
self.vocab_size = shape_list(value["bias"])[0] self.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states): def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.transform(hidden_states=hidden_states) hidden_states = self.transform(hidden_states=hidden_states)
seq_length = shape_list(tensor=hidden_states)[1] seq_length = shape_list(hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
@@ -1200,13 +1202,13 @@ class TFLxmertLMPredictionHead(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMLMHead with Bert->Lxmert # Copied from transformers.models.bert.modeling_tf_bert.TFBertMLMHead with Bert->Lxmert
class TFLxmertMLMHead(tf.keras.layers.Layer): class TFLxmertMLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config: LxmertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.predictions = TFLxmertLMPredictionHead(config, input_embeddings, name="predictions") self.predictions = TFLxmertLMPredictionHead(config, input_embeddings, name="predictions")
def call(self, sequence_output): def call(self, sequence_output: tf.Tensor) -> tf.Tensor:
prediction_scores = self.predictions(sequence_output) prediction_scores = self.predictions(hidden_states=sequence_output)
return prediction_scores return prediction_scores

View File

@@ -17,7 +17,7 @@
import warnings import warnings
from dataclasses import dataclass from dataclasses import dataclass
from typing import Optional, Tuple from typing import Any, Dict, Optional, Tuple
import tensorflow as tf import tensorflow as tf
@@ -116,16 +116,16 @@ class TFMobileBertWordEmbeddings(tf.keras.layers.Layer):
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.initializer_range = initializer_range self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape: tf.TensorShape):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
shape=[self.vocab_size, self.hidden_size], shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape=input_shape) super().build(input_shape)
def get_config(self): def get_config(self) -> Dict[str, Any]:
config = { config = {
"vocab_size": self.vocab_size, "vocab_size": self.vocab_size,
"hidden_size": self.hidden_size, "hidden_size": self.hidden_size,
@@ -135,14 +135,14 @@ class TFMobileBertWordEmbeddings(tf.keras.layers.Layer):
return dict(list(base_config.items()) + list(config.items())) return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids): def call(self, input_ids: tf.Tensor) -> tf.Tensor:
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape( embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0)
) )
embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size])
return embeddings return embeddings
@@ -156,16 +156,16 @@ class TFMobileBertTokenTypeEmbeddings(tf.keras.layers.Layer):
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.initializer_range = initializer_range self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape: tf.TensorShape):
self.token_type_embeddings = self.add_weight( self.token_type_embeddings = self.add_weight(
name="embeddings", name="embeddings",
shape=[self.type_vocab_size, self.hidden_size], shape=[self.type_vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape=input_shape) super().build(input_shape)
def get_config(self): def get_config(self) -> Dict[str, Any]:
config = { config = {
"type_vocab_size": self.type_vocab_size, "type_vocab_size": self.type_vocab_size,
"hidden_size": self.hidden_size, "hidden_size": self.hidden_size,
@@ -175,15 +175,15 @@ class TFMobileBertTokenTypeEmbeddings(tf.keras.layers.Layer):
return dict(list(base_config.items()) + list(config.items())) return dict(list(base_config.items()) + list(config.items()))
def call(self, token_type_ids): def call(self, token_type_ids: tf.Tensor) -> tf.Tensor:
flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1])
one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype)
embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings)
embeddings = tf.reshape( embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) tensor=embeddings, shape=tf.concat(values=[shape_list(token_type_ids), [self.hidden_size]], axis=0)
) )
embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) embeddings.set_shape(token_type_ids.shape.as_list() + [self.hidden_size])
return embeddings return embeddings
@@ -197,16 +197,16 @@ class TFMobileBertPositionEmbeddings(tf.keras.layers.Layer):
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.initializer_range = initializer_range self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape: tf.TensorShape):
self.position_embeddings = self.add_weight( self.position_embeddings = self.add_weight(
name="embeddings", name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size], shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape) super().build(input_shape)
def get_config(self): def get_config(self) -> Dict[str, Any]:
config = { config = {
"max_position_embeddings": self.max_position_embeddings, "max_position_embeddings": self.max_position_embeddings,
"hidden_size": self.hidden_size, "hidden_size": self.hidden_size,
@@ -216,8 +216,8 @@ class TFMobileBertPositionEmbeddings(tf.keras.layers.Layer):
return dict(list(base_config.items()) + list(config.items())) return dict(list(base_config.items()) + list(config.items()))
def call(self, position_ids): def call(self, position_ids: tf.Tensor) -> tf.Tensor:
input_shape = shape_list(tensor=position_ids) input_shape = shape_list(position_ids)
position_embeddings = self.position_embeddings[: input_shape[1], :] position_embeddings = self.position_embeddings[: input_shape[1], :]
return tf.broadcast_to(input=position_embeddings, shape=input_shape) return tf.broadcast_to(input=position_embeddings, shape=input_shape)
@@ -1085,7 +1085,7 @@ class TFMobileBertModel(TFMobileBertPreTrainedModel):
return outputs return outputs
# Copied from transformers.models.bert.modeling_tf_bert.TFBertModel.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertModel.serving_output
def serving_output(self, output): def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1299,7 +1299,7 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModel
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output
def serving_output(self, output): def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1413,7 +1413,7 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel, TFNextS
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForNextSentencePrediction.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForNextSentencePrediction.serving_output
def serving_output(self, output): def serving_output(self, output: TFNextSentencePredictorOutput) -> TFNextSentencePredictorOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1522,7 +1522,7 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
def serving_output(self, output): def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1643,7 +1643,7 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
def serving_output(self, output): def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1796,13 +1796,14 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
} }
] ]
) )
def serving(self, inputs): # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving
output = self.call(inputs) def serving(self, inputs: Dict[str, tf.Tensor]):
output = self.call(input_ids=inputs)
return self.serving_output(output) return self.serving_output(output)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
def serving_output(self, output): def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1911,7 +1912,7 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
def serving_output(self, output): def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None

View File

@@ -18,6 +18,7 @@
import math import math
import warnings import warnings
from typing import Any, Dict
import tensorflow as tf import tensorflow as tf
@@ -95,16 +96,16 @@ class TFMPNetWordEmbeddings(tf.keras.layers.Layer):
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.initializer_range = initializer_range self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape: tf.TensorShape):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
shape=[self.vocab_size, self.hidden_size], shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape=input_shape) super().build(input_shape)
def get_config(self): def get_config(self) -> Dict[str, Any]:
config = { config = {
"vocab_size": self.vocab_size, "vocab_size": self.vocab_size,
"hidden_size": self.hidden_size, "hidden_size": self.hidden_size,
@@ -114,14 +115,14 @@ class TFMPNetWordEmbeddings(tf.keras.layers.Layer):
return dict(list(base_config.items()) + list(config.items())) return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids): def call(self, input_ids: tf.Tensor) -> tf.Tensor:
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape( embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0)
) )
embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size])
return embeddings return embeddings
@@ -139,7 +140,7 @@ class TFMPNetPositionEmbeddings(tf.keras.layers.Layer):
self.position_embeddings = self.add_weight( self.position_embeddings = self.add_weight(
name="embeddings", name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size], shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape) super().build(input_shape)
@@ -158,10 +159,10 @@ class TFMPNetPositionEmbeddings(tf.keras.layers.Layer):
flat_position_ids = tf.reshape(tensor=position_ids, shape=[-1]) flat_position_ids = tf.reshape(tensor=position_ids, shape=[-1])
embeddings = tf.gather(params=self.position_embeddings, indices=flat_position_ids) embeddings = tf.gather(params=self.position_embeddings, indices=flat_position_ids)
embeddings = tf.reshape( embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=position_ids), [self.hidden_size]], axis=0) tensor=embeddings, shape=tf.concat(values=[shape_list(position_ids), [self.hidden_size]], axis=0)
) )
embeddings.set_shape(shape=position_ids.shape.as_list() + [self.hidden_size]) embeddings.set_shape(position_ids.shape.as_list() + [self.hidden_size])
return embeddings return embeddings
@@ -207,8 +208,8 @@ class TFMPNetEmbeddings(tf.keras.layers.Layer):
tensor=input_ids, shape=(input_ids_shape[0] * input_ids_shape[1], input_ids_shape[2]) tensor=input_ids, shape=(input_ids_shape[0] * input_ids_shape[1], input_ids_shape[2])
) )
mask = tf.cast(x=tf.math.not_equal(x=input_ids, y=self.padding_idx), dtype=input_ids.dtype) mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=input_ids.dtype)
incremental_indices = tf.math.cumsum(x=mask, axis=1) * mask incremental_indices = tf.math.cumsum(mask, axis=1) * mask
return incremental_indices + self.padding_idx return incremental_indices + self.padding_idx
@@ -253,23 +254,23 @@ class TFMPNetEmbeddings(tf.keras.layers.Layer):
return final_embeddings return final_embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->MPNet
class TFMPNetPooler(tf.keras.layers.Layer): class TFMPNetPooler(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config: MPNetConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.hidden_size, units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
activation="tanh", activation="tanh",
name="dense", name="dense",
) )
def call(self, hidden_states): def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
# We "pool" the model by simply taking the hidden state corresponding # We "pool" the model by simply taking the hidden state corresponding
# to the first token. # to the first token.
first_token_tensor = hidden_states[:, 0] first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor) pooled_output = self.dense(inputs=first_token_tensor)
return pooled_output return pooled_output
@@ -291,28 +292,28 @@ class TFMPNetSelfAttention(tf.keras.layers.Layer):
equation="abc,cde->abde", equation="abc,cde->abde",
output_shape=(None, config.num_attention_heads, self.attention_head_size), output_shape=(None, config.num_attention_heads, self.attention_head_size),
bias_axes="de", bias_axes="de",
kernel_initializer=get_initializer(initializer_range=config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="q", name="q",
) )
self.k = tf.keras.layers.experimental.EinsumDense( self.k = tf.keras.layers.experimental.EinsumDense(
equation="abc,cde->abde", equation="abc,cde->abde",
output_shape=(None, config.num_attention_heads, self.attention_head_size), output_shape=(None, config.num_attention_heads, self.attention_head_size),
bias_axes="de", bias_axes="de",
kernel_initializer=get_initializer(initializer_range=config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="k", name="k",
) )
self.v = tf.keras.layers.experimental.EinsumDense( self.v = tf.keras.layers.experimental.EinsumDense(
equation="abc,cde->abde", equation="abc,cde->abde",
output_shape=(None, config.num_attention_heads, self.attention_head_size), output_shape=(None, config.num_attention_heads, self.attention_head_size),
bias_axes="de", bias_axes="de",
kernel_initializer=get_initializer(initializer_range=config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="v", name="v",
) )
self.o = tf.keras.layers.experimental.EinsumDense( self.o = tf.keras.layers.experimental.EinsumDense(
equation="abcd,cde->abe", equation="abcd,cde->abe",
output_shape=(None, self.all_head_size), output_shape=(None, self.all_head_size),
bias_axes="e", bias_axes="e",
kernel_initializer=get_initializer(initializer_range=config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="o", name="o",
) )
self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
@@ -322,8 +323,8 @@ class TFMPNetSelfAttention(tf.keras.layers.Layer):
k = self.k(hidden_states) k = self.k(hidden_states)
v = self.v(hidden_states) v = self.v(hidden_states)
dk = tf.cast(x=self.attention_head_size, dtype=q.dtype) dk = tf.cast(self.attention_head_size, dtype=q.dtype)
q = tf.multiply(x=q, y=tf.math.rsqrt(x=dk)) q = tf.multiply(q, y=tf.math.rsqrt(dk))
attention_scores = tf.einsum("aecd,abcd->acbe", k, q) attention_scores = tf.einsum("aecd,abcd->acbe", k, q)
# Apply relative position embedding (precomputed in MPNetEncoder) if provided. # Apply relative position embedding (precomputed in MPNetEncoder) if provided.
@@ -368,34 +369,34 @@ class TFMPNetAttention(tf.keras.layers.Layer):
return outputs return outputs
# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->MPNet
class TFMPNetIntermediate(tf.keras.layers.Layer): class TFMPNetIntermediate(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config: MPNetConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.experimental.EinsumDense( self.dense = tf.keras.layers.experimental.EinsumDense(
equation="abc,cd->abd", equation="abc,cd->abd",
output_shape=(None, config.intermediate_size), output_shape=(None, config.intermediate_size),
bias_axes="d", bias_axes="d",
kernel_initializer=get_initializer(initializer_range=config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="dense", name="dense",
) )
if isinstance(config.hidden_act, str): if isinstance(config.hidden_act, str):
self.intermediate_act_fn = get_tf_activation(activation_string=config.hidden_act) self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else: else:
self.intermediate_act_fn = config.hidden_act self.intermediate_act_fn = config.hidden_act
def call(self, hidden_states): def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states return hidden_states
# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->MPNet
class TFMPNetOutput(tf.keras.layers.Layer): class TFMPNetOutput(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config: MPNetConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.experimental.EinsumDense( self.dense = tf.keras.layers.experimental.EinsumDense(
@@ -408,7 +409,7 @@ class TFMPNetOutput(tf.keras.layers.Layer):
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def call(self, hidden_states, input_tensor, training=False): def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.dropout(inputs=hidden_states, training=training) hidden_states = self.dropout(inputs=hidden_states, training=training)
hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
@@ -563,11 +564,11 @@ class TFMPNetMainLayer(tf.keras.layers.Layer):
self.embeddings = TFMPNetEmbeddings(config, name="embeddings") self.embeddings = TFMPNetEmbeddings(config, name="embeddings")
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings
def get_input_embeddings(self): def get_input_embeddings(self) -> tf.keras.layers.Layer:
return self.embeddings.word_embeddings return self.embeddings.word_embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings
def set_input_embeddings(self, value): def set_input_embeddings(self, value: tf.Variable):
self.embeddings.word_embeddings.weight = value self.embeddings.word_embeddings.weight = value
self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] self.embeddings.word_embeddings.vocab_size = shape_list(value)[0]
@@ -820,7 +821,7 @@ class TFMPNetModel(TFMPNetPreTrainedModel):
return outputs return outputs
# Copied from transformers.models.bert.modeling_tf_bert.TFBertModel.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertModel.serving_output
def serving_output(self, output): def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -973,7 +974,7 @@ class TFMPNetForMaskedLM(TFMPNetPreTrainedModel, TFMaskedLanguageModelingLoss):
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output
def serving_output(self, output): def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1095,7 +1096,7 @@ class TFMPNetForSequenceClassification(TFMPNetPreTrainedModel, TFSequenceClassif
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
def serving_output(self, output): def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1233,7 +1234,7 @@ class TFMPNetForMultipleChoice(TFMPNetPreTrainedModel, TFMultipleChoiceLoss):
return self.serving_output(output) return self.serving_output(output)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
def serving_output(self, output): def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1333,7 +1334,7 @@ class TFMPNetForTokenClassification(TFMPNetPreTrainedModel, TFTokenClassificatio
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
def serving_output(self, output): def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1446,7 +1447,7 @@ class TFMPNetForQuestionAnswering(TFMPNetPreTrainedModel, TFQuestionAnsweringLos
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
def serving_output(self, output): def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None

View File

@@ -663,7 +663,7 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.serving_output
def serving_output(self, output): def serving_output(self, output: TFCausalLMOutput) -> TFCausalLMOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -965,7 +965,7 @@ class TFOpenAIGPTForSequenceClassification(TFOpenAIGPTPreTrainedModel, TFSequenc
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
def serving_output(self, output): def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None

View File

@@ -16,7 +16,9 @@
""" TF 2.0 RoBERTa model. """ """ TF 2.0 RoBERTa model. """
import warnings import warnings
from typing import Any, Dict, Optional, Tuple, Union
import numpy as np
import tensorflow as tf import tensorflow as tf
from ...activations_tf import get_tf_activation from ...activations_tf import get_tf_activation
@@ -37,6 +39,7 @@ from ...modeling_tf_outputs import (
) )
from ...modeling_tf_utils import ( from ...modeling_tf_utils import (
TFMaskedLanguageModelingLoss, TFMaskedLanguageModelingLoss,
TFModelInputType,
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
TFPreTrainedModel, TFPreTrainedModel,
TFQuestionAnsweringLoss, TFQuestionAnsweringLoss,
@@ -74,16 +77,16 @@ class TFRobertaWordEmbeddings(tf.keras.layers.Layer):
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.initializer_range = initializer_range self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape: tf.TensorShape):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
shape=[self.vocab_size, self.hidden_size], shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape=input_shape) super().build(input_shape)
def get_config(self): def get_config(self) -> Dict[str, Any]:
config = { config = {
"vocab_size": self.vocab_size, "vocab_size": self.vocab_size,
"hidden_size": self.hidden_size, "hidden_size": self.hidden_size,
@@ -93,14 +96,14 @@ class TFRobertaWordEmbeddings(tf.keras.layers.Layer):
return dict(list(base_config.items()) + list(config.items())) return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids): def call(self, input_ids: tf.Tensor) -> tf.Tensor:
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape( embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0)
) )
embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size])
return embeddings return embeddings
@@ -114,16 +117,16 @@ class TFRobertaTokenTypeEmbeddings(tf.keras.layers.Layer):
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.initializer_range = initializer_range self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape: tf.TensorShape):
self.token_type_embeddings = self.add_weight( self.token_type_embeddings = self.add_weight(
name="embeddings", name="embeddings",
shape=[self.type_vocab_size, self.hidden_size], shape=[self.type_vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape=input_shape) super().build(input_shape)
def get_config(self): def get_config(self) -> Dict[str, Any]:
config = { config = {
"type_vocab_size": self.type_vocab_size, "type_vocab_size": self.type_vocab_size,
"hidden_size": self.hidden_size, "hidden_size": self.hidden_size,
@@ -133,15 +136,15 @@ class TFRobertaTokenTypeEmbeddings(tf.keras.layers.Layer):
return dict(list(base_config.items()) + list(config.items())) return dict(list(base_config.items()) + list(config.items()))
def call(self, token_type_ids): def call(self, token_type_ids: tf.Tensor) -> tf.Tensor:
flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1])
one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype)
embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings)
embeddings = tf.reshape( embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) tensor=embeddings, shape=tf.concat(values=[shape_list(token_type_ids), [self.hidden_size]], axis=0)
) )
embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) embeddings.set_shape(token_type_ids.shape.as_list() + [self.hidden_size])
return embeddings return embeddings
@@ -159,7 +162,7 @@ class TFRobertaPositionEmbeddings(tf.keras.layers.Layer):
self.position_embeddings = self.add_weight( self.position_embeddings = self.add_weight(
name="embeddings", name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size], shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape) super().build(input_shape)
@@ -178,10 +181,10 @@ class TFRobertaPositionEmbeddings(tf.keras.layers.Layer):
flat_position_ids = tf.reshape(tensor=position_ids, shape=[-1]) flat_position_ids = tf.reshape(tensor=position_ids, shape=[-1])
embeddings = tf.gather(params=self.position_embeddings, indices=flat_position_ids) embeddings = tf.gather(params=self.position_embeddings, indices=flat_position_ids)
embeddings = tf.reshape( embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=position_ids), [self.hidden_size]], axis=0) tensor=embeddings, shape=tf.concat(values=[shape_list(position_ids), [self.hidden_size]], axis=0)
) )
embeddings.set_shape(shape=position_ids.shape.as_list() + [self.hidden_size]) embeddings.set_shape(position_ids.shape.as_list() + [self.hidden_size])
return embeddings return embeddings
@@ -235,8 +238,8 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer):
tensor=input_ids, shape=(input_ids_shape[0] * input_ids_shape[1], input_ids_shape[2]) tensor=input_ids, shape=(input_ids_shape[0] * input_ids_shape[1], input_ids_shape[2])
) )
mask = tf.cast(x=tf.math.not_equal(x=input_ids, y=self.padding_idx), dtype=input_ids.dtype) mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=input_ids.dtype)
incremental_indices = tf.math.cumsum(x=mask, axis=1) * mask incremental_indices = tf.math.cumsum(mask, axis=1) * mask
return incremental_indices + self.padding_idx return incremental_indices + self.padding_idx
@@ -286,30 +289,30 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer):
return final_embeddings return final_embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Roberta
class TFRobertaPooler(tf.keras.layers.Layer): class TFRobertaPooler(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config: RobertaConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.hidden_size, units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
activation="tanh", activation="tanh",
name="dense", name="dense",
) )
def call(self, hidden_states): def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
# We "pool" the model by simply taking the hidden state corresponding # We "pool" the model by simply taking the hidden state corresponding
# to the first token. # to the first token.
first_token_tensor = hidden_states[:, 0] first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor) pooled_output = self.dense(inputs=first_token_tensor)
return pooled_output return pooled_output
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Roberta # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Roberta
class TFRobertaSelfAttention(tf.keras.layers.Layer): class TFRobertaSelfAttention(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config: RobertaConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
if config.hidden_size % config.num_attention_heads != 0: if config.hidden_size % config.num_attention_heads != 0:
@@ -324,50 +327,57 @@ class TFRobertaSelfAttention(tf.keras.layers.Layer):
equation="abc,cde->abde", equation="abc,cde->abde",
output_shape=(None, config.num_attention_heads, self.attention_head_size), output_shape=(None, config.num_attention_heads, self.attention_head_size),
bias_axes="de", bias_axes="de",
kernel_initializer=get_initializer(initializer_range=config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="query", name="query",
) )
self.key = tf.keras.layers.experimental.EinsumDense( self.key = tf.keras.layers.experimental.EinsumDense(
equation="abc,cde->abde", equation="abc,cde->abde",
output_shape=(None, config.num_attention_heads, self.attention_head_size), output_shape=(None, config.num_attention_heads, self.attention_head_size),
bias_axes="de", bias_axes="de",
kernel_initializer=get_initializer(initializer_range=config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="key", name="key",
) )
self.value = tf.keras.layers.experimental.EinsumDense( self.value = tf.keras.layers.experimental.EinsumDense(
equation="abc,cde->abde", equation="abc,cde->abde",
output_shape=(None, config.num_attention_heads, self.attention_head_size), output_shape=(None, config.num_attention_heads, self.attention_head_size),
bias_axes="de", bias_axes="de",
kernel_initializer=get_initializer(initializer_range=config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="value", name="value",
) )
self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
def call(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False, training=False): def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
head_mask: tf.Tensor,
output_attentions: bool,
training: bool = False,
) -> Tuple[tf.Tensor]:
query_layer = self.query(inputs=hidden_states) query_layer = self.query(inputs=hidden_states)
key_layer = self.key(inputs=hidden_states) key_layer = self.key(inputs=hidden_states)
value_layer = self.value(inputs=hidden_states) value_layer = self.value(inputs=hidden_states)
# Take the dot product between "query" and "key" to get the raw # Take the dot product between "query" and "key" to get the raw
# attention scores. # attention scores.
dk = tf.cast(x=self.attention_head_size, dtype=query_layer.dtype) dk = tf.cast(self.attention_head_size, dtype=query_layer.dtype)
query_layer = tf.multiply(x=query_layer, y=tf.math.rsqrt(x=dk)) query_layer = tf.multiply(query_layer, tf.math.rsqrt(dk))
attention_scores = tf.einsum("aecd,abcd->acbe", key_layer, query_layer) attention_scores = tf.einsum("aecd,abcd->acbe", key_layer, query_layer)
if attention_mask is not None: if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in TFRobertaModel call() function) # Apply the attention mask is (precomputed for all layers in TFRobertaModel call() function)
attention_scores = attention_scores + attention_mask attention_scores = tf.add(attention_scores, attention_mask)
# Normalize the attention scores to probabilities. # Normalize the attention scores to probabilities.
attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1) attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1)
# This is actually dropping out entire tokens to attend to, which might # This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper. # seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs, training=training) attention_probs = self.dropout(inputs=attention_probs, training=training)
# Mask heads if we want to # Mask heads if we want to
if head_mask is not None: if head_mask is not None:
attention_scores = attention_scores * head_mask attention_scores = tf.multiply(attention_scores, head_mask)
attention_output = tf.einsum("acbe,aecd->abcd", attention_probs, value_layer) attention_output = tf.einsum("acbe,aecd->abcd", attention_probs, value_layer)
outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
@@ -375,9 +385,9 @@ class TFRobertaSelfAttention(tf.keras.layers.Layer):
return outputs return outputs
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Roberta
class TFRobertaSelfOutput(tf.keras.layers.Layer): class TFRobertaSelfOutput(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config: RobertaConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
if config.hidden_size % config.num_attention_heads != 0: if config.hidden_size % config.num_attention_heads != 0:
@@ -393,13 +403,13 @@ class TFRobertaSelfOutput(tf.keras.layers.Layer):
equation="abcd,cde->abe", equation="abcd,cde->abe",
output_shape=(None, self.all_head_size), output_shape=(None, self.all_head_size),
bias_axes="e", bias_axes="e",
kernel_initializer=get_initializer(initializer_range=config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="dense", name="dense",
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def call(self, hidden_states, input_tensor, training=False): def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.dropout(inputs=hidden_states, training=training) hidden_states = self.dropout(inputs=hidden_states, training=training)
hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
@@ -409,7 +419,7 @@ class TFRobertaSelfOutput(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Roberta # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Roberta
class TFRobertaAttention(tf.keras.layers.Layer): class TFRobertaAttention(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config: RobertaConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.self_attention = TFRobertaSelfAttention(config, name="self") self.self_attention = TFRobertaSelfAttention(config, name="self")
@@ -418,44 +428,57 @@ class TFRobertaAttention(tf.keras.layers.Layer):
def prune_heads(self, heads): def prune_heads(self, heads):
raise NotImplementedError raise NotImplementedError
def call(self, input_tensor, attention_mask, head_mask, output_attentions, training=False): def call(
self,
input_tensor: tf.Tensor,
attention_mask: tf.Tensor,
head_mask: tf.Tensor,
output_attentions: bool,
training: bool = False,
) -> Tuple[tf.Tensor]:
self_outputs = self.self_attention( self_outputs = self.self_attention(
input_tensor, attention_mask, head_mask, output_attentions, training=training hidden_states=input_tensor,
attention_mask=attention_mask,
head_mask=head_mask,
output_attentions=output_attentions,
training=training,
)
attention_output = self.dense_output(
hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
) )
attention_output = self.dense_output(self_outputs[0], input_tensor, training=training)
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
return outputs return outputs
# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Roberta
class TFRobertaIntermediate(tf.keras.layers.Layer): class TFRobertaIntermediate(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config: RobertaConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.experimental.EinsumDense( self.dense = tf.keras.layers.experimental.EinsumDense(
equation="abc,cd->abd", equation="abc,cd->abd",
output_shape=(None, config.intermediate_size), output_shape=(None, config.intermediate_size),
bias_axes="d", bias_axes="d",
kernel_initializer=get_initializer(initializer_range=config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="dense", name="dense",
) )
if isinstance(config.hidden_act, str): if isinstance(config.hidden_act, str):
self.intermediate_act_fn = get_tf_activation(activation_string=config.hidden_act) self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else: else:
self.intermediate_act_fn = config.hidden_act self.intermediate_act_fn = config.hidden_act
def call(self, hidden_states): def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states return hidden_states
# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Roberta
class TFRobertaOutput(tf.keras.layers.Layer): class TFRobertaOutput(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config: RobertaConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.experimental.EinsumDense( self.dense = tf.keras.layers.experimental.EinsumDense(
@@ -468,7 +491,7 @@ class TFRobertaOutput(tf.keras.layers.Layer):
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def call(self, hidden_states, input_tensor, training=False): def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.dropout(inputs=hidden_states, training=training) hidden_states = self.dropout(inputs=hidden_states, training=training)
hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
@@ -478,20 +501,33 @@ class TFRobertaOutput(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Roberta # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Roberta
class TFRobertaLayer(tf.keras.layers.Layer): class TFRobertaLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config: RobertaConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.attention = TFRobertaAttention(config, name="attention") self.attention = TFRobertaAttention(config, name="attention")
self.intermediate = TFRobertaIntermediate(config, name="intermediate") self.intermediate = TFRobertaIntermediate(config, name="intermediate")
self.bert_output = TFRobertaOutput(config, name="output") self.bert_output = TFRobertaOutput(config, name="output")
def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False): def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
head_mask: tf.Tensor,
output_attentions: bool,
training: bool = False,
) -> Tuple[tf.Tensor]:
attention_outputs = self.attention( attention_outputs = self.attention(
hidden_states, attention_mask, head_mask, output_attentions, training=training input_tensor=hidden_states,
attention_mask=attention_mask,
head_mask=head_mask,
output_attentions=output_attentions,
training=training,
) )
attention_output = attention_outputs[0] attention_output = attention_outputs[0]
intermediate_output = self.intermediate(attention_output) intermediate_output = self.intermediate(hidden_states=attention_output)
layer_output = self.bert_output(intermediate_output, attention_output, training=training) layer_output = self.bert_output(
hidden_states=intermediate_output, input_tensor=attention_output, training=training
)
outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them
return outputs return outputs
@@ -499,21 +535,21 @@ class TFRobertaLayer(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Roberta # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Roberta
class TFRobertaEncoder(tf.keras.layers.Layer): class TFRobertaEncoder(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config: RobertaConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.layer = [TFRobertaLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)] self.layer = [TFRobertaLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)]
def call( def call(
self, self,
hidden_states, hidden_states: tf.Tensor,
attention_mask, attention_mask: tf.Tensor,
head_mask, head_mask: tf.Tensor,
output_attentions, output_attentions: bool,
output_hidden_states, output_hidden_states: bool,
return_dict, return_dict: bool,
training=False, training: bool = False,
): ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
all_hidden_states = () if output_hidden_states else None all_hidden_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None all_attentions = () if output_attentions else None
@@ -522,7 +558,11 @@ class TFRobertaEncoder(tf.keras.layers.Layer):
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
layer_outputs = layer_module( layer_outputs = layer_module(
hidden_states, attention_mask, head_mask[i], output_attentions, training=training hidden_states=hidden_states,
attention_mask=attention_mask,
head_mask=head_mask[i],
output_attentions=output_attentions,
training=training,
) )
hidden_states = layer_outputs[0] hidden_states = layer_outputs[0]
@@ -560,11 +600,11 @@ class TFRobertaMainLayer(tf.keras.layers.Layer):
self.embeddings = TFRobertaEmbeddings(config, name="embeddings") self.embeddings = TFRobertaEmbeddings(config, name="embeddings")
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings
def get_input_embeddings(self): def get_input_embeddings(self) -> tf.keras.layers.Layer:
return self.embeddings.word_embeddings return self.embeddings.word_embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings
def set_input_embeddings(self, value): def set_input_embeddings(self, value: tf.Variable):
self.embeddings.word_embeddings.weight = value self.embeddings.word_embeddings.weight = value
self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] self.embeddings.word_embeddings.vocab_size = shape_list(value)[0]
@@ -579,18 +619,18 @@ class TFRobertaMainLayer(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.call # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.call
def call( def call(
self, self,
input_ids=None, input_ids: Optional[TFModelInputType] = None,
attention_mask=None, attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
token_type_ids=None, token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
position_ids=None, position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
head_mask=None, head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
inputs_embeds=None, inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
output_attentions=None, output_attentions: Optional[bool] = None,
output_hidden_states=None, output_hidden_states: Optional[bool] = None,
return_dict=None, return_dict: Optional[bool] = None,
training=False, training: bool = False,
**kwargs, **kwargs,
): ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
inputs = input_processing( inputs = input_processing(
func=self.call, func=self.call,
config=self.config, config=self.config,
@@ -610,23 +650,23 @@ class TFRobertaMainLayer(tf.keras.layers.Layer):
if inputs["input_ids"] is not None and inputs["inputs_embeds"] is not None: if inputs["input_ids"] is not None and inputs["inputs_embeds"] is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif inputs["input_ids"] is not None: elif inputs["input_ids"] is not None:
input_shape = shape_list(inputs["input_ids"]) input_shape = shape_list(tensor=inputs["input_ids"])
elif inputs["inputs_embeds"] is not None: elif inputs["inputs_embeds"] is not None:
input_shape = shape_list(inputs["inputs_embeds"])[:-1] input_shape = shape_list(tensor=inputs["inputs_embeds"])[:-1]
else: else:
raise ValueError("You have to specify either input_ids or inputs_embeds") raise ValueError("You have to specify either input_ids or inputs_embeds")
if inputs["attention_mask"] is None: if inputs["attention_mask"] is None:
inputs["attention_mask"] = tf.fill(input_shape, 1) inputs["attention_mask"] = tf.fill(dims=input_shape, value=1)
if inputs["token_type_ids"] is None: if inputs["token_type_ids"] is None:
inputs["token_type_ids"] = tf.fill(input_shape, 0) inputs["token_type_ids"] = tf.fill(dims=input_shape, value=0)
embedding_output = self.embeddings( embedding_output = self.embeddings(
inputs["input_ids"], input_ids=inputs["input_ids"],
inputs["position_ids"], position_ids=inputs["position_ids"],
inputs["token_type_ids"], token_type_ids=inputs["token_type_ids"],
inputs["inputs_embeds"], inputs_embeds=inputs["inputs_embeds"],
training=inputs["training"], training=inputs["training"],
) )
@@ -642,8 +682,8 @@ class TFRobertaMainLayer(tf.keras.layers.Layer):
# positions we want to attend and -10000.0 for masked positions. # positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is # Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely. # effectively the same as removing these entirely.
extended_attention_mask = tf.cast(extended_attention_mask, embedding_output.dtype) extended_attention_mask = tf.cast(extended_attention_mask, dtype=embedding_output.dtype)
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 extended_attention_mask = tf.multiply(tf.subtract(1.0, extended_attention_mask), -10000.0)
# Prepare head mask if needed # Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head # 1.0 in head_mask indicate we keep the head
@@ -653,21 +693,20 @@ class TFRobertaMainLayer(tf.keras.layers.Layer):
if inputs["head_mask"] is not None: if inputs["head_mask"] is not None:
raise NotImplementedError raise NotImplementedError
else: else:
inputs["head_mask"] = [None] * self.num_hidden_layers inputs["head_mask"] = [None] * self.config.num_hidden_layers
# head_mask = tf.constant([0] * self.num_hidden_layers)
encoder_outputs = self.encoder( encoder_outputs = self.encoder(
embedding_output, hidden_states=embedding_output,
extended_attention_mask, attention_mask=extended_attention_mask,
inputs["head_mask"], head_mask=inputs["head_mask"],
inputs["output_attentions"], output_attentions=inputs["output_attentions"],
inputs["output_hidden_states"], output_hidden_states=inputs["output_hidden_states"],
inputs["return_dict"], return_dict=inputs["return_dict"],
training=inputs["training"], training=inputs["training"],
) )
sequence_output = encoder_outputs[0] sequence_output = encoder_outputs[0]
pooled_output = self.pooler(sequence_output) if self.pooler is not None else None pooled_output = self.pooler(hidden_states=sequence_output) if self.pooler is not None else None
if not inputs["return_dict"]: if not inputs["return_dict"]:
return ( return (
@@ -860,7 +899,7 @@ class TFRobertaModel(TFRobertaPreTrainedModel):
return outputs return outputs
# Copied from transformers.models.bert.modeling_tf_bert.TFBertModel.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertModel.serving_output
def serving_output(self, output): def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1016,7 +1055,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output
def serving_output(self, output): def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1139,7 +1178,7 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
def serving_output(self, output): def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1283,7 +1322,7 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
return self.serving_output(output) return self.serving_output(output)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
def serving_output(self, output): def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1386,7 +1425,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
def serving_output(self, output): def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1501,7 +1540,7 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
def serving_output(self, output): def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None

View File

@@ -19,7 +19,7 @@
import itertools import itertools
import warnings import warnings
from dataclasses import dataclass from dataclasses import dataclass
from typing import Optional, Tuple from typing import Dict, Optional, Tuple
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
@@ -1019,7 +1019,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
def serving_output(self, output): def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1180,13 +1180,14 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
} }
] ]
) )
def serving(self, inputs): # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving
output = self.call(inputs) def serving(self, inputs: Dict[str, tf.Tensor]):
output = self.call(input_ids=inputs)
return self.serving_output(output) return self.serving_output(output)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
def serving_output(self, output): def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1294,7 +1295,7 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
def serving_output(self, output): def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1413,7 +1414,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL
) )
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
def serving_output(self, output): def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None