From 4adbdce5ee4f784a56b9bf9252c3f6a007f9daca Mon Sep 17 00:00:00 2001 From: Julien Plu Date: Wed, 27 Jan 2021 11:28:11 +0100 Subject: [PATCH] Clean TF Bert (#9788) * Start cleaning BERT * Clean BERT and all those depends of it * Fix attribute name * Apply style * Apply Sylvain's comments * Apply Lysandre's comments * remove unused import --- src/transformers/modeling_tf_utils.py | 4 + .../models/albert/modeling_tf_albert.py | 66 +- .../models/bert/modeling_tf_bert.py | 739 ++++++++++-------- .../models/ctrl/modeling_tf_ctrl.py | 2 +- .../distilbert/modeling_tf_distilbert.py | 35 +- .../models/electra/modeling_tf_electra.py | 196 +++-- .../models/funnel/modeling_tf_funnel.py | 32 +- .../longformer/modeling_tf_longformer.py | 66 +- .../models/lxmert/modeling_tf_lxmert.py | 74 +- .../mobilebert/modeling_tf_mobilebert.py | 59 +- .../models/mpnet/modeling_tf_mpnet.py | 79 +- .../models/openai/modeling_tf_openai.py | 4 +- .../models/roberta/modeling_tf_roberta.py | 245 +++--- .../models/xlm/modeling_tf_xlm.py | 15 +- ...tf_{{cookiecutter.lowercase_modelname}}.py | 738 +++++++++-------- 15 files changed, 1295 insertions(+), 1059 deletions(-) diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index be463c0fb7..beb18cba08 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -46,6 +46,10 @@ from .utils import logging logger = logging.get_logger(__name__) tf_logger = tf.get_logger() +TFModelInputType = Union[ + List[tf.Tensor], List[np.ndarray], Dict[str, tf.Tensor], Dict[str, np.ndarray], np.ndarray, tf.Tensor +] + class TFModelUtilsMixin: """ diff --git a/src/transformers/models/albert/modeling_tf_albert.py b/src/transformers/models/albert/modeling_tf_albert.py index 54bb69eba4..b889fb1a61 100644 --- a/src/transformers/models/albert/modeling_tf_albert.py +++ b/src/transformers/models/albert/modeling_tf_albert.py @@ -17,7 +17,7 @@ from dataclasses import dataclass -from typing import Optional, Tuple +from typing import Any, Dict, Optional, Tuple import tensorflow as tf @@ -82,16 +82,16 @@ class TFAlbertWordEmbeddings(tf.keras.layers.Layer): self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.weight = self.add_weight( name="weight", shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "vocab_size": self.vocab_size, "hidden_size": self.hidden_size, @@ -101,14 +101,14 @@ class TFAlbertWordEmbeddings(tf.keras.layers.Layer): return dict(list(base_config.items()) + list(config.items())) - def call(self, input_ids): + def call(self, input_ids: tf.Tensor) -> tf.Tensor: flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -122,16 +122,16 @@ class TFAlbertTokenTypeEmbeddings(tf.keras.layers.Layer): self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.token_type_embeddings = self.add_weight( name="embeddings", shape=[self.type_vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "type_vocab_size": self.type_vocab_size, "hidden_size": self.hidden_size, @@ -141,15 +141,15 @@ class TFAlbertTokenTypeEmbeddings(tf.keras.layers.Layer): return dict(list(base_config.items()) + list(config.items())) - def call(self, token_type_ids): + def call(self, token_type_ids: tf.Tensor) -> tf.Tensor: flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(token_type_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(token_type_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -163,16 +163,16 @@ class TFAlbertPositionEmbeddings(tf.keras.layers.Layer): self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.position_embeddings = self.add_weight( name="embeddings", shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "max_position_embeddings": self.max_position_embeddings, "hidden_size": self.hidden_size, @@ -182,8 +182,8 @@ class TFAlbertPositionEmbeddings(tf.keras.layers.Layer): return dict(list(base_config.items()) + list(config.items())) - def call(self, position_ids): - input_shape = shape_list(tensor=position_ids) + def call(self, position_ids: tf.Tensor) -> tf.Tensor: + input_shape = shape_list(position_ids) position_embeddings = self.position_embeddings[: input_shape[1], :] return tf.broadcast_to(input=position_embeddings, shape=input_shape) @@ -218,7 +218,14 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer): self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call - def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False): + def call( + self, + input_ids: tf.Tensor, + position_ids: tf.Tensor, + token_type_ids: tf.Tensor, + inputs_embeds: tf.Tensor, + training: bool = False, + ) -> tf.Tensor: """ Applies embedding based on inputs tensor. @@ -879,7 +886,7 @@ class TFAlbertModel(TFAlbertPreTrainedModel): return outputs # Copied from transformers.models.bert.modeling_tf_bert.TFBertModel.serving_output - def serving_output(self, output): + def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1105,7 +1112,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss) ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1208,7 +1215,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1310,7 +1317,7 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1425,7 +1432,7 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output - def serving_output(self, output): + def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1572,13 +1579,14 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss): } ] ) - def serving(self, inputs): - output = self.call(inputs) + # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving + def serving(self, inputs: Dict[str, tf.Tensor]): + output = self.call(input_ids=inputs) return self.serving_output(output) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py index 890b362ac4..3f41efedb3 100644 --- a/src/transformers/models/bert/modeling_tf_bert.py +++ b/src/transformers/models/bert/modeling_tf_bert.py @@ -17,8 +17,9 @@ import warnings from dataclasses import dataclass -from typing import Optional, Tuple +from typing import Any, Dict, Optional, Tuple, Union +import numpy as np import tensorflow as tf from ...activations_tf import get_tf_activation @@ -44,6 +45,7 @@ from ...modeling_tf_outputs import ( from ...modeling_tf_utils import ( TFCausalLanguageModelingLoss, TFMaskedLanguageModelingLoss, + TFModelInputType, TFMultipleChoiceLoss, TFNextSentencePredictionLoss, TFPreTrainedModel, @@ -96,27 +98,31 @@ class TFBertPreTrainingLoss: computation. """ - def compute_loss(self, labels, logits): + def compute_loss(self, labels: tf.Tensor, logits: tf.Tensor) -> tf.Tensor: loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction=tf.keras.losses.Reduction.NONE ) # make sure only labels that are not equal to -100 # are taken into account as loss - masked_lm_active_loss = tf.not_equal(tf.reshape(labels["labels"], (-1,)), -100) + masked_lm_active_loss = tf.not_equal(tf.reshape(tensor=labels["labels"], shape=(-1,)), -100) masked_lm_reduced_logits = tf.boolean_mask( - tf.reshape(logits[0], (-1, shape_list(logits[0])[2])), - masked_lm_active_loss, + tensor=tf.reshape(tensor=logits[0], shape=(-1, shape_list(logits[0])[2])), + mask=masked_lm_active_loss, + ) + masked_lm_labels = tf.boolean_mask( + tensor=tf.reshape(tensor=labels["labels"], shape=(-1,)), mask=masked_lm_active_loss + ) + next_sentence_active_loss = tf.not_equal(tf.reshape(tensor=labels["next_sentence_label"], shape=(-1,)), -100) + next_sentence_reduced_logits = tf.boolean_mask( + tensor=tf.reshape(tensor=logits[1], shape=(-1, 2)), mask=next_sentence_active_loss ) - masked_lm_labels = tf.boolean_mask(tf.reshape(labels["labels"], (-1,)), masked_lm_active_loss) - next_sentence_active_loss = tf.not_equal(tf.reshape(labels["next_sentence_label"], (-1,)), -100) - next_sentence_reduced_logits = tf.boolean_mask(tf.reshape(logits[1], (-1, 2)), next_sentence_active_loss) next_sentence_label = tf.boolean_mask( - tf.reshape(labels["next_sentence_label"], (-1,)), mask=next_sentence_active_loss + tensor=tf.reshape(tensor=labels["next_sentence_label"], shape=(-1,)), mask=next_sentence_active_loss ) - masked_lm_loss = loss_fn(masked_lm_labels, masked_lm_reduced_logits) - next_sentence_loss = loss_fn(next_sentence_label, next_sentence_reduced_logits) - masked_lm_loss = tf.reshape(masked_lm_loss, (-1, shape_list(next_sentence_loss)[0])) - masked_lm_loss = tf.reduce_mean(masked_lm_loss, 0) + masked_lm_loss = loss_fn(y_true=masked_lm_labels, y_pred=masked_lm_reduced_logits) + next_sentence_loss = loss_fn(y_true=next_sentence_label, y_pred=next_sentence_reduced_logits) + masked_lm_loss = tf.reshape(tensor=masked_lm_loss, shape=(-1, shape_list(next_sentence_loss)[0])) + masked_lm_loss = tf.reduce_mean(input_tensor=masked_lm_loss, axis=0) return masked_lm_loss + next_sentence_loss @@ -129,16 +135,16 @@ class TFBertWordEmbeddings(tf.keras.layers.Layer): self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.weight = self.add_weight( name="weight", shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "vocab_size": self.vocab_size, "hidden_size": self.hidden_size, @@ -148,14 +154,14 @@ class TFBertWordEmbeddings(tf.keras.layers.Layer): return dict(list(base_config.items()) + list(config.items())) - def call(self, input_ids): + def call(self, input_ids: tf.Tensor) -> tf.Tensor: flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -168,16 +174,16 @@ class TFBertTokenTypeEmbeddings(tf.keras.layers.Layer): self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.token_type_embeddings = self.add_weight( name="embeddings", shape=[self.type_vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "type_vocab_size": self.type_vocab_size, "hidden_size": self.hidden_size, @@ -187,15 +193,15 @@ class TFBertTokenTypeEmbeddings(tf.keras.layers.Layer): return dict(list(base_config.items()) + list(config.items())) - def call(self, token_type_ids): + def call(self, token_type_ids: tf.Tensor) -> tf.Tensor: flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(token_type_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(token_type_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -208,16 +214,16 @@ class TFBertPositionEmbeddings(tf.keras.layers.Layer): self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.position_embeddings = self.add_weight( name="embeddings", shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "max_position_embeddings": self.max_position_embeddings, "hidden_size": self.hidden_size, @@ -227,8 +233,8 @@ class TFBertPositionEmbeddings(tf.keras.layers.Layer): return dict(list(base_config.items()) + list(config.items())) - def call(self, position_ids): - input_shape = shape_list(tensor=position_ids) + def call(self, position_ids: tf.Tensor) -> tf.Tensor: + input_shape = shape_list(position_ids) position_embeddings = self.position_embeddings[: input_shape[1], :] return tf.broadcast_to(input=position_embeddings, shape=input_shape) @@ -237,7 +243,7 @@ class TFBertPositionEmbeddings(tf.keras.layers.Layer): class TFBertEmbeddings(tf.keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" - def __init__(self, config, **kwargs): + def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) self.word_embeddings = TFBertWordEmbeddings( @@ -262,7 +268,14 @@ class TFBertEmbeddings(tf.keras.layers.Layer): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False): + def call( + self, + input_ids: tf.Tensor, + position_ids: tf.Tensor, + token_type_ids: tf.Tensor, + inputs_embeds: tf.Tensor, + training: bool = False, + ) -> tf.Tensor: """ Applies embedding based on inputs tensor. @@ -272,18 +285,18 @@ class TFBertEmbeddings(tf.keras.layers.Layer): assert not (input_ids is None and inputs_embeds is None) if input_ids is not None: - inputs_embeds = self.word_embeddings(input_ids=input_ids) + inputs_embeds = self.word_embeddings(input_ids) if token_type_ids is None: - input_shape = shape_list(tensor=inputs_embeds)[:-1] + input_shape = shape_list(inputs_embeds)[:-1] token_type_ids = tf.fill(dims=input_shape, value=0) if position_ids is None: - position_embeds = self.position_embeddings(position_ids=inputs_embeds) + position_embeds = self.position_embeddings(inputs_embeds) else: - position_embeds = self.position_embeddings(position_ids=position_ids) + position_embeds = self.position_embeddings(position_ids) - token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids) + token_type_embeds = self.token_type_embeddings(token_type_ids) final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) final_embeddings = self.LayerNorm(inputs=final_embeddings) final_embeddings = self.dropout(inputs=final_embeddings, training=training) @@ -292,7 +305,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer): class TFBertSelfAttention(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) if config.hidden_size % config.num_attention_heads != 0: @@ -307,50 +320,57 @@ class TFBertSelfAttention(tf.keras.layers.Layer): equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), bias_axes="de", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="query", ) self.key = tf.keras.layers.experimental.EinsumDense( equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), bias_axes="de", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="key", ) self.value = tf.keras.layers.experimental.EinsumDense( equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), bias_axes="de", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="value", ) self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) - def call(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False, training=False): + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: query_layer = self.query(inputs=hidden_states) key_layer = self.key(inputs=hidden_states) value_layer = self.value(inputs=hidden_states) # Take the dot product between "query" and "key" to get the raw # attention scores. - dk = tf.cast(x=self.attention_head_size, dtype=query_layer.dtype) - query_layer = tf.multiply(x=query_layer, y=tf.math.rsqrt(x=dk)) + dk = tf.cast(self.attention_head_size, dtype=query_layer.dtype) + query_layer = tf.multiply(query_layer, tf.math.rsqrt(dk)) attention_scores = tf.einsum("aecd,abcd->acbe", key_layer, query_layer) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in TFBertModel call() function) - attention_scores = attention_scores + attention_mask + attention_scores = tf.add(attention_scores, attention_mask) # Normalize the attention scores to probabilities. attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. - attention_probs = self.dropout(attention_probs, training=training) + attention_probs = self.dropout(inputs=attention_probs, training=training) # Mask heads if we want to if head_mask is not None: - attention_scores = attention_scores * head_mask + attention_scores = tf.multiply(attention_scores, head_mask) attention_output = tf.einsum("acbe,aecd->abcd", attention_probs, value_layer) outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) @@ -359,7 +379,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer): class TFBertSelfOutput(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) if config.hidden_size % config.num_attention_heads != 0: @@ -375,13 +395,13 @@ class TFBertSelfOutput(tf.keras.layers.Layer): equation="abcd,cde->abe", output_shape=(None, self.all_head_size), bias_axes="e", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="dense", ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def call(self, hidden_states, input_tensor, training=False): + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dropout(inputs=hidden_states, training=training) hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) @@ -390,7 +410,7 @@ class TFBertSelfOutput(tf.keras.layers.Layer): class TFBertAttention(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) self.self_attention = TFBertSelfAttention(config, name="self") @@ -399,34 +419,47 @@ class TFBertAttention(tf.keras.layers.Layer): def prune_heads(self, heads): raise NotImplementedError - def call(self, input_tensor, attention_mask, head_mask, output_attentions, training=False): + def call( + self, + input_tensor: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: self_outputs = self.self_attention( - input_tensor, attention_mask, head_mask, output_attentions, training=training + hidden_states=input_tensor, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + training=training, + ) + attention_output = self.dense_output( + hidden_states=self_outputs[0], input_tensor=input_tensor, training=training ) - attention_output = self.dense_output(self_outputs[0], input_tensor, training=training) outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs class TFBertIntermediate(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.experimental.EinsumDense( equation="abc,cd->abd", output_shape=(None, config.intermediate_size), bias_axes="d", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="dense", ) if isinstance(config.hidden_act, str): - self.intermediate_act_fn = get_tf_activation(activation_string=config.hidden_act) + self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - def call(self, hidden_states): + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) @@ -434,7 +467,7 @@ class TFBertIntermediate(tf.keras.layers.Layer): class TFBertOutput(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.experimental.EinsumDense( @@ -447,7 +480,7 @@ class TFBertOutput(tf.keras.layers.Layer): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def call(self, hidden_states, input_tensor, training=False): + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dropout(inputs=hidden_states, training=training) hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) @@ -456,41 +489,54 @@ class TFBertOutput(tf.keras.layers.Layer): class TFBertLayer(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) self.attention = TFBertAttention(config, name="attention") self.intermediate = TFBertIntermediate(config, name="intermediate") self.bert_output = TFBertOutput(config, name="output") - def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False): + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: attention_outputs = self.attention( - hidden_states, attention_mask, head_mask, output_attentions, training=training + input_tensor=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + training=training, ) attention_output = attention_outputs[0] - intermediate_output = self.intermediate(attention_output) - layer_output = self.bert_output(intermediate_output, attention_output, training=training) + intermediate_output = self.intermediate(hidden_states=attention_output) + layer_output = self.bert_output( + hidden_states=intermediate_output, input_tensor=attention_output, training=training + ) outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them return outputs class TFBertEncoder(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) self.layer = [TFBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)] def call( self, - hidden_states, - attention_mask, - head_mask, - output_attentions, - output_hidden_states, - return_dict, - training=False, - ): + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + output_hidden_states: bool, + return_dict: bool, + training: bool = False, + ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]: all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -499,7 +545,11 @@ class TFBertEncoder(tf.keras.layers.Layer): all_hidden_states = all_hidden_states + (hidden_states,) layer_outputs = layer_module( - hidden_states, attention_mask, head_mask[i], output_attentions, training=training + hidden_states=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask[i], + output_attentions=output_attentions, + training=training, ) hidden_states = layer_outputs[0] @@ -519,31 +569,33 @@ class TFBertEncoder(tf.keras.layers.Layer): class TFBertPooler(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( - config.hidden_size, + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", name="dense", ) - def call(self, hidden_states): + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding # to the first token. first_token_tensor = hidden_states[:, 0] - pooled_output = self.dense(first_token_tensor) + pooled_output = self.dense(inputs=first_token_tensor) return pooled_output class TFBertPredictionHeadTransform(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( - config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + units=config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + name="dense", ) if isinstance(config.hidden_act, str): @@ -553,16 +605,16 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - def call(self, hidden_states): - hidden_states = self.dense(hidden_states) + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) hidden_states = self.transform_act_fn(hidden_states) - hidden_states = self.LayerNorm(hidden_states) + hidden_states = self.LayerNorm(inputs=hidden_states) return hidden_states class TFBertLMPredictionHead(tf.keras.layers.Layer): - def __init__(self, config, input_embeddings, **kwargs): + def __init__(self, config: BertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size @@ -574,28 +626,28 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer): # an output-only bias for each token. self.input_embeddings = input_embeddings - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") super().build(input_shape) - def get_output_embeddings(self): + def get_output_embeddings(self) -> tf.keras.layers.Layer: return self.input_embeddings - def set_output_embeddings(self, value): + def set_output_embeddings(self, value: tf.Variable): self.input_embeddings.weight = value self.input_embeddings.vocab_size = shape_list(value)[0] - def get_bias(self): + def get_bias(self) -> Dict[str, tf.Variable]: return {"bias": self.bias} - def set_bias(self, value): + def set_bias(self, value: tf.Variable): self.bias = value["bias"] self.vocab_size = shape_list(value["bias"])[0] - def call(self, hidden_states): + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.transform(hidden_states=hidden_states) - seq_length = shape_list(tensor=hidden_states)[1] + seq_length = shape_list(hidden_states)[1] hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) @@ -605,27 +657,29 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer): class TFBertMLMHead(tf.keras.layers.Layer): - def __init__(self, config, input_embeddings, **kwargs): + def __init__(self, config: BertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.predictions = TFBertLMPredictionHead(config, input_embeddings, name="predictions") - def call(self, sequence_output): - prediction_scores = self.predictions(sequence_output) + def call(self, sequence_output: tf.Tensor) -> tf.Tensor: + prediction_scores = self.predictions(hidden_states=sequence_output) return prediction_scores class TFBertNSPHead(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) self.seq_relationship = tf.keras.layers.Dense( - 2, kernel_initializer=get_initializer(config.initializer_range), name="seq_relationship" + units=2, + kernel_initializer=get_initializer(config.initializer_range), + name="seq_relationship", ) - def call(self, pooled_output): - seq_relationship_score = self.seq_relationship(pooled_output) + def call(self, pooled_output: tf.Tensor) -> tf.Tensor: + seq_relationship_score = self.seq_relationship(inputs=pooled_output) return seq_relationship_score @@ -634,7 +688,7 @@ class TFBertNSPHead(tf.keras.layers.Layer): class TFBertMainLayer(tf.keras.layers.Layer): config_class = BertConfig - def __init__(self, config, add_pooling_layer=True, **kwargs): + def __init__(self, config: BertConfig, add_pooling_layer: bool = True, **kwargs): super().__init__(**kwargs) self.config = config @@ -643,10 +697,10 @@ class TFBertMainLayer(tf.keras.layers.Layer): self.encoder = TFBertEncoder(config, name="encoder") self.pooler = TFBertPooler(config, name="pooler") if add_pooling_layer else None - def get_input_embeddings(self): + def get_input_embeddings(self) -> tf.keras.layers.Layer: return self.embeddings.word_embeddings - def set_input_embeddings(self, value): + def set_input_embeddings(self, value: tf.Variable): self.embeddings.word_embeddings.weight = value self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] @@ -659,18 +713,18 @@ class TFBertMainLayer(tf.keras.layers.Layer): def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, **kwargs, - ): + ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: inputs = input_processing( func=self.call, config=self.config, @@ -697,16 +751,16 @@ class TFBertMainLayer(tf.keras.layers.Layer): raise ValueError("You have to specify either input_ids or inputs_embeds") if inputs["attention_mask"] is None: - inputs["attention_mask"] = tf.fill(input_shape, 1) + inputs["attention_mask"] = tf.fill(dims=input_shape, value=1) if inputs["token_type_ids"] is None: - inputs["token_type_ids"] = tf.fill(input_shape, 0) + inputs["token_type_ids"] = tf.fill(dims=input_shape, value=0) embedding_output = self.embeddings( - inputs["input_ids"], - inputs["position_ids"], - inputs["token_type_ids"], - inputs["inputs_embeds"], + input_ids=inputs["input_ids"], + position_ids=inputs["position_ids"], + token_type_ids=inputs["token_type_ids"], + inputs_embeds=inputs["inputs_embeds"], training=inputs["training"], ) @@ -722,8 +776,8 @@ class TFBertMainLayer(tf.keras.layers.Layer): # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. - extended_attention_mask = tf.cast(extended_attention_mask, embedding_output.dtype) - extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + extended_attention_mask = tf.cast(extended_attention_mask, dtype=embedding_output.dtype) + extended_attention_mask = tf.multiply(tf.subtract(1.0, extended_attention_mask), -10000.0) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head @@ -736,17 +790,17 @@ class TFBertMainLayer(tf.keras.layers.Layer): inputs["head_mask"] = [None] * self.config.num_hidden_layers encoder_outputs = self.encoder( - embedding_output, - extended_attention_mask, - inputs["head_mask"], - inputs["output_attentions"], - inputs["output_hidden_states"], - inputs["return_dict"], + hidden_states=embedding_output, + attention_mask=extended_attention_mask, + head_mask=inputs["head_mask"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], training=inputs["training"], ) sequence_output = encoder_outputs[0] - pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + pooled_output = self.pooler(hidden_states=sequence_output) if self.pooler is not None else None if not inputs["return_dict"]: return ( @@ -799,8 +853,8 @@ class TFBertForPreTrainingOutput(ModelOutput): loss: Optional[tf.Tensor] = None prediction_logits: tf.Tensor = None seq_relationship_logits: tf.Tensor = None - hidden_states: Optional[Tuple[tf.Tensor]] = None - attentions: Optional[Tuple[tf.Tensor]] = None + hidden_states: Optional[Union[Tuple[tf.Tensor], tf.Tensor]] = None + attentions: Optional[Union[Tuple[tf.Tensor], tf.Tensor]] = None BERT_START_DOCSTRING = r""" @@ -841,7 +895,7 @@ BERT_START_DOCSTRING = r""" BERT_INPUTS_DOCSTRING = r""" Args: - input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`): + input_ids (:obj:`np.ndarray`, :obj:`tf.Tensor`, :obj:`List[tf.Tensor]` :obj:`Dict[str, tf.Tensor]` or :obj:`Dict[str, np.ndarray]` and each example must have the shape :obj:`({0})`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`~transformers.BertTokenizer`. See @@ -849,14 +903,14 @@ BERT_INPUTS_DOCSTRING = r""" details. `What are input IDs? <../glossary.html#input-ids>`__ - attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): + attention_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. `What are attention masks? <../glossary.html#attention-mask>`__ - token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): + token_type_ids (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: @@ -864,18 +918,18 @@ BERT_INPUTS_DOCSTRING = r""" - 1 corresponds to a `sentence B` token. `What are token type IDs? <../glossary.html#token-type-ids>`__ - position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): + position_ids (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`__ - head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): + head_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. - inputs_embeds (:obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`): + inputs_embeds (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert :obj:`input_ids` indices into associated vectors than the model's internal embedding lookup matrix. @@ -901,7 +955,7 @@ BERT_INPUTS_DOCSTRING = r""" BERT_START_DOCSTRING, ) class TFBertModel(TFBertPreTrainedModel): - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: BertConfig, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.bert = TFBertMainLayer(config, name="bert") @@ -915,18 +969,18 @@ class TFBertModel(TFBertPreTrainedModel): ) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: inputs = input_processing( func=self.call, config=self.config, @@ -957,7 +1011,7 @@ class TFBertModel(TFBertPreTrainedModel): return outputs - def serving_output(self, output): + def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -980,17 +1034,17 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss): # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model _keys_to_ignore_on_load_unexpected = [r"cls.predictions.decoder.weight"] - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: BertConfig, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.bert = TFBertMainLayer(config, name="bert") self.nsp = TFBertNSPHead(config, name="nsp___cls") - self.mlm = TFBertMLMHead(config, self.bert.embeddings.word_embeddings, name="mlm___cls") + self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings.word_embeddings, name="mlm___cls") - def get_lm_head(self): + def get_lm_head(self) -> tf.keras.layers.Layer: return self.mlm.predictions - def get_prefix_bias_name(self): + def get_prefix_bias_name(self) -> str: warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning) return self.name + "/" + self.mlm.name + "/" + self.mlm.predictions.name @@ -998,20 +1052,20 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss): @replace_return_docstrings(output_type=TFBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - labels=None, - next_sentence_label=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + next_sentence_label: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFBertForPreTrainingOutput, Tuple[tf.Tensor]]: r""" Return: @@ -1045,7 +1099,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss): kwargs_call=kwargs, ) outputs = self.bert( - inputs["input_ids"], + input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], token_type_ids=inputs["token_type_ids"], position_ids=inputs["position_ids"], @@ -1057,8 +1111,8 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss): training=inputs["training"], ) sequence_output, pooled_output = outputs[:2] - prediction_scores = self.mlm(sequence_output, training=inputs["training"]) - seq_relationship_score = self.nsp(pooled_output) + prediction_scores = self.mlm(sequence_output=sequence_output, training=inputs["training"]) + seq_relationship_score = self.nsp(pooled_output=pooled_output) total_loss = None if inputs["labels"] is not None and inputs["next_sentence_label"] is not None: @@ -1077,7 +1131,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss): attentions=outputs.attentions, ) - def serving_output(self, output): + def serving_output(self, output: TFBertForPreTrainingOutput) -> TFBertForPreTrainingOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1099,7 +1153,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss): r"nsp___cls", ] - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: BertConfig, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) if config.is_decoder: @@ -1109,12 +1163,12 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss): ) self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") - self.mlm = TFBertMLMHead(config, self.bert.embeddings.word_embeddings, name="mlm___cls") + self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings.word_embeddings, name="mlm___cls") - def get_lm_head(self): + def get_lm_head(self) -> tf.keras.layers.Layer: return self.mlm.predictions - def get_prefix_bias_name(self): + def get_prefix_bias_name(self) -> str: warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning) return self.name + "/" + self.mlm.name + "/" + self.mlm.predictions.name @@ -1127,21 +1181,21 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss): ) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - labels=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]: r""" - labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` @@ -1163,7 +1217,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss): kwargs_call=kwargs, ) outputs = self.bert( - inputs["input_ids"], + input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], token_type_ids=inputs["token_type_ids"], position_ids=inputs["position_ids"], @@ -1175,8 +1229,10 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss): training=inputs["training"], ) sequence_output = outputs[0] - prediction_scores = self.mlm(sequence_output, training=inputs["training"]) - loss = None if inputs["labels"] is None else self.compute_loss(inputs["labels"], prediction_scores) + prediction_scores = self.mlm(sequence_output=sequence_output, training=inputs["training"]) + loss = ( + None if inputs["labels"] is None else self.compute_loss(labels=inputs["labels"], logits=prediction_scores) + ) if not inputs["return_dict"]: output = (prediction_scores,) + outputs[2:] @@ -1189,7 +1245,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss): attentions=outputs.attentions, ) - def serving_output(self, output): + def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1205,19 +1261,19 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss): r"nsp___cls", ] - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: BertConfig, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) if not config.is_decoder: logger.warning("If you want to use `TFBertLMHeadModel` as a standalone, add `is_decoder=True.`") self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") - self.mlm = TFBertMLMHead(config, self.bert.embeddings.word_embeddings, name="mlm___cls") + self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings.word_embeddings, name="mlm___cls") - def get_lm_head(self): + def get_lm_head(self) -> tf.keras.layers.Layer: return self.mlm.predictions - def get_prefix_bias_name(self): + def get_prefix_bias_name(self) -> str: warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning) return self.name + "/" + self.mlm.name + "/" + self.mlm.predictions.name @@ -1229,21 +1285,21 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss): ) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - labels=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFCausalLMOutput, Tuple[tf.Tensor]]: r""" - labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for computing the cross entropy classification loss. Indices should be in ``[0, ..., config.vocab_size - 1]``. """ @@ -1264,7 +1320,7 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss): kwargs_call=kwargs, ) outputs = self.bert( - inputs["input_ids"], + input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], token_type_ids=inputs["token_type_ids"], position_ids=inputs["position_ids"], @@ -1276,14 +1332,14 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss): training=inputs["training"], ) sequence_output = outputs[0] - logits = self.mlm(sequence_output, training=inputs["training"]) + logits = self.mlm(sequence_output=sequence_output, training=inputs["training"]) loss = None if inputs["labels"] is not None: # shift labels to the left and cut last logit token logits = logits[:, :-1] labels = inputs["labels"][:, 1:] - loss = self.compute_loss(labels, logits) + loss = self.compute_loss(labels=labels, logits=logits) if not inputs["return_dict"]: output = (logits,) + outputs[2:] @@ -1296,7 +1352,7 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss): attentions=outputs.attentions, ) - def serving_output(self, output): + def serving_output(self, output: TFCausalLMOutput) -> TFCausalLMOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1311,7 +1367,7 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel, TFNextSentencePredi # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model _keys_to_ignore_on_load_unexpected = [r"mlm___cls", r"cls.predictions"] - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: BertConfig, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.bert = TFBertMainLayer(config, name="bert") @@ -1321,19 +1377,19 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel, TFNextSentencePredi @replace_return_docstrings(output_type=TFNextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - next_sentence_label=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + next_sentence_label: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFNextSentencePredictorOutput, Tuple[tf.Tensor]]: r""" Return: @@ -1369,7 +1425,7 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel, TFNextSentencePredi kwargs_call=kwargs, ) outputs = self.bert( - inputs["input_ids"], + input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], token_type_ids=inputs["token_type_ids"], position_ids=inputs["position_ids"], @@ -1381,7 +1437,7 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel, TFNextSentencePredi training=inputs["training"], ) pooled_output = outputs[1] - seq_relationship_scores = self.nsp(pooled_output) + seq_relationship_scores = self.nsp(pooled_output=pooled_output) next_sentence_loss = ( None if inputs["next_sentence_label"] is None @@ -1399,7 +1455,7 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel, TFNextSentencePredi attentions=outputs.attentions, ) - def serving_output(self, output): + def serving_output(self, output: TFNextSentencePredictorOutput) -> TFNextSentencePredictorOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1418,14 +1474,17 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific _keys_to_ignore_on_load_unexpected = [r"mlm___cls", r"nsp___cls", r"cls.predictions", r"cls.seq_relationship"] _keys_to_ignore_on_load_missing = [r"dropout"] - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: BertConfig, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels + self.bert = TFBertMainLayer(config, name="bert") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.classifier = tf.keras.layers.Dense( - config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + units=config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + name="classifier", ) @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1437,21 +1496,21 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific ) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - labels=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]: r""" - labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`): + labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). @@ -1473,7 +1532,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific kwargs_call=kwargs, ) outputs = self.bert( - inputs["input_ids"], + input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], token_type_ids=inputs["token_type_ids"], position_ids=inputs["position_ids"], @@ -1485,9 +1544,9 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific training=inputs["training"], ) pooled_output = outputs[1] - pooled_output = self.dropout(pooled_output, training=inputs["training"]) - logits = self.classifier(pooled_output) - loss = None if inputs["labels"] is None else self.compute_loss(inputs["labels"], logits) + pooled_output = self.dropout(inputs=pooled_output, training=inputs["training"]) + logits = self.classifier(inputs=pooled_output) + loss = None if inputs["labels"] is None else self.compute_loss(labels=inputs["labels"], logits=logits) if not inputs["return_dict"]: output = (logits,) + outputs[2:] @@ -1500,7 +1559,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific attentions=outputs.attentions, ) - def serving_output(self, output): + def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1519,17 +1578,17 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss): _keys_to_ignore_on_load_unexpected = [r"mlm___cls", r"nsp___cls", r"cls.predictions", r"cls.seq_relationship"] _keys_to_ignore_on_load_missing = [r"dropout"] - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: BertConfig, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.bert = TFBertMainLayer(config, name="bert") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.classifier = tf.keras.layers.Dense( - 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) @property - def dummy_inputs(self): + def dummy_inputs(self) -> Dict[str, tf.Tensor]: """ Dummy inputs to build the network. @@ -1547,21 +1606,21 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss): ) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - labels=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]: r""" - labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`): + labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See :obj:`input_ids` above) @@ -1590,38 +1649,46 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss): num_choices = shape_list(inputs["inputs_embeds"])[1] seq_length = shape_list(inputs["inputs_embeds"])[2] - flat_input_ids = tf.reshape(inputs["input_ids"], (-1, seq_length)) if inputs["input_ids"] is not None else None + flat_input_ids = ( + tf.reshape(tensor=inputs["input_ids"], shape=(-1, seq_length)) if inputs["input_ids"] is not None else None + ) flat_attention_mask = ( - tf.reshape(inputs["attention_mask"], (-1, seq_length)) if inputs["attention_mask"] is not None else None + tf.reshape(tensor=inputs["attention_mask"], shape=(-1, seq_length)) + if inputs["attention_mask"] is not None + else None ) flat_token_type_ids = ( - tf.reshape(inputs["token_type_ids"], (-1, seq_length)) if inputs["token_type_ids"] is not None else None + tf.reshape(tensor=inputs["token_type_ids"], shape=(-1, seq_length)) + if inputs["token_type_ids"] is not None + else None ) flat_position_ids = ( - tf.reshape(inputs["position_ids"], (-1, seq_length)) if inputs["position_ids"] is not None else None + tf.reshape(tensor=inputs["position_ids"], shape=(-1, seq_length)) + if inputs["position_ids"] is not None + else None ) flat_inputs_embeds = ( - tf.reshape(inputs["inputs_embeds"], (-1, seq_length, shape_list(inputs["inputs_embeds"])[3])) + tf.reshape(tensor=inputs["inputs_embeds"], shape=(-1, seq_length, shape_list(inputs["inputs_embeds"])[3])) if inputs["inputs_embeds"] is not None else None ) outputs = self.bert( - flat_input_ids, - flat_attention_mask, - flat_token_type_ids, - flat_position_ids, - inputs["head_mask"], - flat_inputs_embeds, - inputs["output_attentions"], - inputs["output_hidden_states"], + input_ids=flat_input_ids, + attention_mask=flat_attention_mask, + token_type_ids=flat_token_type_ids, + position_ids=flat_position_ids, + head_mask=inputs["head_mask"], + inputs_embeds=flat_inputs_embeds, + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], return_dict=inputs["return_dict"], training=inputs["training"], ) pooled_output = outputs[1] - pooled_output = self.dropout(pooled_output, training=inputs["training"]) - logits = self.classifier(pooled_output) - reshaped_logits = tf.reshape(logits, (-1, num_choices)) - loss = None if inputs["labels"] is None else self.compute_loss(inputs["labels"], reshaped_logits) + pooled_output = self.dropout(inputs=pooled_output, training=inputs["training"]) + logits = self.classifier(inputs=pooled_output) + reshaped_logits = tf.reshape(tensor=logits, shape=(-1, num_choices)) + loss = None if inputs["labels"] is None else self.compute_loss(labels=inputs["labels"], logits=reshaped_logits) if not inputs["return_dict"]: output = (reshaped_logits,) + outputs[2:] @@ -1643,12 +1710,12 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss): } ] ) - def serving(self, inputs): - output = self.call(inputs) + def serving(self, inputs: Dict[str, tf.Tensor]): + output = self.call(input_ids=inputs) return self.serving_output(output) - def serving_output(self, output): + def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1673,14 +1740,17 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL ] _keys_to_ignore_on_load_missing = [r"dropout"] - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: BertConfig, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels + self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.classifier = tf.keras.layers.Dense( - config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + units=config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + name="classifier", ) @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1692,21 +1762,21 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL ) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - labels=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]: r""" - labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. """ @@ -1727,7 +1797,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL kwargs_call=kwargs, ) outputs = self.bert( - inputs["input_ids"], + input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], token_type_ids=inputs["token_type_ids"], position_ids=inputs["position_ids"], @@ -1739,9 +1809,9 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL training=inputs["training"], ) sequence_output = outputs[0] - sequence_output = self.dropout(sequence_output, training=inputs["training"]) - logits = self.classifier(sequence_output) - loss = None if inputs["labels"] is None else self.compute_loss(inputs["labels"], logits) + sequence_output = self.dropout(inputs=sequence_output, training=inputs["training"]) + logits = self.classifier(inputs=sequence_output) + loss = None if inputs["labels"] is None else self.compute_loss(labels=inputs["labels"], logits=logits) if not inputs["return_dict"]: output = (logits,) + outputs[2:] @@ -1754,7 +1824,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL attentions=outputs.attentions, ) - def serving_output(self, output): + def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1778,13 +1848,16 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss) r"cls.seq_relationship", ] - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: BertConfig, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels + self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") self.qa_outputs = tf.keras.layers.Dense( - config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" + units=config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + name="qa_outputs", ) @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1796,26 +1869,26 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss) ) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - start_positions=None, - end_positions=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None, + end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]: r""" - start_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`): + start_positions (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. - end_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`): + end_positions (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. @@ -1838,7 +1911,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss) kwargs_call=kwargs, ) outputs = self.bert( - inputs["input_ids"], + input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], token_type_ids=inputs["token_type_ids"], position_ids=inputs["position_ids"], @@ -1850,16 +1923,16 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss) training=inputs["training"], ) sequence_output = outputs[0] - logits = self.qa_outputs(sequence_output) - start_logits, end_logits = tf.split(logits, 2, axis=-1) - start_logits = tf.squeeze(start_logits, axis=-1) - end_logits = tf.squeeze(end_logits, axis=-1) + logits = self.qa_outputs(inputs=sequence_output) + start_logits, end_logits = tf.split(value=logits, num_or_size_splits=2, axis=-1) + start_logits = tf.squeeze(input=start_logits, axis=-1) + end_logits = tf.squeeze(input=end_logits, axis=-1) loss = None if inputs["start_positions"] is not None and inputs["end_positions"] is not None: labels = {"start_position": inputs["start_positions"]} labels["end_position"] = inputs["end_positions"] - loss = self.compute_loss(labels, (start_logits, end_logits)) + loss = self.compute_loss(labels=labels, logits=(start_logits, end_logits)) if not inputs["return_dict"]: output = (start_logits, end_logits) + outputs[2:] @@ -1873,7 +1946,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss) attentions=outputs.attentions, ) - def serving_output(self, output): + def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None diff --git a/src/transformers/models/ctrl/modeling_tf_ctrl.py b/src/transformers/models/ctrl/modeling_tf_ctrl.py index 81930b83fb..ba2eb57e8f 100644 --- a/src/transformers/models/ctrl/modeling_tf_ctrl.py +++ b/src/transformers/models/ctrl/modeling_tf_ctrl.py @@ -919,7 +919,7 @@ class TFCTRLForSequenceClassification(TFCTRLPreTrainedModel, TFSequenceClassific ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None diff --git a/src/transformers/models/distilbert/modeling_tf_distilbert.py b/src/transformers/models/distilbert/modeling_tf_distilbert.py index 3a43976621..d9942026d1 100644 --- a/src/transformers/models/distilbert/modeling_tf_distilbert.py +++ b/src/transformers/models/distilbert/modeling_tf_distilbert.py @@ -17,6 +17,7 @@ """ import warnings +from typing import Any, Dict import tensorflow as tf @@ -76,16 +77,16 @@ class TFDistilBertWordEmbeddings(tf.keras.layers.Layer): self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.weight = self.add_weight( name="weight", shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "vocab_size": self.vocab_size, "hidden_size": self.hidden_size, @@ -95,14 +96,14 @@ class TFDistilBertWordEmbeddings(tf.keras.layers.Layer): return dict(list(base_config.items()) + list(config.items())) - def call(self, input_ids): + def call(self, input_ids: tf.Tensor) -> tf.Tensor: flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -116,16 +117,16 @@ class TFDistilBertPositionEmbeddings(tf.keras.layers.Layer): self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.position_embeddings = self.add_weight( name="embeddings", shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "max_position_embeddings": self.max_position_embeddings, "hidden_size": self.hidden_size, @@ -135,8 +136,8 @@ class TFDistilBertPositionEmbeddings(tf.keras.layers.Layer): return dict(list(base_config.items()) + list(config.items())) - def call(self, position_ids): - input_shape = shape_list(tensor=position_ids) + def call(self, position_ids: tf.Tensor) -> tf.Tensor: + input_shape = shape_list(position_ids) position_embeddings = self.position_embeddings[: input_shape[1], :] return tf.broadcast_to(input=position_embeddings, shape=input_shape) @@ -796,7 +797,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModel ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -897,7 +898,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -988,7 +989,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1131,7 +1132,7 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic return self.serving_output(output) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1238,7 +1239,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output - def serving_output(self, output): + def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None diff --git a/src/transformers/models/electra/modeling_tf_electra.py b/src/transformers/models/electra/modeling_tf_electra.py index 5198943f9b..476b2b6172 100644 --- a/src/transformers/models/electra/modeling_tf_electra.py +++ b/src/transformers/models/electra/modeling_tf_electra.py @@ -16,7 +16,7 @@ import warnings from dataclasses import dataclass -from typing import Optional, Tuple +from typing import Any, Dict, Optional, Tuple, Union import tensorflow as tf @@ -79,16 +79,16 @@ class TFElectraWordEmbeddings(tf.keras.layers.Layer): self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.weight = self.add_weight( name="weight", shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "vocab_size": self.vocab_size, "hidden_size": self.hidden_size, @@ -98,14 +98,14 @@ class TFElectraWordEmbeddings(tf.keras.layers.Layer): return dict(list(base_config.items()) + list(config.items())) - def call(self, input_ids): + def call(self, input_ids: tf.Tensor) -> tf.Tensor: flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -119,16 +119,16 @@ class TFElectraTokenTypeEmbeddings(tf.keras.layers.Layer): self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.token_type_embeddings = self.add_weight( name="embeddings", shape=[self.type_vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "type_vocab_size": self.type_vocab_size, "hidden_size": self.hidden_size, @@ -138,15 +138,15 @@ class TFElectraTokenTypeEmbeddings(tf.keras.layers.Layer): return dict(list(base_config.items()) + list(config.items())) - def call(self, token_type_ids): + def call(self, token_type_ids: tf.Tensor) -> tf.Tensor: flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(token_type_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(token_type_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -160,16 +160,16 @@ class TFElectraPositionEmbeddings(tf.keras.layers.Layer): self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.position_embeddings = self.add_weight( name="embeddings", shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "max_position_embeddings": self.max_position_embeddings, "hidden_size": self.hidden_size, @@ -179,8 +179,8 @@ class TFElectraPositionEmbeddings(tf.keras.layers.Layer): return dict(list(base_config.items()) + list(config.items())) - def call(self, position_ids): - input_shape = shape_list(tensor=position_ids) + def call(self, position_ids: tf.Tensor) -> tf.Tensor: + input_shape = shape_list(position_ids) position_embeddings = self.position_embeddings[: input_shape[1], :] return tf.broadcast_to(input=position_embeddings, shape=input_shape) @@ -188,7 +188,7 @@ class TFElectraPositionEmbeddings(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Electra class TFElectraSelfAttention(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: ElectraConfig, **kwargs): super().__init__(**kwargs) if config.hidden_size % config.num_attention_heads != 0: @@ -203,50 +203,57 @@ class TFElectraSelfAttention(tf.keras.layers.Layer): equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), bias_axes="de", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="query", ) self.key = tf.keras.layers.experimental.EinsumDense( equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), bias_axes="de", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="key", ) self.value = tf.keras.layers.experimental.EinsumDense( equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), bias_axes="de", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="value", ) self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) - def call(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False, training=False): + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: query_layer = self.query(inputs=hidden_states) key_layer = self.key(inputs=hidden_states) value_layer = self.value(inputs=hidden_states) # Take the dot product between "query" and "key" to get the raw # attention scores. - dk = tf.cast(x=self.attention_head_size, dtype=query_layer.dtype) - query_layer = tf.multiply(x=query_layer, y=tf.math.rsqrt(x=dk)) + dk = tf.cast(self.attention_head_size, dtype=query_layer.dtype) + query_layer = tf.multiply(query_layer, tf.math.rsqrt(dk)) attention_scores = tf.einsum("aecd,abcd->acbe", key_layer, query_layer) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in TFElectraModel call() function) - attention_scores = attention_scores + attention_mask + attention_scores = tf.add(attention_scores, attention_mask) # Normalize the attention scores to probabilities. attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. - attention_probs = self.dropout(attention_probs, training=training) + attention_probs = self.dropout(inputs=attention_probs, training=training) # Mask heads if we want to if head_mask is not None: - attention_scores = attention_scores * head_mask + attention_scores = tf.multiply(attention_scores, head_mask) attention_output = tf.einsum("acbe,aecd->abcd", attention_probs, value_layer) outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) @@ -254,9 +261,9 @@ class TFElectraSelfAttention(tf.keras.layers.Layer): return outputs -# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput +# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Electra class TFElectraSelfOutput(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: ElectraConfig, **kwargs): super().__init__(**kwargs) if config.hidden_size % config.num_attention_heads != 0: @@ -272,13 +279,13 @@ class TFElectraSelfOutput(tf.keras.layers.Layer): equation="abcd,cde->abe", output_shape=(None, self.all_head_size), bias_axes="e", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="dense", ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def call(self, hidden_states, input_tensor, training=False): + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dropout(inputs=hidden_states, training=training) hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) @@ -288,7 +295,7 @@ class TFElectraSelfOutput(tf.keras.layers.Layer): # Copied from from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Electra class TFElectraAttention(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: ElectraConfig, **kwargs): super().__init__(**kwargs) self.self_attention = TFElectraSelfAttention(config, name="self") @@ -297,44 +304,57 @@ class TFElectraAttention(tf.keras.layers.Layer): def prune_heads(self, heads): raise NotImplementedError - def call(self, input_tensor, attention_mask, head_mask, output_attentions, training=False): + def call( + self, + input_tensor: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: self_outputs = self.self_attention( - input_tensor, attention_mask, head_mask, output_attentions, training=training + hidden_states=input_tensor, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + training=training, + ) + attention_output = self.dense_output( + hidden_states=self_outputs[0], input_tensor=input_tensor, training=training ) - attention_output = self.dense_output(self_outputs[0], input_tensor, training=training) outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs -# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate +# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Electra class TFElectraIntermediate(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: ElectraConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.experimental.EinsumDense( equation="abc,cd->abd", output_shape=(None, config.intermediate_size), bias_axes="d", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="dense", ) if isinstance(config.hidden_act, str): - self.intermediate_act_fn = get_tf_activation(activation_string=config.hidden_act) + self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - def call(self, hidden_states): + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states -# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput +# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Electra class TFElectraOutput(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: ElectraConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.experimental.EinsumDense( @@ -347,7 +367,7 @@ class TFElectraOutput(tf.keras.layers.Layer): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def call(self, hidden_states, input_tensor, training=False): + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dropout(inputs=hidden_states, training=training) hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) @@ -357,20 +377,33 @@ class TFElectraOutput(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Electra class TFElectraLayer(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: ElectraConfig, **kwargs): super().__init__(**kwargs) self.attention = TFElectraAttention(config, name="attention") self.intermediate = TFElectraIntermediate(config, name="intermediate") self.bert_output = TFElectraOutput(config, name="output") - def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False): + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: attention_outputs = self.attention( - hidden_states, attention_mask, head_mask, output_attentions, training=training + input_tensor=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + training=training, ) attention_output = attention_outputs[0] - intermediate_output = self.intermediate(attention_output) - layer_output = self.bert_output(intermediate_output, attention_output, training=training) + intermediate_output = self.intermediate(hidden_states=attention_output) + layer_output = self.bert_output( + hidden_states=intermediate_output, input_tensor=attention_output, training=training + ) outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them return outputs @@ -378,21 +411,21 @@ class TFElectraLayer(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Electra class TFElectraEncoder(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: ElectraConfig, **kwargs): super().__init__(**kwargs) self.layer = [TFElectraLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)] def call( self, - hidden_states, - attention_mask, - head_mask, - output_attentions, - output_hidden_states, - return_dict, - training=False, - ): + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + output_hidden_states: bool, + return_dict: bool, + training: bool = False, + ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]: all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -401,7 +434,11 @@ class TFElectraEncoder(tf.keras.layers.Layer): all_hidden_states = all_hidden_states + (hidden_states,) layer_outputs = layer_module( - hidden_states, attention_mask, head_mask[i], output_attentions, training=training + hidden_states=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask[i], + output_attentions=output_attentions, + training=training, ) hidden_states = layer_outputs[0] @@ -420,27 +457,28 @@ class TFElectraEncoder(tf.keras.layers.Layer): ) -# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler +# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Electra class TFElectraPooler(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: ElectraConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( - config.hidden_size, + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", name="dense", ) - def call(self, hidden_states): + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding # to the first token. first_token_tensor = hidden_states[:, 0] - pooled_output = self.dense(first_token_tensor) + pooled_output = self.dense(inputs=first_token_tensor) return pooled_output +# Copied from transformers.models.albert.modeling_tf_albert.TFAlbertEmbeddings with Albert->Electra class TFElectraEmbeddings(tf.keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" @@ -469,8 +507,15 @@ class TFElectraEmbeddings(tf.keras.layers.Layer): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - # Copied from transformers.models.albert.modeling_tf_albert.TFAlbertEmbeddings.call with Albert->Electra - def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False): + # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call + def call( + self, + input_ids: tf.Tensor, + position_ids: tf.Tensor, + token_type_ids: tf.Tensor, + inputs_embeds: tf.Tensor, + training: bool = False, + ) -> tf.Tensor: """ Applies embedding based on inputs tensor. @@ -1097,7 +1142,7 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLos ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1215,7 +1260,7 @@ class TFElectraForSequenceClassification(TFElectraPreTrainedModel, TFSequenceCla ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1356,13 +1401,14 @@ class TFElectraForMultipleChoice(TFElectraPreTrainedModel, TFMultipleChoiceLoss) } ] ) - def serving(self, inputs): - output = self.call(inputs) + # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving + def serving(self, inputs: Dict[str, tf.Tensor]): + output = self.call(input_ids=inputs) return self.serving_output(output) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1460,7 +1506,7 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1575,7 +1621,7 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output - def serving_output(self, output): + def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None diff --git a/src/transformers/models/funnel/modeling_tf_funnel.py b/src/transformers/models/funnel/modeling_tf_funnel.py index 676b9b769a..f0e6f0a7b7 100644 --- a/src/transformers/models/funnel/modeling_tf_funnel.py +++ b/src/transformers/models/funnel/modeling_tf_funnel.py @@ -16,7 +16,7 @@ import warnings from dataclasses import dataclass -from typing import Optional, Tuple +from typing import Any, Dict, Optional, Tuple import tensorflow as tf @@ -83,16 +83,16 @@ class TFFunnelWordEmbeddings(tf.keras.layers.Layer): self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.weight = self.add_weight( name="weight", shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "vocab_size": self.vocab_size, "hidden_size": self.hidden_size, @@ -102,14 +102,14 @@ class TFFunnelWordEmbeddings(tf.keras.layers.Layer): return dict(list(base_config.items()) + list(config.items())) - def call(self, input_ids): + def call(self, input_ids: tf.Tensor) -> tf.Tensor: flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -1436,7 +1436,7 @@ class TFFunnelForMaskedLM(TFFunnelPreTrainedModel, TFMaskedLanguageModelingLoss) ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1526,7 +1526,7 @@ class TFFunnelForSequenceClassification(TFFunnelPreTrainedModel, TFSequenceClass ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1656,13 +1656,13 @@ class TFFunnelForMultipleChoice(TFFunnelPreTrainedModel, TFMultipleChoiceLoss): } ] ) - def serving(self, inputs): - output = self.call(inputs) + def serving(self, inputs: Dict[str, tf.Tensor]): + output = self.call(input_ids=inputs) - return self.serving_output(output) + return self.serving_output(output=output) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1755,7 +1755,7 @@ class TFFunnelForTokenClassification(TFFunnelPreTrainedModel, TFTokenClassificat ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1860,7 +1860,7 @@ class TFFunnelForQuestionAnswering(TFFunnelPreTrainedModel, TFQuestionAnsweringL ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output - def serving_output(self, output): + def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py index 4f4e449c56..20480f083c 100644 --- a/src/transformers/models/longformer/modeling_tf_longformer.py +++ b/src/transformers/models/longformer/modeling_tf_longformer.py @@ -16,7 +16,7 @@ import warnings from dataclasses import dataclass -from typing import Optional, Tuple +from typing import Any, Dict, Optional, Tuple import tensorflow as tf @@ -424,16 +424,16 @@ class TFLongformerWordEmbeddings(tf.keras.layers.Layer): self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.weight = self.add_weight( name="weight", shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "vocab_size": self.vocab_size, "hidden_size": self.hidden_size, @@ -443,14 +443,14 @@ class TFLongformerWordEmbeddings(tf.keras.layers.Layer): return dict(list(base_config.items()) + list(config.items())) - def call(self, input_ids): + def call(self, input_ids: tf.Tensor) -> tf.Tensor: flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -464,16 +464,16 @@ class TFLongformerTokenTypeEmbeddings(tf.keras.layers.Layer): self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.token_type_embeddings = self.add_weight( name="embeddings", shape=[self.type_vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "type_vocab_size": self.type_vocab_size, "hidden_size": self.hidden_size, @@ -483,15 +483,15 @@ class TFLongformerTokenTypeEmbeddings(tf.keras.layers.Layer): return dict(list(base_config.items()) + list(config.items())) - def call(self, token_type_ids): + def call(self, token_type_ids: tf.Tensor) -> tf.Tensor: flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(token_type_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(token_type_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -508,7 +508,7 @@ class TFLongformerPositionEmbeddings(tf.keras.layers.Layer): self.position_embeddings = self.add_weight( name="embeddings", shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) super().build(input_shape) @@ -527,10 +527,10 @@ class TFLongformerPositionEmbeddings(tf.keras.layers.Layer): flat_position_ids = tf.reshape(tensor=position_ids, shape=[-1]) embeddings = tf.gather(params=self.position_embeddings, indices=flat_position_ids) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=position_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(position_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=position_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(position_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -638,8 +638,8 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer): tensor=input_ids, shape=(input_ids_shape[0] * input_ids_shape[1], input_ids_shape[2]) ) - mask = tf.cast(x=tf.math.not_equal(x=input_ids, y=self.padding_idx), dtype=input_ids.dtype) - incremental_indices = tf.math.cumsum(x=mask, axis=1) * mask + mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=input_ids.dtype) + incremental_indices = tf.math.cumsum(mask, axis=1) * mask return incremental_indices + self.padding_idx @@ -689,34 +689,34 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer): return final_embeddings -# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate +# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Longformer class TFLongformerIntermediate(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: LongformerConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.experimental.EinsumDense( equation="abc,cd->abd", output_shape=(None, config.intermediate_size), bias_axes="d", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="dense", ) if isinstance(config.hidden_act, str): - self.intermediate_act_fn = get_tf_activation(activation_string=config.hidden_act) + self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - def call(self, hidden_states): + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states -# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput +# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Longformer class TFLongformerOutput(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: LongformerConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.experimental.EinsumDense( @@ -729,7 +729,7 @@ class TFLongformerOutput(tf.keras.layers.Layer): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def call(self, hidden_states, input_tensor, training=False): + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dropout(inputs=hidden_states, training=training) hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) @@ -737,23 +737,23 @@ class TFLongformerOutput(tf.keras.layers.Layer): return hidden_states -# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler +# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Longformer class TFLongformerPooler(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: LongformerConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( - config.hidden_size, + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", name="dense", ) - def call(self, hidden_states): + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding # to the first token. first_token_tensor = hidden_states[:, 0] - pooled_output = self.dense(first_token_tensor) + pooled_output = self.dense(inputs=first_token_tensor) return pooled_output diff --git a/src/transformers/models/lxmert/modeling_tf_lxmert.py b/src/transformers/models/lxmert/modeling_tf_lxmert.py index 0493ef5cdd..eddc82bd1a 100644 --- a/src/transformers/models/lxmert/modeling_tf_lxmert.py +++ b/src/transformers/models/lxmert/modeling_tf_lxmert.py @@ -18,7 +18,7 @@ import warnings from dataclasses import dataclass -from typing import Dict, Optional, Tuple +from typing import Any, Dict, Optional, Tuple import tensorflow as tf @@ -186,16 +186,16 @@ class TFLxmertWordEmbeddings(tf.keras.layers.Layer): self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.weight = self.add_weight( name="weight", shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "vocab_size": self.vocab_size, "hidden_size": self.hidden_size, @@ -205,14 +205,14 @@ class TFLxmertWordEmbeddings(tf.keras.layers.Layer): return dict(list(base_config.items()) + list(config.items())) - def call(self, input_ids): + def call(self, input_ids: tf.Tensor) -> tf.Tensor: flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -226,16 +226,16 @@ class TFLxmertTokenTypeEmbeddings(tf.keras.layers.Layer): self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.token_type_embeddings = self.add_weight( name="embeddings", shape=[self.type_vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "type_vocab_size": self.type_vocab_size, "hidden_size": self.hidden_size, @@ -245,15 +245,15 @@ class TFLxmertTokenTypeEmbeddings(tf.keras.layers.Layer): return dict(list(base_config.items()) + list(config.items())) - def call(self, token_type_ids): + def call(self, token_type_ids: tf.Tensor) -> tf.Tensor: flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(token_type_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(token_type_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -267,16 +267,16 @@ class TFLxmertPositionEmbeddings(tf.keras.layers.Layer): self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.position_embeddings = self.add_weight( name="embeddings", shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "max_position_embeddings": self.max_position_embeddings, "hidden_size": self.hidden_size, @@ -286,8 +286,8 @@ class TFLxmertPositionEmbeddings(tf.keras.layers.Layer): return dict(list(base_config.items()) + list(config.items())) - def call(self, position_ids): - input_shape = shape_list(tensor=position_ids) + def call(self, position_ids: tf.Tensor) -> tf.Tensor: + input_shape = shape_list(position_ids) position_embeddings = self.position_embeddings[: input_shape[1], :] return tf.broadcast_to(input=position_embeddings, shape=input_shape) @@ -1132,11 +1132,13 @@ class TFLxmertPooler(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->Lxmert class TFLxmertPredictionHeadTransform(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: LxmertConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( - config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + units=config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + name="dense", ) if isinstance(config.hidden_act, str): @@ -1146,17 +1148,17 @@ class TFLxmertPredictionHeadTransform(tf.keras.layers.Layer): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - def call(self, hidden_states): - hidden_states = self.dense(hidden_states) + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) hidden_states = self.transform_act_fn(hidden_states) - hidden_states = self.LayerNorm(hidden_states) + hidden_states = self.LayerNorm(inputs=hidden_states) return hidden_states # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMPredictionHead with Bert->Lxmert class TFLxmertLMPredictionHead(tf.keras.layers.Layer): - def __init__(self, config, input_embeddings, **kwargs): + def __init__(self, config: LxmertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size @@ -1168,28 +1170,28 @@ class TFLxmertLMPredictionHead(tf.keras.layers.Layer): # an output-only bias for each token. self.input_embeddings = input_embeddings - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") super().build(input_shape) - def get_output_embeddings(self): + def get_output_embeddings(self) -> tf.keras.layers.Layer: return self.input_embeddings - def set_output_embeddings(self, value): + def set_output_embeddings(self, value: tf.Variable): self.input_embeddings.weight = value self.input_embeddings.vocab_size = shape_list(value)[0] - def get_bias(self): + def get_bias(self) -> Dict[str, tf.Variable]: return {"bias": self.bias} - def set_bias(self, value): + def set_bias(self, value: tf.Variable): self.bias = value["bias"] self.vocab_size = shape_list(value["bias"])[0] - def call(self, hidden_states): + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.transform(hidden_states=hidden_states) - seq_length = shape_list(tensor=hidden_states)[1] + seq_length = shape_list(hidden_states)[1] hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) @@ -1200,13 +1202,13 @@ class TFLxmertLMPredictionHead(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertMLMHead with Bert->Lxmert class TFLxmertMLMHead(tf.keras.layers.Layer): - def __init__(self, config, input_embeddings, **kwargs): + def __init__(self, config: LxmertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.predictions = TFLxmertLMPredictionHead(config, input_embeddings, name="predictions") - def call(self, sequence_output): - prediction_scores = self.predictions(sequence_output) + def call(self, sequence_output: tf.Tensor) -> tf.Tensor: + prediction_scores = self.predictions(hidden_states=sequence_output) return prediction_scores diff --git a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py index 386dfbee6b..d38a4869f1 100644 --- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py +++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py @@ -17,7 +17,7 @@ import warnings from dataclasses import dataclass -from typing import Optional, Tuple +from typing import Any, Dict, Optional, Tuple import tensorflow as tf @@ -116,16 +116,16 @@ class TFMobileBertWordEmbeddings(tf.keras.layers.Layer): self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.weight = self.add_weight( name="weight", shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "vocab_size": self.vocab_size, "hidden_size": self.hidden_size, @@ -135,14 +135,14 @@ class TFMobileBertWordEmbeddings(tf.keras.layers.Layer): return dict(list(base_config.items()) + list(config.items())) - def call(self, input_ids): + def call(self, input_ids: tf.Tensor) -> tf.Tensor: flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -156,16 +156,16 @@ class TFMobileBertTokenTypeEmbeddings(tf.keras.layers.Layer): self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.token_type_embeddings = self.add_weight( name="embeddings", shape=[self.type_vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "type_vocab_size": self.type_vocab_size, "hidden_size": self.hidden_size, @@ -175,15 +175,15 @@ class TFMobileBertTokenTypeEmbeddings(tf.keras.layers.Layer): return dict(list(base_config.items()) + list(config.items())) - def call(self, token_type_ids): + def call(self, token_type_ids: tf.Tensor) -> tf.Tensor: flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(token_type_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(token_type_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -197,16 +197,16 @@ class TFMobileBertPositionEmbeddings(tf.keras.layers.Layer): self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.position_embeddings = self.add_weight( name="embeddings", shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "max_position_embeddings": self.max_position_embeddings, "hidden_size": self.hidden_size, @@ -216,8 +216,8 @@ class TFMobileBertPositionEmbeddings(tf.keras.layers.Layer): return dict(list(base_config.items()) + list(config.items())) - def call(self, position_ids): - input_shape = shape_list(tensor=position_ids) + def call(self, position_ids: tf.Tensor) -> tf.Tensor: + input_shape = shape_list(position_ids) position_embeddings = self.position_embeddings[: input_shape[1], :] return tf.broadcast_to(input=position_embeddings, shape=input_shape) @@ -1085,7 +1085,7 @@ class TFMobileBertModel(TFMobileBertPreTrainedModel): return outputs # Copied from transformers.models.bert.modeling_tf_bert.TFBertModel.serving_output - def serving_output(self, output): + def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1299,7 +1299,7 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModel ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1413,7 +1413,7 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel, TFNextS ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForNextSentencePrediction.serving_output - def serving_output(self, output): + def serving_output(self, output: TFNextSentencePredictorOutput) -> TFNextSentencePredictorOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1522,7 +1522,7 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1643,7 +1643,7 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output - def serving_output(self, output): + def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1796,13 +1796,14 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic } ] ) - def serving(self, inputs): - output = self.call(inputs) + # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving + def serving(self, inputs: Dict[str, tf.Tensor]): + output = self.call(input_ids=inputs) return self.serving_output(output) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1911,7 +1912,7 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None diff --git a/src/transformers/models/mpnet/modeling_tf_mpnet.py b/src/transformers/models/mpnet/modeling_tf_mpnet.py index 33f90cfcf9..67f1031d12 100644 --- a/src/transformers/models/mpnet/modeling_tf_mpnet.py +++ b/src/transformers/models/mpnet/modeling_tf_mpnet.py @@ -18,6 +18,7 @@ import math import warnings +from typing import Any, Dict import tensorflow as tf @@ -95,16 +96,16 @@ class TFMPNetWordEmbeddings(tf.keras.layers.Layer): self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.weight = self.add_weight( name="weight", shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "vocab_size": self.vocab_size, "hidden_size": self.hidden_size, @@ -114,14 +115,14 @@ class TFMPNetWordEmbeddings(tf.keras.layers.Layer): return dict(list(base_config.items()) + list(config.items())) - def call(self, input_ids): + def call(self, input_ids: tf.Tensor) -> tf.Tensor: flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -139,7 +140,7 @@ class TFMPNetPositionEmbeddings(tf.keras.layers.Layer): self.position_embeddings = self.add_weight( name="embeddings", shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) super().build(input_shape) @@ -158,10 +159,10 @@ class TFMPNetPositionEmbeddings(tf.keras.layers.Layer): flat_position_ids = tf.reshape(tensor=position_ids, shape=[-1]) embeddings = tf.gather(params=self.position_embeddings, indices=flat_position_ids) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=position_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(position_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=position_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(position_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -207,8 +208,8 @@ class TFMPNetEmbeddings(tf.keras.layers.Layer): tensor=input_ids, shape=(input_ids_shape[0] * input_ids_shape[1], input_ids_shape[2]) ) - mask = tf.cast(x=tf.math.not_equal(x=input_ids, y=self.padding_idx), dtype=input_ids.dtype) - incremental_indices = tf.math.cumsum(x=mask, axis=1) * mask + mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=input_ids.dtype) + incremental_indices = tf.math.cumsum(mask, axis=1) * mask return incremental_indices + self.padding_idx @@ -253,23 +254,23 @@ class TFMPNetEmbeddings(tf.keras.layers.Layer): return final_embeddings -# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler +# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->MPNet class TFMPNetPooler(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: MPNetConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( - config.hidden_size, + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", name="dense", ) - def call(self, hidden_states): + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding # to the first token. first_token_tensor = hidden_states[:, 0] - pooled_output = self.dense(first_token_tensor) + pooled_output = self.dense(inputs=first_token_tensor) return pooled_output @@ -291,28 +292,28 @@ class TFMPNetSelfAttention(tf.keras.layers.Layer): equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), bias_axes="de", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="q", ) self.k = tf.keras.layers.experimental.EinsumDense( equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), bias_axes="de", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="k", ) self.v = tf.keras.layers.experimental.EinsumDense( equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), bias_axes="de", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="v", ) self.o = tf.keras.layers.experimental.EinsumDense( equation="abcd,cde->abe", output_shape=(None, self.all_head_size), bias_axes="e", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="o", ) self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) @@ -322,8 +323,8 @@ class TFMPNetSelfAttention(tf.keras.layers.Layer): k = self.k(hidden_states) v = self.v(hidden_states) - dk = tf.cast(x=self.attention_head_size, dtype=q.dtype) - q = tf.multiply(x=q, y=tf.math.rsqrt(x=dk)) + dk = tf.cast(self.attention_head_size, dtype=q.dtype) + q = tf.multiply(q, y=tf.math.rsqrt(dk)) attention_scores = tf.einsum("aecd,abcd->acbe", k, q) # Apply relative position embedding (precomputed in MPNetEncoder) if provided. @@ -368,34 +369,34 @@ class TFMPNetAttention(tf.keras.layers.Layer): return outputs -# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate +# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->MPNet class TFMPNetIntermediate(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: MPNetConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.experimental.EinsumDense( equation="abc,cd->abd", output_shape=(None, config.intermediate_size), bias_axes="d", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="dense", ) if isinstance(config.hidden_act, str): - self.intermediate_act_fn = get_tf_activation(activation_string=config.hidden_act) + self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - def call(self, hidden_states): + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states -# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput +# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->MPNet class TFMPNetOutput(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: MPNetConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.experimental.EinsumDense( @@ -408,7 +409,7 @@ class TFMPNetOutput(tf.keras.layers.Layer): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def call(self, hidden_states, input_tensor, training=False): + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dropout(inputs=hidden_states, training=training) hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) @@ -563,11 +564,11 @@ class TFMPNetMainLayer(tf.keras.layers.Layer): self.embeddings = TFMPNetEmbeddings(config, name="embeddings") # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings - def get_input_embeddings(self): + def get_input_embeddings(self) -> tf.keras.layers.Layer: return self.embeddings.word_embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings - def set_input_embeddings(self, value): + def set_input_embeddings(self, value: tf.Variable): self.embeddings.word_embeddings.weight = value self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] @@ -820,7 +821,7 @@ class TFMPNetModel(TFMPNetPreTrainedModel): return outputs # Copied from transformers.models.bert.modeling_tf_bert.TFBertModel.serving_output - def serving_output(self, output): + def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -973,7 +974,7 @@ class TFMPNetForMaskedLM(TFMPNetPreTrainedModel, TFMaskedLanguageModelingLoss): ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1095,7 +1096,7 @@ class TFMPNetForSequenceClassification(TFMPNetPreTrainedModel, TFSequenceClassif ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1233,7 +1234,7 @@ class TFMPNetForMultipleChoice(TFMPNetPreTrainedModel, TFMultipleChoiceLoss): return self.serving_output(output) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1333,7 +1334,7 @@ class TFMPNetForTokenClassification(TFMPNetPreTrainedModel, TFTokenClassificatio ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1446,7 +1447,7 @@ class TFMPNetForQuestionAnswering(TFMPNetPreTrainedModel, TFQuestionAnsweringLos ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output - def serving_output(self, output): + def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None diff --git a/src/transformers/models/openai/modeling_tf_openai.py b/src/transformers/models/openai/modeling_tf_openai.py index 1c4729fc6c..419cbb4332 100644 --- a/src/transformers/models/openai/modeling_tf_openai.py +++ b/src/transformers/models/openai/modeling_tf_openai.py @@ -663,7 +663,7 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.serving_output - def serving_output(self, output): + def serving_output(self, output: TFCausalLMOutput) -> TFCausalLMOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -965,7 +965,7 @@ class TFOpenAIGPTForSequenceClassification(TFOpenAIGPTPreTrainedModel, TFSequenc ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None diff --git a/src/transformers/models/roberta/modeling_tf_roberta.py b/src/transformers/models/roberta/modeling_tf_roberta.py index 9580d6b02c..e34c984a37 100644 --- a/src/transformers/models/roberta/modeling_tf_roberta.py +++ b/src/transformers/models/roberta/modeling_tf_roberta.py @@ -16,7 +16,9 @@ """ TF 2.0 RoBERTa model. """ import warnings +from typing import Any, Dict, Optional, Tuple, Union +import numpy as np import tensorflow as tf from ...activations_tf import get_tf_activation @@ -37,6 +39,7 @@ from ...modeling_tf_outputs import ( ) from ...modeling_tf_utils import ( TFMaskedLanguageModelingLoss, + TFModelInputType, TFMultipleChoiceLoss, TFPreTrainedModel, TFQuestionAnsweringLoss, @@ -74,16 +77,16 @@ class TFRobertaWordEmbeddings(tf.keras.layers.Layer): self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.weight = self.add_weight( name="weight", shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "vocab_size": self.vocab_size, "hidden_size": self.hidden_size, @@ -93,14 +96,14 @@ class TFRobertaWordEmbeddings(tf.keras.layers.Layer): return dict(list(base_config.items()) + list(config.items())) - def call(self, input_ids): + def call(self, input_ids: tf.Tensor) -> tf.Tensor: flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -114,16 +117,16 @@ class TFRobertaTokenTypeEmbeddings(tf.keras.layers.Layer): self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.token_type_embeddings = self.add_weight( name="embeddings", shape=[self.type_vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "type_vocab_size": self.type_vocab_size, "hidden_size": self.hidden_size, @@ -133,15 +136,15 @@ class TFRobertaTokenTypeEmbeddings(tf.keras.layers.Layer): return dict(list(base_config.items()) + list(config.items())) - def call(self, token_type_ids): + def call(self, token_type_ids: tf.Tensor) -> tf.Tensor: flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(token_type_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(token_type_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -159,7 +162,7 @@ class TFRobertaPositionEmbeddings(tf.keras.layers.Layer): self.position_embeddings = self.add_weight( name="embeddings", shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) super().build(input_shape) @@ -178,10 +181,10 @@ class TFRobertaPositionEmbeddings(tf.keras.layers.Layer): flat_position_ids = tf.reshape(tensor=position_ids, shape=[-1]) embeddings = tf.gather(params=self.position_embeddings, indices=flat_position_ids) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=position_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(position_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=position_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(position_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -235,8 +238,8 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer): tensor=input_ids, shape=(input_ids_shape[0] * input_ids_shape[1], input_ids_shape[2]) ) - mask = tf.cast(x=tf.math.not_equal(x=input_ids, y=self.padding_idx), dtype=input_ids.dtype) - incremental_indices = tf.math.cumsum(x=mask, axis=1) * mask + mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=input_ids.dtype) + incremental_indices = tf.math.cumsum(mask, axis=1) * mask return incremental_indices + self.padding_idx @@ -286,30 +289,30 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer): return final_embeddings -# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler +# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Roberta class TFRobertaPooler(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: RobertaConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( - config.hidden_size, + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", name="dense", ) - def call(self, hidden_states): + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding # to the first token. first_token_tensor = hidden_states[:, 0] - pooled_output = self.dense(first_token_tensor) + pooled_output = self.dense(inputs=first_token_tensor) return pooled_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Roberta class TFRobertaSelfAttention(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: RobertaConfig, **kwargs): super().__init__(**kwargs) if config.hidden_size % config.num_attention_heads != 0: @@ -324,50 +327,57 @@ class TFRobertaSelfAttention(tf.keras.layers.Layer): equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), bias_axes="de", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="query", ) self.key = tf.keras.layers.experimental.EinsumDense( equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), bias_axes="de", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="key", ) self.value = tf.keras.layers.experimental.EinsumDense( equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), bias_axes="de", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="value", ) self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) - def call(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False, training=False): + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: query_layer = self.query(inputs=hidden_states) key_layer = self.key(inputs=hidden_states) value_layer = self.value(inputs=hidden_states) # Take the dot product between "query" and "key" to get the raw # attention scores. - dk = tf.cast(x=self.attention_head_size, dtype=query_layer.dtype) - query_layer = tf.multiply(x=query_layer, y=tf.math.rsqrt(x=dk)) + dk = tf.cast(self.attention_head_size, dtype=query_layer.dtype) + query_layer = tf.multiply(query_layer, tf.math.rsqrt(dk)) attention_scores = tf.einsum("aecd,abcd->acbe", key_layer, query_layer) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in TFRobertaModel call() function) - attention_scores = attention_scores + attention_mask + attention_scores = tf.add(attention_scores, attention_mask) # Normalize the attention scores to probabilities. attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. - attention_probs = self.dropout(attention_probs, training=training) + attention_probs = self.dropout(inputs=attention_probs, training=training) # Mask heads if we want to if head_mask is not None: - attention_scores = attention_scores * head_mask + attention_scores = tf.multiply(attention_scores, head_mask) attention_output = tf.einsum("acbe,aecd->abcd", attention_probs, value_layer) outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) @@ -375,9 +385,9 @@ class TFRobertaSelfAttention(tf.keras.layers.Layer): return outputs -# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput +# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Roberta class TFRobertaSelfOutput(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: RobertaConfig, **kwargs): super().__init__(**kwargs) if config.hidden_size % config.num_attention_heads != 0: @@ -393,13 +403,13 @@ class TFRobertaSelfOutput(tf.keras.layers.Layer): equation="abcd,cde->abe", output_shape=(None, self.all_head_size), bias_axes="e", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="dense", ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def call(self, hidden_states, input_tensor, training=False): + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dropout(inputs=hidden_states, training=training) hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) @@ -409,7 +419,7 @@ class TFRobertaSelfOutput(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Roberta class TFRobertaAttention(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: RobertaConfig, **kwargs): super().__init__(**kwargs) self.self_attention = TFRobertaSelfAttention(config, name="self") @@ -418,44 +428,57 @@ class TFRobertaAttention(tf.keras.layers.Layer): def prune_heads(self, heads): raise NotImplementedError - def call(self, input_tensor, attention_mask, head_mask, output_attentions, training=False): + def call( + self, + input_tensor: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: self_outputs = self.self_attention( - input_tensor, attention_mask, head_mask, output_attentions, training=training + hidden_states=input_tensor, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + training=training, + ) + attention_output = self.dense_output( + hidden_states=self_outputs[0], input_tensor=input_tensor, training=training ) - attention_output = self.dense_output(self_outputs[0], input_tensor, training=training) outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs -# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate +# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Roberta class TFRobertaIntermediate(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: RobertaConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.experimental.EinsumDense( equation="abc,cd->abd", output_shape=(None, config.intermediate_size), bias_axes="d", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="dense", ) if isinstance(config.hidden_act, str): - self.intermediate_act_fn = get_tf_activation(activation_string=config.hidden_act) + self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - def call(self, hidden_states): + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states -# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput +# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Roberta class TFRobertaOutput(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: RobertaConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.experimental.EinsumDense( @@ -468,7 +491,7 @@ class TFRobertaOutput(tf.keras.layers.Layer): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def call(self, hidden_states, input_tensor, training=False): + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dropout(inputs=hidden_states, training=training) hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) @@ -478,20 +501,33 @@ class TFRobertaOutput(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Roberta class TFRobertaLayer(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: RobertaConfig, **kwargs): super().__init__(**kwargs) self.attention = TFRobertaAttention(config, name="attention") self.intermediate = TFRobertaIntermediate(config, name="intermediate") self.bert_output = TFRobertaOutput(config, name="output") - def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False): + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: attention_outputs = self.attention( - hidden_states, attention_mask, head_mask, output_attentions, training=training + input_tensor=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + training=training, ) attention_output = attention_outputs[0] - intermediate_output = self.intermediate(attention_output) - layer_output = self.bert_output(intermediate_output, attention_output, training=training) + intermediate_output = self.intermediate(hidden_states=attention_output) + layer_output = self.bert_output( + hidden_states=intermediate_output, input_tensor=attention_output, training=training + ) outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them return outputs @@ -499,21 +535,21 @@ class TFRobertaLayer(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Roberta class TFRobertaEncoder(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: RobertaConfig, **kwargs): super().__init__(**kwargs) self.layer = [TFRobertaLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)] def call( self, - hidden_states, - attention_mask, - head_mask, - output_attentions, - output_hidden_states, - return_dict, - training=False, - ): + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + output_hidden_states: bool, + return_dict: bool, + training: bool = False, + ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]: all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -522,7 +558,11 @@ class TFRobertaEncoder(tf.keras.layers.Layer): all_hidden_states = all_hidden_states + (hidden_states,) layer_outputs = layer_module( - hidden_states, attention_mask, head_mask[i], output_attentions, training=training + hidden_states=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask[i], + output_attentions=output_attentions, + training=training, ) hidden_states = layer_outputs[0] @@ -560,11 +600,11 @@ class TFRobertaMainLayer(tf.keras.layers.Layer): self.embeddings = TFRobertaEmbeddings(config, name="embeddings") # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings - def get_input_embeddings(self): + def get_input_embeddings(self) -> tf.keras.layers.Layer: return self.embeddings.word_embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings - def set_input_embeddings(self, value): + def set_input_embeddings(self, value: tf.Variable): self.embeddings.word_embeddings.weight = value self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] @@ -579,18 +619,18 @@ class TFRobertaMainLayer(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.call def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, **kwargs, - ): + ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: inputs = input_processing( func=self.call, config=self.config, @@ -610,23 +650,23 @@ class TFRobertaMainLayer(tf.keras.layers.Layer): if inputs["input_ids"] is not None and inputs["inputs_embeds"] is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif inputs["input_ids"] is not None: - input_shape = shape_list(inputs["input_ids"]) + input_shape = shape_list(tensor=inputs["input_ids"]) elif inputs["inputs_embeds"] is not None: - input_shape = shape_list(inputs["inputs_embeds"])[:-1] + input_shape = shape_list(tensor=inputs["inputs_embeds"])[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if inputs["attention_mask"] is None: - inputs["attention_mask"] = tf.fill(input_shape, 1) + inputs["attention_mask"] = tf.fill(dims=input_shape, value=1) if inputs["token_type_ids"] is None: - inputs["token_type_ids"] = tf.fill(input_shape, 0) + inputs["token_type_ids"] = tf.fill(dims=input_shape, value=0) embedding_output = self.embeddings( - inputs["input_ids"], - inputs["position_ids"], - inputs["token_type_ids"], - inputs["inputs_embeds"], + input_ids=inputs["input_ids"], + position_ids=inputs["position_ids"], + token_type_ids=inputs["token_type_ids"], + inputs_embeds=inputs["inputs_embeds"], training=inputs["training"], ) @@ -642,8 +682,8 @@ class TFRobertaMainLayer(tf.keras.layers.Layer): # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. - extended_attention_mask = tf.cast(extended_attention_mask, embedding_output.dtype) - extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + extended_attention_mask = tf.cast(extended_attention_mask, dtype=embedding_output.dtype) + extended_attention_mask = tf.multiply(tf.subtract(1.0, extended_attention_mask), -10000.0) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head @@ -653,21 +693,20 @@ class TFRobertaMainLayer(tf.keras.layers.Layer): if inputs["head_mask"] is not None: raise NotImplementedError else: - inputs["head_mask"] = [None] * self.num_hidden_layers - # head_mask = tf.constant([0] * self.num_hidden_layers) + inputs["head_mask"] = [None] * self.config.num_hidden_layers encoder_outputs = self.encoder( - embedding_output, - extended_attention_mask, - inputs["head_mask"], - inputs["output_attentions"], - inputs["output_hidden_states"], - inputs["return_dict"], + hidden_states=embedding_output, + attention_mask=extended_attention_mask, + head_mask=inputs["head_mask"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], training=inputs["training"], ) sequence_output = encoder_outputs[0] - pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + pooled_output = self.pooler(hidden_states=sequence_output) if self.pooler is not None else None if not inputs["return_dict"]: return ( @@ -860,7 +899,7 @@ class TFRobertaModel(TFRobertaPreTrainedModel): return outputs # Copied from transformers.models.bert.modeling_tf_bert.TFBertModel.serving_output - def serving_output(self, output): + def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1016,7 +1055,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1139,7 +1178,7 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1283,7 +1322,7 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss) return self.serving_output(output) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1386,7 +1425,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1501,7 +1540,7 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output - def serving_output(self, output): + def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None diff --git a/src/transformers/models/xlm/modeling_tf_xlm.py b/src/transformers/models/xlm/modeling_tf_xlm.py index e7812846bf..405b7c1fdc 100644 --- a/src/transformers/models/xlm/modeling_tf_xlm.py +++ b/src/transformers/models/xlm/modeling_tf_xlm.py @@ -19,7 +19,7 @@ import itertools import warnings from dataclasses import dataclass -from typing import Optional, Tuple +from typing import Dict, Optional, Tuple import numpy as np import tensorflow as tf @@ -1019,7 +1019,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1180,13 +1180,14 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss): } ] ) - def serving(self, inputs): - output = self.call(inputs) + # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving + def serving(self, inputs: Dict[str, tf.Tensor]): + output = self.call(input_ids=inputs) return self.serving_output(output) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1294,7 +1295,7 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1413,7 +1414,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output - def serving_output(self, output): + def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py index 4aad02333f..1782123f4d 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py @@ -16,11 +16,10 @@ {% if cookiecutter.is_encoder_decoder_model == "False" %} +from typing import Any, Dict, Optional, Tuple, Union +import numpy as np import tensorflow as tf -from tensorflow.keras import layers - -from transformers.modeling_tf_outputs import TFCausalLMOutput from ...activations_tf import get_tf_activation from ...file_utils import ( @@ -32,6 +31,7 @@ from ...file_utils import ( from ...modeling_tf_outputs import ( TFBaseModelOutput, TFBaseModelOutputWithPooling, + TFCausalLMOutput, TFMaskedLMOutput, TFMultipleChoiceModelOutput, TFQuestionAnsweringModelOutput, @@ -41,6 +41,7 @@ from ...modeling_tf_outputs import ( from ...modeling_tf_utils import ( TFCausalLanguageModelingLoss, TFMaskedLanguageModelingLoss, + TFModelInputType, TFMultipleChoiceLoss, TFPreTrainedModel, TFQuestionAnsweringLoss, @@ -76,16 +77,16 @@ class TF{{cookiecutter.camelcase_modelname}}WordEmbeddings(tf.keras.layers.Layer self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.weight = self.add_weight( name="weight", shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "vocab_size": self.vocab_size, "hidden_size": self.hidden_size, @@ -95,14 +96,14 @@ class TF{{cookiecutter.camelcase_modelname}}WordEmbeddings(tf.keras.layers.Layer return dict(list(base_config.items()) + list(config.items())) - def call(self, input_ids): + def call(self, input_ids: tf.Tensor) -> tf.Tensor: flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -116,16 +117,16 @@ class TF{{cookiecutter.camelcase_modelname}}TokenTypeEmbeddings(tf.keras.layers. self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.token_type_embeddings = self.add_weight( name="embeddings", shape=[self.type_vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "type_vocab_size": self.type_vocab_size, "hidden_size": self.hidden_size, @@ -135,15 +136,15 @@ class TF{{cookiecutter.camelcase_modelname}}TokenTypeEmbeddings(tf.keras.layers. return dict(list(base_config.items()) + list(config.items())) - def call(self, token_type_ids): + def call(self, token_type_ids: tf.Tensor) -> tf.Tensor: flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(token_type_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(token_type_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -157,16 +158,16 @@ class TF{{cookiecutter.camelcase_modelname}}PositionEmbeddings(tf.keras.layers.L self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.position_embeddings = self.add_weight( name="embeddings", shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "max_position_embeddings": self.max_position_embeddings, "hidden_size": self.hidden_size, @@ -176,8 +177,8 @@ class TF{{cookiecutter.camelcase_modelname}}PositionEmbeddings(tf.keras.layers.L return dict(list(base_config.items()) + list(config.items())) - def call(self, position_ids): - input_shape = shape_list(tensor=position_ids) + def call(self, position_ids: tf.Tensor) -> tf.Tensor: + input_shape = shape_list(position_ids) position_embeddings = self.position_embeddings[: input_shape[1], :] return tf.broadcast_to(input=position_embeddings, shape=input_shape) @@ -187,7 +188,7 @@ class TF{{cookiecutter.camelcase_modelname}}PositionEmbeddings(tf.keras.layers.L class TF{{cookiecutter.camelcase_modelname}}Embeddings(tf.keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" - def __init__(self, config, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) self.word_embeddings = TF{{cookiecutter.camelcase_modelname}}WordEmbeddings( @@ -212,7 +213,14 @@ class TF{{cookiecutter.camelcase_modelname}}Embeddings(tf.keras.layers.Layer): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False): + def call( + self, + input_ids: tf.Tensor, + position_ids: tf.Tensor, + token_type_ids: tf.Tensor, + inputs_embeds: tf.Tensor, + training: bool = False, + ) -> tf.Tensor: """ Applies embedding based on inputs tensor. @@ -222,18 +230,18 @@ class TF{{cookiecutter.camelcase_modelname}}Embeddings(tf.keras.layers.Layer): assert not (input_ids is None and inputs_embeds is None) if input_ids is not None: - inputs_embeds = self.word_embeddings(input_ids=input_ids) + inputs_embeds = self.word_embeddings(input_ids) if token_type_ids is None: - input_shape = shape_list(tensor=inputs_embeds)[:-1] + input_shape = shape_list(inputs_embeds)[:-1] token_type_ids = tf.fill(dims=input_shape, value=0) if position_ids is None: - position_embeds = self.position_embeddings(position_ids=inputs_embeds) + position_embeds = self.position_embeddings(inputs_embeds) else: - position_embeds = self.position_embeddings(position_ids=position_ids) + position_embeds = self.position_embeddings(position_ids) - token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids) + token_type_embeds = self.token_type_embeddings(token_type_ids) final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) final_embeddings = self.LayerNorm(inputs=final_embeddings) final_embeddings = self.dropout(inputs=final_embeddings, training=training) @@ -244,7 +252,7 @@ class TF{{cookiecutter.camelcase_modelname}}Embeddings(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->{{cookiecutter.camelcase_modelname}} class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) if config.hidden_size % config.num_attention_heads != 0: @@ -259,50 +267,57 @@ class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer) equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), bias_axes="de", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="query", ) self.key = tf.keras.layers.experimental.EinsumDense( equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), bias_axes="de", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="key", ) self.value = tf.keras.layers.experimental.EinsumDense( equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), bias_axes="de", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="value", ) self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) - def call(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False, training=False): + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: query_layer = self.query(inputs=hidden_states) key_layer = self.key(inputs=hidden_states) value_layer = self.value(inputs=hidden_states) # Take the dot product between "query" and "key" to get the raw # attention scores. - dk = tf.cast(x=self.attention_head_size, dtype=query_layer.dtype) - query_layer = tf.multiply(x=query_layer, y=tf.math.rsqrt(x=dk)) + dk = tf.cast(self.attention_head_size, dtype=query_layer.dtype) + query_layer = tf.multiply(query_layer, tf.math.rsqrt(dk)) attention_scores = tf.einsum("aecd,abcd->acbe", key_layer, query_layer) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in TF{{cookiecutter.camelcase_modelname}}Model call() function) - attention_scores = attention_scores + attention_mask + attention_scores = tf.add(attention_scores, attention_mask) # Normalize the attention scores to probabilities. attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. - attention_probs = self.dropout(attention_probs, training=training) + attention_probs = self.dropout(inputs=attention_probs, training=training) # Mask heads if we want to if head_mask is not None: - attention_scores = attention_scores * head_mask + attention_scores = tf.multiply(attention_scores, head_mask) attention_output = tf.einsum("acbe,aecd->abcd", attention_probs, value_layer) outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) @@ -310,9 +325,9 @@ class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer) return outputs -# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput +# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->{{cookiecutter.camelcase_modelname}} class TF{{cookiecutter.camelcase_modelname}}SelfOutput(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) if config.hidden_size % config.num_attention_heads != 0: @@ -328,13 +343,13 @@ class TF{{cookiecutter.camelcase_modelname}}SelfOutput(tf.keras.layers.Layer): equation="abcd,cde->abe", output_shape=(None, self.all_head_size), bias_axes="e", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="dense", ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def call(self, hidden_states, input_tensor, training=False): + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dropout(inputs=hidden_states, training=training) hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) @@ -344,7 +359,7 @@ class TF{{cookiecutter.camelcase_modelname}}SelfOutput(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->{{cookiecutter.camelcase_modelname}} class TF{{cookiecutter.camelcase_modelname}}Attention(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) self.self_attention = TF{{cookiecutter.camelcase_modelname}}SelfAttention(config, name="self") @@ -353,11 +368,24 @@ class TF{{cookiecutter.camelcase_modelname}}Attention(tf.keras.layers.Layer): def prune_heads(self, heads): raise NotImplementedError - def call(self, input_tensor, attention_mask, head_mask, output_attentions, training=False): + def call( + self, + input_tensor: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: self_outputs = self.self_attention( - input_tensor, attention_mask, head_mask, output_attentions, training=training + hidden_states=input_tensor, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + training=training, + ) + attention_output = self.dense_output( + hidden_states=self_outputs[0], input_tensor=input_tensor, training=training ) - attention_output = self.dense_output(self_outputs[0], input_tensor, training=training) outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs @@ -365,23 +393,23 @@ class TF{{cookiecutter.camelcase_modelname}}Attention(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->{{cookiecutter.camelcase_modelname}} class TF{{cookiecutter.camelcase_modelname}}Intermediate(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.experimental.EinsumDense( equation="abc,cd->abd", output_shape=(None, config.intermediate_size), bias_axes="d", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="dense", ) if isinstance(config.hidden_act, str): - self.intermediate_act_fn = get_tf_activation(activation_string=config.hidden_act) + self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - def call(self, hidden_states): + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) @@ -390,7 +418,7 @@ class TF{{cookiecutter.camelcase_modelname}}Intermediate(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->{{cookiecutter.camelcase_modelname}} class TF{{cookiecutter.camelcase_modelname}}Output(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.experimental.EinsumDense( @@ -403,7 +431,7 @@ class TF{{cookiecutter.camelcase_modelname}}Output(tf.keras.layers.Layer): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def call(self, hidden_states, input_tensor, training=False): + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dropout(inputs=hidden_states, training=training) hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) @@ -412,42 +440,52 @@ class TF{{cookiecutter.camelcase_modelname}}Output(tf.keras.layers.Layer): class TF{{cookiecutter.camelcase_modelname}}Layer(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) self.attention = TF{{cookiecutter.camelcase_modelname}}Attention(config, name="attention") self.intermediate = TF{{cookiecutter.camelcase_modelname}}Intermediate(config, name="intermediate") - self.{{cookiecutter.lowercase_modelname}}_output = TF{{cookiecutter.camelcase_modelname}}Output(config, name="output") + self.bert_output = TF{{cookiecutter.camelcase_modelname}}Output(config, name="output") - # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer.call with bert->{{cookiecutter.lowercase_modelname}} - def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False): + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: attention_outputs = self.attention( - hidden_states, attention_mask, head_mask, output_attentions, training=training + input_tensor=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + training=training, ) attention_output = attention_outputs[0] - intermediate_output = self.intermediate(attention_output) - layer_output = self.{{cookiecutter.lowercase_modelname}}_output(intermediate_output, attention_output, training=training) + intermediate_output = self.intermediate(hidden_states=attention_output) + layer_output = self.bert_output(hidden_states=intermediate_output, input_tensor=attention_output, training=training) outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them return outputs # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->{{cookiecutter.camelcase_modelname}} class TF{{cookiecutter.camelcase_modelname}}Encoder(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) self.layer = [TF{{cookiecutter.camelcase_modelname}}Layer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)] def call( self, - hidden_states, - attention_mask, - head_mask, - output_attentions, - output_hidden_states, - return_dict, - training=False, - ): + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + output_hidden_states: bool, + return_dict: bool, + training: bool = False, + ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]: all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -456,7 +494,11 @@ class TF{{cookiecutter.camelcase_modelname}}Encoder(tf.keras.layers.Layer): all_hidden_states = all_hidden_states + (hidden_states,) layer_outputs = layer_module( - hidden_states, attention_mask, head_mask[i], output_attentions, training=training + hidden_states=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask[i], + output_attentions=output_attentions, + training=training, ) hidden_states = layer_outputs[0] @@ -477,11 +519,13 @@ class TF{{cookiecutter.camelcase_modelname}}Encoder(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHead with Bert->{{cookiecutter.camelcase_modelname}} class TF{{cookiecutter.camelcase_modelname}}PredictionHeadTransform(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( - config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + units=config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + name="dense", ) if isinstance(config.hidden_act, str): @@ -491,50 +535,50 @@ class TF{{cookiecutter.camelcase_modelname}}PredictionHeadTransform(tf.keras.lay self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - def call(self, hidden_states): - hidden_states = self.dense(hidden_states) + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) hidden_states = self.transform_act_fn(hidden_states) - hidden_states = self.LayerNorm(hidden_states) + hidden_states = self.LayerNorm(inputs=hidden_states) return hidden_states # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMPredictionHead with Bert->{{cookiecutter.camelcase_modelname}} class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(tf.keras.layers.Layer): - def __init__(self, config, input_embeddings, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, input_embeddings: tf.keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size self.hidden_size = config.hidden_size - + self.transform = TF{{cookiecutter.camelcase_modelname}}PredictionHeadTransform(config, name="transform") # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.input_embeddings = input_embeddings - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") - super().build(input_shape) - - def get_output_embeddings(self): + super().build(input_shape=input_shape) + + def get_output_embeddings(self) -> tf.keras.layers.Layer: return self.input_embeddings - def set_output_embeddings(self, value): + def set_output_embeddings(self, value: tf.Variable): self.input_embeddings.weight = value self.input_embeddings.vocab_size = shape_list(value)[0] - def get_bias(self): + def get_bias(self) -> Dict[str, tf.Variable]: return {"bias": self.bias} - def set_bias(self, value): + def set_bias(self, value: tf.Variable): self.bias = value["bias"] self.vocab_size = shape_list(value["bias"])[0] - def call(self, hidden_states): + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.transform(hidden_states=hidden_states) - seq_length = shape_list(tensor=hidden_states)[1] + seq_length = shape_list(hidden_states)[1] hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) @@ -545,13 +589,13 @@ class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(tf.keras.layers.Lay # Copied from transformers.models.bert.modeling_tf_bert.TFBertMLMHead with Bert->{{cookiecutter.camelcase_modelname}} class TF{{cookiecutter.camelcase_modelname}}MLMHead(tf.keras.layers.Layer): - def __init__(self, config, input_embeddings, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, input_embeddings: tf.keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.predictions = TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(config, input_embeddings, name="predictions") - def call(self, sequence_output): - prediction_scores = self.predictions(sequence_output) + def call(self, sequence_output: tf.Tensor) -> tf.Tensor: + prediction_scores = self.predictions(hidden_states=sequence_output) return prediction_scores @@ -560,47 +604,45 @@ class TF{{cookiecutter.camelcase_modelname}}MLMHead(tf.keras.layers.Layer): class TF{{cookiecutter.camelcase_modelname}}MainLayer(tf.keras.layers.Layer): config_class = {{cookiecutter.camelcase_modelname}}Config - def __init__(self, config, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, add_pooling_layer: bool = True, **kwargs): super().__init__(**kwargs) self.config = config - self.num_hidden_layers = config.num_hidden_layers - self.initializer_range = config.initializer_range - self.output_attentions = config.output_attentions - self.output_hidden_states = config.output_hidden_states - self.return_dict = config.use_return_dict + self.embeddings = TF{{cookiecutter.camelcase_modelname}}Embeddings(config, name="embeddings") self.encoder = TF{{cookiecutter.camelcase_modelname}}Encoder(config, name="encoder") - self.config = config - def get_input_embeddings(self): + # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings + def get_input_embeddings(self) -> tf.keras.layers.Layer: return self.embeddings.word_embeddings - def set_input_embeddings(self, value): + # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings + def set_input_embeddings(self, value: tf.Variable): self.embeddings.word_embeddings.weight = value self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] + # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads def _prune_heads(self, heads_to_prune): - """Prunes heads of the model. - heads_to_prune: dict of {layer_num: list of heads to prune in this layer} - See base class PreTrainedModel + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel """ raise NotImplementedError def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, **kwargs, - ): + ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]: inputs = input_processing( func=self.call, config=self.config, @@ -616,7 +658,7 @@ class TF{{cookiecutter.camelcase_modelname}}MainLayer(tf.keras.layers.Layer): training=training, kwargs_call=kwargs, ) - + if inputs["input_ids"] is not None and inputs["inputs_embeds"] is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif inputs["input_ids"] is not None: @@ -627,16 +669,16 @@ class TF{{cookiecutter.camelcase_modelname}}MainLayer(tf.keras.layers.Layer): raise ValueError("You have to specify either input_ids or inputs_embeds") if inputs["attention_mask"] is None: - inputs["attention_mask"] = tf.fill(input_shape, 1) + inputs["attention_mask"] = tf.fill(dims=input_shape, value=1) if inputs["token_type_ids"] is None: - inputs["token_type_ids"] = tf.fill(input_shape, 0) + inputs["token_type_ids"] = tf.fill(dims=input_shape, value=0) embedding_output = self.embeddings( - inputs["input_ids"], - inputs["position_ids"], - inputs["token_type_ids"], - inputs["inputs_embeds"], + input_ids=inputs["input_ids"], + position_ids=inputs["position_ids"], + token_type_ids=inputs["token_type_ids"], + inputs_embeds=inputs["inputs_embeds"], training=inputs["training"], ) @@ -652,8 +694,8 @@ class TF{{cookiecutter.camelcase_modelname}}MainLayer(tf.keras.layers.Layer): # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. - extended_attention_mask = tf.cast(extended_attention_mask, embedding_output.dtype) - extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + extended_attention_mask = tf.cast(extended_attention_mask, dtype=embedding_output.dtype) + extended_attention_mask = tf.multiply(tf.subtract(1.0, extended_attention_mask), -10000.0) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head @@ -663,21 +705,21 @@ class TF{{cookiecutter.camelcase_modelname}}MainLayer(tf.keras.layers.Layer): if inputs["head_mask"] is not None: raise NotImplementedError else: - inputs["head_mask"] = [None] * self.num_hidden_layers + inputs["head_mask"] = [None] * self.config.num_hidden_layers encoder_outputs = self.encoder( - embedding_output, - extended_attention_mask, - inputs["head_mask"], - inputs["output_attentions"], - inputs["output_hidden_states"], - inputs["return_dict"], + hidden_states=embedding_output, + attention_mask=extended_attention_mask, + head_mask=inputs["head_mask"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], training=inputs["training"], ) sequence_output = encoder_outputs[0] - if not return_dict: + if not inputs["return_dict"]: return ( sequence_output, ) + encoder_outputs[1:] @@ -736,43 +778,41 @@ class TF{{cookiecutter.camelcase_modelname}}PreTrainedModel(TFPreTrainedModel): {{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING = r""" Args: - input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`): + input_ids (:obj:`np.ndarray`, :obj:`tf.Tensor`, :obj:`List[tf.Tensor]` :obj:`Dict[str, tf.Tensor]` or :obj:`Dict[str, np.ndarray]` and each example must have the shape :obj:`({0})`): Indices of input sequence tokens in the vocabulary. - Indices can be obtained using :class:`~transformers.{{cookiecutter.camelcase_modelname}}Tokenizer`. - See :func:`transformers.PreTrainedTokenizer.__call__` and - :func:`transformers.PreTrainedTokenizer.encode` for details. + Indices can be obtained using :class:`~transformers.BertTokenizer`. See + :func:`transformers.PreTrainedTokenizer.__call__` and :func:`transformers.PreTrainedTokenizer.encode` for + details. `What are input IDs? <../glossary.html#input-ids>`__ - attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): - Mask to avoid performing attention on padding token indices. - Mask values selected in ``[0, 1]``: + attention_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: - 1 for tokens that are **not masked**, - - 0 for tokens that are **maked**. + - 0 for tokens that are **masked**. `What are attention masks? <../glossary.html#attention-mask>`__ - token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): - Segment token indices to indicate first and second portions of the inputs. - Indices are selected in ``[0, 1]``: + token_type_ids (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, + 1]``: - 0 corresponds to a `sentence A` token, - 1 corresponds to a `sentence B` token. `What are token type IDs? <../glossary.html#token-type-ids>`__ - position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): - Indices of positions of each input sequence tokens in the position embeddings. - Selected in the range ``[0, config.max_position_embeddings - 1]``. + position_ids (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, + config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`__ - head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): - Mask to nullify selected heads of the self-attention modules. - Mask values selected in ``[0, 1]``: + head_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): + Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. - inputs_embeds (:obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`): + inputs_embeds (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert :obj:`input_ids` indices into associated vectors than the model's internal embedding lookup matrix. @@ -798,7 +838,7 @@ class TF{{cookiecutter.camelcase_modelname}}PreTrainedModel(TFPreTrainedModel): {{cookiecutter.uppercase_modelname}}_START_DOCSTRING, ) class TF{{cookiecutter.camelcase_modelname}}Model(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel): - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") @@ -812,18 +852,18 @@ class TF{{cookiecutter.camelcase_modelname}}Model(TF{{cookiecutter.camelcase_mod ) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: inputs = input_processing( func=self.call, config=self.config, @@ -855,17 +895,19 @@ class TF{{cookiecutter.camelcase_modelname}}Model(TF{{cookiecutter.camelcase_mod return outputs # Copied from transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertModel.serving_output - def serving_output(self, output): + def serving_output(self, output: TFBaseModelOutput) -> TFBaseModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None - return TFBaseModelOutput(last_hidden_state=output.last_hidden_state, hidden_states=hs, attentions=attns) + return TFBaseModelOutput( + last_hidden_state=output.last_hidden_state, hidden_states=hs, attentions=attns, + ) @add_start_docstrings("""{{cookiecutter.modelname}} Model with a `language modeling` head on top. """, {{cookiecutter.uppercase_modelname}}_START_DOCSTRING) class TF{{cookiecutter.camelcase_modelname}}ForMaskedLM(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, TFMaskedLanguageModelingLoss): - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) if config.is_decoder: @@ -875,9 +917,9 @@ class TF{{cookiecutter.camelcase_modelname}}ForMaskedLM(TF{{cookiecutter.camelca ) self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") - self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, self.{{cookiecutter.lowercase_modelname}}.embeddings.word_embeddings, name="mlm___cls") + self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, inputs_embeddings=self.{{cookiecutter.lowercase_modelname}}.embeddings.word_embeddings, name="mlm___cls") - def get_lm_head(self): + def get_lm_head(self) -> tf.keras.layers.Layer: return self.mlm.predictions @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -889,25 +931,24 @@ class TF{{cookiecutter.camelcase_modelname}}ForMaskedLM(TF{{cookiecutter.camelca ) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - labels=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]: r""" - labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): - Labels for computing the masked language modeling loss. - Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) - Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels - in ``[0, ..., config.vocab_size]`` + labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., + config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored + (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` """ inputs = input_processing( func=self.call, @@ -926,7 +967,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForMaskedLM(TF{{cookiecutter.camelca kwargs_call=kwargs, ) outputs = self.{{cookiecutter.lowercase_modelname}}( - inputs["input_ids"], + input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], token_type_ids=inputs["token_type_ids"], position_ids=inputs["position_ids"], @@ -937,13 +978,14 @@ class TF{{cookiecutter.camelcase_modelname}}ForMaskedLM(TF{{cookiecutter.camelca return_dict=inputs["return_dict"], training=inputs["training"], ) - sequence_output = outputs[0] - prediction_scores = self.mlm(sequence_output, training=inputs["training"]) - loss = None if inputs["labels"] is None else self.compute_loss(inputs["labels"], prediction_scores) + prediction_scores = self.mlm(sequence_output=sequence_output, training=inputs["training"]) + loss = ( + None if inputs["labels"] is None else self.compute_loss(labels=inputs["labels"], logits=prediction_scores) + ) if not inputs["return_dict"]: - output = (prediction_scores,) + outputs[1:] + output = (prediction_scores,) + outputs[2:] return ((loss,) + output) if loss is not None else output return TFMaskedLMOutput( @@ -954,7 +996,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForMaskedLM(TF{{cookiecutter.camelca ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -965,16 +1007,16 @@ class TF{{cookiecutter.camelcase_modelname}}ForMaskedLM(TF{{cookiecutter.camelca ) class TF{{cookiecutter.camelcase_modelname}}ForCausalLM(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, TFCausalLanguageModelingLoss): - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) if not config.is_decoder: logger.warning("If you want to use `TF{{cookiecutter.camelcase_modelname}}ForCausalLM` as a standalone, add `is_decoder=True.`") self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") - self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, self.{{cookiecutter.lowercase_modelname}}.embeddings.word_embeddings, name="mlm___cls") + self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, inputs_embeddings=self.{{cookiecutter.lowercase_modelname}}.embeddings.word_embeddings, name="mlm___cls") - def get_lm_head(self): + def get_lm_head(self) -> tf.keras.layers.Layer: return self.mlm.predictions @add_code_sample_docstrings( @@ -985,21 +1027,21 @@ class TF{{cookiecutter.camelcase_modelname}}ForCausalLM(TF{{cookiecutter.camelca ) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - labels=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFCausalLMOutput, Tuple[tf.Tensor]]: r""" - labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for computing the cross entropy classification loss. Indices should be in ``[0, ..., config.vocab_size - 1]``. """ @@ -1020,7 +1062,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForCausalLM(TF{{cookiecutter.camelca kwargs_call=kwargs, ) outputs = self.{{cookiecutter.lowercase_modelname}}( - inputs["input_ids"], + input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], token_type_ids=inputs["token_type_ids"], position_ids=inputs["position_ids"], @@ -1032,17 +1074,17 @@ class TF{{cookiecutter.camelcase_modelname}}ForCausalLM(TF{{cookiecutter.camelca training=inputs["training"], ) sequence_output = outputs[0] - logits = self.mlm(sequence_output, training=inputs["training"]) + logits = self.mlm(sequence_output=sequence_output, training=inputs["training"]) loss = None if inputs["labels"] is not None: # shift labels to the left and cut last logit token logits = logits[:, :-1] labels = inputs["labels"][:, 1:] - loss = self.compute_loss(labels, logits) + loss = self.compute_loss(labels=labels, logits=logits) if not inputs["return_dict"]: - output = (logits,) + outputs[1:] + output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output return TFCausalLMOutput( @@ -1053,37 +1095,41 @@ class TF{{cookiecutter.camelcase_modelname}}ForCausalLM(TF{{cookiecutter.camelca ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.serving_output - def serving_output(self, output): + def serving_output(self, output: TFCausalLMOutput) -> TFCausalLMOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None return TFCausalLMOutput(logits=output.logits, hidden_states=hs, attentions=attns) + class TF{{cookiecutter.camelcase_modelname}}ClassificationHead(tf.keras.layers.Layer): """Head for sentence-level classification tasks.""" - def __init__(self, config, **kwargs): - super().__init__(**kwargs) + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) self.dense = tf.keras.layers.Dense( - config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.out_proj = tf.keras.layers.Dense( - config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" + units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" ) - self.config = config + if isinstance(config.hidden_act, str): + self.classifier_act_fn = get_tf_activation(config.hidden_act) + else: + self.classifier_act_fn = config.hidden_act - def call(self, inputs, **kwargs): - x = inputs[:, 0, :] # take token (equiv. to [CLS]) - x = self.dropout(x) - x = self.dense(x) - x = get_tf_activation(self.config.hidden_act)(x) - x = self.dropout(x) - x = self.out_proj(x) + def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = hidden_states[:, 0, :] # take token (equiv. to [CLS]) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.classifier_act_fn(hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.out_proj(hidden_states) - return x + return hidden_states @add_start_docstrings( @@ -1092,9 +1138,11 @@ class TF{{cookiecutter.camelcase_modelname}}ClassificationHead(tf.keras.layers.L {{cookiecutter.uppercase_modelname}}_START_DOCSTRING, ) class TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, TFSequenceClassificationLoss): - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") self.classifier = TF{{cookiecutter.camelcase_modelname}}ClassificationHead(config, name="classifier") @@ -1107,24 +1155,23 @@ class TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification(TF{{cookie ) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - labels=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]: r""" - labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`): - Labels for computing the sequence classification/regression loss. - Indices should be in :obj:`[0, ..., config.num_labels - 1]`. - If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), + labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., + config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ inputs = input_processing( @@ -1144,7 +1191,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification(TF{{cookie kwargs_call=kwargs, ) outputs = self.{{cookiecutter.lowercase_modelname}}( - inputs["input_ids"], + input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], token_type_ids=inputs["token_type_ids"], position_ids=inputs["position_ids"], @@ -1155,8 +1202,8 @@ class TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification(TF{{cookie return_dict=inputs["return_dict"], training=inputs["training"], ) - logits = self.classifier(outputs[0], training=inputs["training"]) - loss = None if inputs["labels"] is None else self.compute_loss(inputs["labels"], logits) + logits = self.classifier(hidden_states=outputs[0], training=inputs["training"]) + loss = None if inputs["labels"] is None else self.compute_loss(labels=inputs["labels"], logits=logits) if not inputs["return_dict"]: output = (logits,) + outputs[1:] @@ -1171,7 +1218,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification(TF{{cookie ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1184,26 +1231,26 @@ class TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification(TF{{cookie {{cookiecutter.uppercase_modelname}}_START_DOCSTRING, ) class TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, TFMultipleChoiceLoss): - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") self.sequence_summary = TFSequenceSummary( - config, initializer_range=config.initializer_range, name="sequence_summary" + config, config.initializer_range, name="sequence_summary" ) self.classifier = tf.keras.layers.Dense( - 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) @property - def dummy_inputs(self): + def dummy_inputs(self) -> Dict[str, tf.Tensor]: """ Dummy inputs to build the network. Returns: tf.Tensor with dummy inputs """ - return {"input_ids": tf.convert_to_tensor(MULTIPLE_CHOICE_DUMMY_INPUTS)} + return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_code_sample_docstrings( @@ -1214,24 +1261,24 @@ class TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice(TF{{cookiecutter.c ) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - labels=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]: r""" - labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`): - Labels for computing the multiple choice classification loss. - Indices should be in ``[0, ..., num_choices]`` where :obj:`num_choices` is the size of the second dimension - of the input tensors. (See :obj:`input_ids` above) + labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., + num_choices]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See + :obj:`input_ids` above) """ inputs = input_processing( func=self.call, @@ -1249,7 +1296,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice(TF{{cookiecutter.c training=training, kwargs_call=kwargs, ) - + if inputs["input_ids"] is not None: num_choices = shape_list(inputs["input_ids"])[1] seq_length = shape_list(inputs["input_ids"])[2] @@ -1257,37 +1304,47 @@ class TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice(TF{{cookiecutter.c num_choices = shape_list(inputs["inputs_embeds"])[1] seq_length = shape_list(inputs["inputs_embeds"])[2] - flat_input_ids = tf.reshape(inputs["input_ids"], (-1, seq_length)) if inputs["input_ids"] is not None else None + flat_input_ids = ( + tf.reshape(tensor=inputs["input_ids"], shape=(-1, seq_length)) if inputs["input_ids"] is not None else None + ) flat_attention_mask = ( - tf.reshape(inputs["attention_mask"], (-1, seq_length)) if inputs["attention_mask"] is not None else None + tf.reshape(tensor=inputs["attention_mask"], shape=(-1, seq_length)) + if inputs["attention_mask"] is not None + else None ) flat_token_type_ids = ( - tf.reshape(inputs["token_type_ids"], (-1, seq_length)) if inputs["token_type_ids"] is not None else None + tf.reshape(tensor=inputs["token_type_ids"], shape=(-1, seq_length)) + if inputs["token_type_ids"] is not None + else None ) flat_position_ids = ( - tf.reshape(inputs["position_ids"], (-1, seq_length)) if inputs["position_ids"] is not None else None + tf.reshape(tensor=inputs["position_ids"], shape=(-1, seq_length)) + if inputs["position_ids"] is not None + else None ) flat_inputs_embeds = ( - tf.reshape(inputs["inputs_embeds"], (-1, seq_length, shape_list(inputs["inputs_embeds"])[3])) + tf.reshape( + tensor=inputs["inputs_embeds"], shape=(-1, seq_length, shape_list(inputs["inputs_embeds"])[3]) + ) if inputs["inputs_embeds"] is not None else None ) outputs = self.{{cookiecutter.lowercase_modelname}}( - flat_input_ids, - flat_attention_mask, - flat_token_type_ids, - flat_position_ids, - inputs["head_mask"], - flat_inputs_embeds, - inputs["output_attentions"], - inputs["output_hidden_states"], + input_ids=flat_input_ids, + attention_mask=flat_attention_mask, + token_type_ids=flat_token_type_ids, + position_ids=flat_position_ids, + head_mask=inputs["head_mask"], + inputs_embeds=flat_inputs_embeds, + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], return_dict=inputs["return_dict"], training=inputs["training"], ) - logits = self.sequence_summary(outputs[0], training=inputs["training"]) - logits = self.classifier(logits) - reshaped_logits = tf.reshape(logits, (-1, num_choices)) - loss = None if inputs["labels"] is None else self.compute_loss(inputs["labels"], reshaped_logits) + logits = self.sequence_summary(inputs=outputs[0], training=inputs["training"]) + logits = self.classifier(inputs=logits) + reshaped_logits = tf.reshape(tensor=logits, shape=(-1, num_choices)) + loss = None if inputs["labels"] is None else self.compute_loss(labels=inputs["labels"], logits=reshaped_logits) if not inputs["return_dict"]: output = (reshaped_logits,) + outputs[1:] @@ -1306,13 +1363,14 @@ class TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice(TF{{cookiecutter.c "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"), "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"), }]) - def serving(self, inputs): - output = self.call(inputs) - + # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving + def serving(self, inputs: Dict[str, tf.Tensor]): + output = self.call(input_ids=inputs) + return self.serving_output(output) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1326,14 +1384,15 @@ class TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice(TF{{cookiecutter.c ) class TF{{cookiecutter.camelcase_modelname}}ForTokenClassification(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, TFTokenClassificationLoss): - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels + self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.classifier = tf.keras.layers.Dense( - config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1345,23 +1404,23 @@ class TF{{cookiecutter.camelcase_modelname}}ForTokenClassification(TF{{cookiecut ) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - labels=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]: r""" - labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): - Labels for computing the token classification loss. - Indices should be in ``[0, ..., config.num_labels - 1]``. + labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - + 1]``. """ inputs = input_processing( func=self.call, @@ -1380,7 +1439,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForTokenClassification(TF{{cookiecut kwargs_call=kwargs, ) outputs = self.{{cookiecutter.lowercase_modelname}}( - inputs["input_ids"], + input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], token_type_ids=inputs["token_type_ids"], position_ids=inputs["position_ids"], @@ -1392,9 +1451,9 @@ class TF{{cookiecutter.camelcase_modelname}}ForTokenClassification(TF{{cookiecut training=inputs["training"], ) sequence_output = outputs[0] - sequence_output = self.dropout(sequence_output, training=inputs["training"]) - logits = self.classifier(sequence_output) - loss = None if inputs["labels"] is None else self.compute_loss(inputs["labels"], logits) + sequence_output = self.dropout(inputs=sequence_output, training=inputs["training"]) + logits = self.classifier(inputs=sequence_output) + loss = None if inputs["labels"] is None else self.compute_loss(labels=inputs["labels"], logits=logits) if not inputs["return_dict"]: output = (logits,) + outputs[1:] @@ -1408,7 +1467,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForTokenClassification(TF{{cookiecut ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1422,13 +1481,14 @@ class TF{{cookiecutter.camelcase_modelname}}ForTokenClassification(TF{{cookiecut ) class TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, TFQuestionAnsweringLoss): - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels + self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") self.qa_outputs = tf.keras.layers.Dense( - config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" + units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1440,29 +1500,29 @@ class TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering(TF{{cookiecutte ) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - start_positions=None, - end_positions=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None, + end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]: r""" - start_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`): + start_positions (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`): Labels for position (index) of the start of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (:obj:`sequence_length`). - Position outside of the sequence are not taken into account for computing the loss. - end_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`): + Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the + sequence are not taken into account for computing the loss. + end_positions (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`): Labels for position (index) of the end of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (:obj:`sequence_length`). - Position outside of the sequence are not taken into account for computing the loss. + Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the + sequence are not taken into account for computing the loss. """ inputs = input_processing( func=self.call, @@ -1482,7 +1542,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering(TF{{cookiecutte kwargs_call=kwargs, ) outputs = self.{{cookiecutter.lowercase_modelname}}( - inputs["input_ids"], + input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], token_type_ids=inputs["token_type_ids"], position_ids=inputs["position_ids"], @@ -1494,19 +1554,19 @@ class TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering(TF{{cookiecutte training=inputs["training"], ) sequence_output = outputs[0] - logits = self.qa_outputs(sequence_output) - start_logits, end_logits = tf.split(logits, 2, axis=-1) - start_logits = tf.squeeze(start_logits, axis=-1) - end_logits = tf.squeeze(end_logits, axis=-1) + logits = self.qa_outputs(inputs=sequence_output) + start_logits, end_logits = tf.split(value=logits, num_or_size_splits=2, axis=-1) + start_logits = tf.squeeze(input=start_logits, axis=-1) + end_logits = tf.squeeze(input=end_logits, axis=-1) loss = None if inputs["start_positions"] is not None and inputs["end_positions"] is not None: labels = {"start_position": inputs["start_positions"]} labels["end_position"] = inputs["end_positions"] - loss = self.compute_loss(labels, (start_logits, end_logits)) + loss = self.compute_loss(labels=labels, logits=(start_logits, end_logits)) if not inputs["return_dict"]: - output = (start_logits, end_logits) + outputs[1:] + output = (start_logits, end_logits) + outputs[2:] return ((loss,) + output) if loss is not None else output return TFQuestionAnsweringModelOutput( @@ -1518,10 +1578,10 @@ class TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering(TF{{cookiecutte ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output - def serving_output(self, output): + def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None - + return TFQuestionAnsweringModelOutput( start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns )