diff --git a/docs/source/model_doc/albert.rst b/docs/source/model_doc/albert.rst index 6562631e0a..c78426d0c7 100644 --- a/docs/source/model_doc/albert.rst +++ b/docs/source/model_doc/albert.rst @@ -50,7 +50,10 @@ AlbertTokenizer Albert specific outputs ~~~~~~~~~~~~~~~~~~~~~~~ -.. autoclass:: transformers.modeling_albert.AlbertForPretrainingOutput +.. autoclass:: transformers.modeling_albert.AlbertForPreTrainingOutput + :members: + +.. autoclass:: transformers.modeling_tf_albert.TFAlbertForPreTrainingOutput :members: diff --git a/docs/source/model_doc/bert.rst b/docs/source/model_doc/bert.rst index 1666260f96..5e35b520d8 100644 --- a/docs/source/model_doc/bert.rst +++ b/docs/source/model_doc/bert.rst @@ -57,7 +57,10 @@ BertTokenizerFast Bert specific outputs ~~~~~~~~~~~~~~~~~~~~~ -.. autoclass:: transformers.modeling_bert.BertForPretrainingOutput +.. autoclass:: transformers.modeling_bert.BertForPreTrainingOutput + :members: + +.. autoclass:: transformers.modeling_tf_bert.TFBertForPreTrainingOutput :members: diff --git a/docs/source/model_doc/electra.rst b/docs/source/model_doc/electra.rst index 895ca9dde8..993ed4d2b5 100644 --- a/docs/source/model_doc/electra.rst +++ b/docs/source/model_doc/electra.rst @@ -74,7 +74,10 @@ ElectraTokenizerFast Electra specific outputs ~~~~~~~~~~~~~~~~~~~~~~~~ -.. autoclass:: transformers.modeling_electra.ElectraForPretrainingOutput +.. autoclass:: transformers.modeling_electra.ElectraForPreTrainingOutput + :members: + +.. autoclass:: transformers.modeling_tf_electra.TFElectraForPreTrainingOutput :members: @@ -106,6 +109,13 @@ ElectraForSequenceClassification :members: +ElectraForMultipleChoice +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.ElectraForMultipleChoice + :members: + + ElectraForTokenClassification ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -141,6 +151,20 @@ TFElectraForMaskedLM :members: +TFElectraForSequenceClassification +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TFElectraForSequenceClassification + :members: + + +TFElectraForMultipleChoice +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TFElectraForMultipleChoice + :members: + + TFElectraForTokenClassification ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/model_doc/gpt.rst b/docs/source/model_doc/gpt.rst index 39c5fe269c..99772b30fd 100644 --- a/docs/source/model_doc/gpt.rst +++ b/docs/source/model_doc/gpt.rst @@ -77,6 +77,9 @@ OpenAI specific outputs .. autoclass:: transformers.modeling_openai.OpenAIGPTDoubleHeadsModelOutput :members: +.. autoclass:: transformers.modeling_tf_openai.TFOpenAIGPTDoubleHeadsModelOutput + :members: + OpenAIGPTModel ~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/model_doc/gpt2.rst b/docs/source/model_doc/gpt2.rst index 3f1be1bb4c..b1ae24d98e 100644 --- a/docs/source/model_doc/gpt2.rst +++ b/docs/source/model_doc/gpt2.rst @@ -64,6 +64,9 @@ GPT2 specific outputs .. autoclass:: transformers.modeling_gpt2.GPT2DoubleHeadsModelOutput :members: +.. autoclass:: transformers.modeling_tf_gpt2.TFGPT2DoubleHeadsModelOutput + :members: + GPT2Model ~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/model_doc/mobilebert.rst b/docs/source/model_doc/mobilebert.rst index ad3e0c206e..038adc6b66 100644 --- a/docs/source/model_doc/mobilebert.rst +++ b/docs/source/model_doc/mobilebert.rst @@ -59,7 +59,10 @@ MobileBertTokenizerFast MobileBert specific outputs ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. autoclass:: transformers.modeling_mobilebert.MobileBertForPretrainingOutput +.. autoclass:: transformers.modeling_mobilebert.MobileBertForPreTrainingOutput + :members: + +.. autoclass:: transformers.modeling_tf_mobilebert.TFMobileBertForPreTrainingOutput :members: diff --git a/docs/source/model_doc/transformerxl.rst b/docs/source/model_doc/transformerxl.rst index dc1a637836..c9c9807a0d 100644 --- a/docs/source/model_doc/transformerxl.rst +++ b/docs/source/model_doc/transformerxl.rst @@ -63,6 +63,12 @@ TransfoXL specific outputs .. autoclass:: transformers.modeling_transfo_xl.TransfoXLLMHeadModelOutput :members: +.. autoclass:: transformers.modeling_tf_transfo_xl.TFTransfoXLModelOutput + :members: + +.. autoclass:: transformers.modeling_tf_transfo_xl.TFTransfoXLLMHeadModelOutput + :members: + TransfoXLModel ~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/model_doc/xlnet.rst b/docs/source/model_doc/xlnet.rst index bea5897591..d424aecc18 100644 --- a/docs/source/model_doc/xlnet.rst +++ b/docs/source/model_doc/xlnet.rst @@ -74,6 +74,24 @@ XLNet specific outputs .. autoclass:: transformers.modeling_xlnet.XLNetForQuestionAnsweringOutput :members: +.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetModelOutput + :members: + +.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetLMHeadModelOutput + :members: + +.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetForSequenceClassificationOutput + :members: + +.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetForMultipleChoiceOutput + :members: + +.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetForTokenClassificationOutput + :members: + +.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetForQuestionAnsweringSimpleOutput + :members: + XLNetModel ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py index 5bdf1f792d..2f72469f1c 100644 --- a/src/transformers/file_utils.py +++ b/src/transformers/file_utils.py @@ -190,7 +190,7 @@ def add_end_docstrings(*docstr): return docstring_decorator -RETURN_INTRODUCTION = r""" +PT_RETURN_INTRODUCTION = r""" Returns: :class:`~{full_output_type}` or :obj:`tuple(torch.FloatTensor)`: A :class:`~{full_output_type}` (if ``return_dict=True`` is passed or when ``config.return_dict=True``) or a @@ -200,6 +200,16 @@ RETURN_INTRODUCTION = r""" """ +TF_RETURN_INTRODUCTION = r""" + Returns: + :class:`~{full_output_type}` or :obj:`tuple(tf.Tensor)`: + A :class:`~{full_output_type}` (if ``return_dict=True`` is passed or when ``config.return_dict=True``) or a + tuple of :obj:`tf.Tensor` comprising various elements depending on the configuration + (:class:`~transformers.{config_class}`) and inputs. + +""" + + def _get_indent(t): """Returns the indentation in the first line of t""" search = re.search(r"^(\s*)\S", t) @@ -249,7 +259,8 @@ def _prepare_output_docstrings(output_type, config_class): # Add the return introduction full_output_type = f"{output_type.__module__}.{output_type.__name__}" - intro = RETURN_INTRODUCTION.format(full_output_type=full_output_type, config_class=config_class) + intro = TF_RETURN_INTRODUCTION if output_type.__name__.startswith("TF") else PT_RETURN_INTRODUCTION + intro = intro.format(full_output_type=full_output_type, config_class=config_class) return intro + docstrings diff --git a/src/transformers/modeling_albert.py b/src/transformers/modeling_albert.py index cbd94ce47f..2f3427fce3 100644 --- a/src/transformers/modeling_albert.py +++ b/src/transformers/modeling_albert.py @@ -407,9 +407,9 @@ class AlbertPreTrainedModel(PreTrainedModel): @dataclass -class AlbertForPretrainingOutput(ModelOutput): +class AlbertForPreTrainingOutput(ModelOutput): """ - Output type of :class:`~transformers.AlbertForPretrainingModel`. + Output type of :class:`~transformers.AlbertForPreTrainingModel`. Args: loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`): @@ -643,7 +643,7 @@ class AlbertForPreTraining(AlbertPreTrainedModel): return self.predictions.decoder @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=AlbertForPretrainingOutput, config_class=_CONFIG_FOR_DOC) + @replace_return_docstrings(output_type=AlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) def forward( self, input_ids=None, @@ -728,7 +728,7 @@ class AlbertForPreTraining(AlbertPreTrainedModel): output = (prediction_scores, sop_scores) + outputs[2:] return ((total_loss,) + output) if total_loss is not None else output - return AlbertForPretrainingOutput( + return AlbertForPreTrainingOutput( loss=total_loss, prediction_logits=prediction_scores, sop_logits=sop_scores, diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py index 74c5acafbe..fb2a2a510e 100644 --- a/src/transformers/modeling_bert.py +++ b/src/transformers/modeling_bert.py @@ -586,9 +586,9 @@ class BertPreTrainedModel(PreTrainedModel): @dataclass -class BertForPretrainingOutput(ModelOutput): +class BertForPreTrainingOutput(ModelOutput): """ - Output type of :class:`~transformers.BertForPretrainingModel`. + Output type of :class:`~transformers.BertForPreTrainingModel`. Args: loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`): @@ -837,7 +837,7 @@ class BertForPreTraining(BertPreTrainedModel): return self.cls.predictions.decoder @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) - @replace_return_docstrings(output_type=BertForPretrainingOutput, config_class=_CONFIG_FOR_DOC) + @replace_return_docstrings(output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) def forward( self, input_ids=None, @@ -918,7 +918,7 @@ class BertForPreTraining(BertPreTrainedModel): output = (prediction_scores, seq_relationship_score) + outputs[2:] return ((total_loss,) + output) if total_loss is not None else output - return BertForPretrainingOutput( + return BertForPreTrainingOutput( loss=total_loss, prediction_logits=prediction_scores, seq_relationship_logits=seq_relationship_score, diff --git a/src/transformers/modeling_electra.py b/src/transformers/modeling_electra.py index 5e4e1286d2..1eb58c1486 100644 --- a/src/transformers/modeling_electra.py +++ b/src/transformers/modeling_electra.py @@ -188,9 +188,9 @@ class ElectraPreTrainedModel(BertPreTrainedModel): @dataclass -class ElectraForPretrainingOutput(ModelOutput): +class ElectraForPreTrainingOutput(ModelOutput): """ - Output type of :class:`~transformers.ElectraForPretrainingModel`. + Output type of :class:`~transformers.ElectraForPreTrainingModel`. Args: loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`): @@ -496,7 +496,7 @@ class ElectraForPreTraining(ElectraPreTrainedModel): self.init_weights() @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=ElectraForPretrainingOutput, config_class=_CONFIG_FOR_DOC) + @replace_return_docstrings(output_type=ElectraForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) def forward( self, input_ids=None, @@ -562,7 +562,7 @@ class ElectraForPreTraining(ElectraPreTrainedModel): output = (logits,) + discriminator_hidden_states[1:] return ((loss,) + output) if loss is not None else output - return ElectraForPretrainingOutput( + return ElectraForPreTrainingOutput( loss=loss, logits=logits, hidden_states=discriminator_hidden_states.hidden_states, @@ -850,7 +850,7 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel): @add_start_docstrings( """ELECTRA Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, - ELECTRA_INPUTS_DOCSTRING, + ELECTRA_START_DOCSTRING, ) class ElectraForMultipleChoice(ElectraPreTrainedModel): def __init__(self, config): diff --git a/src/transformers/modeling_mobilebert.py b/src/transformers/modeling_mobilebert.py index f0b01cfa61..5fbda94bd9 100644 --- a/src/transformers/modeling_mobilebert.py +++ b/src/transformers/modeling_mobilebert.py @@ -685,9 +685,9 @@ class MobileBertPreTrainedModel(PreTrainedModel): @dataclass -class MobileBertForPretrainingOutput(ModelOutput): +class MobileBertForPreTrainingOutput(ModelOutput): """ - Output type of :class:`~transformers.MobileBertForPretrainingModel`. + Output type of :class:`~transformers.MobileBertForPreTrainingModel`. Args: loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`): @@ -948,7 +948,7 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel): self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings()) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=MobileBertForPretrainingOutput, config_class=_CONFIG_FOR_DOC) + @replace_return_docstrings(output_type=MobileBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) def forward( self, input_ids=None, @@ -1018,7 +1018,7 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel): output = (prediction_scores, seq_relationship_score) + outputs[2:] return ((total_loss,) + output) if total_loss is not None else output - return MobileBertForPretrainingOutput( + return MobileBertForPreTrainingOutput( loss=total_loss, prediction_logits=prediction_scores, seq_relationship_logits=seq_relationship_score, diff --git a/src/transformers/modeling_t5.py b/src/transformers/modeling_t5.py index 03a0827e1d..3b718f242f 100644 --- a/src/transformers/modeling_t5.py +++ b/src/transformers/modeling_t5.py @@ -973,7 +973,7 @@ class T5Model(T5PreTrainedModel): output_hidden_states=output_hidden_states, return_dict=return_dict, ) - elif not return_dict and not isinstance(encoder_outputs, BaseModelOutput): + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): encoder_outputs = BaseModelOutput( last_hidden_state=encoder_outputs[0], hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, diff --git a/src/transformers/modeling_tf_albert.py b/src/transformers/modeling_tf_albert.py index 4159a6cb08..73a1e82527 100644 --- a/src/transformers/modeling_tf_albert.py +++ b/src/transformers/modeling_tf_albert.py @@ -17,17 +17,30 @@ import logging +from dataclasses import dataclass +from typing import Optional, Tuple import tensorflow as tf from .configuration_albert import AlbertConfig from .file_utils import ( MULTIPLE_CHOICE_DUMMY_INPUTS, + ModelOutput, add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable, + replace_return_docstrings, ) from .modeling_tf_bert import ACT2FN, TFBertSelfAttention +from .modeling_tf_outputs import ( + TFBaseModelOutput, + TFBaseModelOutputWithPooling, + TFMaskedLMOutput, + TFMultipleChoiceModelOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) from .modeling_tf_utils import ( TFMaskedLanguageModelingLoss, TFMultipleChoiceLoss, @@ -44,6 +57,7 @@ from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) +_CONFIG_FOR_DOC = "AlbertConfig" _TOKENIZER_FOR_DOC = "AlbertTokenizer" TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ @@ -414,12 +428,19 @@ class TFAlbertTransformer(tf.keras.layers.Layer): for i in range(config.num_hidden_groups) ] - def call(self, hidden_states, attention_mask, head_mask, output_attentions, output_hidden_states, training=False): + def call( + self, + hidden_states, + attention_mask, + head_mask, + output_attentions, + output_hidden_states, + return_dict, + training=False, + ): hidden_states = self.embedding_hidden_mapping_in(hidden_states) - all_attentions = () - - if output_hidden_states: - all_hidden_states = (hidden_states,) + all_attentions = () if output_attentions else None + all_hidden_states = (hidden_states,) if output_hidden_states else None for i in range(self.config.num_hidden_layers): # Number of layers in a hidden group @@ -444,14 +465,11 @@ class TFAlbertTransformer(tf.keras.layers.Layer): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - outputs = (hidden_states,) - if output_hidden_states: - outputs = outputs + (all_hidden_states,) - if output_attentions: - outputs = outputs + (all_attentions,) - - # last-layer hidden state, (all hidden states), (all attentions) - return outputs + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) + return TFBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions + ) class TFAlbertPreTrainedModel(TFPreTrainedModel): @@ -506,6 +524,7 @@ class TFAlbertMainLayer(tf.keras.layers.Layer): self.num_hidden_layers = config.num_hidden_layers self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states + self.return_dict = config.use_return_dict self.embeddings = TFAlbertEmbeddings(config, name="embeddings") self.encoder = TFAlbertTransformer(config, name="encoder") @@ -543,6 +562,7 @@ class TFAlbertMainLayer(tf.keras.layers.Layer): inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): if isinstance(inputs, (tuple, list)): @@ -554,7 +574,8 @@ class TFAlbertMainLayer(tf.keras.layers.Layer): inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states - assert len(inputs) <= 8, "Too many inputs." + return_dict = inputs[8] if len(inputs) > 8 else return_dict + assert len(inputs) <= 9, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -564,12 +585,14 @@ class TFAlbertMainLayer(tf.keras.layers.Layer): inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) - assert len(inputs) <= 8, "Too many inputs." + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 9, "Too many inputs." else: input_ids = inputs output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states + return_dict = return_dict if return_dict is not None else self.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -619,16 +642,52 @@ class TFAlbertMainLayer(tf.keras.layers.Layer): head_mask, output_attentions, output_hidden_states, + return_dict, training=training, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output[:, 0]) - # add hidden_states and attentions if they are here - outputs = (sequence_output, pooled_output,) + encoder_outputs[1:] - # sequence_output, pooled_output, (hidden_states), (attentions) - return outputs + if not return_dict: + return (sequence_output, pooled_output,) + encoder_outputs[1:] + + return TFBaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@dataclass +class TFAlbertForPreTrainingOutput(ModelOutput): + """ + Output type of :class:`~transformers.TFAlbertForPreTrainingModel`. + + Args: + prediction_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + sop_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False + continuation before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + prediction_logits: tf.Tensor = None + sop_logits: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None ALBERT_START_DOCSTRING = r""" @@ -707,6 +766,11 @@ ALBERT_INPUTS_DOCSTRING = r""" (if set to :obj:`False`) for evaluation. output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -720,32 +784,13 @@ class TFAlbertModel(TFAlbertPreTrainedModel): self.albert = TFAlbertMainLayer(config, name="albert") @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="albert-base-v2", + output_type=TFBaseModelOutputWithPooling, + config_class=_CONFIG_FOR_DOC, + ) def call(self, inputs, **kwargs): - r""" - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs: - last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`): - Last layer hidden-state of the first token of the sequence (classification token) - further processed by a Linear layer and a Tanh activation function. The Linear - layer weights are trained from the next sentence prediction (classification) - objective during Albert pretraining. This output is usually *not* a good summary - of the semantic content of the input, you're often better with averaging or pooling - the sequence of hidden-states for the whole input sequence. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ outputs = self.albert(inputs, **kwargs) return outputs @@ -768,25 +813,10 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel): return self.albert.embeddings @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + @replace_return_docstrings(output_type=TFAlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) def call(self, inputs, **kwargs): r""" Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - sop_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`): - Prediction scores of the sentence order prediction (classification) head (scores of True/False continuation before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. Examples:: import tensorflow as tf @@ -797,13 +827,22 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel): outputs = model(input_ids) prediction_scores, sop_scores = outputs[:2] """ - + return_dict = kwargs.get("return_dict") + return_dict = return_dict if return_dict is not None else self.albert.return_dict outputs = self.albert(inputs, **kwargs) sequence_output, pooled_output = outputs[:2] prediction_scores = self.predictions(sequence_output) sop_scores = self.sop_classifier(pooled_output, training=kwargs.get("training", False)) - outputs = (prediction_scores, sop_scores) + outputs[2:] - return outputs + + if not return_dict: + return (prediction_scores, sop_scores) + outputs[2:] + + return TFAlbertForPreTrainingOutput( + prediction_logits=prediction_scores, + sop_logits=sop_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) class TFAlbertSOPHead(tf.keras.layers.Layer): @@ -833,7 +872,12 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss) return self.albert.embeddings @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="albert-base-v2", + output_type=TFMaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -844,6 +888,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss) inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -853,27 +898,12 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss) Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` - - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs: - prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)` - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.albert.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -886,20 +916,22 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss) inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) sequence_output = outputs[0] prediction_scores = self.predictions(sequence_output, training=training) - # Add hidden states and attention if they are here - outputs = (prediction_scores,) + outputs[2:] + loss = None if labels is None else self.compute_loss(labels, prediction_scores) - if labels is not None: - loss = self.compute_loss(labels, prediction_scores) - outputs = (loss,) + outputs + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # prediction_scores, (hidden_states), (attentions) + return TFMaskedLMOutput( + loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -919,7 +951,12 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass ) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="albert-base-v2", + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -930,6 +967,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -939,27 +977,12 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass Indices should be in ``[0, ..., config.num_labels - 1]``. If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). - - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs: - logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`) - Classification (or regression if config.num_labels==1) scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.albert.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -972,6 +995,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -980,13 +1004,15 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass pooled_output = self.dropout(pooled_output, training=training) logits = self.classifier(pooled_output) - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFSequenceClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -1006,7 +1032,12 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat ) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="albert-base-v2", + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -1017,6 +1048,7 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -1024,27 +1056,12 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.albert.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -1057,6 +1074,7 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -1065,13 +1083,15 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat sequence_output = self.dropout(sequence_output, training=training) logits = self.classifier(sequence_output) - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFTokenClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -1089,7 +1109,12 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL ) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="albert-base-v2", + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -1100,6 +1125,7 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, start_positions=None, end_positions=None, training=False, @@ -1113,30 +1139,13 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs: - start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-start scores (before SoftMax). - end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-end scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.albert.return_dict if isinstance(inputs, (tuple, list)): - start_positions = inputs[8] if len(inputs) > 8 else start_positions - end_positions = inputs[9] if len(inputs) > 9 else end_positions - if len(inputs) > 8: - inputs = inputs[:8] + start_positions = inputs[9] if len(inputs) > 9 else start_positions + end_positions = inputs[10] if len(inputs) > 10 else end_positions + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): start_positions = inputs.pop("start_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions) @@ -1150,6 +1159,7 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -1160,15 +1170,23 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) - outputs = (start_logits, end_logits,) + outputs[2:] - + loss = None if start_positions is not None and end_positions is not None: labels = {"start_position": start_positions} labels["end_position"] = end_positions - loss = self.compute_loss(labels, outputs[:2]) - outputs = (loss,) + outputs + loss = self.compute_loss(labels, (start_logits, end_logits)) - return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) @add_start_docstrings( @@ -1196,7 +1214,12 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss): return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="albert-base-v2", + output_type=TFMultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs, @@ -1207,6 +1230,7 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss): inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -1215,24 +1239,6 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`: - `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). - - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ if isinstance(inputs, (tuple, list)): input_ids = inputs[0] @@ -1243,8 +1249,9 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss): inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states - labels = inputs[8] if len(inputs) > 8 else labels - assert len(inputs) <= 9, "Too many inputs." + return_dict = inputs[8] if len(inputs) > 8 else return_dict + labels = inputs[9] if len(inputs) > 9 else labels + assert len(inputs) <= 10, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -1254,10 +1261,12 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss): inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_attentions) + return_dict = inputs.get("return_dict", return_dict) labels = inputs.get("labels", labels) - assert len(inputs) <= 9, "Too many inputs." + assert len(inputs) <= 10, "Too many inputs." else: input_ids = inputs + return_dict = return_dict if return_dict is not None else self.albert.return_dict if input_ids is not None: num_choices = shape_list(input_ids)[1] @@ -1280,6 +1289,7 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss): inputs_embeds, output_attentions, output_hidden_states, + return_dict=return_dict, training=training, ) @@ -1289,10 +1299,12 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss): logits = self.classifier(pooled_output) reshaped_logits = tf.reshape(logits, (-1, num_choices)) - outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, reshaped_logits) - if labels is not None: - loss = self.compute_loss(labels, reshaped_logits) - outputs = (loss,) + outputs + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), reshaped_logits, (hidden_states), (attentions) + return TFMultipleChoiceModelOutput( + loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) diff --git a/src/transformers/modeling_tf_bert.py b/src/transformers/modeling_tf_bert.py index 4c76120f54..6768c6765c 100644 --- a/src/transformers/modeling_tf_bert.py +++ b/src/transformers/modeling_tf_bert.py @@ -17,6 +17,8 @@ import logging +from dataclasses import dataclass +from typing import Optional, Tuple import numpy as np import tensorflow as tf @@ -24,9 +26,22 @@ import tensorflow as tf from .configuration_bert import BertConfig from .file_utils import ( MULTIPLE_CHOICE_DUMMY_INPUTS, + ModelOutput, add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable, + replace_return_docstrings, +) +from .modeling_tf_outputs import ( + TFBaseModelOutput, + TFBaseModelOutputWithPooling, + TFCausalLMOutput, + TFMaskedLMOutput, + TFMultipleChoiceModelOutput, + TFNextSentencePredictorOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, ) from .modeling_tf_utils import ( TFCausalLanguageModelingLoss, @@ -45,6 +60,7 @@ from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) +_CONFIG_FOR_DOC = "BertConfig" _TOKENIZER_FOR_DOC = "BertTokenizer" TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ @@ -389,9 +405,18 @@ class TFBertEncoder(tf.keras.layers.Layer): super().__init__(**kwargs) self.layer = [TFBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)] - def call(self, hidden_states, attention_mask, head_mask, output_attentions, output_hidden_states, training=False): - all_hidden_states = () - all_attentions = () + def call( + self, + hidden_states, + attention_mask, + head_mask, + output_attentions, + output_hidden_states, + return_dict, + training=False, + ): + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None for i, layer_module in enumerate(self.layer): if output_hidden_states: @@ -409,15 +434,11 @@ class TFBertEncoder(tf.keras.layers.Layer): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - outputs = (hidden_states,) - - if output_hidden_states: - outputs = outputs + (all_hidden_states,) - - if output_attentions: - outputs = outputs + (all_attentions,) - - return outputs # outputs, (hidden states), (attentions) + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) + return TFBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions + ) class TFBertPooler(tf.keras.layers.Layer): @@ -517,6 +538,7 @@ class TFBertMainLayer(tf.keras.layers.Layer): self.initializer_range = config.initializer_range self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states + self.return_dict = config.use_return_dict self.embeddings = TFBertEmbeddings(config, name="embeddings") self.encoder = TFBertEncoder(config, name="encoder") self.pooler = TFBertPooler(config, name="pooler") @@ -545,6 +567,7 @@ class TFBertMainLayer(tf.keras.layers.Layer): inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): if isinstance(inputs, (tuple, list)): @@ -556,7 +579,8 @@ class TFBertMainLayer(tf.keras.layers.Layer): inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states - assert len(inputs) <= 8, "Too many inputs." + return_dict = inputs[8] if len(inputs) > 8 else return_dict + assert len(inputs) <= 9, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -566,12 +590,14 @@ class TFBertMainLayer(tf.keras.layers.Layer): inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) - assert len(inputs) <= 8, "Too many inputs." + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 9, "Too many inputs." else: input_ids = inputs output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states + return_dict = return_dict if return_dict is not None else self.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -621,16 +647,22 @@ class TFBertMainLayer(tf.keras.layers.Layer): head_mask, output_attentions, output_hidden_states, + return_dict, training=training, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) - outputs = (sequence_output, pooled_output,) + encoder_outputs[ - 1: - ] # add hidden_states and attentions if they are here - return outputs # sequence_output, pooled_output, (hidden_states), (attentions) + if not return_dict: + return (sequence_output, pooled_output,) + encoder_outputs[1:] + + return TFBaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) class TFBertPreTrainedModel(TFPreTrainedModel): @@ -642,6 +674,36 @@ class TFBertPreTrainedModel(TFPreTrainedModel): base_model_prefix = "bert" +@dataclass +class TFBertForPreTrainingOutput(ModelOutput): + """ + Output type of :class:`~transformers.TFBertForPreTrainingModel`. + + Args: + prediction_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + seq_relationship_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False + continuation before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + prediction_logits: tf.Tensor = None + seq_relationship_logits: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + BERT_START_DOCSTRING = r""" This model is a `tf.keras.Model `__ sub-class. Use it as a regular TF 2.0 Keras Model and @@ -712,6 +774,11 @@ BERT_INPUTS_DOCSTRING = r""" (if set to :obj:`False`) for evaluation. output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -725,32 +792,13 @@ class TFBertModel(TFBertPreTrainedModel): self.bert = TFBertMainLayer(config, name="bert") @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="bert-base-cased", + output_type=TFBaseModelOutputWithPooling, + config_class=_CONFIG_FOR_DOC, + ) def call(self, inputs, **kwargs): - r""" - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`): - Last layer hidden-state of the first token of the sequence (classification token) - further processed by a Linear layer and a Tanh activation function. The Linear - layer weights are trained from the next sentence prediction (classification) - objective during Bert pretraining. This output is usually *not* a good summary - of the semantic content of the input, you're often better with averaging or pooling - the sequence of hidden-states for the whole input sequence. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ outputs = self.bert(inputs, **kwargs) return outputs @@ -772,25 +820,10 @@ class TFBertForPreTraining(TFBertPreTrainedModel): return self.bert.embeddings @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + @replace_return_docstrings(output_type=TFBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) def call(self, inputs, **kwargs): r""" Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - seq_relationship_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`): - Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. Examples:: @@ -804,17 +837,23 @@ class TFBertForPreTraining(TFBertPreTrainedModel): prediction_scores, seq_relationship_scores = outputs[:2] """ + return_dict = kwargs.get("return_dict") + return_dict = return_dict if return_dict is not None else self.bert.return_dict outputs = self.bert(inputs, **kwargs) sequence_output, pooled_output = outputs[:2] prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False)) seq_relationship_score = self.nsp(pooled_output) - outputs = (prediction_scores, seq_relationship_score,) + outputs[ - 2: - ] # add hidden states and attention if they are here + if not return_dict: + return (prediction_scores, seq_relationship_score) + outputs[2:] - return outputs # prediction_scores, seq_relationship_score, (hidden_states), (attentions) + return TFBertForPreTrainingOutput( + prediction_logits=prediction_scores, + seq_relationship_logits=seq_relationship_score, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) @add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING) @@ -832,7 +871,12 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss): return self.bert.embeddings @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="bert-base-cased", + output_type=TFMaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -843,6 +887,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss): inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -852,27 +897,12 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss): Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.bert.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -885,19 +915,22 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss): inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) sequence_output = outputs[0] prediction_scores = self.mlm(sequence_output, training=training) - outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, prediction_scores) - if labels is not None: - loss = self.compute_loss(labels, prediction_scores) - outputs = (loss,) + outputs + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), prediction_scores, (hidden_states), (attentions) + return TFMaskedLMOutput( + loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss): @@ -911,7 +944,12 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss): def get_output_embeddings(self): return self.bert.embeddings - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="bert-base-cased", + output_type=TFCausalLMOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -922,6 +960,7 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss): inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -929,27 +968,12 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss): labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the cross entropy classification loss. Indices should be in ``[0, ..., config.vocab_size - 1]``. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.bert.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -962,21 +986,27 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss): inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) sequence_output = outputs[0] logits = self.mlm(sequence_output, training=training) - outputs = (logits,) + outputs[2:] # Add hidden states and attention if they are here + loss = None if labels is not None: # shift labels to the left and cut last logit token logits = logits[:, :-1] labels = labels[:, 1:] loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs - return outputs # prediction_scores, (hidden_states), (attentions) + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFCausalLMOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -990,23 +1020,10 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel): self.nsp = TFBertNSPHead(config, name="nsp___cls") @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + @replace_return_docstrings(output_type=TFNextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) def call(self, inputs, **kwargs): r""" Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - seq_relationship_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`) - Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. Examples:: @@ -1023,14 +1040,19 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel): logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0] assert logits[0][0] < logits[0][1] # the next sentence was random """ + return_dict = kwargs.get("return_dict") + return_dict = return_dict if return_dict is not None else self.bert.return_dict outputs = self.bert(inputs, **kwargs) pooled_output = outputs[1] seq_relationship_score = self.nsp(pooled_output) - outputs = (seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here + if not return_dict: + return (seq_relationship_score,) + outputs[2:] - return outputs # seq_relationship_score, (hidden_states), (attentions) + return TFNextSentencePredictorOutput( + logits=seq_relationship_score, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -1050,7 +1072,12 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific ) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="bert-base-cased", + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -1061,6 +1088,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -1070,27 +1098,12 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): - Classification (or regression if config.num_labels==1) scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.bert.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -1103,6 +1116,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -1111,13 +1125,15 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific pooled_output = self.dropout(pooled_output, training=training) logits = self.classifier(pooled_output) - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFSequenceClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -1145,7 +1161,12 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss): return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="bert-base-cased", + output_type=TFMultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs, @@ -1156,6 +1177,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss): inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -1164,24 +1186,6 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`: - `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). - - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ if isinstance(inputs, (tuple, list)): input_ids = inputs[0] @@ -1192,8 +1196,9 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss): inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states - labels = inputs[8] if len(inputs) > 8 else labels - assert len(inputs) <= 9, "Too many inputs." + return_dict = inputs[8] if len(inputs) > 8 else return_dict + labels = inputs[9] if len(inputs) > 9 else labels + assert len(inputs) <= 10, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -1203,10 +1208,12 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss): inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + return_dict = inputs.get("return_dict", return_dict) labels = inputs.get("labels", labels) - assert len(inputs) <= 9, "Too many inputs." + assert len(inputs) <= 10, "Too many inputs." else: input_ids = inputs + return_dict = return_dict if return_dict is not None else self.bert.return_dict if input_ids is not None: num_choices = shape_list(input_ids)[1] @@ -1233,19 +1240,23 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss): flat_inputs_embeds, output_attentions, output_hidden_states, + return_dict=return_dict, training=training, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output, training=training) logits = self.classifier(pooled_output) reshaped_logits = tf.reshape(logits, (-1, num_choices)) - outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here - if labels is not None: - loss = self.compute_loss(labels, reshaped_logits) - outputs = (loss,) + outputs + loss = None if labels is None else self.compute_loss(labels, reshaped_logits) - return outputs # (loss), reshaped_logits, (hidden_states), (attentions) + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFMultipleChoiceModelOutput( + loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -1265,7 +1276,12 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL ) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="bert-base-cased", + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -1276,6 +1292,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -1283,27 +1300,12 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.bert.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -1316,6 +1318,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -1324,13 +1327,15 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL sequence_output = self.dropout(sequence_output, training=training) logits = self.classifier(sequence_output) - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFTokenClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -1349,7 +1354,12 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss) ) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="bert-base-cased", + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -1360,6 +1370,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss) inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, start_positions=None, end_positions=None, training=False, @@ -1373,30 +1384,13 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss) Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-start scores (before SoftMax). - end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-end scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.bert.return_dict if isinstance(inputs, (tuple, list)): - start_positions = inputs[8] if len(inputs) > 8 else start_positions - end_positions = inputs[9] if len(inputs) > 9 else end_positions - if len(inputs) > 8: - inputs = inputs[:8] + start_positions = inputs[9] if len(inputs) > 9 else start_positions + end_positions = inputs[10] if len(inputs) > 10 else end_positions + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): start_positions = inputs.pop("start_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions) @@ -1410,6 +1404,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss) inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -1420,12 +1415,20 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss) start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) - outputs = (start_logits, end_logits,) + outputs[2:] - + loss = None if start_positions is not None and end_positions is not None: labels = {"start_position": start_positions} labels["end_position"] = end_positions - loss = self.compute_loss(labels, outputs[:2]) - outputs = (loss,) + outputs + loss = self.compute_loss(labels, (start_logits, end_logits)) - return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/src/transformers/modeling_tf_camembert.py b/src/transformers/modeling_tf_camembert.py index 1fefe7b3bb..e7a5a1d38e 100644 --- a/src/transformers/modeling_tf_camembert.py +++ b/src/transformers/modeling_tf_camembert.py @@ -62,8 +62,6 @@ CAMEMBERT_START_DOCSTRING = r""" config (:class:`~transformers.CamembertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. - output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. """ diff --git a/src/transformers/modeling_tf_ctrl.py b/src/transformers/modeling_tf_ctrl.py index caeec80f6d..760944099a 100644 --- a/src/transformers/modeling_tf_ctrl.py +++ b/src/transformers/modeling_tf_ctrl.py @@ -23,6 +23,7 @@ import tensorflow as tf from .configuration_ctrl import CTRLConfig from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable +from .modeling_tf_outputs import TFBaseModelOutputWithPast, TFCausalLMOutputWithPast from .modeling_tf_utils import ( TFCausalLanguageModelingLoss, TFPreTrainedModel, @@ -35,7 +36,8 @@ from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) -_TOKENIZER_FOR_DOC = "CtrlTokenizer" +_CONFIG_FOR_DOC = "CTRLConfig" +_TOKENIZER_FOR_DOC = "CTRLTokenizer" TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = [ "ctrl" @@ -207,6 +209,7 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): self.output_hidden_states = config.output_hidden_states self.output_attentions = config.output_attentions self.use_cache = config.use_cache + self.return_dict = config.use_return_dict self.d_model_size = config.n_embd self.num_layers = config.n_layer @@ -260,6 +263,7 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): use_cache=None, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): @@ -274,7 +278,8 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): use_cache = inputs[7] if len(inputs) > 7 else use_cache output_attentions = inputs[8] if len(inputs) > 8 else output_attentions output_hidden_states = inputs[9] if len(inputs) > 9 else output_hidden_states - assert len(inputs) <= 10, "Too many inputs." + return_dict = inputs[10] if len(inputs) > 10 else return_dict + assert len(inputs) <= 11, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") past = inputs.get("past", past) @@ -286,13 +291,15 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): use_cache = inputs.get("use_cache", use_cache) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) - assert len(inputs) <= 10, "Too many inputs." + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 11, "Too many inputs." else: input_ids = inputs output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states use_cache = use_cache if use_cache is not None else self.use_cache + return_dict = return_dict if return_dict is not None else self.return_dict # If using past key value states, only the last tokens # should be given as an input @@ -374,9 +381,9 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): hidden_states = self.dropout(hidden_states, training=training) output_shape = input_shape + [shape_list(hidden_states)[-1]] - presents = () - all_hidden_states = () - all_attentions = [] + presents = () if use_cache else None + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None for i, (h, layer_past) in enumerate(zip(self.h, past)): if output_hidden_states: all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),) @@ -396,24 +403,27 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): presents = presents + (present,) if output_attentions: - all_attentions.append(outputs[2]) + all_attentions = all_attentions + (outputs[2],) hidden_states = self.layernorm(hidden_states) hidden_states = tf.reshape(hidden_states, output_shape) if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - outputs = (hidden_states,) - if use_cache: - outputs = outputs + (presents,) - if output_hidden_states: - outputs = outputs + (all_hidden_states,) if output_attentions: # let the number of heads free (-1) so we can extract attention even after head pruning attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:] all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions) - outputs = outputs + (all_attentions,) - return outputs + + if not return_dict: + return tuple(v for v in [hidden_states, presents, all_hidden_states, all_attentions] if v is not None) + + return TFBaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=presents, + hidden_states=all_hidden_states, + attentions=all_attentions, + ) class TFCTRLPreTrainedModel(TFPreTrainedModel): @@ -503,6 +513,11 @@ CTRL_INPUTS_DOCSTRING = r""" (if set to :obj:`False`) for evaluation. output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -516,29 +531,13 @@ class TFCTRLModel(TFCTRLPreTrainedModel): self.transformer = TFCTRLMainLayer(config, name="transformer") @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="ctrl", + output_type=TFBaseModelOutputWithPast, + config_class=_CONFIG_FOR_DOC, + ) def call(self, inputs, **kwargs): - r""" - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.CTRLConfig`) and inputs: - last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the last layer of the model. - past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): - Contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model - should not be passed as input ids as they have already been computed. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ outputs = self.transformer(inputs, **kwargs) return outputs @@ -585,7 +584,12 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss): return {"inputs": inputs, "past": past, "use_cache": kwargs["use_cache"]} @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="ctrl", + output_type=TFCausalLMOutputWithPast, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs, @@ -598,6 +602,7 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss): use_cache=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -605,31 +610,12 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss): labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the cross entropy classification loss. Indices should be in ``[0, ..., config.vocab_size - 1]``. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.CTRLConfig`) and inputs: - prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): - Contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model - should not be passed as input ids as they have already been computed. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[10] if len(inputs) > 10 else labels - if len(inputs) > 10: - inputs = inputs[:10] + labels = inputs[11] if len(inputs) > 11 else labels + if len(inputs) > 11: + inputs = inputs[:11] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -644,6 +630,7 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss): use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -651,12 +638,21 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss): logits = self.lm_head(hidden_states) - outputs = (logits,) + transformer_outputs[1:] + loss = None if labels is not None: # shift labels to the left and cut last logit token logits = logits[:, :-1] labels = labels[:, 1:] loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs - return outputs # lm_logits, presents, (all hidden_states), (attentions) + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/src/transformers/modeling_tf_distilbert.py b/src/transformers/modeling_tf_distilbert.py index 892417627f..4a9484aa1f 100644 --- a/src/transformers/modeling_tf_distilbert.py +++ b/src/transformers/modeling_tf_distilbert.py @@ -29,6 +29,14 @@ from .file_utils import ( add_start_docstrings, add_start_docstrings_to_callable, ) +from .modeling_tf_outputs import ( + TFBaseModelOutput, + TFMaskedLMOutput, + TFMultipleChoiceModelOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) from .modeling_tf_utils import ( TFMaskedLanguageModelingLoss, TFMultipleChoiceLoss, @@ -46,6 +54,7 @@ from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) +_CONFIG_FOR_DOC = "DistilBertConfig" _TOKENIZER_FOR_DOC = "DistilBertTokenizer" TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ @@ -359,7 +368,7 @@ class TFTransformer(tf.keras.layers.Layer): self.layer = [TFTransformerBlock(config, name="layer_._{}".format(i)) for i in range(config.n_layers)] - def call(self, x, attn_mask, head_mask, output_attentions, output_hidden_states, training=False): + def call(self, x, attn_mask, head_mask, output_attentions, output_hidden_states, return_dict, training=False): """ Parameters ---------- @@ -379,8 +388,8 @@ class TFTransformer(tf.keras.layers.Layer): Tuple of length n_layers with the attention weights from each layer Optional: only if output_attentions=True """ - all_hidden_states = () - all_attentions = () + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None hidden_state = x for i, layer_module in enumerate(self.layer): @@ -401,12 +410,11 @@ class TFTransformer(tf.keras.layers.Layer): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_state,) - outputs = (hidden_state,) - if output_hidden_states: - outputs = outputs + (all_hidden_states,) - if output_attentions: - outputs = outputs + (all_attentions,) - return outputs # last-layer hidden state, (all hidden states), (all attentions) + if not return_dict: + return tuple(v for v in [hidden_state, all_hidden_states, all_attentions] if v is not None) + return TFBaseModelOutput( + last_hidden_state=hidden_state, hidden_states=all_hidden_states, attentions=all_attentions + ) @keras_serializable @@ -418,6 +426,7 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer): self.num_hidden_layers = config.num_hidden_layers self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states + self.return_dict = config.use_return_dict self.embeddings = TFEmbeddings(config, name="embeddings") # Embeddings self.transformer = TFTransformer(config, name="transformer") # Encoder @@ -440,6 +449,7 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer): inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): if isinstance(inputs, (tuple, list)): @@ -449,7 +459,8 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer): inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds output_attentions = inputs[4] if len(inputs) > 4 else output_attentions output_hidden_states = inputs[5] if len(inputs) > 5 else output_hidden_states - assert len(inputs) <= 6, "Too many inputs." + return_dict = inputs[6] if len(inputs) > 6 else return_dict + assert len(inputs) <= 7, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -457,12 +468,14 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer): inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) - assert len(inputs) <= 6, "Too many inputs." + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 7, "Too many inputs." else: input_ids = inputs output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states + return_dict = return_dict if return_dict is not None else self.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -491,7 +504,13 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer): embedding_output = self.embeddings(input_ids, inputs_embeds=inputs_embeds) # (bs, seq_length, dim) tfmr_output = self.transformer( - embedding_output, attention_mask, head_mask, output_attentions, output_hidden_states, training=training + embedding_output, + attention_mask, + head_mask, + output_attentions, + output_hidden_states, + return_dict, + training=training, ) return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions) @@ -564,9 +583,13 @@ DISTILBERT_INPUTS_DOCSTRING = r""" training (:obj:`boolean`, `optional`, defaults to :obj:`False`): Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them (if set to :obj:`False`) for evaluation. - output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -580,25 +603,13 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel): self.distilbert = TFDistilBertMainLayer(config, name="distilbert") # Embeddings @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="distilbert-base-uncased", + output_type=TFBaseModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call(self, inputs, **kwargs): - r""" - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs: - last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ outputs = self.distilbert(inputs, **kwargs) return outputs @@ -642,7 +653,12 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModel return self.vocab_projector.input_embeddings @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="distilbert-base-uncased", + output_type=TFMaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -651,6 +667,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModel inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -660,27 +677,12 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModel Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` - - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs: - prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.distilbert.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[6] if len(inputs) > 6 else labels - if len(inputs) > 6: - inputs = inputs[:6] + labels = inputs[7] if len(inputs) > 7 else labels + if len(inputs) > 7: + inputs = inputs[:7] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -691,6 +693,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModel inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -700,13 +703,18 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModel prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim) prediction_logits = self.vocab_projector(prediction_logits) - outputs = (prediction_logits,) + distilbert_output[1:] + loss = None if labels is None else self.compute_loss(labels, prediction_logits) - if labels is not None: - loss = self.compute_loss(labels, prediction_logits) - outputs = (loss,) + outputs + if not return_dict: + output = (prediction_logits,) + distilbert_output[1:] + return ((loss,) + output) if loss is not None else output - return outputs # logits, (hidden_states), (attentions) + return TFMaskedLMOutput( + loss=loss, + logits=prediction_logits, + hidden_states=distilbert_output.hidden_states, + attentions=distilbert_output.attentions, + ) @add_start_docstrings( @@ -732,7 +740,12 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="distilbert-base-uncased", + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -741,6 +754,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -750,27 +764,12 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque Indices should be in ``[0, ..., config.num_labels - 1]``. If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). - - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs: - logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): - Classification (or regression if config.num_labels==1) scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.distilbert.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[6] if len(inputs) > 6 else labels - if len(inputs) > 6: - inputs = inputs[:6] + labels = inputs[7] if len(inputs) > 7 else labels + if len(inputs) > 7: + inputs = inputs[:7] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -781,6 +780,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -790,13 +790,18 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque pooled_output = self.dropout(pooled_output, training=training) # (bs, dim) logits = self.classifier(pooled_output) # (bs, dim) - outputs = (logits,) + distilbert_output[1:] + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + distilbert_output[1:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFSequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=distilbert_output.hidden_states, + attentions=distilbert_output.attentions, + ) @add_start_docstrings( @@ -816,7 +821,12 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla ) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="distilbert-base-uncased", + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -825,6 +835,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -832,27 +843,12 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. - - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs: - scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.distilbert.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[6] if len(inputs) > 6 else labels - if len(inputs) > 6: - inputs = inputs[:6] + labels = inputs[7] if len(inputs) > 7 else labels + if len(inputs) > 7: + inputs = inputs[:7] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -863,6 +859,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -871,13 +868,15 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla sequence_output = self.dropout(sequence_output, training=training) logits = self.classifier(sequence_output) - outputs = (logits,) + outputs[1:] # add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFTokenClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -911,7 +910,12 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="distilbert-base-uncased", + output_type=TFMultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs, @@ -920,6 +924,7 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -928,24 +933,6 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`: - `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). - - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ if isinstance(inputs, (tuple, list)): input_ids = inputs[0] @@ -954,8 +941,9 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds output_attentions = inputs[4] if len(inputs) > 4 else output_attentions output_hidden_states = inputs[5] if len(inputs) > 5 else output_hidden_states - labels = inputs[6] if len(inputs) > 6 else labels - assert len(inputs) <= 7, "Too many inputs." + return_dict = inputs[6] if len(inputs) > 6 else return_dict + labels = inputs[7] if len(inputs) > 7 else labels + assert len(inputs) <= 8, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -963,10 +951,12 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + return_dict = inputs.get("return_dict", return_dict) labels = inputs.get("labels", labels) - assert len(inputs) <= 7, "Too many inputs." + assert len(inputs) <= 8, "Too many inputs." else: input_ids = inputs + return_dict = return_dict if return_dict is not None else self.distilbert.return_dict if input_ids is not None: num_choices = shape_list(input_ids)[1] @@ -989,6 +979,7 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic flat_inputs_embeds, output_attentions, output_hidden_states, + return_dict=return_dict, training=training, ) hidden_state = distilbert_output[0] # (bs, seq_len, dim) @@ -997,13 +988,19 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic pooled_output = self.dropout(pooled_output, training=training) # (bs, dim) logits = self.classifier(pooled_output) reshaped_logits = tf.reshape(logits, (-1, num_choices)) - outputs = (reshaped_logits,) + distilbert_output[1:] # add hidden states and attention if they are here - if labels is not None: - loss = self.compute_loss(labels, reshaped_logits) - outputs = (loss,) + outputs + loss = None if labels is None else self.compute_loss(labels, reshaped_logits) - return outputs # (loss), reshaped_logits, (hidden_states), (attentions) + if not return_dict: + output = (reshaped_logits,) + distilbert_output[1:] + return ((loss,) + output) if loss is not None else output + + return TFMultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=distilbert_output.hidden_states, + attentions=distilbert_output.attentions, + ) @add_start_docstrings( @@ -1023,7 +1020,12 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn self.dropout = tf.keras.layers.Dropout(config.qa_dropout) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="distilbert-base-uncased", + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -1032,6 +1034,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, start_positions=None, end_positions=None, training=False, @@ -1045,30 +1048,13 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs: - start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-start scores (before SoftMax). - end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-end scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.distilbert.return_dict if isinstance(inputs, (tuple, list)): - start_positions = inputs[6] if len(inputs) > 6 else start_positions - end_positions = inputs[7] if len(inputs) > 7 else end_positions - if len(inputs) > 6: - inputs = inputs[:6] + start_positions = inputs[7] if len(inputs) > 7 else start_positions + end_positions = inputs[8] if len(inputs) > 8 else end_positions + if len(inputs) > 7: + inputs = inputs[:7] elif isinstance(inputs, (dict, BatchEncoding)): start_positions = inputs.pop("start_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions) @@ -1080,6 +1066,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -1090,12 +1077,20 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) - outputs = (start_logits, end_logits,) + distilbert_output[1:] - + loss = None if start_positions is not None and end_positions is not None: labels = {"start_position": start_positions} labels["end_position"] = end_positions - loss = self.compute_loss(labels, outputs[:2]) - outputs = (loss,) + outputs + loss = self.compute_loss(labels, (start_logits, end_logits)) - return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) + if not return_dict: + output = (start_logits, end_logits) + distilbert_output[1:] + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=distilbert_output.hidden_states, + attentions=distilbert_output.attentions, + ) diff --git a/src/transformers/modeling_tf_electra.py b/src/transformers/modeling_tf_electra.py index 5a7e366cd3..6269d02387 100644 --- a/src/transformers/modeling_tf_electra.py +++ b/src/transformers/modeling_tf_electra.py @@ -1,4 +1,6 @@ import logging +from dataclasses import dataclass +from typing import Optional, Tuple import tensorflow as tf @@ -6,11 +8,21 @@ from transformers import ElectraConfig from .file_utils import ( MULTIPLE_CHOICE_DUMMY_INPUTS, + ModelOutput, add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable, + replace_return_docstrings, ) from .modeling_tf_bert import ACT2FN, TFBertEncoder, TFBertPreTrainedModel +from .modeling_tf_outputs import ( + TFBaseModelOutput, + TFMaskedLMOutput, + TFMultipleChoiceModelOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) from .modeling_tf_utils import ( TFMaskedLanguageModelingLoss, TFMultipleChoiceLoss, @@ -27,8 +39,8 @@ from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) -_TOKENIZER_FOR_DOC = "ElectraTokenizer" _CONFIG_FOR_DOC = "ElectraConfig" +_TOKENIZER_FOR_DOC = "ElectraTokenizer" TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [ "google/electra-small-generator", @@ -254,6 +266,7 @@ class TFElectraMainLayer(TFElectraPreTrainedModel): inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): if isinstance(inputs, (tuple, list)): @@ -265,7 +278,8 @@ class TFElectraMainLayer(TFElectraPreTrainedModel): inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states - assert len(inputs) <= 8, "Too many inputs." + return_dict = inputs[8] if len(inputs) > 8 else return_dict + assert len(inputs) <= 9, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -275,7 +289,8 @@ class TFElectraMainLayer(TFElectraPreTrainedModel): inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) - assert len(inputs) <= 8, "Too many inputs." + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 9, "Too many inputs." else: input_ids = inputs @@ -283,6 +298,7 @@ class TFElectraMainLayer(TFElectraPreTrainedModel): output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -312,12 +328,41 @@ class TFElectraMainLayer(TFElectraPreTrainedModel): head_mask, output_attentions, output_hidden_states, + return_dict, training=training, ) return hidden_states +@dataclass +class TFElectraForPreTrainingOutput(ModelOutput): + """ + Output type of :class:`~transformers.TFElectraForPreTrainingModel`. + + Args: + loss (`optional`, returned when ``labels`` is provided, ``tf.Tensor`` of shape :obj:`(1,)`): + Total loss of the ELECTRA objective. + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`): + Prediction scores of the head (scores for each token before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + logits: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + ELECTRA_START_DOCSTRING = r""" This model is a `tf.keras.Model `__ sub-class. Use it as a regular TF 2.0 Keras Model and @@ -380,9 +425,13 @@ ELECTRA_INPUTS_DOCSTRING = r""" training (:obj:`boolean`, `optional`, defaults to :obj:`False`): Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them (if set to :obj:`False`) for evaluation. - output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -400,25 +449,13 @@ class TFElectraModel(TFElectraPreTrainedModel): self.electra = TFElectraMainLayer(config, name="electra") @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="google/electra-small-discriminator", + output_type=TFBaseModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call(self, inputs, **kwargs): - r""" - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: - last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ outputs = self.electra(inputs, **kwargs) return outputs @@ -439,6 +476,7 @@ class TFElectraForPreTraining(TFElectraPreTrainedModel): self.discriminator_predictions = TFElectraDiscriminatorPredictions(config, name="discriminator_predictions") @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFElectraForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) def call( self, input_ids, @@ -449,24 +487,11 @@ class TFElectraForPreTraining(TFElectraPreTrainedModel): inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): r""" Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: - scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): - Prediction scores of the head (scores for each token before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. Examples:: @@ -479,6 +504,7 @@ class TFElectraForPreTraining(TFElectraPreTrainedModel): outputs = model(input_ids) scores = outputs[0] """ + return_dict = return_dict if return_dict is not None else self.electra.config.return_dict discriminator_hidden_states = self.electra( input_ids, @@ -489,14 +515,20 @@ class TFElectraForPreTraining(TFElectraPreTrainedModel): inputs_embeds, output_attentions, output_hidden_states, + return_dict=return_dict, training=training, ) discriminator_sequence_output = discriminator_hidden_states[0] logits = self.discriminator_predictions(discriminator_sequence_output) - output = (logits,) - output += discriminator_hidden_states[1:] - return output # (loss), scores, (hidden_states), (attentions) + if not return_dict: + return (logits,) + discriminator_hidden_states[1:] + + return TFElectraForPreTrainingOutput( + logits=logits, + hidden_states=discriminator_hidden_states.hidden_states, + attentions=discriminator_hidden_states.attentions, + ) class TFElectraMaskedLMHead(tf.keras.layers.Layer): @@ -539,7 +571,12 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLos return self.generator_lm_head @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-generator") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="google/electra-small-generator", + output_type=TFMaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, input_ids, @@ -550,6 +587,7 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLos inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -559,27 +597,12 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLos Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` - - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: - prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.electra.config.return_dict if isinstance(input_ids, (tuple, list)): - labels = input_ids[8] if len(input_ids) > 8 else labels - if len(input_ids) > 8: - input_ids = input_ids[:8] + labels = input_ids[9] if len(input_ids) > 9 else labels + if len(input_ids) > 9: + input_ids = input_ids[:9] elif isinstance(input_ids, (dict, BatchEncoding)): labels = input_ids.pop("labels", labels) @@ -592,19 +615,25 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLos inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) generator_sequence_output = generator_hidden_states[0] prediction_scores = self.generator_predictions(generator_sequence_output, training=training) prediction_scores = self.generator_lm_head(prediction_scores, training=training) - output = (prediction_scores,) - output += generator_hidden_states[1:] - if labels is not None: - loss = self.compute_loss(labels, prediction_scores) - output = (loss,) + output + loss = None if labels is None else self.compute_loss(labels, prediction_scores) - return output # (masked_lm_loss), prediction_scores, (hidden_states), (attentions) + if not return_dict: + output = (prediction_scores,) + generator_hidden_states[1:] + return ((loss,) + output) if loss is not None else output + + return TFMaskedLMOutput( + loss=loss, + logits=prediction_scores, + hidden_states=generator_hidden_states.hidden_states, + attentions=generator_hidden_states.attentions, + ) class TFElectraClassificationHead(tf.keras.layers.Layer): @@ -647,6 +676,7 @@ class TFElectraForSequenceClassification(TFElectraPreTrainedModel, TFSequenceCla @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator", + output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC, ) def call( @@ -659,23 +689,25 @@ class TFElectraForSequenceClassification(TFElectraPreTrainedModel, TFSequenceCla inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): r""" - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: - logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`) - Classification (or regression if config.num_labels==1) scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for computing the sequence classification/regression loss. + Indices should be in :obj:`[0, ..., config.num_labels - 1]`. + If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), + If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ + return_dict = return_dict if return_dict is not None else self.electra.config.return_dict + if isinstance(input_ids, (tuple, list)): + labels = input_ids[9] if len(input_ids) > 9 else labels + if len(input_ids) > 9: + input_ids = input_ids[:9] + elif isinstance(input_ids, (dict, BatchEncoding)): + labels = input_ids.pop("labels", labels) + outputs = self.electra( input_ids, attention_mask, @@ -685,16 +717,20 @@ class TFElectraForSequenceClassification(TFElectraPreTrainedModel, TFSequenceCla inputs_embeds, output_attentions, output_hidden_states, + return_dict=return_dict, training=training, ) logits = self.classifier(outputs[0]) - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + loss = None if labels is None else self.compute_loss(labels, logits) - return outputs # (loss), logits, (hidden_states), (attentions) + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFSequenceClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -724,7 +760,12 @@ class TFElectraForMultipleChoice(TFElectraPreTrainedModel, TFMultipleChoiceLoss) return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="google/electra-small-discriminator", + output_type=TFMultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs, @@ -735,6 +776,7 @@ class TFElectraForMultipleChoice(TFElectraPreTrainedModel, TFMultipleChoiceLoss) inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -743,24 +785,6 @@ class TFElectraForMultipleChoice(TFElectraPreTrainedModel, TFMultipleChoiceLoss) Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: - classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`: - `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). - - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ if isinstance(inputs, (tuple, list)): input_ids = inputs[0] @@ -771,8 +795,9 @@ class TFElectraForMultipleChoice(TFElectraPreTrainedModel, TFMultipleChoiceLoss) inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states - labels = inputs[8] if len(inputs) > 8 else labels - assert len(inputs) <= 9, "Too many inputs." + return_dict = inputs[8] if len(inputs) > 8 else return_dict + labels = inputs[9] if len(inputs) > 9 else labels + assert len(inputs) <= 10, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -782,10 +807,12 @@ class TFElectraForMultipleChoice(TFElectraPreTrainedModel, TFMultipleChoiceLoss) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + return_dict = inputs.get("return_dict", return_dict) labels = inputs.get("labels", labels) - assert len(inputs) <= 9, "Too many inputs." + assert len(inputs) <= 10, "Too many inputs." else: input_ids = inputs + return_dict = return_dict if return_dict is not None else self.electra.config.return_dict if input_ids is not None: num_choices = shape_list(input_ids)[1] @@ -812,18 +839,22 @@ class TFElectraForMultipleChoice(TFElectraPreTrainedModel, TFMultipleChoiceLoss) flat_inputs_embeds, output_attentions, output_hidden_states, + return_dict=return_dict, training=training, ) logits = self.sequence_summary(outputs[0]) logits = self.classifier(logits) reshaped_logits = tf.reshape(logits, (-1, num_choices)) - outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here - if labels is not None: - loss = self.compute_loss(labels, reshaped_logits) - outputs = (loss,) + outputs + loss = None if labels is None else self.compute_loss(labels, reshaped_logits) - return outputs # (loss), reshaped_logits, (hidden_states), (attentions) + if not return_dict: + output = (reshaped_logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFMultipleChoiceModelOutput( + loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -843,7 +874,12 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific ) @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="google/electra-small-discriminator", + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs, @@ -854,6 +890,7 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -861,27 +898,12 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. - - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: - scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.electra.config.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -894,19 +916,25 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific inputs_embeds, output_attentions, output_hidden_states, + return_dict=return_dict, training=training, ) discriminator_sequence_output = discriminator_hidden_states[0] discriminator_sequence_output = self.dropout(discriminator_sequence_output) logits = self.classifier(discriminator_sequence_output) - outputs = (logits,) + discriminator_hidden_states[1:] + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + discriminator_hidden_states[1:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), scores, (hidden_states), (attentions) + return TFTokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=discriminator_hidden_states.hidden_states, + attentions=discriminator_hidden_states.attentions, + ) @add_start_docstrings( @@ -925,7 +953,12 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin ) @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="google/electra-small-discriminator", + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs, @@ -936,6 +969,7 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, start_positions=None, end_positions=None, training=False, @@ -949,30 +983,13 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-start scores (before SoftMax). - end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-end scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.electra.config.return_dict if isinstance(inputs, (tuple, list)): - start_positions = inputs[8] if len(inputs) > 8 else start_positions - end_positions = inputs[9] if len(inputs) > 9 else end_positions - if len(inputs) > 8: - inputs = inputs[:8] + start_positions = inputs[9] if len(inputs) > 9 else start_positions + end_positions = inputs[10] if len(inputs) > 10 else end_positions + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): start_positions = inputs.pop("start_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions) @@ -986,6 +1003,7 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin inputs_embeds, output_attentions, output_hidden_states, + return_dict=return_dict, training=training, ) discriminator_sequence_output = discriminator_hidden_states[0] @@ -995,12 +1013,20 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) - outputs = (start_logits, end_logits,) + discriminator_hidden_states[1:] - + loss = None if start_positions is not None and end_positions is not None: labels = {"start_position": start_positions} labels["end_position"] = end_positions - loss = self.compute_loss(labels, outputs[:2]) - outputs = (loss,) + outputs + loss = self.compute_loss(labels, (start_logits, end_logits)) - return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) + if not return_dict: + output = (start_logits, end_logits,) + discriminator_hidden_states[1:] + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=discriminator_hidden_states.hidden_states, + attentions=discriminator_hidden_states.attentions, + ) diff --git a/src/transformers/modeling_tf_flaubert.py b/src/transformers/modeling_tf_flaubert.py index 77aa4f249f..a9dae60e8e 100644 --- a/src/transformers/modeling_tf_flaubert.py +++ b/src/transformers/modeling_tf_flaubert.py @@ -22,6 +22,7 @@ import tensorflow as tf from .configuration_flaubert import FlaubertConfig from .file_utils import add_start_docstrings +from .modeling_tf_outputs import TFBaseModelOutput from .modeling_tf_utils import keras_serializable, shape_list from .modeling_tf_xlm import ( TFXLMForMultipleChoice, @@ -103,6 +104,11 @@ FLAUBERT_INPUTS_DOCSTRING = r""" than the model's internal embedding lookup matrix. output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -126,6 +132,7 @@ class TFFlaubertMainLayer(TFXLMMainLayer): self.pre_norm = getattr(config, "pre_norm", False) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states + self.return_dict = config.use_return_dict def call( self, @@ -140,6 +147,7 @@ class TFFlaubertMainLayer(TFXLMMainLayer): inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): # removed: src_enc=None, src_len=None @@ -155,7 +163,8 @@ class TFFlaubertMainLayer(TFXLMMainLayer): inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds output_attentions = inputs[9] if len(inputs) > 9 else output_attentions output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states - assert len(inputs) <= 11, "Too many inputs." + return_dict = inputs[11] if len(inputs) > 11 else return_dict + assert len(inputs) <= 12, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -168,12 +177,14 @@ class TFFlaubertMainLayer(TFXLMMainLayer): inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) - assert len(inputs) <= 11, "Too many inputs." + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 12, "Too many inputs." else: input_ids = inputs output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states + return_dict = return_dict if return_dict is not None else self.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -260,8 +271,8 @@ class TFFlaubertMainLayer(TFXLMMainLayer): tensor = tensor * mask[..., tf.newaxis] # transformer layers - hidden_states = () - attentions = () + hidden_states = () if output_hidden_states else None + attentions = () if output_attentions else None for i in range(self.n_layers): # LayerDrop dropout_probability = random.uniform(0, 1) @@ -321,12 +332,9 @@ class TFFlaubertMainLayer(TFXLMMainLayer): # move back sequence length to dimension 0 # tensor = tensor.transpose(0, 1) - outputs = (tensor,) - if output_hidden_states: - outputs = outputs + (hidden_states,) - if output_attentions: - outputs = outputs + (attentions,) - return outputs # outputs, (hidden_states), (attentions) + if not return_dict: + return tuple(v for v in [tensor, hidden_states, attentions] if v is not None) + return TFBaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions) @add_start_docstrings( diff --git a/src/transformers/modeling_tf_gpt2.py b/src/transformers/modeling_tf_gpt2.py index 8adaafb35e..5221ef46ce 100644 --- a/src/transformers/modeling_tf_gpt2.py +++ b/src/transformers/modeling_tf_gpt2.py @@ -17,12 +17,21 @@ import logging +from dataclasses import dataclass +from typing import List, Optional, Tuple import numpy as np import tensorflow as tf from .configuration_gpt2 import GPT2Config -from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable +from .file_utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_callable, + replace_return_docstrings, +) +from .modeling_tf_outputs import TFBaseModelOutputWithPast, TFCausalLMOutputWithPast from .modeling_tf_utils import ( TFCausalLanguageModelingLoss, TFConv1D, @@ -38,6 +47,7 @@ from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) +_CONFIG_FOR_DOC = "GPT2Config" _TOKENIZER_FOR_DOC = "GPT2Tokenizer" TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [ @@ -214,12 +224,11 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.use_cache = config.use_cache + self.return_dict = config.use_return_dict self.num_hidden_layers = config.n_layer self.vocab_size = config.vocab_size self.n_embd = config.n_embd - self.output_hidden_states = self.output_hidden_states - self.output_attentions = self.output_attentions self.wte = TFSharedEmbeddings( config.vocab_size, config.hidden_size, initializer_range=config.initializer_range, name="wte" @@ -259,6 +268,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): use_cache=None, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): if isinstance(inputs, (tuple, list)): @@ -272,7 +282,8 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): use_cache = inputs[7] if len(inputs) > 7 else use_cache output_attentions = inputs[8] if len(inputs) > 8 else output_attentions output_hidden_states = inputs[9] if len(inputs) > 9 else output_hidden_states - assert len(inputs) <= 10, "Too many inputs." + return_dict = inputs[10] if len(inputs) > 10 else return_dict + assert len(inputs) <= 11, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") past = inputs.get("past", past) @@ -284,13 +295,15 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): use_cache = inputs.get("use_cache", use_cache) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) - assert len(inputs) <= 10, "Too many inputs." + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 11, "Too many inputs." else: input_ids = inputs output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states use_cache = use_cache if use_cache is not None else self.use_cache + return_dict = return_dict if return_dict is not None else self.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -355,9 +368,9 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): output_shape = input_shape + [shape_list(hidden_states)[-1]] - presents = () - all_attentions = [] - all_hidden_states = () + presents = () if use_cache else None + all_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None for i, (block, layer_past) in enumerate(zip(self.h, past)): if output_hidden_states: all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),) @@ -373,10 +386,11 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): ) hidden_states, present = outputs[:2] - presents = presents + (present,) + if use_cache: + presents = presents + (present,) if output_attentions: - all_attentions.append(outputs[2]) + all_attentions = all_attentions + (outputs[2],) hidden_states = self.ln_f(hidden_states) @@ -385,18 +399,20 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - outputs = (hidden_states,) - - if use_cache: - outputs = outputs + (presents,) - if output_hidden_states: - outputs = outputs + (all_hidden_states,) if output_attentions: # let the number of heads free (-1) so we can extract attention even after head pruning attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:] all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions) - outputs = outputs + (all_attentions,) - return outputs # last hidden state, presents, (all hidden_states), (attentions) + + if not return_dict: + return tuple(v for v in [hidden_states, presents, all_hidden_states, all_attentions] if v is not None) + + return TFBaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=presents, + hidden_states=all_hidden_states, + attentions=all_attentions, + ) class TFGPT2PreTrainedModel(TFPreTrainedModel): @@ -408,6 +424,42 @@ class TFGPT2PreTrainedModel(TFPreTrainedModel): base_model_prefix = "transformer" +@dataclass +class TFGPT2DoubleHeadsModelOutput(ModelOutput): + """ + Base class for outputs of models predicting if two sentences are consecutive or not. + + Args: + lm_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + mc_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`): + Prediction scores of the multiple choice classification head (scores for each choice before SoftMax). + past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): + List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape + :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see + ``past_key_values`` input) to speed up sequential decoding. + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + lm_logits: tf.Tensor = None + mc_logits: tf.Tensor = None + past_key_values: Optional[List[tf.Tensor]] = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + GPT2_START_DOCSTRING = r""" .. note:: @@ -482,6 +534,11 @@ GPT2_INPUTS_DOCSTRING = r""" (if set to :obj:`False`) for evaluation. output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -495,29 +552,13 @@ class TFGPT2Model(TFGPT2PreTrainedModel): self.transformer = TFGPT2MainLayer(config, name="transformer") @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="gpt2", + output_type=TFBaseModelOutputWithPast, + config_class=_CONFIG_FOR_DOC, + ) def call(self, inputs, **kwargs): - r""" - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs: - last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the last layer of the model. - past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): - Contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model - should not be passed as input ids as they have already been computed. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ outputs = self.transformer(inputs, **kwargs) return outputs @@ -543,7 +584,12 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss): return {"inputs": inputs, "past": past, "use_cache": kwargs["use_cache"]} @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="gpt2", + output_type=TFCausalLMOutputWithPast, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs, @@ -556,6 +602,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss): use_cache=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -563,31 +610,12 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss): labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the cross entropy classification loss. Indices should be in ``[0, ..., config.vocab_size - 1]``. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs: - prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): - Contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model - should not be passed as input ids as they have already been computed. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[10] if len(inputs) > 10 else labels - if len(inputs) > 10: - inputs = inputs[:10] + labels = inputs[11] if len(inputs) > 11 else labels + if len(inputs) > 11: + inputs = inputs[:11] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -602,6 +630,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss): use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -609,15 +638,24 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss): logits = self.transformer.wte(hidden_states, mode="linear") - outputs = (logits,) + transformer_outputs[1:] + loss = None if labels is not None: # shift labels to the left and cut last logit token logits = logits[:, :-1] labels = labels[:, 1:] loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs - return outputs # lm_logits, presents, (all hidden_states), (attentions) + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) @add_start_docstrings( @@ -641,6 +679,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): return self.transformer.wte @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFGPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC) def call( self, inputs, @@ -654,6 +693,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): use_cache=None, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): r""" @@ -662,26 +702,6 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): Selected in the range ``[0, input_ids.size(-1) - 1[``. Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs: - lm_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - mc_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`): - Prediction scores of the multiple choice classification head (scores for each choice before SoftMax). - past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): - Contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model - should not be passed as `input_ids` as they have already been computed. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. Examples:: @@ -717,8 +737,10 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds mc_token_ids = inputs[7] if len(inputs) > 7 else mc_token_ids use_cache = inputs[8] if len(inputs) > 8 else use_cache - output_attentions = inputs[9] if len(inputs) > 8 else output_attentions - assert len(inputs) <= 10, "Too many inputs." + output_attentions = inputs[9] if len(inputs) > 9 else output_attentions + output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states + return_dict = inputs[11] if len(inputs) > 11 else return_dict + assert len(inputs) <= 12, "Too many inputs." elif isinstance(inputs, dict): input_ids = inputs.get("input_ids") past = inputs.get("past", past) @@ -730,9 +752,12 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): mc_token_ids = inputs.get("mc_token_ids", mc_token_ids) use_cache = inputs.get("use_cache", use_cache) output_attentions = inputs.get("output_attentions", output_attentions) - assert len(inputs) <= 10, "Too many inputs." + output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 12, "Too many inputs." else: input_ids = inputs + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if input_ids is not None: input_shapes = shape_list(input_ids) @@ -755,6 +780,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): use_cache, output_attentions, output_hidden_states, + return_dict=return_dict, training=training, ) hidden_states = transformer_outputs[0] @@ -762,6 +788,14 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): lm_logits = self.transformer.wte(hidden_states, mode="linear") mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids, training=training) mc_logits = tf.squeeze(mc_logits, axis=-1) - outputs = (lm_logits, mc_logits) + transformer_outputs[1:] - return outputs # lm logits, mc logits, presents, (all hidden_states), (attentions) + if not return_dict: + return (lm_logits, mc_logits) + transformer_outputs[1:] + + return TFGPT2DoubleHeadsModelOutput( + lm_logits=lm_logits, + mc_logits=mc_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/src/transformers/modeling_tf_mobilebert.py b/src/transformers/modeling_tf_mobilebert.py index 5cee9e764b..fd748c30ca 100644 --- a/src/transformers/modeling_tf_mobilebert.py +++ b/src/transformers/modeling_tf_mobilebert.py @@ -17,17 +17,31 @@ import logging +from dataclasses import dataclass +from typing import Optional, Tuple import tensorflow as tf from . import MobileBertConfig from .file_utils import ( MULTIPLE_CHOICE_DUMMY_INPUTS, + ModelOutput, add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable, + replace_return_docstrings, ) from .modeling_tf_bert import TFBertIntermediate, gelu, gelu_new, swish +from .modeling_tf_outputs import ( + TFBaseModelOutput, + TFBaseModelOutputWithPooling, + TFMaskedLMOutput, + TFMultipleChoiceModelOutput, + TFNextSentencePredictorOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) from .modeling_tf_utils import ( TFMaskedLanguageModelingLoss, TFMultipleChoiceLoss, @@ -44,6 +58,7 @@ from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) +_CONFIG_FOR_DOC = "MobileBertConfig" _TOKENIZER_FOR_DOC = "MobileBertTokenizer" TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ @@ -541,9 +556,18 @@ class TFMobileBertEncoder(tf.keras.layers.Layer): self.output_hidden_states = config.output_hidden_states self.layer = [TFMobileBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)] - def call(self, hidden_states, attention_mask, head_mask, output_attentions, output_hidden_states, training=False): - all_hidden_states = () - all_attentions = () + def call( + self, + hidden_states, + attention_mask, + head_mask, + output_attentions, + output_hidden_states, + return_dict, + training=False, + ): + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None for i, layer_module in enumerate(self.layer): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) @@ -561,12 +585,11 @@ class TFMobileBertEncoder(tf.keras.layers.Layer): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - outputs = (hidden_states,) - if output_hidden_states: - outputs = outputs + (all_hidden_states,) - if output_attentions: - outputs = outputs + (all_attentions,) - return outputs # outputs, (hidden states), (attentions) + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) + return TFBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions + ) class TFMobileBertPooler(tf.keras.layers.Layer): @@ -660,6 +683,7 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer): self.num_hidden_layers = config.num_hidden_layers self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states + self.return_dict = config.use_return_dict self.embeddings = TFMobileBertEmbeddings(config, name="embeddings") self.encoder = TFMobileBertEncoder(config, name="encoder") @@ -688,6 +712,7 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer): inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): if isinstance(inputs, (tuple, list)): @@ -699,7 +724,8 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer): inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states - assert len(inputs) <= 8, "Too many inputs." + return_dict = inputs[8] if len(inputs) > 8 else return_dict + assert len(inputs) <= 9, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -709,12 +735,14 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer): inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) - assert len(inputs) <= 8, "Too many inputs." + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 9, "Too many inputs." else: input_ids = inputs output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states + return_dict = return_dict if return_dict is not None else self.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -763,16 +791,22 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer): head_mask, output_attentions, output_hidden_states, + return_dict, training=training, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) - outputs = (sequence_output, pooled_output,) + encoder_outputs[ - 1: - ] # add hidden_states and attentions if they are here - return outputs # sequence_output, pooled_output, (hidden_states), (attentions) + if not return_dict: + return (sequence_output, pooled_output,) + encoder_outputs[1:] + + return TFBaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) class TFMobileBertPreTrainedModel(TFPreTrainedModel): @@ -784,6 +818,37 @@ class TFMobileBertPreTrainedModel(TFPreTrainedModel): base_model_prefix = "mobilebert" +@dataclass +class TFMobileBertForPreTrainingOutput(ModelOutput): + """ + Output type of :class:`~transformers.TFMobileBertForPreTrainingModel`. + + Args: + prediction_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + seq_relationship_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False + continuation before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[tf.Tensor] = None + prediction_logits: tf.Tensor = None + seq_relationship_logits: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + MOBILEBERT_START_DOCSTRING = r""" This model is a `tf.keras.Model `__ sub-class. Use it as a regular TF 2.0 Keras Model and @@ -852,6 +917,13 @@ MOBILEBERT_INPUTS_DOCSTRING = r""" training (:obj:`boolean`, `optional`, defaults to :obj:`False`): Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them (if set to :obj:`False`) for evaluation. + output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -865,32 +937,13 @@ class TFMobileBertModel(TFMobileBertPreTrainedModel): self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert") @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="google/mobilebert-uncased", + output_type=TFBaseModelOutputWithPooling, + config_class=_CONFIG_FOR_DOC, + ) def call(self, inputs, **kwargs): - r""" - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs: - last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`): - Last layer hidden-state of the first token of the sequence (classification token) - further processed by a Linear layer and a Tanh activation function. The Linear - layer weights are trained from the next sentence prediction (classification) - objective during the original Bert pretraining. This output is usually *not* a good summary - of the semantic content of the input, you're often better with averaging or pooling - the sequence of hidden-states for the whole input sequence. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ outputs = self.mobilebert(inputs, **kwargs) return outputs @@ -911,25 +964,10 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel): return self.mobilebert.embeddings @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + @replace_return_docstrings(output_type=TFMobileBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) def call(self, inputs, **kwargs): r""" Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs: - prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - seq_relationship_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`): - Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. Examples:: @@ -943,16 +981,23 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel): >>> prediction_scores, seq_relationship_scores = outputs[:2] """ + return_dict = kwargs.get("return_dict") + return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict outputs = self.mobilebert(inputs, **kwargs) sequence_output, pooled_output = outputs[:2] prediction_scores = self.predictions(sequence_output) seq_relationship_score = self.seq_relationship(pooled_output) - outputs = (prediction_scores, seq_relationship_score,) + outputs[ - 2: - ] # add hidden states and attention if they are here - return outputs # prediction_scores, seq_relationship_score, (hidden_states), (attentions) + if not return_dict: + return (prediction_scores, seq_relationship_score) + outputs[2:] + + return TFMobileBertForPreTrainingOutput( + prediction_logits=prediction_scores, + seq_relationship_logits=seq_relationship_score, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) @add_start_docstrings("""MobileBert Model with a `language modeling` head on top. """, MOBILEBERT_START_DOCSTRING) @@ -967,7 +1012,12 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModel return self.mobilebert.embeddings @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="google/mobilebert-uncased", + output_type=TFMaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -978,6 +1028,7 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModel inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -986,27 +1037,12 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModel Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs: - prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -1019,18 +1055,22 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModel inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) sequence_output = outputs[0] prediction_scores = self.mlm(sequence_output, training=training) - outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here - if labels is not None: - loss = self.compute_loss(labels, prediction_scores) - outputs = (loss,) + outputs + loss = None if labels is None else self.compute_loss(labels, prediction_scores) - return outputs # (loss), prediction_scores, (hidden_states), (attentions) + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFMaskedLMOutput( + loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) class TFMobileBertOnlyNSPHead(tf.keras.layers.Layer): @@ -1055,23 +1095,10 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel): self.cls = TFMobileBertOnlyNSPHead(config, name="seq_relationship___cls") @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + @replace_return_docstrings(output_type=TFNextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) def call(self, inputs, **kwargs): r""" Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs: - seq_relationship_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`) - Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. Examples:: @@ -1087,14 +1114,19 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel): >>> logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0] """ + return_dict = kwargs.get("return_dict") + return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict outputs = self.mobilebert(inputs, **kwargs) pooled_output = outputs[1] seq_relationship_score = self.cls(pooled_output) - outputs = (seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here + if not return_dict: + return (seq_relationship_score,) + outputs[2:] - return outputs # seq_relationship_score, (hidden_states), (attentions) + return TFNextSentencePredictorOutput( + logits=seq_relationship_score, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -1114,7 +1146,12 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque ) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="google/mobilebert-uncased", + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -1125,6 +1162,7 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -1134,27 +1172,12 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs: - logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): - Classification (or regression if config.num_labels==1) scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -1167,6 +1190,7 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -1175,13 +1199,15 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque pooled_output = self.dropout(pooled_output, training=training) logits = self.classifier(pooled_output) - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFSequenceClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -1200,7 +1226,12 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn ) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="google/mobilebert-uncased", + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -1211,6 +1242,7 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, start_positions=None, end_positions=None, training=False, @@ -1224,30 +1256,13 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs: - start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-start scores (before SoftMax). - end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-end scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict if isinstance(inputs, (tuple, list)): - start_positions = inputs[8] if len(inputs) > 8 else start_positions - end_positions = inputs[9] if len(inputs) > 9 else end_positions - if len(inputs) > 8: - inputs = inputs[:8] + start_positions = inputs[9] if len(inputs) > 9 else start_positions + end_positions = inputs[10] if len(inputs) > 10 else end_positions + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): start_positions = inputs.pop("start_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions) @@ -1261,6 +1276,7 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -1271,15 +1287,23 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) - outputs = (start_logits, end_logits,) + outputs[2:] - + loss = None if start_positions is not None and end_positions is not None: labels = {"start_position": start_positions} labels["end_position"] = end_positions - loss = self.compute_loss(labels, outputs[:2]) - outputs = (loss,) + outputs + loss = self.compute_loss(labels, (start_logits, end_logits)) - return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) @add_start_docstrings( @@ -1307,7 +1331,12 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="google/mobilebert-uncased", + output_type=TFMultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs, @@ -1318,6 +1347,7 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -1326,24 +1356,6 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs: - classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`: - `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). - - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ if isinstance(inputs, (tuple, list)): input_ids = inputs[0] @@ -1354,8 +1366,9 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states - labels = inputs[8] if len(inputs) > 8 else labels - assert len(inputs) <= 9, "Too many inputs." + return_dict = inputs[8] if len(inputs) > 8 else return_dict + labels = inputs[9] if len(inputs) > 9 else labels + assert len(inputs) <= 10, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -1365,10 +1378,12 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + return_dict = inputs.get("return_dict", return_dict) labels = inputs.get("labels", labels) - assert len(inputs) <= 9, "Too many inputs." + assert len(inputs) <= 10, "Too many inputs." else: input_ids = inputs + return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict if input_ids is not None: num_choices = shape_list(input_ids)[1] @@ -1395,19 +1410,23 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic flat_inputs_embeds, output_attentions, output_hidden_states, + return_dict=return_dict, training=training, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output, training=training) logits = self.classifier(pooled_output) reshaped_logits = tf.reshape(logits, (-1, num_choices)) - outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here - if labels is not None: - loss = self.compute_loss(labels, reshaped_logits) - outputs = (loss,) + outputs + loss = None if labels is None else self.compute_loss(labels, reshaped_logits) - return outputs # (loss), reshaped_logits, (hidden_states), (attentions) + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFMultipleChoiceModelOutput( + loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -1427,7 +1446,12 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla ) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="google/mobilebert-uncased", + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -1438,6 +1462,7 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -1445,27 +1470,12 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs: - scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -1478,6 +1488,7 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -1486,10 +1497,12 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla sequence_output = self.dropout(sequence_output, training=training) logits = self.classifier(sequence_output) - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFTokenClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) diff --git a/src/transformers/modeling_tf_openai.py b/src/transformers/modeling_tf_openai.py index d5174f142a..e37478ef51 100644 --- a/src/transformers/modeling_tf_openai.py +++ b/src/transformers/modeling_tf_openai.py @@ -17,12 +17,21 @@ import logging +from dataclasses import dataclass +from typing import Optional, Tuple import numpy as np import tensorflow as tf from .configuration_openai import OpenAIGPTConfig -from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable +from .file_utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_callable, + replace_return_docstrings, +) +from .modeling_tf_outputs import TFBaseModelOutput, TFCausalLMOutput from .modeling_tf_utils import ( TFCausalLanguageModelingLoss, TFConv1D, @@ -38,6 +47,7 @@ from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) +_CONFIG_FOR_DOC = "OpenAIGPTConfig" _TOKENIZER_FOR_DOC = "OpenAIGPTTokenizer" TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [ @@ -208,6 +218,7 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): super().__init__(*inputs, **kwargs) self.output_hidden_states = config.output_hidden_states self.output_attentions = config.output_attentions + self.return_dict = config.use_return_dict self.num_hidden_layers = config.n_layer self.vocab_size = config.vocab_size self.n_embd = config.n_embd @@ -247,6 +258,7 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): if isinstance(inputs, (tuple, list)): @@ -258,7 +270,8 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states - assert len(inputs) <= 8, "Too many inputs." + return_dict = inputs[8] if len(inputs) > 8 else return_dict + assert len(inputs) <= 9, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -268,12 +281,14 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) - assert len(inputs) <= 8, "Too many inputs." + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 9, "Too many inputs." else: input_ids = inputs output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states + return_dict = return_dict if return_dict is not None else self.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -333,8 +348,8 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): output_shape = input_shape + [shape_list(hidden_states)[-1]] - all_attentions = [] - all_hidden_states = () + all_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None for i, block in enumerate(self.h): if output_hidden_states: all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),) @@ -342,22 +357,24 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): outputs = block(hidden_states, attention_mask, head_mask[i], output_attentions, training=training) hidden_states = outputs[0] if output_attentions: - all_attentions.append(outputs[1]) + all_attentions = all_attentions + (outputs[1],) hidden_states = tf.reshape(hidden_states, output_shape) # Add last hidden state if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - outputs = (hidden_states,) - if output_hidden_states: - outputs = outputs + (all_hidden_states,) if output_attentions: # let the number of heads free (-1) so we can extract attention even after head pruning attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:] all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions) - outputs = outputs + (all_attentions,) - return outputs # last hidden state, (all hidden_states), (attentions) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) + + return TFBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions, + ) class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel): @@ -369,6 +386,35 @@ class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel): base_model_prefix = "transformer" +@dataclass +class TFOpenAIGPTDoubleHeadsModelOutput(ModelOutput): + """ + Base class for outputs of models predicting if two sentences are consecutive or not. + + Args: + lm_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + mc_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`): + Prediction scores of the multiple choice classification head (scores for each choice before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + lm_logits: tf.Tensor = None + mc_logits: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + OPENAI_GPT_START_DOCSTRING = r""" .. note:: @@ -436,6 +482,11 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" (if set to :obj:`False`) for evaluation. output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -449,25 +500,13 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel): self.transformer = TFOpenAIGPTMainLayer(config, name="transformer") @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="openai-gpt", + output_type=TFBaseModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call(self, inputs, **kwargs): - r""" - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs: - last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the last layer of the model. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ outputs = self.transformer(inputs, **kwargs) return outputs @@ -486,7 +525,12 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin return self.transformer.tokens_embed @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="openai-gpt", + output_type=TFCausalLMOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs, @@ -497,6 +541,7 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -504,27 +549,12 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the cross entropy classification loss. Indices should be in ``[0, ..., config.vocab_size - 1]``. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs: - prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -537,21 +567,30 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) hidden_states = transformer_outputs[0] logits = self.transformer.tokens_embed(hidden_states, mode="linear") - outputs = (logits,) + transformer_outputs[1:] + loss = None if labels is not None: # shift labels to the left and cut last logit token logits = logits[:, :-1] labels = labels[:, 1:] loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs - return outputs # lm_logits, (all hidden_states), (attentions) + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFCausalLMOutput( + loss=loss, + logits=logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) @add_start_docstrings( @@ -575,6 +614,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): return self.transformer.tokens_embed @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFOpenAIGPTDoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC) def call( self, inputs, @@ -586,6 +626,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): mc_token_ids=None, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): r""" @@ -594,27 +635,6 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): Selected in the range ``[0, input_ids.size(-1) - 1]``. Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs: - lm_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - mc_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`): - Prediction scores of the multiple choice classification head (scores for each choice before SoftMax). - past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): - Contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model - should not be passed as input ids as they have already been computed. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - Examples:: @@ -646,7 +666,9 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids output_attentions = inputs[7] if len(inputs) > 7 else output_attentions - assert len(inputs) <= 8, "Too many inputs." + output_hidden_states = inputs[8] if len(inputs) > 8 else output_hidden_states + return_dict = inputs[9] if len(inputs) > 9 else return_dict + assert len(inputs) <= 10, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -656,9 +678,12 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) mc_token_ids = inputs.get("mc_token_ids", mc_token_ids) output_attentions = inputs.get("output_attentions", output_attentions) - assert len(inputs) <= 8, "Too many inputs." + output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 10, "Too many inputs." else: input_ids = inputs + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if input_ids is not None: input_shapes = shape_list(input_ids) @@ -679,6 +704,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): inputs_embeds, output_attentions, output_hidden_states, + return_dict=return_dict, training=training, ) hidden_states = transformer_outputs[0] @@ -686,6 +712,13 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): lm_logits = self.transformer.tokens_embed(hidden_states, mode="linear") mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids, training=training) mc_logits = tf.squeeze(mc_logits, axis=-1) - outputs = (lm_logits, mc_logits) + transformer_outputs[1:] - return outputs # lm logits, mc logits, (all hidden_states), (attentions) + if not return_dict: + return (lm_logits, mc_logits) + transformer_outputs[1:] + + return TFOpenAIGPTDoubleHeadsModelOutput( + lm_logits=lm_logits, + mc_logits=mc_logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/src/transformers/modeling_tf_outputs.py b/src/transformers/modeling_tf_outputs.py new file mode 100644 index 0000000000..8d61a17572 --- /dev/null +++ b/src/transformers/modeling_tf_outputs.py @@ -0,0 +1,555 @@ +from dataclasses import dataclass +from typing import List, Optional, Tuple + +import tensorflow as tf + +from .file_utils import ModelOutput + + +@dataclass +class TFBaseModelOutput(ModelOutput): + """ + Base class for model's outputs, with potential hidden states and attentions. + + Args: + last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (:obj:`tuple(tf.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFBaseModelOutputWithPooling(ModelOutput): + """ + Base class for model's outputs that also contains a pooling of the last hidden states. + + Args: + last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`): + Last layer hidden-state of the first token of the sequence (classification token) + further processed by a Linear layer and a Tanh activation function. The Linear + layer weights are trained from the next sentence prediction (classification) + objective during pretraining. + + This output is usually *not* a good summary + of the semantic content of the input, you're often better with averaging or pooling + the sequence of hidden-states for the whole input sequence. + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: tf.Tensor = None + pooler_output: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFBaseModelOutputWithPast(ModelOutput): + """ + Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). + + Args: + last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + + If `past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output. + past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): + List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape + :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see + ``past_key_values`` input) to speed up sequential decoding. + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: tf.Tensor = None + past_key_values: Optional[List[tf.Tensor]] = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFSeq2SeqModelOutput(ModelOutput): + """ + Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential + decoding. + + Args: + last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + + If ``decoder_past_key_values`` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output. + decoder_past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): + List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape + :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be + used (see ``decoder_past_key_values`` input) to speed up sequential decoding. + decoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + encoder_last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + last_hidden_state: tf.Tensor = None + decoder_past_key_values: Optional[List[tf.Tensor]] = None + decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None + decoder_attentions: Optional[Tuple[tf.Tensor]] = None + encoder_last_hidden_state: Optional[tf.Tensor] = None + encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None + encoder_attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFCausalLMOutput(ModelOutput): + """ + Base class for causal language model (or autoregressive) outputs. + + Args: + loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Language modeling loss (for next-token prediction). + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[tf.Tensor] = None + logits: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFCausalLMOutputWithPast(ModelOutput): + """ + Base class for causal language model (or autoregressive) outputs. + + Args: + loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Language modeling loss (for next-token prediction). + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): + List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape + :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see + ``past_key_values`` input) to speed up sequential decoding. + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[tf.Tensor] = None + logits: tf.Tensor = None + past_key_values: Optional[List[tf.Tensor]] = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFMaskedLMOutput(ModelOutput): + """ + Base class for masked language models outputs. + + Args: + loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Masked languaged modeling (MLM) loss. + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[tf.Tensor] = None + logits: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFSeq2SeqLMOutput(ModelOutput): + """ + Base class for sequence-to-sequence language models outputs. + + Args: + loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Languaged modeling loss. + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + decoder_past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): + List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape + :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be + used (see ``decoder_past_key_values`` input) to speed up sequential decoding. + decoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + encoder_last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + loss: Optional[tf.Tensor] = None + logits: tf.Tensor = None + decoder_past_key_values: Optional[List[tf.Tensor]] = None + decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None + decoder_attentions: Optional[Tuple[tf.Tensor]] = None + encoder_last_hidden_state: Optional[tf.Tensor] = None + encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None + encoder_attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFNextSentencePredictorOutput(ModelOutput): + """ + Base class for outputs of models predicting if two sentences are consecutive or not. + + Args: + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + logits: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFSequenceClassifierOutput(ModelOutput): + """ + Base class for outputs of sentence classification models. + + Args: + loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[tf.Tensor] = None + logits: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFSeq2SeqSequenceClassifierOutput(ModelOutput): + """ + Base class for outputs of sequence-to-sequence sentence classification models. + + Args: + loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + decoder_past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): + List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape + :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be + used (see ``decoder_past_key_values`` input) to speed up sequential decoding. + decoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + encoder_last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + loss: Optional[tf.Tensor] = None + logits: tf.Tensor = None + decoder_past_key_values: Optional[List[tf.Tensor]] = None + decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None + decoder_attentions: Optional[Tuple[tf.Tensor]] = None + encoder_last_hidden_state: Optional[tf.Tensor] = None + encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None + encoder_attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFMultipleChoiceModelOutput(ModelOutput): + """ + Base class for outputs of multiple choice models. + + Args: + loss (:obj:`tf.Tensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided): + Classification loss. + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`): + `num_choices` is the second dimension of the input tensors. (see `input_ids` above). + + Classification scores (before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[tf.Tensor] = None + logits: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFTokenClassifierOutput(ModelOutput): + """ + Base class for outputs of token classification models. + + Args: + loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : + Classification loss. + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): + Classification scores (before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[tf.Tensor] = None + logits: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFQuestionAnsweringModelOutput(ModelOutput): + """ + Base class for outputs of question answering models. + + Args: + loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + start_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): + Span-start scores (before SoftMax). + end_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): + Span-end scores (before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[tf.Tensor] = None + start_logits: tf.Tensor = None + end_logits: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFSeq2SeqQuestionAnsweringModelOutput(ModelOutput): + """ + Base class for outputs of sequence-to-sequence question answering models. + + Args: + loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + start_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): + Span-start scores (before SoftMax). + end_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): + Span-end scores (before SoftMax). + decoder_past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): + List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape + :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be + used (see ``decoder_past_key_values`` input) to speed up sequential decoding. + decoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + encoder_last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + loss: Optional[tf.Tensor] = None + start_logits: tf.Tensor = None + end_logits: tf.Tensor = None + decoder_past_key_values: Optional[List[tf.Tensor]] = None + decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None + decoder_attentions: Optional[Tuple[tf.Tensor]] = None + encoder_last_hidden_state: Optional[tf.Tensor] = None + encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None + encoder_attentions: Optional[Tuple[tf.Tensor]] = None diff --git a/src/transformers/modeling_tf_roberta.py b/src/transformers/modeling_tf_roberta.py index fbe1a4be58..3eb5e5a698 100644 --- a/src/transformers/modeling_tf_roberta.py +++ b/src/transformers/modeling_tf_roberta.py @@ -28,6 +28,14 @@ from .file_utils import ( add_start_docstrings_to_callable, ) from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu +from .modeling_tf_outputs import ( + TFBaseModelOutputWithPooling, + TFMaskedLMOutput, + TFMultipleChoiceModelOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) from .modeling_tf_utils import ( TFMaskedLanguageModelingLoss, TFMultipleChoiceLoss, @@ -44,6 +52,7 @@ from .tokenization_utils_base import BatchEncoding logger = logging.getLogger(__name__) +_CONFIG_FOR_DOC = "RobertaConfig" _TOKENIZER_FOR_DOC = "RobertaTokenizer" TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [ @@ -190,6 +199,11 @@ ROBERTA_INPUTS_DOCSTRING = r""" (if set to :obj:`False`) for evaluation. output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -203,32 +217,13 @@ class TFRobertaModel(TFRobertaPreTrainedModel): self.roberta = TFRobertaMainLayer(config, name="roberta") @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="roberta-base", + output_type=TFBaseModelOutputWithPooling, + config_class=_CONFIG_FOR_DOC, + ) def call(self, inputs, **kwargs): - r""" - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: - last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`): - Last layer hidden-state of the first token of the sequence (classification token) - further processed by a Linear layer and a Tanh activation function. The Linear - layer weights are trained from the next sentence prediction (classification) - objective during Bert pretraining. This output is usually *not* a good summary - of the semantic content of the input, you're often better with averaging or pooling - the sequence of hidden-states for the whole input sequence. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ outputs = self.roberta(inputs, **kwargs) return outputs @@ -276,7 +271,12 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos return self.lm_head.decoder @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="roberta-base", + output_type=TFMaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -287,6 +287,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -296,27 +297,12 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: - prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.roberta.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -329,6 +315,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -337,13 +324,15 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos sequence_output = outputs[0] prediction_scores = self.lm_head(sequence_output) - outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, prediction_scores) - if labels is not None: - loss = self.compute_loss(labels, prediction_scores) - outputs = (loss,) + outputs + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), prediction_scores, (hidden_states), (attentions) + return TFMaskedLMOutput( + loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) class TFRobertaClassificationHead(tf.keras.layers.Layer): @@ -385,7 +374,12 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla self.classifier = TFRobertaClassificationHead(config, name="classifier") @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="roberta-base", + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -396,30 +390,22 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): r""" - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: - logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): - Classification (or regression if config.num_labels==1) scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. + labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for computing the sequence classification/regression loss. + Indices should be in :obj:`[0, ..., config.num_labels - 1]`. + If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), + If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ + return_dict = return_dict if return_dict is not None else self.roberta.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -432,19 +418,22 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) sequence_output = outputs[0] logits = self.classifier(sequence_output, training=training) - outputs = (logits,) + outputs[2:] + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFSequenceClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -472,7 +461,12 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss) return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="roberta-base", + output_type=TFMultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs, @@ -483,6 +477,7 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss) inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -491,24 +486,6 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss) Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`: - `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). - - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ if isinstance(inputs, (tuple, list)): input_ids = inputs[0] @@ -519,8 +496,9 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss) inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states - labels = inputs[8] if len(inputs) > 8 else labels - assert len(inputs) <= 9, "Too many inputs." + return_dict = inputs[8] if len(inputs) > 8 else return_dict + labels = inputs[9] if len(inputs) > 9 else labels + assert len(inputs) <= 10, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -530,10 +508,12 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_attentions) + return_dict = inputs.get("return_dict", return_dict) labels = inputs.get("labels", labels) - assert len(inputs) <= 9, "Too many inputs." + assert len(inputs) <= 10, "Too many inputs." else: input_ids = inputs + return_dict = return_dict if return_dict is not None else self.roberta.return_dict if input_ids is not None: num_choices = shape_list(input_ids)[1] @@ -555,19 +535,23 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss) inputs_embeds, output_attentions, output_hidden_states, + return_dict=return_dict, training=training, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output, training=training) logits = self.classifier(pooled_output) reshaped_logits = tf.reshape(logits, (-1, num_choices)) - outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here - if labels is not None: - loss = self.compute_loss(labels, reshaped_logits) - outputs = (loss,) + outputs + loss = None if labels is None else self.compute_loss(labels, reshaped_logits) - return outputs # (loss), reshaped_logits, (hidden_states), (attentions) + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFMultipleChoiceModelOutput( + loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -587,7 +571,12 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific ) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="roberta-base", + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -598,6 +587,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -605,27 +595,12 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: - scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.roberta.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -638,6 +613,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -646,13 +622,15 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific sequence_output = self.dropout(sequence_output, training=training) logits = self.classifier(sequence_output) - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFTokenClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -670,7 +648,12 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin ) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="roberta-base", + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -681,6 +664,7 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, start_positions=None, end_positions=None, training=False, @@ -694,30 +678,13 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: - start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-start scores (before SoftMax). - end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-end scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.roberta.return_dict if isinstance(inputs, (tuple, list)): - start_positions = inputs[8] if len(inputs) > 8 else start_positions - end_positions = inputs[9] if len(inputs) > 9 else end_positions - if len(inputs) > 8: - inputs = inputs[:8] + start_positions = inputs[9] if len(inputs) > 9 else start_positions + end_positions = inputs[10] if len(inputs) > 10 else end_positions + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): start_positions = inputs.pop("start_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions) @@ -731,6 +698,7 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -741,12 +709,20 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) - outputs = (start_logits, end_logits,) + outputs[2:] - + loss = None if start_positions is not None and end_positions is not None: labels = {"start_position": start_positions} labels["end_position"] = end_positions - loss = self.compute_loss(labels, outputs[:2]) - outputs = (loss,) + outputs + loss = self.compute_loss(labels, (start_logits, end_logits)) - return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/src/transformers/modeling_tf_t5.py b/src/transformers/modeling_tf_t5.py index 9858b8ae76..819240ee78 100644 --- a/src/transformers/modeling_tf_t5.py +++ b/src/transformers/modeling_tf_t5.py @@ -25,7 +25,14 @@ import warnings import tensorflow as tf from .configuration_t5 import T5Config -from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings, add_start_docstrings_to_callable +from .file_utils import ( + DUMMY_INPUTS, + DUMMY_MASK, + add_start_docstrings, + add_start_docstrings_to_callable, + replace_return_docstrings, +) +from .modeling_tf_outputs import TFSeq2SeqLMOutput, TFSeq2SeqModelOutput from .modeling_tf_utils import ( TFCausalLanguageModelingLoss, TFPreTrainedModel, @@ -39,6 +46,7 @@ from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) +_CONFIG_FOR_DOC = "T5Config" _TOKENIZER_FOR_DOC = "T5Tokenizer" TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = [ @@ -575,8 +583,8 @@ class TFT5MainLayer(tf.keras.layers.Layer): head_mask = inputs[5] if len(inputs) > 5 else head_mask past_key_value_states = inputs[6] if len(inputs) > 6 else past_key_value_states use_cache = inputs[7] if len(inputs) > 7 else use_cache - output_attentions = inputs[8] if len(inputs) > 7 else output_attentions - output_hidden_states = inputs[9] if len(inputs) > 8 else output_hidden_states + output_attentions = inputs[8] if len(inputs) > 8 else output_attentions + output_hidden_states = inputs[9] if len(inputs) > 9 else output_hidden_states assert len(inputs) <= 10, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") @@ -934,6 +942,7 @@ class TFT5Model(TFT5PreTrainedModel): return self.decoder @add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFSeq2SeqModelOutput, config_class=_CONFIG_FOR_DOC) def call( self, inputs, @@ -948,29 +957,11 @@ class TFT5Model(TFT5PreTrainedModel): use_cache=None, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): r""" Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs: - last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - If `decoder_past_key_value_states` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output. - decoder_past_key_value_states (:obj:`tuple(tuple(tf.Tensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`, `optional`, returned when ``use_cache=True``): - Contains pre-computed key and value hidden-states of the attention blocks. - Can be used to speed up sequential decoding (see `decoder_past_key_value_states` input). - Note that when using `decoder_past_key_value_states`, the model only outputs the last `hidden-state` of the sequence of shape :obj:`(batch_size, 1, config.vocab_size)`. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. Examples:: @@ -996,7 +987,8 @@ class TFT5Model(TFT5PreTrainedModel): use_cache = inputs[9] if len(inputs) > 9 else use_cache output_attentions = inputs[10] if len(inputs) > 10 else output_attentions output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states - assert len(inputs) <= 12, "Too many inputs." + return_dict = inputs[12] if len(inputs) > 12 else return_dict + assert len(inputs) <= 13, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): if "inputs" in inputs: warnings.warn("Using `inputs` as a keyword argument is deprecated. Please use `input_ids` instead.") @@ -1013,11 +1005,13 @@ class TFT5Model(TFT5PreTrainedModel): use_cache = inputs.get("use_cache", use_cache) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) - assert len(inputs) <= 12, "Too many inputs." + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 13, "Too many inputs." else: input_ids = inputs use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.return_dict # Encode if needed (training, first prediction pass) if encoder_outputs is None: @@ -1063,12 +1057,40 @@ class TFT5Model(TFT5PreTrainedModel): ], training=training, ) + past = ( + (encoder_outputs, decoder_outputs[1]) if cast_bool_to_primitive(use_cache, self.config.use_cache) else None + ) + if not return_dict: + if past is not None: + decoder_outputs = decoder_outputs[:1] + (past,) + decoder_outputs[2:] + return decoder_outputs + encoder_outputs - if cast_bool_to_primitive(use_cache, self.config.use_cache) is True: - past = ((encoder_outputs, decoder_outputs[1]),) - decoder_outputs = decoder_outputs[:1] + past + decoder_outputs[2:] + # If put before, this breaks the tf compilation. + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) - return decoder_outputs + encoder_outputs + # This is long and annoying but if we introduce return_dict at the TFT5MainLayer level (like in PyTorch) + # TF refuses to compile anymore. + if not cast_bool_to_primitive(use_cache, self.config.use_cache): + decoder_outputs = decoder_outputs[:1] + (None,) + decoder_outputs[1:] + if not cast_bool_to_primitive(output_hidden_states, self.config.output_hidden_states): + encoder_outputs = encoder_outputs[:1] + (None,) + encoder_outputs[1:] + decoder_outputs = decoder_outputs[:2] + (None,) + decoder_outputs[2:] + if not cast_bool_to_primitive(output_attentions, self.config.output_attentions): + encoder_outputs = encoder_outputs + (None,) + decoder_outputs = decoder_outputs + (None,) + + return TFSeq2SeqModelOutput( + last_hidden_state=decoder_outputs[0], + decoder_past_key_values=past, + decoder_hidden_states=decoder_outputs[2], + decoder_attentions=decoder_outputs[3], + encoder_last_hidden_state=encoder_outputs[0], + encoder_hidden_states=encoder_outputs[1], + encoder_attentions=encoder_outputs[2], + ) @add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING) @@ -1115,6 +1137,7 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling return self.decoder @add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) def call( self, inputs, @@ -1129,6 +1152,7 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling use_cache=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -1138,24 +1162,6 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling Indices should be in ``[0, ..., config.vocab_size - 1]``. Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs: - prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - decoder_past_key_value_states (:obj:`tuple(tuple(tf.Tensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`, `optional`, returned when ``use_cache=True``): - Contains pre-computed key and value hidden-states of the attention blocks. - Can be used to speed up sequential decoding (see `decoder_past_key_value_states` input). - Note that when using `decoder_past_key_value_states`, the model only outputs the last `prediction_score` of the sequence of shape :obj:`(batch_size, 1, config.vocab_size)`. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. Examples:: @@ -1186,8 +1192,9 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling use_cache = inputs[9] if len(inputs) > 9 else use_cache output_attentions = inputs[10] if len(inputs) > 10 else output_attentions output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states - labels = inputs[12] if len(inputs) > 12 else labels - assert len(inputs) <= 13, "Too many inputs." + return_dict = inputs[12] if len(inputs) > 12 else return_dict + labels = inputs[13] if len(inputs) > 13 else labels + assert len(inputs) <= 14, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): if "inputs" in inputs: warnings.warn("Using `inputs` as a keyword argument is deprecated. Please use `input_ids` instead.") @@ -1204,12 +1211,14 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling use_cache = inputs.get("use_cache", use_cache) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + return_dict = inputs.get("return_dict", return_dict) labels = inputs.get("labels", labels) - assert len(inputs) <= 13, "Too many inputs." + assert len(inputs) <= 14, "Too many inputs." else: input_ids = inputs use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.return_dict # Encode if needed (training, first prediction pass) if encoder_outputs is None: @@ -1261,22 +1270,48 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling training=training, ) - # insert decoder past at right place - # to speed up decoding - if cast_bool_to_primitive(use_cache, self.config.use_cache) is True: - past = ((encoder_outputs, decoder_outputs[1]),) - decoder_outputs = decoder_outputs[:1] + past + decoder_outputs[2:] - sequence_output = decoder_outputs[0] * (self.model_dim ** -0.5) embed_tokens = self.get_output_embeddings() logits = embed_tokens(sequence_output, mode="linear") - decoder_outputs = (logits,) + decoder_outputs[1:] - if labels is not None: - loss = self.compute_loss(labels, logits) - decoder_outputs = (loss,) + decoder_outputs + loss = None if labels is None else self.compute_loss(labels, logits) - return decoder_outputs + encoder_outputs + past = ( + (encoder_outputs, decoder_outputs[1]) if cast_bool_to_primitive(use_cache, self.config.use_cache) else None + ) + if not return_dict: + if past is not None: + decoder_outputs = decoder_outputs[:1] + (past,) + decoder_outputs[2:] + output = (logits,) + decoder_outputs[1:] + encoder_outputs + return ((loss,) + output) if loss is not None else output + + # Putting this before breaks tf compilation. + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + # This is long and annoying but if we introduce return_dict at the TFT5MainLayer level (like in PyTorch) + # TF refuses to compile anymore. + if not cast_bool_to_primitive(use_cache, self.config.use_cache): + decoder_outputs = decoder_outputs[:1] + (None,) + decoder_outputs[1:] + if not cast_bool_to_primitive(output_hidden_states, self.config.output_hidden_states): + encoder_outputs = encoder_outputs[:1] + (None,) + encoder_outputs[1:] + decoder_outputs = decoder_outputs[:2] + (None,) + decoder_outputs[2:] + if not cast_bool_to_primitive(output_attentions, self.config.output_attentions): + encoder_outputs = encoder_outputs + (None,) + decoder_outputs = decoder_outputs + (None,) + + return TFSeq2SeqLMOutput( + loss=loss, + logits=logits, + decoder_past_key_values=past, + decoder_hidden_states=decoder_outputs[2], + decoder_attentions=decoder_outputs[3], + encoder_last_hidden_state=encoder_outputs[0], + encoder_hidden_states=encoder_outputs[1], + encoder_attentions=encoder_outputs[2], + ) def prepare_inputs_for_generation(self, inputs, past, attention_mask, use_cache, **kwargs): assert past is not None, "past has to be defined for encoder_outputs" diff --git a/src/transformers/modeling_tf_transfo_xl.py b/src/transformers/modeling_tf_transfo_xl.py index d1979174a3..abeef52ff1 100644 --- a/src/transformers/modeling_tf_transfo_xl.py +++ b/src/transformers/modeling_tf_transfo_xl.py @@ -18,11 +18,13 @@ import logging +from dataclasses import dataclass +from typing import List, Optional, Tuple import tensorflow as tf from .configuration_transfo_xl import TransfoXLConfig -from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable +from .file_utils import ModelOutput, add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask from .modeling_tf_utils import TFPreTrainedModel, get_initializer, keras_serializable, shape_list from .tokenization_utils import BatchEncoding @@ -30,6 +32,7 @@ from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) +_CONFIG_FOR_DOC = "TransfoXLConfig" _TOKENIZER_FOR_DOC = "TransfoXLTokenizer" TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [ @@ -388,6 +391,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): super().__init__(**kwargs) self.output_hidden_states = config.output_hidden_states self.output_attentions = config.output_attentions + self.return_dict = config.use_return_dict self.n_token = config.vocab_size @@ -525,6 +529,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): if isinstance(inputs, (tuple, list)): @@ -533,8 +538,9 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): head_mask = inputs[2] if len(inputs) > 2 else head_mask inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds output_attentions = inputs[4] if len(inputs) > 4 else output_attentions - output_hidden_states = inputs[5] if len(inputs) > 4 else output_hidden_states - assert len(inputs) <= 6, "Too many inputs." + output_hidden_states = inputs[5] if len(inputs) > 5 else output_hidden_states + return_dict = inputs[6] if len(inputs) > 6 else return_dict + assert len(inputs) <= 7, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") mems = inputs.get("mems", mems) @@ -542,12 +548,14 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) - assert len(inputs) <= 6, "Too many inputs." + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 7, "Too many inputs." else: input_ids = inputs output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states + return_dict = return_dict if return_dict is not None else self.return_dict # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library # so we transpose here from shape [bsz, len] to shape [len, bsz] @@ -606,7 +614,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): # word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1+mlen)[:,:,None] hids = [] - attentions = [] + attentions = [] if output_attentions else None if self.attn_type == 0: # default pos_seq = tf.range(klen - 1, -1, -1.0) if self.clamp_len > 0: @@ -633,17 +641,24 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): new_mems = self._update_mems(hids, mems, mlen, qlen) # We transpose back here to shape [bsz, len, hidden_dim] - outputs = [tf.transpose(core_out, perm=(1, 0, 2)), new_mems] + core_out = tf.transpose(core_out, perm=(1, 0, 2)) + if output_hidden_states: # Add last layer and transpose to library standard shape [bsz, len, hidden_dim] hids.append(core_out) - hids = list(tf.transpose(t, perm=(1, 0, 2)) for t in hids) - outputs.append(hids) + hids = tuple(tf.transpose(t, perm=(1, 0, 2)) for t in hids) + else: + hids = None if output_attentions: # Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len] - attentions = list(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions) - outputs.append(attentions) - return outputs # last hidden state, new_mems, (all hidden states), (all attentions) + attentions = tuple(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions) + + if not return_dict: + return tuple(v for v in [core_out, new_mems, hids, attentions] if v is not None) + + return TFTransfoXLModelOutput( + last_hidden_state=core_out, mems=new_mems, hidden_states=hids, attentions=attentions, + ) class TFTransfoXLPreTrainedModel(TFPreTrainedModel): @@ -655,6 +670,70 @@ class TFTransfoXLPreTrainedModel(TFPreTrainedModel): base_model_prefix = "transformer" +@dataclass +class TFTransfoXLModelOutput(ModelOutput): + """ + Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). + + Args: + last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): + Contains pre-computed hidden-states (key and values in the attention blocks). + Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model + should not be passed as input ids as they have already been computed. + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: tf.Tensor = None + mems: List[tf.Tensor] = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFTransfoXLLMHeadModelOutput(ModelOutput): + """ + Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). + + Args: + losses (:obj:`tf.Tensor` of shape `(batch_size, sequence_length-1)`, `optional`, returned when ``labels`` is provided) + Language modeling losses (not reduced). + prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token after SoftMax). + mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): + Contains pre-computed hidden-states (key and values in the attention blocks). + Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model + should not be passed as input ids as they have already been computed. + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + prediction_scores: tf.Tensor = None + mems: List[tf.Tensor] = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + TRANSFO_XL_START_DOCSTRING = r""" .. note:: @@ -706,6 +785,11 @@ TRANSFO_XL_INPUTS_DOCSTRING = r""" than the model's internal embedding lookup matrix. output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -719,29 +803,13 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel): self.transformer = TFTransfoXLMainLayer(config, name="transformer") @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="transfo-xl-wt103", + output_type=TFTransfoXLModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call(self, inputs, **kwargs): - r""" - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.TransfoXLConfig`) and inputs: - last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the last layer of the model. - mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): - Contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model - should not be passed as input ids as they have already been computed. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ outputs = self.transformer(inputs, **kwargs) return outputs @@ -797,57 +865,47 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel): return self.transformer.init_mems(bsz) @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="transfo-xl-wt103", + output_type=TFTransfoXLLMHeadModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs, mems=None, head_mask=None, inputs_embeds=None, - labels=None, output_attentions=None, output_hidden_states=None, + return_dict=None, + labels=None, training=False, ): - r""" - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.TransfoXLConfig`) and inputs: - prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): - Contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model - should not be passed as input ids as they have already been computed. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ if isinstance(inputs, (tuple, list)): input_ids = inputs[0] mems = inputs[1] if len(inputs) > 1 else mems head_mask = inputs[2] if len(inputs) > 2 else head_mask inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds - labels = inputs[4] if len(inputs) > 4 else labels - output_attentions = inputs[5] if len(inputs) > 5 else output_attentions - assert len(inputs) <= 6, "Too many inputs." + output_attentions = inputs[4] if len(inputs) > 4 else output_attentions + output_hidden_states = inputs[5] if len(inputs) > 5 else output_hidden_states + return_dict = inputs[6] if len(inputs) > 6 else return_dict + labels = inputs[7] if len(inputs) > 7 else labels + assert len(inputs) <= 8, "Too many inputs." elif isinstance(inputs, (BatchEncoding, dict)): input_ids = inputs.get("input_ids") mems = inputs.get("mems", mems) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) - labels = inputs.get("labels", labels) output_attentions = inputs.get("output_attentions", output_attentions) - assert len(inputs) <= 6, "Too many inputs." + output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + return_dict = inputs.get("return_dict", return_dict) + labels = inputs.get("labels", labels) + assert len(inputs) <= 8, "Too many inputs." else: input_ids = inputs + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if input_ids is not None: bsz, tgt_len = shape_list(input_ids)[:2] @@ -855,17 +913,30 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel): bsz, tgt_len = shape_list(inputs_embeds)[:2] transformer_outputs = self.transformer( - input_ids, mems, head_mask, inputs_embeds, output_attentions, output_hidden_states, training=training + input_ids, + mems, + head_mask, + inputs_embeds, + output_attentions, + output_hidden_states, + return_dict, + training=training, ) last_hidden = transformer_outputs[0] pred_hid = last_hidden[:, -tgt_len:] - outputs = transformer_outputs[1:] softmax_output = self.crit(pred_hid, labels, training=training) - outputs = [softmax_output] + outputs - return outputs # logits, new_mems, (all hidden states), (all attentions) + if not return_dict: + return (softmax_output,) + transformer_outputs[1:] + + return TFTransfoXLLMHeadModelOutput( + prediction_scores=softmax_output, + mems=transformer_outputs.mems, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) def prepare_inputs_for_generation(self, inputs, past, **model_kwargs): inputs = {"inputs": inputs} diff --git a/src/transformers/modeling_tf_xlm.py b/src/transformers/modeling_tf_xlm.py index e1eddcc57c..1cd0d7e580 100644 --- a/src/transformers/modeling_tf_xlm.py +++ b/src/transformers/modeling_tf_xlm.py @@ -20,6 +20,8 @@ import itertools import logging import math import warnings +from dataclasses import dataclass +from typing import Optional, Tuple import numpy as np import tensorflow as tf @@ -27,10 +29,18 @@ import tensorflow as tf from .configuration_xlm import XLMConfig from .file_utils import ( MULTIPLE_CHOICE_DUMMY_INPUTS, + ModelOutput, add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable, ) +from .modeling_tf_outputs import ( + TFBaseModelOutput, + TFMultipleChoiceModelOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) from .modeling_tf_utils import ( TFMultipleChoiceLoss, TFPreTrainedModel, @@ -48,6 +58,7 @@ from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) +_CONFIG_FOR_DOC = "XLMConfig" _TOKENIZER_FOR_DOC = "XLMTokenizer" TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [ @@ -224,6 +235,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer): super().__init__(**kwargs) self.output_hidden_states = config.output_hidden_states self.output_attentions = config.output_attentions + self.return_dict = config.use_return_dict # encoder / decoder, output layer self.is_encoder = config.is_encoder @@ -340,6 +352,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer): inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): # removed: src_enc=None, src_len=None if isinstance(inputs, (tuple, list)): @@ -354,7 +367,8 @@ class TFXLMMainLayer(tf.keras.layers.Layer): inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds output_attentions = inputs[9] if len(inputs) > 9 else output_attentions output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states - assert len(inputs) <= 11, "Too many inputs." + return_dict = inputs[11] if len(inputs) > 11 else return_dict + assert len(inputs) <= 12, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -367,12 +381,14 @@ class TFXLMMainLayer(tf.keras.layers.Layer): inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) - assert len(inputs) <= 11, "Too many inputs." + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 12, "Too many inputs." else: input_ids = inputs output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states + return_dict = return_dict if return_dict is not None else self.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -454,8 +470,8 @@ class TFXLMMainLayer(tf.keras.layers.Layer): tensor = tensor * mask[..., tf.newaxis] # transformer layers - hidden_states = () - attentions = () + hidden_states = () if output_hidden_states else None + attentions = () if output_attentions else None for i in range(self.n_layers): if output_hidden_states: hidden_states = hidden_states + (tensor,) @@ -494,12 +510,9 @@ class TFXLMMainLayer(tf.keras.layers.Layer): # move back sequence length to dimension 0 # tensor = tensor.transpose(0, 1) - outputs = (tensor,) - if output_hidden_states: - outputs = outputs + (hidden_states,) - if output_attentions: - outputs = outputs + (attentions,) - return outputs # outputs, (hidden_states), (attentions) + if not return_dict: + return tuple(v for v in [tensor, hidden_states, attentions] if v is not None) + return TFBaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions) class TFXLMPreTrainedModel(TFPreTrainedModel): @@ -522,6 +535,33 @@ class TFXLMPreTrainedModel(TFPreTrainedModel): return {"input_ids": inputs_list, "attention_mask": attns_list, "langs": langs_list} +# Remove when XLMWithLMHead computes loss like other LM models +@dataclass +class TFXLMWithLMHeadModelOutput(ModelOutput): + """ + Base class for :class:`~transformers.TFXLMWithLMHeadModel` outputs. + + Args: + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + logits: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + XLM_START_DOCSTRING = r""" .. note:: @@ -603,6 +643,11 @@ XLM_INPUTS_DOCSTRING = r""" than the model's internal embedding lookup matrix. output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -616,25 +661,13 @@ class TFXLMModel(TFXLMPreTrainedModel): self.transformer = TFXLMMainLayer(config, name="transformer") @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xlm-mlm-en-2048", + output_type=TFBaseModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call(self, inputs, **kwargs): - r""" - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs: - last_hidden_state (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): - Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ outputs = self.transformer(inputs, **kwargs) return outputs @@ -701,32 +734,26 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel): return {"inputs": inputs, "langs": langs} @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xlm-mlm-en-2048", + output_type=TFXLMWithLMHeadModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call(self, inputs, **kwargs): - r""" - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs: - prediction_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ + return_dict = kwargs.get("return_dict") + return_dict = return_dict if return_dict is not None else self.transformer.return_dict transformer_outputs = self.transformer(inputs, **kwargs) output = transformer_outputs[0] outputs = self.pred_layer(output) - outputs = (outputs,) + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here - return outputs + if not return_dict: + return (outputs,) + transformer_outputs[1:] + + return TFXLMWithLMHeadModelOutput( + logits=outputs, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions + ) @add_start_docstrings( @@ -743,7 +770,12 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary") @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xlm-mlm-en-2048", + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -757,6 +789,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -766,27 +799,12 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat Indices should be in ``[0, ..., config.num_labels - 1]``. If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). - - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs: - logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, config.num_labels)`): - Classification (or regression if config.num_labels==1) scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[11] if len(inputs) > 11 else labels - if len(inputs) > 11: - inputs = inputs[:11] + labels = inputs[12] if len(inputs) > 12 else labels + if len(inputs) > 12: + inputs = inputs[:12] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -802,19 +820,25 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) output = transformer_outputs[0] logits = self.sequence_summary(output) - outputs = (logits,) + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFSequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) @add_start_docstrings( @@ -845,7 +869,12 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss): } @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xlm-mlm-en-2048", + output_type=TFMultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs, @@ -859,6 +888,7 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss): inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -867,24 +897,6 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`: - `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). - - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ if isinstance(inputs, (tuple, list)): input_ids = inputs[0] @@ -898,8 +910,9 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss): inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds output_attentions = inputs[9] if len(inputs) > 9 else output_attentions output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states - labels = inputs[11] if len(inputs) > 11 else labels - assert len(inputs) <= 12, "Too many inputs." + return_dict = inputs[11] if len(inputs) > 11 else return_dict + labels = inputs[12] if len(inputs) > 12 else labels + assert len(inputs) <= 13, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -912,10 +925,12 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss): inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + return_dict = inputs.get("return_dict", return_dict) labels = inputs.get("labels", labels) - assert len(inputs) <= 12, "Too many inputs." + assert len(inputs) <= 13, "Too many inputs." else: input_ids = inputs + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if input_ids is not None: num_choices = shape_list(input_ids)[1] @@ -955,19 +970,26 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss): flat_inputs_embeds, output_attentions, output_hidden_states, + return_dict=return_dict, training=training, ) output = transformer_outputs[0] logits = self.sequence_summary(output) logits = self.logits_proj(logits) reshaped_logits = tf.reshape(logits, (-1, num_choices)) - outputs = (reshaped_logits,) + transformer_outputs[1:] # add hidden states and attention if they are here - if labels is not None: - loss = self.compute_loss(labels, reshaped_logits) - outputs = (loss,) + outputs + loss = None if labels is None else self.compute_loss(labels, reshaped_logits) - return outputs # (loss), reshaped_logits, (hidden_states), (attentions) + if not return_dict: + output = (reshaped_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFMultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) @add_start_docstrings( @@ -987,7 +1009,12 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos ) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xlm-mlm-en-2048", + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -1001,6 +1028,7 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -1008,27 +1036,12 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[11] if len(inputs) > 11 else labels - if len(inputs) > 11: - inputs = inputs[:11] + labels = inputs[12] if len(inputs) > 12 else labels + if len(inputs) > 12: + inputs = inputs[:12] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -1044,6 +1057,7 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -1052,13 +1066,18 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos sequence_output = self.dropout(sequence_output, training=training) logits = self.classifier(sequence_output) - outputs = (logits,) + transformer_outputs[1:] # add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFTokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) @add_start_docstrings( @@ -1075,7 +1094,12 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL ) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xlm-mlm-en-2048", + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -1089,6 +1113,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, start_positions=None, end_positions=None, training=False, @@ -1102,30 +1127,13 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. - - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs: - start_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`): - Span-start scores (before SoftMax). - end_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`): - Span-end scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if isinstance(inputs, (tuple, list)): - start_positions = inputs[11] if len(inputs) > 11 else start_positions - end_positions = inputs[12] if len(inputs) > 12 else end_positions - if len(inputs) > 11: - inputs = inputs[:11] + start_positions = inputs[12] if len(inputs) > 12 else start_positions + end_positions = inputs[13] if len(inputs) > 13 else end_positions + if len(inputs) > 12: + inputs = inputs[:12] elif isinstance(inputs, (dict, BatchEncoding)): start_positions = inputs.pop("start_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions) @@ -1142,6 +1150,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -1152,14 +1161,20 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) - outputs = (start_logits, end_logits,) + transformer_outputs[ - 1: - ] # Keep mems, hidden states, attentions if there are in it - + loss = None if start_positions is not None and end_positions is not None: labels = {"start_position": start_positions} labels["end_position"] = end_positions - loss = self.compute_loss(labels, outputs[:2]) - outputs = (loss,) + outputs + loss = self.compute_loss(labels, (start_logits, end_logits)) - return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) + if not return_dict: + output = (start_logits, end_logits) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/src/transformers/modeling_tf_xlm_roberta.py b/src/transformers/modeling_tf_xlm_roberta.py index 5448595a7b..46bc96950c 100644 --- a/src/transformers/modeling_tf_xlm_roberta.py +++ b/src/transformers/modeling_tf_xlm_roberta.py @@ -62,8 +62,6 @@ XLM_ROBERTA_START_DOCSTRING = r""" config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. - output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. """ diff --git a/src/transformers/modeling_tf_xlnet.py b/src/transformers/modeling_tf_xlnet.py index e255e5adfd..2054d230ce 100644 --- a/src/transformers/modeling_tf_xlnet.py +++ b/src/transformers/modeling_tf_xlnet.py @@ -18,6 +18,8 @@ import logging +from dataclasses import dataclass +from typing import List, Optional, Tuple import numpy as np import tensorflow as tf @@ -25,9 +27,11 @@ import tensorflow as tf from .configuration_xlnet import XLNetConfig from .file_utils import ( MULTIPLE_CHOICE_DUMMY_INPUTS, + ModelOutput, add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable, + replace_return_docstrings, ) from .modeling_tf_utils import ( TFCausalLanguageModelingLoss, @@ -47,6 +51,7 @@ from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) +_CONFIG_FOR_DOC = "XLNetConfig" _TOKENIZER_FOR_DOC = "XLNetTokenizer" TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [ @@ -436,6 +441,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): super().__init__(**kwargs) self.output_hidden_states = config.output_hidden_states self.output_attentions = config.output_attentions + self.return_dict = config.return_dict self.mem_len = config.mem_len self.reuse_len = config.reuse_len @@ -586,6 +592,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): use_cache=True, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): if isinstance(inputs, (tuple, list)): @@ -601,7 +608,8 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): use_cache = inputs[9] if len(inputs) > 9 else use_cache output_attentions = inputs[10] if len(inputs) > 10 else output_attentions output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states - assert len(inputs) <= 12, "Too many inputs." + return_dict = inputs[12] if len(inputs) > 12 else return_dict + assert len(inputs) <= 13, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -615,12 +623,14 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): use_cache = inputs.get("use_cache", use_cache) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) - assert len(inputs) <= 12, "Too many inputs." + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 13, "Too many inputs." else: input_ids = inputs output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states + return_dict = return_dict if return_dict is not None else self.return_dict # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end # but we want a unified interface in the library with the batch size on the first dimension @@ -743,8 +753,8 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): if mems is None: mems = [None] * len(self.layer) - attentions = [] - hidden_states = [] + attentions = [] if output_attentions else None + hidden_states = [] if output_hidden_states else None for i, layer_module in enumerate(self.layer): # cache new mems if self.mem_len is not None and self.mem_len > 0 and use_cache: @@ -776,22 +786,24 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): output = self.dropout(output_g if output_g is not None else output_h, training=training) # Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method) - outputs = (tf.transpose(output, perm=(1, 0, 2)),) - - if self.mem_len is not None and self.mem_len > 0 and use_cache: - outputs = outputs + (new_mems,) + output = tf.transpose(output, perm=(1, 0, 2)) + if not (self.mem_len is not None and self.mem_len > 0 and use_cache): + new_mems = None if output_hidden_states: if output_g is not None: hidden_states = tuple(tf.transpose(h, perm=(1, 0, 2)) for hs in hidden_states for h in hs) else: hidden_states = tuple(tf.transpose(hs, perm=(1, 0, 2)) for hs in hidden_states) - outputs = outputs + (hidden_states,) if output_attentions: attentions = tuple(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions) - outputs = outputs + (attentions,) - return outputs # outputs, (new_mems), (hidden_states), (attentions) + if not return_dict: + return tuple(v for v in [output, new_mems, hidden_states, attentions] if v is not None) + + return TFXLNetModelOutput( + last_hidden_state=output, mems=new_mems, hidden_states=hidden_states, attentions=attentions + ) class TFXLNetPreTrainedModel(TFPreTrainedModel): @@ -803,6 +815,218 @@ class TFXLNetPreTrainedModel(TFPreTrainedModel): base_model_prefix = "transformer" +@dataclass +class TFXLNetModelOutput(ModelOutput): + """ + Output type of :class:`~transformers.TFXLNetModel`. + + Args: + last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_predict, hidden_size)`): + Sequence of hidden-states at the last layer of the model. + + ``num_predict`` corresponds to ``target_mapping.shape[1]``. If ``target_mapping`` is ``None``, then + ``num_predict`` corresponds to ``sequence_length``. + mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): + Contains pre-computed hidden-states. + Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model + should not be passed as input ids as they have already been computed. + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: tf.Tensor = None + mems: Optional[List[tf.Tensor]] = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFXLNetLMHeadModelOutput(ModelOutput): + """ + Output type of :class:`~transformers.TFXLNetLMHeadModel`. + + Args: + loss (:obj:`tf.Tensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided) + Language modeling loss (for next-token prediction). + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_predict, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + + ``num_predict`` corresponds to ``target_mapping.shape[1]``. If ``target_mapping`` is ``None``, then + ``num_predict`` corresponds to ``sequence_length``. + mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): + Contains pre-computed hidden-states. + Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model + should not be passed as input ids as they have already been computed. + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[tf.Tensor] = None + logits: tf.Tensor = None + mems: Optional[List[tf.Tensor]] = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFXLNetForSequenceClassificationOutput(ModelOutput): + """ + Output type of :class:`~transformers.TFXLNetForSequenceClassification`. + + Args: + loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): + Contains pre-computed hidden-states. + Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model + should not be passed as input ids as they have already been computed. + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[tf.Tensor] = None + logits: tf.Tensor = None + mems: Optional[List[tf.Tensor]] = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFXLNetForTokenClassificationOutput(ModelOutput): + """ + Output type of :class:`~transformers.TFXLNetForTokenClassificationOutput`. + + Args: + loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : + Classification loss. + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): + Classification scores (before SoftMax). + mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): + Contains pre-computed hidden-states. + Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model + should not be passed as input ids as they have already been computed. + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[tf.Tensor] = None + logits: tf.Tensor = None + mems: Optional[List[tf.Tensor]] = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFXLNetForMultipleChoiceOutput(ModelOutput): + """ + Output type of :class:`~transformers.TFXLNetForMultipleChoice`. + + Args: + loss (:obj:`tf.Tensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided): + Classification loss. + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`): + `num_choices` is the second dimension of the input tensors. (see `input_ids` above). + + Classification scores (before SoftMax). + mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): + Contains pre-computed hidden-states. + Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model + should not be passed as input ids as they have already been computed. + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[tf.Tensor] = None + logits: tf.Tensor = None + mems: Optional[List[tf.Tensor]] = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFXLNetForQuestionAnsweringSimpleOutput(ModelOutput): + """ + Output type of :class:`~transformers.TFXLNetForQuestionAnsweringSimple`. + + Args: + loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + start_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): + Span-start scores (before SoftMax). + end_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): + Span-end scores (before SoftMax). + mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): + Contains pre-computed hidden-states. + Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model + should not be passed as input ids as they have already been computed. + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[tf.Tensor] = None + start_logits: tf.Tensor = None + end_logits: tf.Tensor = None + mems: Optional[List[tf.Tensor]] = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + XLNET_START_DOCSTRING = r""" .. note:: @@ -885,6 +1109,11 @@ XLNET_INPUTS_DOCSTRING = r""" If `use_cache` is True, `mems` are returned and can be used to speed up decoding (see `mems`). Defaults to `True`. output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -898,29 +1127,13 @@ class TFXLNetModel(TFXLNetPreTrainedModel): self.transformer = TFXLNetMainLayer(config, name="transformer") @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xlnet-base-cased", + output_type=TFXLNetModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call(self, inputs, **kwargs): - r""" - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs: - last_hidden_state (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the last layer of the model. - mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): - Contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model - should not be passed as input ids as they have already been computed. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): - Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ outputs = self.transformer(inputs, **kwargs) return outputs @@ -980,6 +1193,7 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss): return inputs @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFXLNetLMHeadModelOutput, config_class=_CONFIG_FOR_DOC) def call( self, inputs, @@ -994,6 +1208,7 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss): use_cache=True, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -1003,24 +1218,6 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss): Indices should be in ``[0, ..., config.vocab_size - 1]``. Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs: - prediction_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): - Contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model - should not be passed as input ids as they have already been computed. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. Examples:: @@ -1045,10 +1242,11 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss): next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] """ + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[12] if len(inputs) > 12 else labels - if len(inputs) > 12: - inputs = inputs[:12] + labels = inputs[13] if len(inputs) > 13 else labels + if len(inputs) > 13: + inputs = inputs[:13] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -1065,21 +1263,30 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss): use_cache=True, output_attentions=None, output_hidden_states=None, + return_dict=return_dict, training=training, ) hidden_state = transformer_outputs[0] logits = self.lm_loss(hidden_state, training=training) - outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it - + loss = None if labels is not None: # shift labels to the left and cut last logit token logits = logits[:, :-1] labels = labels[:, 1:] loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs - return outputs # return logits, (mems), (hidden states), (attentions) + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFXLNetLMHeadModelOutput( + loss=loss, + logits=logits, + mems=transformer_outputs.mems, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) @add_start_docstrings( @@ -1101,7 +1308,12 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif ) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xlnet-base-cased", + output_type=TFXLNetForSequenceClassificationOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -1116,6 +1328,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif use_cache=True, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -1125,31 +1338,12 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif Indices should be in ``[0, ..., config.num_labels - 1]``. If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs: - logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:(batch_size, config.num_labels)`): - Classification (or regression if config.num_labels==1) scores (before SoftMax). - mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): - Contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model - should not be passed as input ids as they have already been computed. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[12] if len(inputs) > 12 else labels - if len(inputs) > 12: - inputs = inputs[:12] + labels = inputs[13] if len(inputs) > 13 else labels + if len(inputs) > 13: + inputs = inputs[:13] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -1166,19 +1360,26 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, ) output = transformer_outputs[0] output = self.sequence_summary(output) logits = self.logits_proj(output) - outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFXLNetForSequenceClassificationOutput( + loss=loss, + logits=logits, + mems=transformer_outputs.mems, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) @add_start_docstrings( @@ -1208,7 +1409,12 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss): return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xlnet-base-cased", + output_type=TFXLNetForMultipleChoiceOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -1223,6 +1429,7 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss): use_cache=True, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -1231,24 +1438,6 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`: - `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). - - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ if isinstance(inputs, (tuple, list)): input_ids = inputs[0] @@ -1263,8 +1452,9 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss): use_cache = inputs[9] if len(inputs) > 9 else use_cache output_attentions = inputs[10] if len(inputs) > 10 else output_attentions output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states - labels = inputs[12] if len(inputs) > 12 else labels - assert len(inputs) <= 13, "Too many inputs." + return_dict = inputs[12] if len(inputs) > 12 else return_dict + labels = inputs[13] if len(inputs) > 13 else labels + assert len(inputs) <= 14, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -1278,10 +1468,12 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss): use_cache = inputs.get("use_cache", use_cache) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + return_dict = inputs.get("return_dict", return_dict) labels = inputs.get("labels", labels) - assert len(inputs) <= 13, "Too many inputs." + assert len(inputs) <= 14, "Too many inputs." else: input_ids = inputs + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if input_ids is not None: num_choices = shape_list(input_ids)[1] @@ -1312,19 +1504,26 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss): use_cache, output_attentions, output_hidden_states, + return_dict=return_dict, training=training, ) output = transformer_outputs[0] logits = self.sequence_summary(output) logits = self.logits_proj(logits) reshaped_logits = tf.reshape(logits, (-1, num_choices)) - outputs = (reshaped_logits,) + transformer_outputs[1:] # add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, reshaped_logits) - if labels is not None: - loss = self.compute_loss(labels, reshaped_logits) - outputs = (loss,) + outputs + if not return_dict: + output = (reshaped_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (mems), (hidden states), (attentions) + return TFXLNetForMultipleChoiceOutput( + loss=loss, + logits=reshaped_logits, + mems=transformer_outputs.mems, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) @add_start_docstrings( @@ -1343,7 +1542,12 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio ) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xlnet-base-cased", + output_type=TFXLNetForTokenClassificationOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -1358,6 +1562,7 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio use_cache=True, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -1365,31 +1570,12 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs: - logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:(batch_size, config.num_labels)`): - Classification scores (before SoftMax). - mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): - Contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model - should not be passed as input ids as they have already been computed. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[12] if len(inputs) > 12 else labels - if len(inputs) > 12: - inputs = inputs[:12] + labels = inputs[13] if len(inputs) > 13 else labels + if len(inputs) > 13: + inputs = inputs[:13] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -1406,19 +1592,25 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) + output = transformer_outputs[0] - logits = self.classifier(output) + loss = None if labels is None else self.compute_loss(labels, logits) - outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs - - return outputs # (loss), logits, (hidden_states), (attentions) + return TFXLNetForTokenClassificationOutput( + loss=loss, + logits=logits, + mems=transformer_outputs.mems, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) @add_start_docstrings( @@ -1435,7 +1627,12 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer ) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xlnet-base-cased", + output_type=TFXLNetForQuestionAnsweringSimpleOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -1450,6 +1647,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer use_cache=True, output_attentions=None, output_hidden_states=None, + return_dict=None, start_positions=None, end_positions=None, training=False, @@ -1463,36 +1661,13 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. - - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs: - loss (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): - Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. - start_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`): - Span-start scores (before SoftMax). - end_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`): - Span-end scores (before SoftMax). - mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): - Contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model - should not be passed as input ids as they have already been computed. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if isinstance(inputs, (tuple, list)): - start_positions = inputs[12] if len(inputs) > 12 else start_positions - end_positions = inputs[13] if len(inputs) > 13 else end_positions - if len(inputs) > 12: - inputs = inputs[:12] + start_positions = inputs[13] if len(inputs) > 13 else start_positions + end_positions = inputs[14] if len(inputs) > 14 else end_positions + if len(inputs) > 13: + inputs = inputs[:13] elif isinstance(inputs, (dict, BatchEncoding)): start_positions = inputs.pop("start_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions) @@ -1510,6 +1685,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -1520,17 +1696,24 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) - outputs = (start_logits, end_logits,) + transformer_outputs[ - 1: - ] # Keep mems, hidden states, attentions if there are in it - + loss = None if start_positions is not None and end_positions is not None: labels = {"start_position": start_positions} labels["end_position"] = end_positions - loss = self.compute_loss(labels, outputs[:2]) - outputs = (loss,) + outputs + loss = self.compute_loss(labels, (start_logits, end_logits)) - return outputs # (loss), start_logits, end_logits, (mems), (hidden_states), (attentions) + if not return_dict: + output = (start_logits, end_logits) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFXLNetForQuestionAnsweringSimpleOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + mems=transformer_outputs.mems, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) # @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of diff --git a/src/transformers/modeling_xlnet.py b/src/transformers/modeling_xlnet.py index 9746fb008f..ddb655656a 100644 --- a/src/transformers/modeling_xlnet.py +++ b/src/transformers/modeling_xlnet.py @@ -711,7 +711,7 @@ class XLNetForTokenClassificationOutput(ModelOutput): @dataclass class XLNetForMultipleChoiceOutput(ModelOutput): """ - Base class for outputs of multiple choice models. + Output type of :class:`~transformers.XLNetForMultipleChoice`. Args: loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided): @@ -747,7 +747,7 @@ class XLNetForMultipleChoiceOutput(ModelOutput): @dataclass class XLNetForQuestionAnsweringSimpleOutput(ModelOutput): """ - Base class for outputs of question answering models. + Output type of :class:`~transformers.XLNetForQuestionAnsweringSimple`. Args: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): @@ -784,7 +784,7 @@ class XLNetForQuestionAnsweringSimpleOutput(ModelOutput): @dataclass class XLNetForQuestionAnsweringOutput(ModelOutput): """ - Base class for outputs of question answering models using a :obj:`SquadHead`. + Output type of :class:`~transformers.XLNetForQuestionAnswering`. Args: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided): @@ -1227,7 +1227,6 @@ class XLNetModel(XLNetPreTrainedModel): # Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method) output = output.permute(1, 0, 2).contiguous() - # TODO Teven: fix this test to only use use_cache. if not use_cache: new_mems = None diff --git a/templates/adding_a_new_model/modeling_tf_xxx.py b/templates/adding_a_new_model/modeling_tf_xxx.py index 07b47a10a8..76dd0f08d3 100644 --- a/templates/adding_a_new_model/modeling_tf_xxx.py +++ b/templates/adding_a_new_model/modeling_tf_xxx.py @@ -31,6 +31,14 @@ from .file_utils import ( add_start_docstrings, add_start_docstrings_to_callable, ) +from .modeling_tf_outputs import ( + TFBaseModelOutputWithPooling, + TFMaskedLMOutput, + TFMultipleChoiceModelOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) from .modeling_tf_utils import ( TFMaskedLanguageModelingLoss, TFMultipleChoiceLoss, @@ -46,6 +54,7 @@ from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) +_CONFIG_FOR_DOC = "XXXConfig" _TOKENIZER_FOR_DOC = "XxxTokenizer" #################################################### @@ -117,35 +126,60 @@ class TFXxxMainLayer(tf.keras.layers.Layer): raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models def call( - self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False + self, + inputs, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + training=False, ): - # We allow three types of multi-inputs: - # - traditional keyword arguments in the call method - # - all the arguments provided as a dict in the first positional argument of call - # - all the arguments provided as a list/tuple (ordered) in the first positional argument of call - # The last two options are useful to use the tf.keras fit() method. - if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids position_ids = inputs[3] if len(inputs) > 3 else position_ids head_mask = inputs[4] if len(inputs) > 4 else head_mask - assert len(inputs) <= 5, "Too many inputs." - elif isinstance(inputs, dict): + inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds + output_attentions = inputs[6] if len(inputs) > 6 else output_attentions + output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states + return_dict = inputs[8] if len(inputs) > 8 else return_dict + assert len(inputs) <= 9, "Too many inputs." + elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) token_type_ids = inputs.get("token_type_ids", token_type_ids) position_ids = inputs.get("position_ids", position_ids) head_mask = inputs.get("head_mask", head_mask) - assert len(inputs) <= 5, "Too many inputs." + inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) + output_attentions = inputs.get("output_attentions", output_attentions) + output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 9, "Too many inputs." else: input_ids = inputs + output_attentions = output_attentions if output_attentions is not None else self.output_attentions + output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states + return_dict = return_dict if return_dict is not None else self.return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + if attention_mask is None: - attention_mask = tf.fill(shape_list(input_ids), 1) + attention_mask = tf.fill(input_shape, 1) if token_type_ids is None: - token_type_ids = tf.fill(shape_list(input_ids), 0) + token_type_ids = tf.fill(input_shape, 0) # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] @@ -174,14 +208,29 @@ class TFXxxMainLayer(tf.keras.layers.Layer): head_mask = [None] * self.num_hidden_layers # head_mask = tf.constant([0] * self.num_hidden_layers) - ################################## - # Replace this with your model code - embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids) - encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training) - sequence_output = encoder_outputs[0] - outputs = (sequence_output,) + encoder_outputs[1:] # add hidden_states and attentions if they are here + embedding_output = self.embeddings(input_ids, position_ids, token_type_ids, inputs_embeds, training=training) + encoder_outputs = self.encoder( + embedding_output, + extended_attention_mask, + head_mask, + output_attentions, + output_hidden_states, + return_dict, + training=training, + ) - return outputs # sequence_output, (hidden_states), (attentions) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) + + if not return_dict: + return (sequence_output, pooled_output,) + encoder_outputs[1:] + + return TFBaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) #################################################### @@ -274,6 +323,11 @@ XXX_INPUTS_DOCSTRING = r""" (if set to :obj:`False`) for evaluation. output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -287,32 +341,13 @@ class TFXxxModel(TFXxxPreTrainedModel): self.transformer = TFXxxMainLayer(config, name="transformer") @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xxx-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xxx-base-cased", + output_type=TFBaseModelOutputWithPooling, + config_class=_CONFIG_FOR_DOC, + ) def call(self, inputs, **kwargs): - r""" - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XxxConfig`) and inputs: - last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`): - Last layer hidden-state of the first token of the sequence (classification token) - further processed by a Linear layer and a Tanh activation function. The Linear - layer weights are trained from the next sentence prediction (classification) - objective during XXX pretraining. This output is usually *not* a good summary - of the semantic content of the input, you're often better with averaging or pooling - the sequence of hidden-states for the whole input sequence. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ outputs = self.transformer(inputs, **kwargs) return outputs @@ -329,7 +364,12 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel, TFMaskedLanguageModelingLoss): self.mlm = TFXxxMLMHead(config, self.transformer.embeddings, name="mlm") @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xxx-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xxx-base-cased", + output_type=TFMaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -340,6 +380,7 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel, TFMaskedLanguageModelingLoss): inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -349,27 +390,12 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel, TFMaskedLanguageModelingLoss): Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XxxConfig`) and inputs: - prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -382,19 +408,22 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel, TFMaskedLanguageModelingLoss): inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) sequence_output = outputs[0] prediction_scores = self.mlm(sequence_output, training=training) - outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, prediction_scores) - if labels is not None: - loss = self.compute_loss(labels, prediction_scores) - outputs = (loss,) + outputs + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), prediction_scores, (hidden_states), (attentions) + return TFMaskedLMOutput( + loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -414,7 +443,12 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel, TFSequenceClassificat ) @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xxx-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xxx-base-cased", + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -425,6 +459,7 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel, TFSequenceClassificat inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -434,27 +469,12 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel, TFSequenceClassificat Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XxxConfig`) and inputs: - logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): - Classification (or regression if config.num_labels==1) scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -467,6 +487,7 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel, TFSequenceClassificat inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -475,13 +496,15 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel, TFSequenceClassificat pooled_output = self.dropout(pooled_output, training=training) logits = self.classifier(pooled_output) - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFSequenceClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -509,7 +532,12 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss): return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xxx-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xxx-base-cased", + output_type=TFMultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs, @@ -520,6 +548,7 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss): inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -527,24 +556,7 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss): labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension - of the input tensors. (see `input_ids` above) - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XxxConfig`) and inputs: - classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`: - `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). - - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + of the input tensors. (see `input_ids` above)s after the attention softmax, used to compute the weighted average in the self-attention heads. """ if isinstance(inputs, (tuple, list)): @@ -556,8 +568,9 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss): inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states - labels = inputs[8] if len(inputs) > 8 else labels - assert len(inputs) <= 9, "Too many inputs." + return_dict = inputs[8] if len(inputs) > 8 else return_dict + labels = inputs[9] if len(inputs) > 9 else labels + assert len(inputs) <= 10, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -567,10 +580,12 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss): inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + return_dict = inputs.get("return_dict", return_dict) labels = inputs.get("labels", labels) - assert len(inputs) <= 9, "Too many inputs." + assert len(inputs) <= 10, "Too many inputs." else: input_ids = inputs + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if input_ids is not None: num_choices = shape_list(input_ids)[1] @@ -598,6 +613,7 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss): flat_inputs_embeds, output_attentions, output_hidden_states, + return_dict, ] outputs = self.transformer(flat_inputs, training=training) @@ -608,13 +624,15 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss): logits = self.classifier(pooled_output) reshaped_logits = tf.reshape(logits, (-1, num_choices)) - outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, reshaped_logits) - if labels is not None: - loss = self.compute_loss(labels, reshaped_logits) - outputs = (loss,) + outputs + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), reshaped_logits, (hidden_states), (attentions) + return TFMultipleChoiceModelOutput( + loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -634,7 +652,12 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLos ) @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xxx-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xxx-base-cased", + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -645,6 +668,7 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLos inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -652,27 +676,12 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLos labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XxxConfig`) and inputs: - scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -685,6 +694,7 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLos inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -693,13 +703,15 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLos sequence_output = self.dropout(sequence_output, training=training) logits = self.classifier(sequence_output) - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFTokenClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -718,7 +730,12 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel, TFQuestionAnsweringLoss): ) @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xxx-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xxx-base-cased", + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -729,6 +746,7 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel, TFQuestionAnsweringLoss): inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, start_positions=None, end_positions=None, training=False, @@ -742,30 +760,13 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel, TFQuestionAnsweringLoss): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XxxConfig`) and inputs: - start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-start scores (before SoftMax). - end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-end scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if isinstance(inputs, (tuple, list)): - start_positions = inputs[8] if len(inputs) > 8 else start_positions - end_positions = inputs[9] if len(inputs) > 9 else end_positions - if len(inputs) > 8: - inputs = inputs[:8] + start_positions = inputs[9] if len(inputs) > 9 else start_positions + end_positions = inputs[10] if len(inputs) > 10 else end_positions + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): start_positions = inputs.pop("start_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions) @@ -779,6 +780,7 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel, TFQuestionAnsweringLoss): inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -789,12 +791,20 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel, TFQuestionAnsweringLoss): start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) - outputs = (start_logits, end_logits,) + outputs[2:] - + loss = None if start_positions is not None and end_positions is not None: labels = {"start_position": start_positions} labels["end_position"] = end_positions - loss = self.compute_loss(labels, outputs[:2]) - outputs = (loss,) + outputs + loss = self.compute_loss(labels, (start_logits, end_logits)) - return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/templates/adding_a_new_model/tests/test_modeling_tf_xxx.py b/templates/adding_a_new_model/tests/test_modeling_tf_xxx.py index 3e12b3f745..cd700e9aab 100644 --- a/templates/adding_a_new_model/tests/test_modeling_tf_xxx.py +++ b/templates/adding_a_new_model/tests/test_modeling_tf_xxx.py @@ -24,9 +24,11 @@ from .utils import CACHE_DIR, require_tf, slow if is_tf_available(): + import tensorflow as tf from transformers.modeling_tf_xxx import ( TFXxxModel, TFXxxForMaskedLM, + TFXxxForMultipleChoice, TFXxxForSequenceClassification, TFXxxForTokenClassification, TFXxxForQuestionAnswering, @@ -40,6 +42,7 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase): ( TFXxxModel, TFXxxForMaskedLM, + TFXxxForMultipleChoice, TFXxxForQuestionAnswering, TFXxxForSequenceClassification, TFXxxForTokenClassification, @@ -128,6 +131,7 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase): max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, + return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -137,33 +141,26 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase): ): model = TFXxxModel(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - sequence_output, pooled_output = model(inputs) + result = model(inputs) inputs = [input_ids, input_mask] - sequence_output, pooled_output = model(inputs) + result = model(inputs) - sequence_output, pooled_output = model(input_ids) + result = model(input_ids) - result = { - "sequence_output": sequence_output.numpy(), - "pooled_output": pooled_output.numpy(), - } self.parent.assertListEqual( - list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size] ) - self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size]) + self.parent.assertListEqual(list(result["pooler_output"].shape), [self.batch_size, self.hidden_size]) def create_and_check_xxx_for_masked_lm( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = TFXxxForMaskedLM(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (prediction_scores,) = model(inputs) - result = { - "prediction_scores": prediction_scores.numpy(), - } + result = model(inputs) self.parent.assertListEqual( - list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] + list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size] ) def create_and_check_xxx_for_sequence_classification( @@ -172,22 +169,32 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase): config.num_labels = self.num_labels model = TFXxxForSequenceClassification(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) + def create_and_check_bert_for_multiple_choice( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_choices = self.num_choices + model = TFXxxForMultipleChoice(config=config) + multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1)) + multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1)) + multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1)) + inputs = { + "input_ids": multiple_choice_inputs_ids, + "attention_mask": multiple_choice_input_mask, + "token_type_ids": multiple_choice_token_type_ids, + } + result = model(inputs) + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) + def create_and_check_xxx_for_token_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): config.num_labels = self.num_labels model = TFXxxForTokenClassification(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual( list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels] ) @@ -197,11 +204,7 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase): ): model = TFXxxForQuestionAnswering(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - start_logits, end_logits = model(inputs) - result = { - "start_logits": start_logits.numpy(), - "end_logits": end_logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) diff --git a/tests/test_modeling_tf_albert.py b/tests/test_modeling_tf_albert.py index f59931424b..ca807e8487 100644 --- a/tests/test_modeling_tf_albert.py +++ b/tests/test_modeling_tf_albert.py @@ -116,6 +116,7 @@ class TFAlbertModelTester: max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, + return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -129,21 +130,17 @@ class TFAlbertModelTester: # 'token_type_ids': token_type_ids} # sequence_output, pooled_output = model(**inputs) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - sequence_output, pooled_output = model(inputs) + result = model(inputs) inputs = [input_ids, input_mask] - sequence_output, pooled_output = model(inputs) + result = model(inputs) - sequence_output, pooled_output = model(input_ids) + result = model(input_ids) - result = { - "sequence_output": sequence_output.numpy(), - "pooled_output": pooled_output.numpy(), - } self.parent.assertListEqual( - list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size] ) - self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size]) + self.parent.assertListEqual(list(result["pooler_output"].shape), [self.batch_size, self.hidden_size]) def create_and_check_albert_for_pretraining( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -151,28 +148,19 @@ class TFAlbertModelTester: config.num_labels = self.num_labels model = TFAlbertForPreTraining(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - prediction_scores, sop_scores = model(inputs) - result = { - "prediction_scores": prediction_scores.numpy(), - "sop_scores": sop_scores.numpy(), - } + result = model(inputs) self.parent.assertListEqual( - list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] + list(result["prediction_logits"].shape), [self.batch_size, self.seq_length, self.vocab_size] ) - self.parent.assertListEqual(list(result["sop_scores"].shape), [self.batch_size, self.num_labels]) + self.parent.assertListEqual(list(result["sop_logits"].shape), [self.batch_size, self.num_labels]) def create_and_check_albert_for_masked_lm( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = TFAlbertForMaskedLM(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (prediction_scores,) = model(inputs) - result = { - "prediction_scores": prediction_scores.numpy(), - } - self.parent.assertListEqual( - list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] - ) + result = model(inputs) + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]) def create_and_check_albert_for_sequence_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -180,10 +168,7 @@ class TFAlbertModelTester: config.num_labels = self.num_labels model = TFAlbertForSequenceClassification(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) def create_and_check_albert_for_question_answering( @@ -191,11 +176,7 @@ class TFAlbertModelTester: ): model = TFAlbertForQuestionAnswering(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - start_logits, end_logits = model(inputs) - result = { - "start_logits": start_logits.numpy(), - "end_logits": end_logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) diff --git a/tests/test_modeling_tf_bert.py b/tests/test_modeling_tf_bert.py index 7e1884bafc..5026ce55fb 100644 --- a/tests/test_modeling_tf_bert.py +++ b/tests/test_modeling_tf_bert.py @@ -118,6 +118,7 @@ class TFBertModelTester: max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, + return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -130,18 +131,14 @@ class TFBertModelTester: sequence_output, pooled_output = model(inputs) inputs = [input_ids, input_mask] - sequence_output, pooled_output = model(inputs) + result = model(inputs) - sequence_output, pooled_output = model(input_ids) + result = model(input_ids) - result = { - "sequence_output": sequence_output.numpy(), - "pooled_output": pooled_output.numpy(), - } self.parent.assertListEqual( - list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size] ) - self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size]) + self.parent.assertListEqual(list(result["pooler_output"].shape), [self.batch_size, self.hidden_size]) def create_and_check_bert_lm_head( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -153,7 +150,7 @@ class TFBertModelTester: "attention_mask": input_mask, "token_type_ids": token_type_ids, } - (prediction_scores,) = model(inputs) + prediction_scores = model(inputs)["logits"] self.parent.assertListEqual( list(prediction_scores.numpy().shape), [self.batch_size, self.seq_length, self.vocab_size] ) @@ -167,39 +164,27 @@ class TFBertModelTester: "attention_mask": input_mask, "token_type_ids": token_type_ids, } - (prediction_scores,) = model(inputs) - result = { - "prediction_scores": prediction_scores.numpy(), - } - self.parent.assertListEqual( - list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] - ) + result = model(inputs) + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]) def create_and_check_bert_for_next_sequence_prediction( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = TFBertForNextSentencePrediction(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (seq_relationship_score,) = model(inputs) - result = { - "seq_relationship_score": seq_relationship_score.numpy(), - } - self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2]) + result = model(inputs) + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, 2]) def create_and_check_bert_for_pretraining( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = TFBertForPreTraining(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - prediction_scores, seq_relationship_score = model(inputs) - result = { - "prediction_scores": prediction_scores.numpy(), - "seq_relationship_score": seq_relationship_score.numpy(), - } + result = model(inputs) self.parent.assertListEqual( - list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] + list(result["prediction_logits"].shape), [self.batch_size, self.seq_length, self.vocab_size] ) - self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2]) + self.parent.assertListEqual(list(result["seq_relationship_logits"].shape), [self.batch_size, 2]) def create_and_check_bert_for_sequence_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -212,8 +197,7 @@ class TFBertModelTester: "token_type_ids": token_type_ids, } - (logits,) = model(inputs) - result = {"logits": logits.numpy()} + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) def create_and_check_bert_for_multiple_choice( @@ -229,8 +213,7 @@ class TFBertModelTester: "attention_mask": multiple_choice_input_mask, "token_type_ids": multiple_choice_token_type_ids, } - (logits,) = model(inputs) - result = {"logits": logits.numpy()} + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) def create_and_check_bert_for_token_classification( @@ -243,10 +226,7 @@ class TFBertModelTester: "attention_mask": input_mask, "token_type_ids": token_type_ids, } - (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]) def create_and_check_bert_for_question_answering( @@ -259,8 +239,7 @@ class TFBertModelTester: "token_type_ids": token_type_ids, } - start_logits, end_logits = model(inputs) - result = {"start_logits": start_logits.numpy(), "end_logits": end_logits.numpy()} + result = model(inputs) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) diff --git a/tests/test_modeling_tf_camembert.py b/tests/test_modeling_tf_camembert.py index fa962dd615..3eb47beb28 100644 --- a/tests/test_modeling_tf_camembert.py +++ b/tests/test_modeling_tf_camembert.py @@ -35,7 +35,7 @@ class TFCamembertModelIntegrationTest(unittest.TestCase): [[5, 121, 11, 660, 16, 730, 25543, 110, 83, 6]], dtype=tf.int32, ) # J'aime le camembert !" - output = model(input_ids)[0] + output = model(input_ids)["last_hidden_state"] expected_shape = tf.TensorShape((1, 10, 768)) self.assertEqual(output.shape, expected_shape) # compare the actual values for a slice. diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py index 9aafb5d600..0353314bab 100644 --- a/tests/test_modeling_tf_common.py +++ b/tests/test_modeling_tf_common.py @@ -146,7 +146,8 @@ class TFModelTesterMixin: tf.saved_model.save(model, tmpdirname) model = tf.keras.models.load_model(tmpdirname) outputs = model(inputs_dict) - hidden_states = [t.numpy() for t in outputs[-1]] + output = outputs[list(outputs.keys())[-1]] if isinstance(outputs, dict) else outputs[-1] + hidden_states = [t.numpy() for t in output] self.assertEqual(len(outputs), num_out) self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1) self.assertListEqual( @@ -177,7 +178,8 @@ class TFModelTesterMixin: tf.saved_model.save(model, tmpdirname) model = tf.keras.models.load_model(tmpdirname) outputs = model(inputs_dict) - attentions = [t.numpy() for t in outputs[-1]] + output = outputs[list(outputs.keys())[-1]] if isinstance(outputs, dict) else outputs[-1] + attentions = [t.numpy() for t in output] self.assertEqual(len(outputs), num_out) self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) self.assertListEqual( @@ -238,6 +240,8 @@ class TFModelTesterMixin: # Make sure we don't have nans if isinstance(after_outputs, tf.Tensor): out_1 = after_outputs.numpy() + elif isinstance(after_outputs, dict): + out_1 = after_outputs[list(after_outputs.keys())[0]] else: out_1 = after_outputs[0].numpy() out_2 = outputs[0].numpy() diff --git a/tests/test_modeling_tf_ctrl.py b/tests/test_modeling_tf_ctrl.py index 462a8bb2c2..854f5b565a 100644 --- a/tests/test_modeling_tf_ctrl.py +++ b/tests/test_modeling_tf_ctrl.py @@ -89,9 +89,10 @@ class TFCTRLModelTester(object): # hidden_dropout_prob=self.hidden_dropout_prob, # attention_probs_dropout_prob=self.attention_probs_dropout_prob, n_positions=self.max_position_embeddings, - n_ctx=self.max_position_embeddings + n_ctx=self.max_position_embeddings, # type_vocab_size=self.type_vocab_size, - # initializer_range=self.initializer_range + # initializer_range=self.initializer_range, + return_dict=True, ) head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) @@ -111,30 +112,22 @@ class TFCTRLModelTester(object): def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): model = TFCTRLModel(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - sequence_output = model(inputs)[0] + result = model(inputs) inputs = [input_ids, None, input_mask] # None is the input for 'past' - sequence_output = model(inputs)[0] + result = model(inputs) - sequence_output = model(input_ids)[0] + result = model(input_ids) - result = { - "sequence_output": sequence_output.numpy(), - } self.parent.assertListEqual( - list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size] ) def create_and_check_ctrl_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): model = TFCTRLLMHeadModel(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - prediction_scores = model(inputs)[0] - result = { - "prediction_scores": prediction_scores.numpy(), - } - self.parent.assertListEqual( - list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] - ) + result = model(inputs) + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() diff --git a/tests/test_modeling_tf_distilbert.py b/tests/test_modeling_tf_distilbert.py index e3c83a47a7..3f73958378 100644 --- a/tests/test_modeling_tf_distilbert.py +++ b/tests/test_modeling_tf_distilbert.py @@ -89,6 +89,7 @@ class TFDistilBertModelTester: attention_dropout=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, initializer_range=self.initializer_range, + return_dict=True, ) return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -99,18 +100,14 @@ class TFDistilBertModelTester: model = TFDistilBertModel(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask} - outputs = model(inputs) - sequence_output = outputs[0] + result = model(inputs) inputs = [input_ids, input_mask] - (sequence_output,) = model(inputs) + result = model(inputs) - result = { - "sequence_output": sequence_output.numpy(), - } self.parent.assertListEqual( - list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size] ) def create_and_check_distilbert_for_masked_lm( @@ -118,11 +115,8 @@ class TFDistilBertModelTester: ): model = TFDistilBertForMaskedLM(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask} - (prediction_scores,) = model(inputs) - result = {"prediction_scores": prediction_scores.numpy()} - self.parent.assertListEqual( - list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] - ) + result = model(inputs) + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]) def create_and_check_distilbert_for_question_answering( self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -132,8 +126,7 @@ class TFDistilBertModelTester: "input_ids": input_ids, "attention_mask": input_mask, } - start_logits, end_logits = model(inputs) - result = {"start_logits": start_logits.numpy(), "end_logits": end_logits.numpy()} + result = model(inputs) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) @@ -143,8 +136,7 @@ class TFDistilBertModelTester: config.num_labels = self.num_labels model = TFDistilBertForSequenceClassification(config) inputs = {"input_ids": input_ids, "attention_mask": input_mask} - (logits,) = model(inputs) - result = {"logits": logits.numpy()} + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) def create_and_check_distilbert_for_multiple_choice( @@ -158,8 +150,7 @@ class TFDistilBertModelTester: "input_ids": multiple_choice_inputs_ids, "attention_mask": multiple_choice_input_mask, } - (logits,) = model(inputs) - result = {"logits": logits.numpy()} + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) def create_and_check_distilbert_for_token_classification( @@ -168,10 +159,7 @@ class TFDistilBertModelTester: config.num_labels = self.num_labels model = TFDistilBertForTokenClassification(config) inputs = {"input_ids": input_ids, "attention_mask": input_mask} - (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]) def prepare_config_and_inputs_for_common(self): diff --git a/tests/test_modeling_tf_electra.py b/tests/test_modeling_tf_electra.py index 625f935c62..e986137567 100644 --- a/tests/test_modeling_tf_electra.py +++ b/tests/test_modeling_tf_electra.py @@ -95,6 +95,7 @@ class TFElectraModelTester: max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, + return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -104,18 +105,15 @@ class TFElectraModelTester: ): model = TFElectraModel(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (sequence_output,) = model(inputs) + result = model(inputs) inputs = [input_ids, input_mask] - (sequence_output,) = model(inputs) + result = model(inputs) - (sequence_output,) = model(input_ids) + result = model(input_ids) - result = { - "sequence_output": sequence_output.numpy(), - } self.parent.assertListEqual( - list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size] ) def create_and_check_electra_for_masked_lm( @@ -123,24 +121,16 @@ class TFElectraModelTester: ): model = TFElectraForMaskedLM(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (prediction_scores,) = model(inputs) - result = { - "prediction_scores": prediction_scores.numpy(), - } - self.parent.assertListEqual( - list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] - ) + result = model(inputs) + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]) def create_and_check_electra_for_pretraining( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = TFElectraForPreTraining(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (prediction_scores,) = model(inputs) - result = { - "prediction_scores": prediction_scores.numpy(), - } - self.parent.assertListEqual(list(result["prediction_scores"].shape), [self.batch_size, self.seq_length]) + result = model(inputs) + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length]) def create_and_check_electra_for_sequence_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -148,10 +138,7 @@ class TFElectraModelTester: config.num_labels = self.num_labels model = TFElectraForSequenceClassification(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) def create_and_check_electra_for_multiple_choice( @@ -167,8 +154,7 @@ class TFElectraModelTester: "attention_mask": multiple_choice_input_mask, "token_type_ids": multiple_choice_token_type_ids, } - (logits,) = model(inputs) - result = {"logits": logits.numpy()} + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) def create_and_check_electra_for_question_answering( @@ -176,11 +162,7 @@ class TFElectraModelTester: ): model = TFElectraForQuestionAnswering(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - start_logits, end_logits = model(inputs) - result = { - "start_logits": start_logits.numpy(), - "end_logits": end_logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) @@ -190,10 +172,7 @@ class TFElectraModelTester: config.num_labels = self.num_labels model = TFElectraForTokenClassification(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]) def prepare_config_and_inputs_for_common(self): diff --git a/tests/test_modeling_tf_flaubert.py b/tests/test_modeling_tf_flaubert.py index 399c78ca53..7ec611e035 100644 --- a/tests/test_modeling_tf_flaubert.py +++ b/tests/test_modeling_tf_flaubert.py @@ -113,6 +113,7 @@ class TFFlaubertModelTester: summary_type=self.summary_type, use_proj=self.use_proj, bos_token_id=self.bos_token_id, + return_dict=True, ) return ( @@ -141,16 +142,12 @@ class TFFlaubertModelTester: ): model = TFFlaubertModel(config=config) inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids} - outputs = model(inputs) + result = model(inputs) inputs = [input_ids, input_mask] - outputs = model(inputs) - sequence_output = outputs[0] - result = { - "sequence_output": sequence_output.numpy(), - } + result = model(inputs) self.parent.assertListEqual( - list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size] ) def create_and_check_flaubert_lm_head( @@ -168,13 +165,7 @@ class TFFlaubertModelTester: model = TFFlaubertWithLMHeadModel(config) inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids} - outputs = model(inputs) - - logits = outputs[0] - - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]) @@ -194,12 +185,7 @@ class TFFlaubertModelTester: inputs = {"input_ids": input_ids, "lengths": input_lengths} - start_logits, end_logits = model(inputs) - - result = { - "start_logits": start_logits.numpy(), - "end_logits": end_logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) @@ -220,11 +206,7 @@ class TFFlaubertModelTester: inputs = {"input_ids": input_ids, "lengths": input_lengths} - (logits,) = model(inputs) - - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size]) @@ -243,10 +225,7 @@ class TFFlaubertModelTester: config.num_labels = self.num_labels model = TFFlaubertForTokenClassification(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]) def create_and_check_flaubert_for_multiple_choice( @@ -271,8 +250,7 @@ class TFFlaubertModelTester: "attention_mask": multiple_choice_input_mask, "token_type_ids": multiple_choice_token_type_ids, } - (logits,) = model(inputs) - result = {"logits": logits.numpy()} + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) def prepare_config_and_inputs_for_common(self): diff --git a/tests/test_modeling_tf_gpt2.py b/tests/test_modeling_tf_gpt2.py index 7728c8b1f6..32e725c028 100644 --- a/tests/test_modeling_tf_gpt2.py +++ b/tests/test_modeling_tf_gpt2.py @@ -102,6 +102,7 @@ class TFGPT2ModelTester: # initializer_range=self.initializer_range bos_token_id=self.bos_token_id, eos_token_id=self.eos_token_id, + return_dict=True, ) head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) @@ -125,18 +126,15 @@ class TFGPT2ModelTester: "attention_mask": input_mask, "token_type_ids": token_type_ids, } - sequence_output = model(inputs)[0] + result = model(inputs) inputs = [input_ids, None, input_mask] # None is the input for 'past' - sequence_output = model(inputs)[0] + result = model(inputs) - sequence_output = model(input_ids)[0] + result = model(input_ids) - result = { - "sequence_output": sequence_output.numpy(), - } self.parent.assertListEqual( - list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size], + list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size], ) def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): @@ -150,7 +148,7 @@ class TFGPT2ModelTester: self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf)) self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1) - output, past = outputs + output, past = outputs.to_tuple() # create hypothetical next token and extent to next_input_ids next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) @@ -160,8 +158,8 @@ class TFGPT2ModelTester: next_input_ids = tf.concat([input_ids, next_tokens], axis=-1) next_token_type_ids = tf.concat([token_type_ids, next_token_types], axis=-1) - output_from_no_past, _ = model(next_input_ids, token_type_ids=next_token_type_ids) - output_from_past, _ = model(next_tokens, token_type_ids=next_token_types, past=past) + output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"] + output_from_past = model(next_tokens, token_type_ids=next_token_types, past=past)["last_hidden_state"] # select random slice random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1])) @@ -183,7 +181,7 @@ class TFGPT2ModelTester: attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1) # first forward pass - output, past = model(input_ids, attention_mask=attn_mask) + output, past = model(input_ids, attention_mask=attn_mask).to_tuple() # create hypothetical next token and extent to next_input_ids next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) @@ -202,8 +200,8 @@ class TFGPT2ModelTester: attn_mask = tf.concat([attn_mask, tf.ones((shape_list(attn_mask)[0], 1), dtype=tf.int32)], axis=1) # get two different outputs - output_from_no_past, _ = model(next_input_ids, attention_mask=attn_mask) - output_from_past, _ = model(next_tokens, past=past, attention_mask=attn_mask) + output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"] + output_from_past = model(next_tokens, past=past, attention_mask=attn_mask)["last_hidden_state"] # select random slice random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1])) @@ -220,12 +218,9 @@ class TFGPT2ModelTester: "attention_mask": input_mask, "token_type_ids": token_type_ids, } - prediction_scores = model(inputs)[0] - result = { - "prediction_scores": prediction_scores.numpy(), - } + result = model(inputs) self.parent.assertListEqual( - list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size], + list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size], ) def create_and_check_gpt2_double_head( @@ -243,8 +238,7 @@ class TFGPT2ModelTester: "attention_mask": multiple_choice_input_mask, "token_type_ids": multiple_choice_token_type_ids, } - lm_logits, mc_logits = model(inputs)[:2] - result = {"lm_logits": lm_logits.numpy(), "mc_logits": mc_logits.numpy()} + result = model(inputs) self.parent.assertListEqual( list(result["lm_logits"].shape), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size], ) diff --git a/tests/test_modeling_tf_mobilebert.py b/tests/test_modeling_tf_mobilebert.py index e43d0d84cf..41dd522f53 100644 --- a/tests/test_modeling_tf_mobilebert.py +++ b/tests/test_modeling_tf_mobilebert.py @@ -138,6 +138,7 @@ class TFMobileBertModelTest(TFModelTesterMixin, unittest.TestCase): type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, embedding_size=self.embedding_size, + return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -147,33 +148,26 @@ class TFMobileBertModelTest(TFModelTesterMixin, unittest.TestCase): ): model = TFMobileBertModel(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - sequence_output, pooled_output = model(inputs) + result = model(inputs) inputs = [input_ids, input_mask] - sequence_output, pooled_output = model(inputs) + result = model(inputs) - sequence_output, pooled_output = model(input_ids) + result = model(input_ids) - result = { - "sequence_output": sequence_output.numpy(), - "pooled_output": pooled_output.numpy(), - } self.parent.assertListEqual( - list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size] ) - self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size]) + self.parent.assertListEqual(list(result["pooler_output"].shape), [self.batch_size, self.hidden_size]) def create_and_check_mobilebert_for_masked_lm( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = TFMobileBertForMaskedLM(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (prediction_scores,) = model(inputs) - result = { - "prediction_scores": prediction_scores.numpy(), - } + result = model(inputs) self.parent.assertListEqual( - list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] + list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size] ) def create_and_check_mobilebert_for_next_sequence_prediction( @@ -181,26 +175,19 @@ class TFMobileBertModelTest(TFModelTesterMixin, unittest.TestCase): ): model = TFMobileBertForNextSentencePrediction(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (seq_relationship_score,) = model(inputs) - result = { - "seq_relationship_score": seq_relationship_score.numpy(), - } - self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2]) + result = model(inputs) + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, 2]) def create_and_check_mobilebert_for_pretraining( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = TFMobileBertForPreTraining(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - prediction_scores, seq_relationship_score = model(inputs) - result = { - "prediction_scores": prediction_scores.numpy(), - "seq_relationship_score": seq_relationship_score.numpy(), - } + result = model(inputs) self.parent.assertListEqual( - list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] + list(result["prediction_logits"].shape), [self.batch_size, self.seq_length, self.vocab_size] ) - self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2]) + self.parent.assertListEqual(list(result["seq_relationship_logits"].shape), [self.batch_size, 2]) def create_and_check_mobilebert_for_sequence_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -208,10 +195,7 @@ class TFMobileBertModelTest(TFModelTesterMixin, unittest.TestCase): config.num_labels = self.num_labels model = TFMobileBertForSequenceClassification(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) def create_and_check_mobilebert_for_multiple_choice( @@ -227,10 +211,7 @@ class TFMobileBertModelTest(TFModelTesterMixin, unittest.TestCase): "attention_mask": multiple_choice_input_mask, "token_type_ids": multiple_choice_token_type_ids, } - (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) def create_and_check_mobilebert_for_token_classification( @@ -239,10 +220,7 @@ class TFMobileBertModelTest(TFModelTesterMixin, unittest.TestCase): config.num_labels = self.num_labels model = TFMobileBertForTokenClassification(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual( list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels] ) @@ -252,11 +230,7 @@ class TFMobileBertModelTest(TFModelTesterMixin, unittest.TestCase): ): model = TFMobileBertForQuestionAnswering(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - start_logits, end_logits = model(inputs) - result = { - "start_logits": start_logits.numpy(), - "end_logits": end_logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) diff --git a/tests/test_modeling_tf_openai_gpt.py b/tests/test_modeling_tf_openai_gpt.py index 8e32949102..b9f86fed58 100644 --- a/tests/test_modeling_tf_openai_gpt.py +++ b/tests/test_modeling_tf_openai_gpt.py @@ -94,9 +94,10 @@ class TFOpenAIGPTModelTester: # hidden_dropout_prob=self.hidden_dropout_prob, # attention_probs_dropout_prob=self.attention_probs_dropout_prob, n_positions=self.max_position_embeddings, - n_ctx=self.max_position_embeddings + n_ctx=self.max_position_embeddings, # type_vocab_size=self.type_vocab_size, - # initializer_range=self.initializer_range + # initializer_range=self.initializer_range, + return_dict=True, ) head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) @@ -116,30 +117,22 @@ class TFOpenAIGPTModelTester: def create_and_check_openai_gpt_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): model = TFOpenAIGPTModel(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - sequence_output = model(inputs)[0] + result = model(inputs) inputs = [input_ids, input_mask] - sequence_output = model(inputs)[0] + result = model(inputs) - sequence_output = model(input_ids)[0] + result = model(input_ids) - result = { - "sequence_output": sequence_output.numpy(), - } self.parent.assertListEqual( - list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size] ) def create_and_check_openai_gpt_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): model = TFOpenAIGPTLMHeadModel(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - prediction_scores = model(inputs)[0] - result = { - "prediction_scores": prediction_scores.numpy(), - } - self.parent.assertListEqual( - list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] - ) + result = model(inputs) + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]) def create_and_check_openai_gpt_double_head( self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args @@ -156,8 +149,7 @@ class TFOpenAIGPTModelTester: "attention_mask": multiple_choice_input_mask, "token_type_ids": multiple_choice_token_type_ids, } - lm_logits, mc_logits = model(inputs)[:2] - result = {"lm_logits": lm_logits.numpy(), "mc_logits": mc_logits.numpy()} + result = model(inputs) self.parent.assertListEqual( list(result["lm_logits"].shape), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size] ) diff --git a/tests/test_modeling_tf_roberta.py b/tests/test_modeling_tf_roberta.py index 65752e994a..04dcf20af8 100644 --- a/tests/test_modeling_tf_roberta.py +++ b/tests/test_modeling_tf_roberta.py @@ -95,6 +95,7 @@ class TFRobertaModelTester: max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, + return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -104,31 +105,23 @@ class TFRobertaModelTester: ): model = TFRobertaModel(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - sequence_output = model(inputs)[0] + result = model(inputs) inputs = [input_ids, input_mask] - sequence_output = model(inputs)[0] + result = model(inputs) - sequence_output = model(input_ids)[0] + result = model(input_ids) - result = { - "sequence_output": sequence_output.numpy(), - } self.parent.assertListEqual( - list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size] ) def create_and_check_roberta_for_masked_lm( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = TFRobertaForMaskedLM(config=config) - prediction_scores = model([input_ids, input_mask, token_type_ids])[0] - result = { - "prediction_scores": prediction_scores.numpy(), - } - self.parent.assertListEqual( - list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] - ) + result = model([input_ids, input_mask, token_type_ids]) + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]) def create_and_check_roberta_for_token_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -136,10 +129,7 @@ class TFRobertaModelTester: config.num_labels = self.num_labels model = TFRobertaForTokenClassification(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]) def create_and_check_roberta_for_question_answering( @@ -147,11 +137,7 @@ class TFRobertaModelTester: ): model = TFRobertaForQuestionAnswering(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - start_logits, end_logits = model(inputs) - result = { - "start_logits": start_logits.numpy(), - "end_logits": end_logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) @@ -168,10 +154,7 @@ class TFRobertaModelTester: "attention_mask": multiple_choice_input_mask, "token_type_ids": multiple_choice_token_type_ids, } - (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) def prepare_config_and_inputs_for_common(self): diff --git a/tests/test_modeling_tf_t5.py b/tests/test_modeling_tf_t5.py index 3990ba76c3..fc7f72667a 100644 --- a/tests/test_modeling_tf_t5.py +++ b/tests/test_modeling_tf_t5.py @@ -78,6 +78,7 @@ class TFT5ModelTester: bos_token_id=self.pad_token_id, pad_token_id=self.pad_token_id, decoder_start_token_id=self.pad_token_id, + return_dict=True, ) return (config, input_ids, input_mask, token_labels) @@ -89,22 +90,14 @@ class TFT5ModelTester: "decoder_input_ids": input_ids, "decoder_attention_mask": input_mask, } - decoder_output, decoder_past, encoder_output = model(inputs) + result = model(inputs) - decoder_output, decoder_past, encoder_output = model( - input_ids, decoder_attention_mask=input_mask, decoder_input_ids=input_ids - ) - result = { - "encoder_output": encoder_output.numpy(), - "decoder_past": decoder_past, - "decoder_output": decoder_output.numpy(), - } - self.parent.assertListEqual( - list(result["encoder_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] - ) - self.parent.assertListEqual( - list(result["decoder_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] - ) + result = model(input_ids, decoder_attention_mask=input_mask, decoder_input_ids=input_ids) + decoder_output = result["last_hidden_state"] + decoder_past = result["decoder_past_key_values"] + encoder_output = result["encoder_last_hidden_state"] + self.parent.assertListEqual(list(encoder_output.shape), [self.batch_size, self.seq_length, self.hidden_size]) + self.parent.assertListEqual(list(decoder_output.shape), [self.batch_size, self.seq_length, self.hidden_size]) self.parent.assertEqual(len(decoder_past), 2) # decoder_past[0] should correspond to encoder output self.parent.assertTrue(tf.reduce_all(tf.math.equal(decoder_past[0][0], encoder_output))) @@ -121,14 +114,9 @@ class TFT5ModelTester: "decoder_attention_mask": input_mask, } - prediction_scores, _, _ = model(inputs_dict) + result = model(inputs_dict) - result = { - "prediction_scores": prediction_scores.numpy(), - } - self.parent.assertListEqual( - list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] - ) + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]) def create_and_check_t5_decoder_model_past(self, config, input_ids, decoder_input_ids, attention_mask): model = TFT5Model(config=config).get_decoder() diff --git a/tests/test_modeling_tf_transfo_xl.py b/tests/test_modeling_tf_transfo_xl.py index 408b3c02b0..12e3be5bd5 100644 --- a/tests/test_modeling_tf_transfo_xl.py +++ b/tests/test_modeling_tf_transfo_xl.py @@ -79,6 +79,7 @@ class TFTransfoXLModelTester: div_val=self.div_val, n_layer=self.num_hidden_layers, eos_token_id=self.eos_token_id, + return_dict=True, ) return (config, input_ids_1, input_ids_2, lm_labels) @@ -90,11 +91,11 @@ class TFTransfoXLModelTester: def create_and_check_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels): model = TFTransfoXLModel(config) - hidden_states_1, mems_1 = model(input_ids_1) + hidden_states_1, mems_1 = model(input_ids_1).to_tuple() inputs = {"input_ids": input_ids_2, "mems": mems_1} - hidden_states_2, mems_2 = model(inputs) + hidden_states_2, mems_2 = model(inputs).to_tuple() result = { "hidden_states_1": hidden_states_1.numpy(), @@ -121,16 +122,16 @@ class TFTransfoXLModelTester: def create_and_check_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels): model = TFTransfoXLLMHeadModel(config) - lm_logits_1, mems_1 = model(input_ids_1) + lm_logits_1, mems_1 = model(input_ids_1).to_tuple() inputs = {"input_ids": input_ids_1, "labels": lm_labels} - _, mems_1 = model(inputs) + _, mems_1 = model(inputs).to_tuple() - lm_logits_2, mems_2 = model([input_ids_2, mems_1]) + lm_logits_2, mems_2 = model([input_ids_2, mems_1]).to_tuple() inputs = {"input_ids": input_ids_1, "mems": mems_1, "labels": lm_labels} - _, mems_2 = model(inputs) + _, mems_2 = model(inputs).to_tuple() result = { "mems_1": [mem.numpy() for mem in mems_1], diff --git a/tests/test_modeling_tf_xlm.py b/tests/test_modeling_tf_xlm.py index 1903f4a8df..7f5007ad88 100644 --- a/tests/test_modeling_tf_xlm.py +++ b/tests/test_modeling_tf_xlm.py @@ -112,6 +112,7 @@ class TFXLMModelTester: summary_type=self.summary_type, use_proj=self.use_proj, bos_token_id=self.bos_token_id, + return_dict=True, ) return ( @@ -140,16 +141,12 @@ class TFXLMModelTester: ): model = TFXLMModel(config=config) inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids} - outputs = model(inputs) + result = model(inputs) inputs = [input_ids, input_mask] - outputs = model(inputs) - sequence_output = outputs[0] - result = { - "sequence_output": sequence_output.numpy(), - } + result = model(inputs) self.parent.assertListEqual( - list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size] ) def create_and_check_xlm_lm_head( @@ -169,11 +166,7 @@ class TFXLMModelTester: inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids} outputs = model(inputs) - logits = outputs[0] - - result = { - "logits": logits.numpy(), - } + result = outputs self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]) @@ -193,12 +186,7 @@ class TFXLMModelTester: inputs = {"input_ids": input_ids, "lengths": input_lengths} - start_logits, end_logits = model(inputs) - - result = { - "start_logits": start_logits.numpy(), - "end_logits": end_logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) @@ -219,11 +207,7 @@ class TFXLMModelTester: inputs = {"input_ids": input_ids, "lengths": input_lengths} - (logits,) = model(inputs) - - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size]) @@ -242,10 +226,7 @@ class TFXLMModelTester: config.num_labels = self.num_labels model = TFXLMForTokenClassification(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]) def create_and_check_xlm_for_multiple_choice( @@ -270,8 +251,7 @@ class TFXLMModelTester: "attention_mask": multiple_choice_input_mask, "token_type_ids": multiple_choice_token_type_ids, } - (logits,) = model(inputs) - result = {"logits": logits.numpy()} + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) def prepare_config_and_inputs_for_common(self): diff --git a/tests/test_modeling_tf_xlm_roberta.py b/tests/test_modeling_tf_xlm_roberta.py index c27b0576b8..088574c508 100644 --- a/tests/test_modeling_tf_xlm_roberta.py +++ b/tests/test_modeling_tf_xlm_roberta.py @@ -36,7 +36,7 @@ class TFFlaubertModelIntegrationTest(unittest.TestCase): "attention_mask": tf.convert_to_tensor([[1, 1, 1, 1, 1, 1]], dtype=tf.int32), } - output = model(features)[0] + output = model(features)["last_hidden_state"] expected_shape = tf.TensorShape((1, 6, 768)) self.assertEqual(output.shape, expected_shape) # compare the actual values for a slice. diff --git a/tests/test_modeling_tf_xlnet.py b/tests/test_modeling_tf_xlnet.py index 0299cb2fb4..f8b92186ca 100644 --- a/tests/test_modeling_tf_xlnet.py +++ b/tests/test_modeling_tf_xlnet.py @@ -110,6 +110,7 @@ class TFXLNetModelTester: bos_token_id=self.bos_token_id, pad_token_id=self.pad_token_id, eos_token_id=self.eos_token_id, + return_dict=True, ) return ( @@ -147,17 +148,10 @@ class TFXLNetModelTester: model = TFXLNetModel(config) inputs = {"input_ids": input_ids_1, "input_mask": input_mask, "token_type_ids": segment_ids} - - _, _ = model(inputs) + result = model(inputs) inputs = [input_ids_1, input_mask] - - outputs, mems_1 = model(inputs) - - result = { - "mems_1": [mem.numpy() for mem in mems_1], - "outputs": outputs.numpy(), - } + result = model(inputs) config.mem_len = 0 model = TFXLNetModel(config) @@ -165,10 +159,10 @@ class TFXLNetModelTester: self.parent.assertEqual(len(no_mems_outputs), 1) self.parent.assertListEqual( - list(result["outputs"].shape), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size] ) self.parent.assertListEqual( - list(list(mem.shape) for mem in result["mems_1"]), + list(list(mem.shape) for mem in result["mems"]), [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers, ) @@ -189,16 +183,13 @@ class TFXLNetModelTester: model = TFXLNetLMHeadModel(config) inputs_1 = {"input_ids": input_ids_1, "token_type_ids": segment_ids} - - all_logits_1, mems_1 = model(inputs_1) + all_logits_1, mems_1 = model(inputs_1).to_tuple() inputs_2 = {"input_ids": input_ids_2, "mems": mems_1, "token_type_ids": segment_ids} - - all_logits_2, mems_2 = model(inputs_2) + all_logits_2, mems_2 = model(inputs_2).to_tuple() inputs_3 = {"input_ids": input_ids_q, "perm_mask": perm_mask, "target_mapping": target_mapping} - - logits, _ = model(inputs_3) + logits, _ = model(inputs_3).to_tuple() result = { "mems_1": [mem.numpy() for mem in mems_1], @@ -240,13 +231,7 @@ class TFXLNetModelTester: model = TFXLNetForQuestionAnsweringSimple(config) inputs = {"input_ids": input_ids_1, "attention_mask": input_mask, "token_type_ids": segment_ids} - start_logits, end_logits, mems = model(inputs) - - result = { - "start_logits": start_logits.numpy(), - "end_logits": end_logits.numpy(), - "mems": [m.numpy() for m in mems], - } + result = model(inputs) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) @@ -271,16 +256,11 @@ class TFXLNetModelTester: ): model = TFXLNetForSequenceClassification(config) - logits, mems_1 = model(input_ids_1) - - result = { - "mems_1": [mem.numpy() for mem in mems_1], - "logits": logits.numpy(), - } + result = model(input_ids_1) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size]) self.parent.assertListEqual( - list(list(mem.shape) for mem in result["mems_1"]), + list(list(mem.shape) for mem in result["mems"]), [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers, ) @@ -305,16 +285,12 @@ class TFXLNetModelTester: "attention_mask": input_mask, # 'token_type_ids': token_type_ids } - logits, mems_1 = model(inputs) - result = { - "mems_1": [mem.numpy() for mem in mems_1], - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual( list(result["logits"].shape), [self.batch_size, self.seq_length, config.num_labels] ) self.parent.assertListEqual( - list(list(mem.shape) for mem in result["mems_1"]), + list(list(mem.shape) for mem in result["mems"]), [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers, ) @@ -342,15 +318,11 @@ class TFXLNetModelTester: "attention_mask": multiple_choice_input_mask, "token_type_ids": multiple_choice_token_type_ids, } - (logits, mems_1) = model(inputs) - result = { - "mems_1": [mem.numpy() for mem in mems_1], - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) self.parent.assertListEqual( - list(list(mem.shape) for mem in result["mems_1"]), + list(list(mem.shape) for mem in result["mems"]), [[self.seq_length, self.batch_size * self.num_choices, self.hidden_size]] * self.num_hidden_layers, )