Tf model outputs (#6247)

* TF outputs and test on BERT

* Albert to DistilBert

* All remaining TF models except T5

* Documentation

* One file forgotten

* TF outputs and test on BERT

* Albert to DistilBert

* All remaining TF models except T5

* Documentation

* One file forgotten

* Add new models and fix issues

* Quality improvements

* Add T5

* A bit of cleanup

* Fix for slow tests

* Style
This commit is contained in:
Sylvain Gugger
2020-08-05 11:34:39 -04:00
committed by GitHub
parent bd0eab351a
commit c67d1a0259
51 changed files with 3253 additions and 2430 deletions

View File

@@ -50,7 +50,10 @@ AlbertTokenizer
Albert specific outputs Albert specific outputs
~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.modeling_albert.AlbertForPretrainingOutput .. autoclass:: transformers.modeling_albert.AlbertForPreTrainingOutput
:members:
.. autoclass:: transformers.modeling_tf_albert.TFAlbertForPreTrainingOutput
:members: :members:

View File

@@ -57,7 +57,10 @@ BertTokenizerFast
Bert specific outputs Bert specific outputs
~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.modeling_bert.BertForPretrainingOutput .. autoclass:: transformers.modeling_bert.BertForPreTrainingOutput
:members:
.. autoclass:: transformers.modeling_tf_bert.TFBertForPreTrainingOutput
:members: :members:

View File

@@ -74,7 +74,10 @@ ElectraTokenizerFast
Electra specific outputs Electra specific outputs
~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.modeling_electra.ElectraForPretrainingOutput .. autoclass:: transformers.modeling_electra.ElectraForPreTrainingOutput
:members:
.. autoclass:: transformers.modeling_tf_electra.TFElectraForPreTrainingOutput
:members: :members:
@@ -106,6 +109,13 @@ ElectraForSequenceClassification
:members: :members:
ElectraForMultipleChoice
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.ElectraForMultipleChoice
:members:
ElectraForTokenClassification ElectraForTokenClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -141,6 +151,20 @@ TFElectraForMaskedLM
:members: :members:
TFElectraForSequenceClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFElectraForSequenceClassification
:members:
TFElectraForMultipleChoice
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFElectraForMultipleChoice
:members:
TFElectraForTokenClassification TFElectraForTokenClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@@ -77,6 +77,9 @@ OpenAI specific outputs
.. autoclass:: transformers.modeling_openai.OpenAIGPTDoubleHeadsModelOutput .. autoclass:: transformers.modeling_openai.OpenAIGPTDoubleHeadsModelOutput
:members: :members:
.. autoclass:: transformers.modeling_tf_openai.TFOpenAIGPTDoubleHeadsModelOutput
:members:
OpenAIGPTModel OpenAIGPTModel
~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@@ -64,6 +64,9 @@ GPT2 specific outputs
.. autoclass:: transformers.modeling_gpt2.GPT2DoubleHeadsModelOutput .. autoclass:: transformers.modeling_gpt2.GPT2DoubleHeadsModelOutput
:members: :members:
.. autoclass:: transformers.modeling_tf_gpt2.TFGPT2DoubleHeadsModelOutput
:members:
GPT2Model GPT2Model
~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~

View File

@@ -59,7 +59,10 @@ MobileBertTokenizerFast
MobileBert specific outputs MobileBert specific outputs
~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.modeling_mobilebert.MobileBertForPretrainingOutput .. autoclass:: transformers.modeling_mobilebert.MobileBertForPreTrainingOutput
:members:
.. autoclass:: transformers.modeling_tf_mobilebert.TFMobileBertForPreTrainingOutput
:members: :members:

View File

@@ -63,6 +63,12 @@ TransfoXL specific outputs
.. autoclass:: transformers.modeling_transfo_xl.TransfoXLLMHeadModelOutput .. autoclass:: transformers.modeling_transfo_xl.TransfoXLLMHeadModelOutput
:members: :members:
.. autoclass:: transformers.modeling_tf_transfo_xl.TFTransfoXLModelOutput
:members:
.. autoclass:: transformers.modeling_tf_transfo_xl.TFTransfoXLLMHeadModelOutput
:members:
TransfoXLModel TransfoXLModel
~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@@ -74,6 +74,24 @@ XLNet specific outputs
.. autoclass:: transformers.modeling_xlnet.XLNetForQuestionAnsweringOutput .. autoclass:: transformers.modeling_xlnet.XLNetForQuestionAnsweringOutput
:members: :members:
.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetModelOutput
:members:
.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetLMHeadModelOutput
:members:
.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetForSequenceClassificationOutput
:members:
.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetForMultipleChoiceOutput
:members:
.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetForTokenClassificationOutput
:members:
.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetForQuestionAnsweringSimpleOutput
:members:
XLNetModel XLNetModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@@ -190,7 +190,7 @@ def add_end_docstrings(*docstr):
return docstring_decorator return docstring_decorator
RETURN_INTRODUCTION = r""" PT_RETURN_INTRODUCTION = r"""
Returns: Returns:
:class:`~{full_output_type}` or :obj:`tuple(torch.FloatTensor)`: :class:`~{full_output_type}` or :obj:`tuple(torch.FloatTensor)`:
A :class:`~{full_output_type}` (if ``return_dict=True`` is passed or when ``config.return_dict=True``) or a A :class:`~{full_output_type}` (if ``return_dict=True`` is passed or when ``config.return_dict=True``) or a
@@ -200,6 +200,16 @@ RETURN_INTRODUCTION = r"""
""" """
TF_RETURN_INTRODUCTION = r"""
Returns:
:class:`~{full_output_type}` or :obj:`tuple(tf.Tensor)`:
A :class:`~{full_output_type}` (if ``return_dict=True`` is passed or when ``config.return_dict=True``) or a
tuple of :obj:`tf.Tensor` comprising various elements depending on the configuration
(:class:`~transformers.{config_class}`) and inputs.
"""
def _get_indent(t): def _get_indent(t):
"""Returns the indentation in the first line of t""" """Returns the indentation in the first line of t"""
search = re.search(r"^(\s*)\S", t) search = re.search(r"^(\s*)\S", t)
@@ -249,7 +259,8 @@ def _prepare_output_docstrings(output_type, config_class):
# Add the return introduction # Add the return introduction
full_output_type = f"{output_type.__module__}.{output_type.__name__}" full_output_type = f"{output_type.__module__}.{output_type.__name__}"
intro = RETURN_INTRODUCTION.format(full_output_type=full_output_type, config_class=config_class) intro = TF_RETURN_INTRODUCTION if output_type.__name__.startswith("TF") else PT_RETURN_INTRODUCTION
intro = intro.format(full_output_type=full_output_type, config_class=config_class)
return intro + docstrings return intro + docstrings

View File

@@ -407,9 +407,9 @@ class AlbertPreTrainedModel(PreTrainedModel):
@dataclass @dataclass
class AlbertForPretrainingOutput(ModelOutput): class AlbertForPreTrainingOutput(ModelOutput):
""" """
Output type of :class:`~transformers.AlbertForPretrainingModel`. Output type of :class:`~transformers.AlbertForPreTrainingModel`.
Args: Args:
loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`): loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
@@ -643,7 +643,7 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
return self.predictions.decoder return self.predictions.decoder
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=AlbertForPretrainingOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=AlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
@@ -728,7 +728,7 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
output = (prediction_scores, sop_scores) + outputs[2:] output = (prediction_scores, sop_scores) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output return ((total_loss,) + output) if total_loss is not None else output
return AlbertForPretrainingOutput( return AlbertForPreTrainingOutput(
loss=total_loss, loss=total_loss,
prediction_logits=prediction_scores, prediction_logits=prediction_scores,
sop_logits=sop_scores, sop_logits=sop_scores,

View File

@@ -586,9 +586,9 @@ class BertPreTrainedModel(PreTrainedModel):
@dataclass @dataclass
class BertForPretrainingOutput(ModelOutput): class BertForPreTrainingOutput(ModelOutput):
""" """
Output type of :class:`~transformers.BertForPretrainingModel`. Output type of :class:`~transformers.BertForPreTrainingModel`.
Args: Args:
loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`): loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
@@ -837,7 +837,7 @@ class BertForPreTraining(BertPreTrainedModel):
return self.cls.predictions.decoder return self.cls.predictions.decoder
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@replace_return_docstrings(output_type=BertForPretrainingOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
@@ -918,7 +918,7 @@ class BertForPreTraining(BertPreTrainedModel):
output = (prediction_scores, seq_relationship_score) + outputs[2:] output = (prediction_scores, seq_relationship_score) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output return ((total_loss,) + output) if total_loss is not None else output
return BertForPretrainingOutput( return BertForPreTrainingOutput(
loss=total_loss, loss=total_loss,
prediction_logits=prediction_scores, prediction_logits=prediction_scores,
seq_relationship_logits=seq_relationship_score, seq_relationship_logits=seq_relationship_score,

View File

@@ -188,9 +188,9 @@ class ElectraPreTrainedModel(BertPreTrainedModel):
@dataclass @dataclass
class ElectraForPretrainingOutput(ModelOutput): class ElectraForPreTrainingOutput(ModelOutput):
""" """
Output type of :class:`~transformers.ElectraForPretrainingModel`. Output type of :class:`~transformers.ElectraForPreTrainingModel`.
Args: Args:
loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`): loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
@@ -496,7 +496,7 @@ class ElectraForPreTraining(ElectraPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=ElectraForPretrainingOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=ElectraForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
@@ -562,7 +562,7 @@ class ElectraForPreTraining(ElectraPreTrainedModel):
output = (logits,) + discriminator_hidden_states[1:] output = (logits,) + discriminator_hidden_states[1:]
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return ElectraForPretrainingOutput( return ElectraForPreTrainingOutput(
loss=loss, loss=loss,
logits=logits, logits=logits,
hidden_states=discriminator_hidden_states.hidden_states, hidden_states=discriminator_hidden_states.hidden_states,
@@ -850,7 +850,7 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel):
@add_start_docstrings( @add_start_docstrings(
"""ELECTRA Model with a multiple choice classification head on top (a linear layer on top of """ELECTRA Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
ELECTRA_INPUTS_DOCSTRING, ELECTRA_START_DOCSTRING,
) )
class ElectraForMultipleChoice(ElectraPreTrainedModel): class ElectraForMultipleChoice(ElectraPreTrainedModel):
def __init__(self, config): def __init__(self, config):

View File

@@ -685,9 +685,9 @@ class MobileBertPreTrainedModel(PreTrainedModel):
@dataclass @dataclass
class MobileBertForPretrainingOutput(ModelOutput): class MobileBertForPreTrainingOutput(ModelOutput):
""" """
Output type of :class:`~transformers.MobileBertForPretrainingModel`. Output type of :class:`~transformers.MobileBertForPreTrainingModel`.
Args: Args:
loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`): loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
@@ -948,7 +948,7 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel):
self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings()) self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=MobileBertForPretrainingOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=MobileBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
@@ -1018,7 +1018,7 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel):
output = (prediction_scores, seq_relationship_score) + outputs[2:] output = (prediction_scores, seq_relationship_score) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output return ((total_loss,) + output) if total_loss is not None else output
return MobileBertForPretrainingOutput( return MobileBertForPreTrainingOutput(
loss=total_loss, loss=total_loss,
prediction_logits=prediction_scores, prediction_logits=prediction_scores,
seq_relationship_logits=seq_relationship_score, seq_relationship_logits=seq_relationship_score,

View File

@@ -973,7 +973,7 @@ class T5Model(T5PreTrainedModel):
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict, return_dict=return_dict,
) )
elif not return_dict and not isinstance(encoder_outputs, BaseModelOutput): elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
encoder_outputs = BaseModelOutput( encoder_outputs = BaseModelOutput(
last_hidden_state=encoder_outputs[0], last_hidden_state=encoder_outputs[0],
hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,

View File

@@ -17,17 +17,30 @@
import logging import logging
from dataclasses import dataclass
from typing import Optional, Tuple
import tensorflow as tf import tensorflow as tf
from .configuration_albert import AlbertConfig from .configuration_albert import AlbertConfig
from .file_utils import ( from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS, MULTIPLE_CHOICE_DUMMY_INPUTS,
ModelOutput,
add_code_sample_docstrings, add_code_sample_docstrings,
add_start_docstrings, add_start_docstrings,
add_start_docstrings_to_callable, add_start_docstrings_to_callable,
replace_return_docstrings,
) )
from .modeling_tf_bert import ACT2FN, TFBertSelfAttention from .modeling_tf_bert import ACT2FN, TFBertSelfAttention
from .modeling_tf_outputs import (
TFBaseModelOutput,
TFBaseModelOutputWithPooling,
TFMaskedLMOutput,
TFMultipleChoiceModelOutput,
TFQuestionAnsweringModelOutput,
TFSequenceClassifierOutput,
TFTokenClassifierOutput,
)
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFMaskedLanguageModelingLoss, TFMaskedLanguageModelingLoss,
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
@@ -44,6 +57,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_CONFIG_FOR_DOC = "AlbertConfig"
_TOKENIZER_FOR_DOC = "AlbertTokenizer" _TOKENIZER_FOR_DOC = "AlbertTokenizer"
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -414,12 +428,19 @@ class TFAlbertTransformer(tf.keras.layers.Layer):
for i in range(config.num_hidden_groups) for i in range(config.num_hidden_groups)
] ]
def call(self, hidden_states, attention_mask, head_mask, output_attentions, output_hidden_states, training=False): def call(
self,
hidden_states,
attention_mask,
head_mask,
output_attentions,
output_hidden_states,
return_dict,
training=False,
):
hidden_states = self.embedding_hidden_mapping_in(hidden_states) hidden_states = self.embedding_hidden_mapping_in(hidden_states)
all_attentions = () all_attentions = () if output_attentions else None
all_hidden_states = (hidden_states,) if output_hidden_states else None
if output_hidden_states:
all_hidden_states = (hidden_states,)
for i in range(self.config.num_hidden_layers): for i in range(self.config.num_hidden_layers):
# Number of layers in a hidden group # Number of layers in a hidden group
@@ -444,14 +465,11 @@ class TFAlbertTransformer(tf.keras.layers.Layer):
if output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states,) if not return_dict:
if output_hidden_states: return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
outputs = outputs + (all_hidden_states,) return TFBaseModelOutput(
if output_attentions: last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
outputs = outputs + (all_attentions,) )
# last-layer hidden state, (all hidden states), (all attentions)
return outputs
class TFAlbertPreTrainedModel(TFPreTrainedModel): class TFAlbertPreTrainedModel(TFPreTrainedModel):
@@ -506,6 +524,7 @@ class TFAlbertMainLayer(tf.keras.layers.Layer):
self.num_hidden_layers = config.num_hidden_layers self.num_hidden_layers = config.num_hidden_layers
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.return_dict = config.use_return_dict
self.embeddings = TFAlbertEmbeddings(config, name="embeddings") self.embeddings = TFAlbertEmbeddings(config, name="embeddings")
self.encoder = TFAlbertTransformer(config, name="encoder") self.encoder = TFAlbertTransformer(config, name="encoder")
@@ -543,6 +562,7 @@ class TFAlbertMainLayer(tf.keras.layers.Layer):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
training=False, training=False,
): ):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
@@ -554,7 +574,8 @@ class TFAlbertMainLayer(tf.keras.layers.Layer):
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_attentions = inputs[6] if len(inputs) > 6 else output_attentions
output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states
assert len(inputs) <= 8, "Too many inputs." return_dict = inputs[8] if len(inputs) > 8 else return_dict
assert len(inputs) <= 9, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
@@ -564,12 +585,14 @@ class TFAlbertMainLayer(tf.keras.layers.Layer):
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
assert len(inputs) <= 8, "Too many inputs." return_dict = inputs.get("return_dict", return_dict)
assert len(inputs) <= 9, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_attentions = output_attentions if output_attentions is not None else self.output_attentions
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
return_dict = return_dict if return_dict is not None else self.return_dict
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
@@ -619,16 +642,52 @@ class TFAlbertMainLayer(tf.keras.layers.Layer):
head_mask, head_mask,
output_attentions, output_attentions,
output_hidden_states, output_hidden_states,
return_dict,
training=training, training=training,
) )
sequence_output = encoder_outputs[0] sequence_output = encoder_outputs[0]
pooled_output = self.pooler(sequence_output[:, 0]) pooled_output = self.pooler(sequence_output[:, 0])
# add hidden_states and attentions if they are here if not return_dict:
outputs = (sequence_output, pooled_output,) + encoder_outputs[1:] return (sequence_output, pooled_output,) + encoder_outputs[1:]
# sequence_output, pooled_output, (hidden_states), (attentions)
return outputs return TFBaseModelOutputWithPooling(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
@dataclass
class TFAlbertForPreTrainingOutput(ModelOutput):
"""
Output type of :class:`~transformers.TFAlbertForPreTrainingModel`.
Args:
prediction_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
sop_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False
continuation before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
prediction_logits: tf.Tensor = None
sop_logits: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
ALBERT_START_DOCSTRING = r""" ALBERT_START_DOCSTRING = r"""
@@ -707,6 +766,11 @@ ALBERT_INPUTS_DOCSTRING = r"""
(if set to :obj:`False`) for evaluation. (if set to :obj:`False`) for evaluation.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
@@ -720,32 +784,13 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
self.albert = TFAlbertMainLayer(config, name="albert") self.albert = TFAlbertMainLayer(config, name="albert")
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="albert-base-v2",
output_type=TFBaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r"""
Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification token)
further processed by a Linear layer and a Tanh activation function. The Linear
layer weights are trained from the next sentence prediction (classification)
objective during Albert pretraining. This output is usually *not* a good summary
of the semantic content of the input, you're often better with averaging or pooling
the sequence of hidden-states for the whole input sequence.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
outputs = self.albert(inputs, **kwargs) outputs = self.albert(inputs, **kwargs)
return outputs return outputs
@@ -768,25 +813,10 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel):
return self.albert.embeddings return self.albert.embeddings
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@replace_return_docstrings(output_type=TFAlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
sop_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`):
Prediction scores of the sentence order prediction (classification) head (scores of True/False continuation before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples:: Examples::
import tensorflow as tf import tensorflow as tf
@@ -797,13 +827,22 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel):
outputs = model(input_ids) outputs = model(input_ids)
prediction_scores, sop_scores = outputs[:2] prediction_scores, sop_scores = outputs[:2]
""" """
return_dict = kwargs.get("return_dict")
return_dict = return_dict if return_dict is not None else self.albert.return_dict
outputs = self.albert(inputs, **kwargs) outputs = self.albert(inputs, **kwargs)
sequence_output, pooled_output = outputs[:2] sequence_output, pooled_output = outputs[:2]
prediction_scores = self.predictions(sequence_output) prediction_scores = self.predictions(sequence_output)
sop_scores = self.sop_classifier(pooled_output, training=kwargs.get("training", False)) sop_scores = self.sop_classifier(pooled_output, training=kwargs.get("training", False))
outputs = (prediction_scores, sop_scores) + outputs[2:]
return outputs if not return_dict:
return (prediction_scores, sop_scores) + outputs[2:]
return TFAlbertForPreTrainingOutput(
prediction_logits=prediction_scores,
sop_logits=sop_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
class TFAlbertSOPHead(tf.keras.layers.Layer): class TFAlbertSOPHead(tf.keras.layers.Layer):
@@ -833,7 +872,12 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss)
return self.albert.embeddings return self.albert.embeddings
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="albert-base-v2",
output_type=TFMaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -844,6 +888,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss)
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -853,27 +898,12 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss)
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]`` in ``[0, ..., config.vocab_size]``
Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.albert.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
@@ -886,20 +916,22 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss)
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
prediction_scores = self.predictions(sequence_output, training=training) prediction_scores = self.predictions(sequence_output, training=training)
# Add hidden states and attention if they are here loss = None if labels is None else self.compute_loss(labels, prediction_scores)
outputs = (prediction_scores,) + outputs[2:]
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, prediction_scores) output = (prediction_scores,) + outputs[2:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # prediction_scores, (hidden_states), (attentions) return TFMaskedLMOutput(
loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -919,7 +951,12 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
) )
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="albert-base-v2",
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -930,6 +967,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -939,27 +977,12 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
Indices should be in ``[0, ..., config.num_labels - 1]``. Indices should be in ``[0, ..., config.num_labels - 1]``.
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`)
Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.albert.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
@@ -972,6 +995,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
@@ -980,13 +1004,15 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
pooled_output = self.dropout(pooled_output, training=training) pooled_output = self.dropout(pooled_output, training=training)
logits = self.classifier(pooled_output) logits = self.classifier(pooled_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here loss = None if labels is None else self.compute_loss(labels, logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, logits) output = (logits,) + outputs[2:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), logits, (hidden_states), (attentions) return TFSequenceClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -1006,7 +1032,12 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
) )
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="albert-base-v2",
output_type=TFTokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -1017,6 +1048,7 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -1024,27 +1056,12 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the token classification loss. Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``. Indices should be in ``[0, ..., config.num_labels - 1]``.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.albert.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
@@ -1057,6 +1074,7 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
@@ -1065,13 +1083,15 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
sequence_output = self.dropout(sequence_output, training=training) sequence_output = self.dropout(sequence_output, training=training)
logits = self.classifier(sequence_output) logits = self.classifier(sequence_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here loss = None if labels is None else self.compute_loss(labels, logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, logits) output = (logits,) + outputs[2:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), logits, (hidden_states), (attentions) return TFTokenClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -1089,7 +1109,12 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL
) )
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="albert-base-v2",
output_type=TFQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -1100,6 +1125,7 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
start_positions=None, start_positions=None,
end_positions=None, end_positions=None,
training=False, training=False,
@@ -1113,30 +1139,13 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL
Labels for position (index) of the end of the labelled span for computing the token classification loss. Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss. Position outside of the sequence are not taken into account for computing the loss.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-start scores (before SoftMax).
end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.albert.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
start_positions = inputs[8] if len(inputs) > 8 else start_positions start_positions = inputs[9] if len(inputs) > 9 else start_positions
end_positions = inputs[9] if len(inputs) > 9 else end_positions end_positions = inputs[10] if len(inputs) > 10 else end_positions
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
start_positions = inputs.pop("start_positions", start_positions) start_positions = inputs.pop("start_positions", start_positions)
end_positions = inputs.pop("end_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions)
@@ -1150,6 +1159,7 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
@@ -1160,15 +1170,23 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL
start_logits = tf.squeeze(start_logits, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1)
outputs = (start_logits, end_logits,) + outputs[2:] loss = None
if start_positions is not None and end_positions is not None: if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions} labels = {"start_position": start_positions}
labels["end_position"] = end_positions labels["end_position"] = end_positions
loss = self.compute_loss(labels, outputs[:2]) loss = self.compute_loss(labels, (start_logits, end_logits))
outputs = (loss,) + outputs
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFQuestionAnsweringModelOutput(
loss=loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -1196,7 +1214,12 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="albert-base-v2",
output_type=TFMultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs, inputs,
@@ -1207,6 +1230,7 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -1215,24 +1239,6 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
Labels for computing the multiple choice classification loss. Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above) of the input tensors. (see `input_ids` above)
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
`num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
@@ -1243,8 +1249,9 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_attentions = inputs[6] if len(inputs) > 6 else output_attentions
output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states
labels = inputs[8] if len(inputs) > 8 else labels return_dict = inputs[8] if len(inputs) > 8 else return_dict
assert len(inputs) <= 9, "Too many inputs." labels = inputs[9] if len(inputs) > 9 else labels
assert len(inputs) <= 10, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
@@ -1254,10 +1261,12 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_attentions)
return_dict = inputs.get("return_dict", return_dict)
labels = inputs.get("labels", labels) labels = inputs.get("labels", labels)
assert len(inputs) <= 9, "Too many inputs." assert len(inputs) <= 10, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
return_dict = return_dict if return_dict is not None else self.albert.return_dict
if input_ids is not None: if input_ids is not None:
num_choices = shape_list(input_ids)[1] num_choices = shape_list(input_ids)[1]
@@ -1280,6 +1289,7 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
inputs_embeds, inputs_embeds,
output_attentions, output_attentions,
output_hidden_states, output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
@@ -1289,10 +1299,12 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
logits = self.classifier(pooled_output) logits = self.classifier(pooled_output)
reshaped_logits = tf.reshape(logits, (-1, num_choices)) reshaped_logits = tf.reshape(logits, (-1, num_choices))
outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here loss = None if labels is None else self.compute_loss(labels, reshaped_logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, reshaped_logits) output = (reshaped_logits,) + outputs[2:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), reshaped_logits, (hidden_states), (attentions) return TFMultipleChoiceModelOutput(
loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)

View File

@@ -17,6 +17,8 @@
import logging import logging
from dataclasses import dataclass
from typing import Optional, Tuple
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
@@ -24,9 +26,22 @@ import tensorflow as tf
from .configuration_bert import BertConfig from .configuration_bert import BertConfig
from .file_utils import ( from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS, MULTIPLE_CHOICE_DUMMY_INPUTS,
ModelOutput,
add_code_sample_docstrings, add_code_sample_docstrings,
add_start_docstrings, add_start_docstrings,
add_start_docstrings_to_callable, add_start_docstrings_to_callable,
replace_return_docstrings,
)
from .modeling_tf_outputs import (
TFBaseModelOutput,
TFBaseModelOutputWithPooling,
TFCausalLMOutput,
TFMaskedLMOutput,
TFMultipleChoiceModelOutput,
TFNextSentencePredictorOutput,
TFQuestionAnsweringModelOutput,
TFSequenceClassifierOutput,
TFTokenClassifierOutput,
) )
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFCausalLanguageModelingLoss, TFCausalLanguageModelingLoss,
@@ -45,6 +60,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_CONFIG_FOR_DOC = "BertConfig"
_TOKENIZER_FOR_DOC = "BertTokenizer" _TOKENIZER_FOR_DOC = "BertTokenizer"
TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -389,9 +405,18 @@ class TFBertEncoder(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.layer = [TFBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)] self.layer = [TFBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)]
def call(self, hidden_states, attention_mask, head_mask, output_attentions, output_hidden_states, training=False): def call(
all_hidden_states = () self,
all_attentions = () hidden_states,
attention_mask,
head_mask,
output_attentions,
output_hidden_states,
return_dict,
training=False,
):
all_hidden_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
for i, layer_module in enumerate(self.layer): for i, layer_module in enumerate(self.layer):
if output_hidden_states: if output_hidden_states:
@@ -409,15 +434,11 @@ class TFBertEncoder(tf.keras.layers.Layer):
if output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states,) if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
if output_hidden_states: return TFBaseModelOutput(
outputs = outputs + (all_hidden_states,) last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
)
if output_attentions:
outputs = outputs + (all_attentions,)
return outputs # outputs, (hidden states), (attentions)
class TFBertPooler(tf.keras.layers.Layer): class TFBertPooler(tf.keras.layers.Layer):
@@ -517,6 +538,7 @@ class TFBertMainLayer(tf.keras.layers.Layer):
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.return_dict = config.use_return_dict
self.embeddings = TFBertEmbeddings(config, name="embeddings") self.embeddings = TFBertEmbeddings(config, name="embeddings")
self.encoder = TFBertEncoder(config, name="encoder") self.encoder = TFBertEncoder(config, name="encoder")
self.pooler = TFBertPooler(config, name="pooler") self.pooler = TFBertPooler(config, name="pooler")
@@ -545,6 +567,7 @@ class TFBertMainLayer(tf.keras.layers.Layer):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
training=False, training=False,
): ):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
@@ -556,7 +579,8 @@ class TFBertMainLayer(tf.keras.layers.Layer):
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_attentions = inputs[6] if len(inputs) > 6 else output_attentions
output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states
assert len(inputs) <= 8, "Too many inputs." return_dict = inputs[8] if len(inputs) > 8 else return_dict
assert len(inputs) <= 9, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
@@ -566,12 +590,14 @@ class TFBertMainLayer(tf.keras.layers.Layer):
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
assert len(inputs) <= 8, "Too many inputs." return_dict = inputs.get("return_dict", return_dict)
assert len(inputs) <= 9, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_attentions = output_attentions if output_attentions is not None else self.output_attentions
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
return_dict = return_dict if return_dict is not None else self.return_dict
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
@@ -621,16 +647,22 @@ class TFBertMainLayer(tf.keras.layers.Layer):
head_mask, head_mask,
output_attentions, output_attentions,
output_hidden_states, output_hidden_states,
return_dict,
training=training, training=training,
) )
sequence_output = encoder_outputs[0] sequence_output = encoder_outputs[0]
pooled_output = self.pooler(sequence_output) pooled_output = self.pooler(sequence_output)
outputs = (sequence_output, pooled_output,) + encoder_outputs[
1:
] # add hidden_states and attentions if they are here
return outputs # sequence_output, pooled_output, (hidden_states), (attentions) if not return_dict:
return (sequence_output, pooled_output,) + encoder_outputs[1:]
return TFBaseModelOutputWithPooling(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
class TFBertPreTrainedModel(TFPreTrainedModel): class TFBertPreTrainedModel(TFPreTrainedModel):
@@ -642,6 +674,36 @@ class TFBertPreTrainedModel(TFPreTrainedModel):
base_model_prefix = "bert" base_model_prefix = "bert"
@dataclass
class TFBertForPreTrainingOutput(ModelOutput):
"""
Output type of :class:`~transformers.TFBertForPreTrainingModel`.
Args:
prediction_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
seq_relationship_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False
continuation before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
prediction_logits: tf.Tensor = None
seq_relationship_logits: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
BERT_START_DOCSTRING = r""" BERT_START_DOCSTRING = r"""
This model is a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ sub-class. This model is a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ sub-class.
Use it as a regular TF 2.0 Keras Model and Use it as a regular TF 2.0 Keras Model and
@@ -712,6 +774,11 @@ BERT_INPUTS_DOCSTRING = r"""
(if set to :obj:`False`) for evaluation. (if set to :obj:`False`) for evaluation.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
@@ -725,32 +792,13 @@ class TFBertModel(TFBertPreTrainedModel):
self.bert = TFBertMainLayer(config, name="bert") self.bert = TFBertMainLayer(config, name="bert")
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="bert-base-cased",
output_type=TFBaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r"""
Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification token)
further processed by a Linear layer and a Tanh activation function. The Linear
layer weights are trained from the next sentence prediction (classification)
objective during Bert pretraining. This output is usually *not* a good summary
of the semantic content of the input, you're often better with averaging or pooling
the sequence of hidden-states for the whole input sequence.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
outputs = self.bert(inputs, **kwargs) outputs = self.bert(inputs, **kwargs)
return outputs return outputs
@@ -772,25 +820,10 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
return self.bert.embeddings return self.bert.embeddings
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@replace_return_docstrings(output_type=TFBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
seq_relationship_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples:: Examples::
@@ -804,17 +837,23 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
prediction_scores, seq_relationship_scores = outputs[:2] prediction_scores, seq_relationship_scores = outputs[:2]
""" """
return_dict = kwargs.get("return_dict")
return_dict = return_dict if return_dict is not None else self.bert.return_dict
outputs = self.bert(inputs, **kwargs) outputs = self.bert(inputs, **kwargs)
sequence_output, pooled_output = outputs[:2] sequence_output, pooled_output = outputs[:2]
prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False)) prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False))
seq_relationship_score = self.nsp(pooled_output) seq_relationship_score = self.nsp(pooled_output)
outputs = (prediction_scores, seq_relationship_score,) + outputs[ if not return_dict:
2: return (prediction_scores, seq_relationship_score) + outputs[2:]
] # add hidden states and attention if they are here
return outputs # prediction_scores, seq_relationship_score, (hidden_states), (attentions) return TFBertForPreTrainingOutput(
prediction_logits=prediction_scores,
seq_relationship_logits=seq_relationship_score,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING) @add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
@@ -832,7 +871,12 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
return self.bert.embeddings return self.bert.embeddings
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="bert-base-cased",
output_type=TFMaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -843,6 +887,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -852,27 +897,12 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]`` in ``[0, ..., config.vocab_size]``
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.bert.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
@@ -885,19 +915,22 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
prediction_scores = self.mlm(sequence_output, training=training) prediction_scores = self.mlm(sequence_output, training=training)
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here loss = None if labels is None else self.compute_loss(labels, prediction_scores)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, prediction_scores) output = (prediction_scores,) + outputs[2:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), prediction_scores, (hidden_states), (attentions) return TFMaskedLMOutput(
loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss): class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss):
@@ -911,7 +944,12 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss):
def get_output_embeddings(self): def get_output_embeddings(self):
return self.bert.embeddings return self.bert.embeddings
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="bert-base-cased",
output_type=TFCausalLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -922,6 +960,7 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -929,27 +968,12 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss):
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the cross entropy classification loss. Labels for computing the cross entropy classification loss.
Indices should be in ``[0, ..., config.vocab_size - 1]``. Indices should be in ``[0, ..., config.vocab_size - 1]``.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.bert.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
@@ -962,21 +986,27 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
logits = self.mlm(sequence_output, training=training) logits = self.mlm(sequence_output, training=training)
outputs = (logits,) + outputs[2:] # Add hidden states and attention if they are here loss = None
if labels is not None: if labels is not None:
# shift labels to the left and cut last logit token # shift labels to the left and cut last logit token
logits = logits[:, :-1] logits = logits[:, :-1]
labels = labels[:, 1:] labels = labels[:, 1:]
loss = self.compute_loss(labels, logits) loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # prediction_scores, (hidden_states), (attentions) if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFCausalLMOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -990,23 +1020,10 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
self.nsp = TFBertNSPHead(config, name="nsp___cls") self.nsp = TFBertNSPHead(config, name="nsp___cls")
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@replace_return_docstrings(output_type=TFNextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
seq_relationship_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`)
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples:: Examples::
@@ -1023,14 +1040,19 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0] logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
assert logits[0][0] < logits[0][1] # the next sentence was random assert logits[0][0] < logits[0][1] # the next sentence was random
""" """
return_dict = kwargs.get("return_dict")
return_dict = return_dict if return_dict is not None else self.bert.return_dict
outputs = self.bert(inputs, **kwargs) outputs = self.bert(inputs, **kwargs)
pooled_output = outputs[1] pooled_output = outputs[1]
seq_relationship_score = self.nsp(pooled_output) seq_relationship_score = self.nsp(pooled_output)
outputs = (seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here if not return_dict:
return (seq_relationship_score,) + outputs[2:]
return outputs # seq_relationship_score, (hidden_states), (attentions) return TFNextSentencePredictorOutput(
logits=seq_relationship_score, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -1050,7 +1072,12 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
) )
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="bert-base-cased",
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -1061,6 +1088,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -1070,27 +1098,12 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
Indices should be in :obj:`[0, ..., config.num_labels - 1]`. Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.bert.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
@@ -1103,6 +1116,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
@@ -1111,13 +1125,15 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
pooled_output = self.dropout(pooled_output, training=training) pooled_output = self.dropout(pooled_output, training=training)
logits = self.classifier(pooled_output) logits = self.classifier(pooled_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here loss = None if labels is None else self.compute_loss(labels, logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, logits) output = (logits,) + outputs[2:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), logits, (hidden_states), (attentions) return TFSequenceClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -1145,7 +1161,12 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="bert-base-cased",
output_type=TFMultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs, inputs,
@@ -1156,6 +1177,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -1164,24 +1186,6 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
Labels for computing the multiple choice classification loss. Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above) of the input tensors. (see `input_ids` above)
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
`num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
@@ -1192,8 +1196,9 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_attentions = inputs[6] if len(inputs) > 6 else output_attentions
output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states
labels = inputs[8] if len(inputs) > 8 else labels return_dict = inputs[8] if len(inputs) > 8 else return_dict
assert len(inputs) <= 9, "Too many inputs." labels = inputs[9] if len(inputs) > 9 else labels
assert len(inputs) <= 10, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
@@ -1203,10 +1208,12 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
return_dict = inputs.get("return_dict", return_dict)
labels = inputs.get("labels", labels) labels = inputs.get("labels", labels)
assert len(inputs) <= 9, "Too many inputs." assert len(inputs) <= 10, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
return_dict = return_dict if return_dict is not None else self.bert.return_dict
if input_ids is not None: if input_ids is not None:
num_choices = shape_list(input_ids)[1] num_choices = shape_list(input_ids)[1]
@@ -1233,19 +1240,23 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
flat_inputs_embeds, flat_inputs_embeds,
output_attentions, output_attentions,
output_hidden_states, output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
pooled_output = outputs[1] pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output, training=training) pooled_output = self.dropout(pooled_output, training=training)
logits = self.classifier(pooled_output) logits = self.classifier(pooled_output)
reshaped_logits = tf.reshape(logits, (-1, num_choices)) reshaped_logits = tf.reshape(logits, (-1, num_choices))
outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here
if labels is not None: loss = None if labels is None else self.compute_loss(labels, reshaped_logits)
loss = self.compute_loss(labels, reshaped_logits)
outputs = (loss,) + outputs
return outputs # (loss), reshaped_logits, (hidden_states), (attentions) if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFMultipleChoiceModelOutput(
loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -1265,7 +1276,12 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
) )
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="bert-base-cased",
output_type=TFTokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -1276,6 +1292,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -1283,27 +1300,12 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the token classification loss. Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``. Indices should be in ``[0, ..., config.num_labels - 1]``.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.bert.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
@@ -1316,6 +1318,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
@@ -1324,13 +1327,15 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
sequence_output = self.dropout(sequence_output, training=training) sequence_output = self.dropout(sequence_output, training=training)
logits = self.classifier(sequence_output) logits = self.classifier(sequence_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here loss = None if labels is None else self.compute_loss(labels, logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, logits) output = (logits,) + outputs[2:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), logits, (hidden_states), (attentions) return TFTokenClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -1349,7 +1354,12 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss)
) )
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="bert-base-cased",
output_type=TFQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -1360,6 +1370,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss)
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
start_positions=None, start_positions=None,
end_positions=None, end_positions=None,
training=False, training=False,
@@ -1373,30 +1384,13 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss)
Labels for position (index) of the end of the labelled span for computing the token classification loss. Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss. Position outside of the sequence are not taken into account for computing the loss.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-start scores (before SoftMax).
end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.bert.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
start_positions = inputs[8] if len(inputs) > 8 else start_positions start_positions = inputs[9] if len(inputs) > 9 else start_positions
end_positions = inputs[9] if len(inputs) > 9 else end_positions end_positions = inputs[10] if len(inputs) > 10 else end_positions
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
start_positions = inputs.pop("start_positions", start_positions) start_positions = inputs.pop("start_positions", start_positions)
end_positions = inputs.pop("end_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions)
@@ -1410,6 +1404,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss)
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
@@ -1420,12 +1415,20 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss)
start_logits = tf.squeeze(start_logits, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1)
outputs = (start_logits, end_logits,) + outputs[2:] loss = None
if start_positions is not None and end_positions is not None: if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions} labels = {"start_position": start_positions}
labels["end_position"] = end_positions labels["end_position"] = end_positions
loss = self.compute_loss(labels, outputs[:2]) loss = self.compute_loss(labels, (start_logits, end_logits))
outputs = (loss,) + outputs
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFQuestionAnsweringModelOutput(
loss=loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)

View File

@@ -62,8 +62,6 @@ CAMEMBERT_START_DOCSTRING = r"""
config (:class:`~transformers.CamembertConfig`): Model configuration class with all the parameters of the config (:class:`~transformers.CamembertConfig`): Model configuration class with all the parameters of the
model. Initializing with a config file does not load the weights associated with the model, only the configuration. model. Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
""" """

View File

@@ -23,6 +23,7 @@ import tensorflow as tf
from .configuration_ctrl import CTRLConfig from .configuration_ctrl import CTRLConfig
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_outputs import TFBaseModelOutputWithPast, TFCausalLMOutputWithPast
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFCausalLanguageModelingLoss, TFCausalLanguageModelingLoss,
TFPreTrainedModel, TFPreTrainedModel,
@@ -35,7 +36,8 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "CtrlTokenizer" _CONFIG_FOR_DOC = "CTRLConfig"
_TOKENIZER_FOR_DOC = "CTRLTokenizer"
TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = [
"ctrl" "ctrl"
@@ -207,6 +209,7 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.use_cache = config.use_cache self.use_cache = config.use_cache
self.return_dict = config.use_return_dict
self.d_model_size = config.n_embd self.d_model_size = config.n_embd
self.num_layers = config.n_layer self.num_layers = config.n_layer
@@ -260,6 +263,7 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
use_cache=None, use_cache=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
training=False, training=False,
): ):
@@ -274,7 +278,8 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
use_cache = inputs[7] if len(inputs) > 7 else use_cache use_cache = inputs[7] if len(inputs) > 7 else use_cache
output_attentions = inputs[8] if len(inputs) > 8 else output_attentions output_attentions = inputs[8] if len(inputs) > 8 else output_attentions
output_hidden_states = inputs[9] if len(inputs) > 9 else output_hidden_states output_hidden_states = inputs[9] if len(inputs) > 9 else output_hidden_states
assert len(inputs) <= 10, "Too many inputs." return_dict = inputs[10] if len(inputs) > 10 else return_dict
assert len(inputs) <= 11, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
past = inputs.get("past", past) past = inputs.get("past", past)
@@ -286,13 +291,15 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
use_cache = inputs.get("use_cache", use_cache) use_cache = inputs.get("use_cache", use_cache)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
assert len(inputs) <= 10, "Too many inputs." return_dict = inputs.get("return_dict", return_dict)
assert len(inputs) <= 11, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_attentions = output_attentions if output_attentions is not None else self.output_attentions
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
use_cache = use_cache if use_cache is not None else self.use_cache use_cache = use_cache if use_cache is not None else self.use_cache
return_dict = return_dict if return_dict is not None else self.return_dict
# If using past key value states, only the last tokens # If using past key value states, only the last tokens
# should be given as an input # should be given as an input
@@ -374,9 +381,9 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
hidden_states = self.dropout(hidden_states, training=training) hidden_states = self.dropout(hidden_states, training=training)
output_shape = input_shape + [shape_list(hidden_states)[-1]] output_shape = input_shape + [shape_list(hidden_states)[-1]]
presents = () presents = () if use_cache else None
all_hidden_states = () all_hidden_states = () if output_hidden_states else None
all_attentions = [] all_attentions = () if output_attentions else None
for i, (h, layer_past) in enumerate(zip(self.h, past)): for i, (h, layer_past) in enumerate(zip(self.h, past)):
if output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),) all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),)
@@ -396,24 +403,27 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
presents = presents + (present,) presents = presents + (present,)
if output_attentions: if output_attentions:
all_attentions.append(outputs[2]) all_attentions = all_attentions + (outputs[2],)
hidden_states = self.layernorm(hidden_states) hidden_states = self.layernorm(hidden_states)
hidden_states = tf.reshape(hidden_states, output_shape) hidden_states = tf.reshape(hidden_states, output_shape)
if output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states,)
if use_cache:
outputs = outputs + (presents,)
if output_hidden_states:
outputs = outputs + (all_hidden_states,)
if output_attentions: if output_attentions:
# let the number of heads free (-1) so we can extract attention even after head pruning # let the number of heads free (-1) so we can extract attention even after head pruning
attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:] attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:]
all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions) all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions)
outputs = outputs + (all_attentions,)
return outputs if not return_dict:
return tuple(v for v in [hidden_states, presents, all_hidden_states, all_attentions] if v is not None)
return TFBaseModelOutputWithPast(
last_hidden_state=hidden_states,
past_key_values=presents,
hidden_states=all_hidden_states,
attentions=all_attentions,
)
class TFCTRLPreTrainedModel(TFPreTrainedModel): class TFCTRLPreTrainedModel(TFPreTrainedModel):
@@ -503,6 +513,11 @@ CTRL_INPUTS_DOCSTRING = r"""
(if set to :obj:`False`) for evaluation. (if set to :obj:`False`) for evaluation.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
@@ -516,29 +531,13 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
self.transformer = TFCTRLMainLayer(config, name="transformer") self.transformer = TFCTRLMainLayer(config, name="transformer")
@add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="ctrl",
output_type=TFBaseModelOutputWithPast,
config_class=_CONFIG_FOR_DOC,
)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r"""
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.CTRLConfig`) and inputs:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the last layer of the model.
past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
outputs = self.transformer(inputs, **kwargs) outputs = self.transformer(inputs, **kwargs)
return outputs return outputs
@@ -585,7 +584,12 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss):
return {"inputs": inputs, "past": past, "use_cache": kwargs["use_cache"]} return {"inputs": inputs, "past": past, "use_cache": kwargs["use_cache"]}
@add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="ctrl",
output_type=TFCausalLMOutputWithPast,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs, inputs,
@@ -598,6 +602,7 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss):
use_cache=None, use_cache=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -605,31 +610,12 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss):
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the cross entropy classification loss. Labels for computing the cross entropy classification loss.
Indices should be in ``[0, ..., config.vocab_size - 1]``. Indices should be in ``[0, ..., config.vocab_size - 1]``.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.CTRLConfig`) and inputs:
prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[10] if len(inputs) > 10 else labels labels = inputs[11] if len(inputs) > 11 else labels
if len(inputs) > 10: if len(inputs) > 11:
inputs = inputs[:10] inputs = inputs[:11]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
@@ -644,6 +630,7 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss):
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
@@ -651,12 +638,21 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss):
logits = self.lm_head(hidden_states) logits = self.lm_head(hidden_states)
outputs = (logits,) + transformer_outputs[1:] loss = None
if labels is not None: if labels is not None:
# shift labels to the left and cut last logit token # shift labels to the left and cut last logit token
logits = logits[:, :-1] logits = logits[:, :-1]
labels = labels[:, 1:] labels = labels[:, 1:]
loss = self.compute_loss(labels, logits) loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # lm_logits, presents, (all hidden_states), (attentions) if not return_dict:
output = (logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFCausalLMOutputWithPast(
loss=loss,
logits=logits,
past_key_values=transformer_outputs.past_key_values,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)

View File

@@ -29,6 +29,14 @@ from .file_utils import (
add_start_docstrings, add_start_docstrings,
add_start_docstrings_to_callable, add_start_docstrings_to_callable,
) )
from .modeling_tf_outputs import (
TFBaseModelOutput,
TFMaskedLMOutput,
TFMultipleChoiceModelOutput,
TFQuestionAnsweringModelOutput,
TFSequenceClassifierOutput,
TFTokenClassifierOutput,
)
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFMaskedLanguageModelingLoss, TFMaskedLanguageModelingLoss,
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
@@ -46,6 +54,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_CONFIG_FOR_DOC = "DistilBertConfig"
_TOKENIZER_FOR_DOC = "DistilBertTokenizer" _TOKENIZER_FOR_DOC = "DistilBertTokenizer"
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -359,7 +368,7 @@ class TFTransformer(tf.keras.layers.Layer):
self.layer = [TFTransformerBlock(config, name="layer_._{}".format(i)) for i in range(config.n_layers)] self.layer = [TFTransformerBlock(config, name="layer_._{}".format(i)) for i in range(config.n_layers)]
def call(self, x, attn_mask, head_mask, output_attentions, output_hidden_states, training=False): def call(self, x, attn_mask, head_mask, output_attentions, output_hidden_states, return_dict, training=False):
""" """
Parameters Parameters
---------- ----------
@@ -379,8 +388,8 @@ class TFTransformer(tf.keras.layers.Layer):
Tuple of length n_layers with the attention weights from each layer Tuple of length n_layers with the attention weights from each layer
Optional: only if output_attentions=True Optional: only if output_attentions=True
""" """
all_hidden_states = () all_hidden_states = () if output_hidden_states else None
all_attentions = () all_attentions = () if output_attentions else None
hidden_state = x hidden_state = x
for i, layer_module in enumerate(self.layer): for i, layer_module in enumerate(self.layer):
@@ -401,12 +410,11 @@ class TFTransformer(tf.keras.layers.Layer):
if output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_state,) all_hidden_states = all_hidden_states + (hidden_state,)
outputs = (hidden_state,) if not return_dict:
if output_hidden_states: return tuple(v for v in [hidden_state, all_hidden_states, all_attentions] if v is not None)
outputs = outputs + (all_hidden_states,) return TFBaseModelOutput(
if output_attentions: last_hidden_state=hidden_state, hidden_states=all_hidden_states, attentions=all_attentions
outputs = outputs + (all_attentions,) )
return outputs # last-layer hidden state, (all hidden states), (all attentions)
@keras_serializable @keras_serializable
@@ -418,6 +426,7 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
self.num_hidden_layers = config.num_hidden_layers self.num_hidden_layers = config.num_hidden_layers
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.return_dict = config.use_return_dict
self.embeddings = TFEmbeddings(config, name="embeddings") # Embeddings self.embeddings = TFEmbeddings(config, name="embeddings") # Embeddings
self.transformer = TFTransformer(config, name="transformer") # Encoder self.transformer = TFTransformer(config, name="transformer") # Encoder
@@ -440,6 +449,7 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
training=False, training=False,
): ):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
@@ -449,7 +459,8 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
output_attentions = inputs[4] if len(inputs) > 4 else output_attentions output_attentions = inputs[4] if len(inputs) > 4 else output_attentions
output_hidden_states = inputs[5] if len(inputs) > 5 else output_hidden_states output_hidden_states = inputs[5] if len(inputs) > 5 else output_hidden_states
assert len(inputs) <= 6, "Too many inputs." return_dict = inputs[6] if len(inputs) > 6 else return_dict
assert len(inputs) <= 7, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
@@ -457,12 +468,14 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
assert len(inputs) <= 6, "Too many inputs." return_dict = inputs.get("return_dict", return_dict)
assert len(inputs) <= 7, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_attentions = output_attentions if output_attentions is not None else self.output_attentions
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
return_dict = return_dict if return_dict is not None else self.return_dict
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
@@ -491,7 +504,13 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
embedding_output = self.embeddings(input_ids, inputs_embeds=inputs_embeds) # (bs, seq_length, dim) embedding_output = self.embeddings(input_ids, inputs_embeds=inputs_embeds) # (bs, seq_length, dim)
tfmr_output = self.transformer( tfmr_output = self.transformer(
embedding_output, attention_mask, head_mask, output_attentions, output_hidden_states, training=training embedding_output,
attention_mask,
head_mask,
output_attentions,
output_hidden_states,
return_dict,
training=training,
) )
return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions) return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions)
@@ -564,9 +583,13 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
training (:obj:`boolean`, `optional`, defaults to :obj:`False`): training (:obj:`boolean`, `optional`, defaults to :obj:`False`):
Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them
(if set to :obj:`False`) for evaluation. (if set to :obj:`False`) for evaluation.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
@@ -580,25 +603,13 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
self.distilbert = TFDistilBertMainLayer(config, name="distilbert") # Embeddings self.distilbert = TFDistilBertMainLayer(config, name="distilbert") # Embeddings
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="distilbert-base-uncased",
output_type=TFBaseModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r"""
Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
outputs = self.distilbert(inputs, **kwargs) outputs = self.distilbert(inputs, **kwargs)
return outputs return outputs
@@ -642,7 +653,12 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModel
return self.vocab_projector.input_embeddings return self.vocab_projector.input_embeddings
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="distilbert-base-uncased",
output_type=TFMaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -651,6 +667,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModel
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -660,27 +677,12 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModel
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]`` in ``[0, ..., config.vocab_size]``
Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs:
prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.distilbert.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[6] if len(inputs) > 6 else labels labels = inputs[7] if len(inputs) > 7 else labels
if len(inputs) > 6: if len(inputs) > 7:
inputs = inputs[:6] inputs = inputs[:7]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
@@ -691,6 +693,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModel
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
@@ -700,13 +703,18 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModel
prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim) prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim)
prediction_logits = self.vocab_projector(prediction_logits) prediction_logits = self.vocab_projector(prediction_logits)
outputs = (prediction_logits,) + distilbert_output[1:] loss = None if labels is None else self.compute_loss(labels, prediction_logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, prediction_logits) output = (prediction_logits,) + distilbert_output[1:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # logits, (hidden_states), (attentions) return TFMaskedLMOutput(
loss=loss,
logits=prediction_logits,
hidden_states=distilbert_output.hidden_states,
attentions=distilbert_output.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -732,7 +740,12 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque
self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout) self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout)
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="distilbert-base-uncased",
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -741,6 +754,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -750,27 +764,12 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque
Indices should be in ``[0, ..., config.num_labels - 1]``. Indices should be in ``[0, ..., config.num_labels - 1]``.
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs:
logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.distilbert.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[6] if len(inputs) > 6 else labels labels = inputs[7] if len(inputs) > 7 else labels
if len(inputs) > 6: if len(inputs) > 7:
inputs = inputs[:6] inputs = inputs[:7]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
@@ -781,6 +780,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
@@ -790,13 +790,18 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque
pooled_output = self.dropout(pooled_output, training=training) # (bs, dim) pooled_output = self.dropout(pooled_output, training=training) # (bs, dim)
logits = self.classifier(pooled_output) # (bs, dim) logits = self.classifier(pooled_output) # (bs, dim)
outputs = (logits,) + distilbert_output[1:] loss = None if labels is None else self.compute_loss(labels, logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, logits) output = (logits,) + distilbert_output[1:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), logits, (hidden_states), (attentions) return TFSequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=distilbert_output.hidden_states,
attentions=distilbert_output.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -816,7 +821,12 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla
) )
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="distilbert-base-uncased",
output_type=TFTokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -825,6 +835,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -832,27 +843,12 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the token classification loss. Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``. Indices should be in ``[0, ..., config.num_labels - 1]``.
Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs:
scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.distilbert.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[6] if len(inputs) > 6 else labels labels = inputs[7] if len(inputs) > 7 else labels
if len(inputs) > 6: if len(inputs) > 7:
inputs = inputs[:6] inputs = inputs[:7]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
@@ -863,6 +859,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
@@ -871,13 +868,15 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla
sequence_output = self.dropout(sequence_output, training=training) sequence_output = self.dropout(sequence_output, training=training)
logits = self.classifier(sequence_output) logits = self.classifier(sequence_output)
outputs = (logits,) + outputs[1:] # add hidden states and attention if they are here loss = None if labels is None else self.compute_loss(labels, logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, logits) output = (logits,) + outputs[1:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), logits, (hidden_states), (attentions) return TFTokenClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -911,7 +910,12 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="distilbert-base-uncased",
output_type=TFMultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs, inputs,
@@ -920,6 +924,7 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -928,24 +933,6 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic
Labels for computing the multiple choice classification loss. Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above) of the input tensors. (see `input_ids` above)
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
`num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
@@ -954,8 +941,9 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic
inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
output_attentions = inputs[4] if len(inputs) > 4 else output_attentions output_attentions = inputs[4] if len(inputs) > 4 else output_attentions
output_hidden_states = inputs[5] if len(inputs) > 5 else output_hidden_states output_hidden_states = inputs[5] if len(inputs) > 5 else output_hidden_states
labels = inputs[6] if len(inputs) > 6 else labels return_dict = inputs[6] if len(inputs) > 6 else return_dict
assert len(inputs) <= 7, "Too many inputs." labels = inputs[7] if len(inputs) > 7 else labels
assert len(inputs) <= 8, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
@@ -963,10 +951,12 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
return_dict = inputs.get("return_dict", return_dict)
labels = inputs.get("labels", labels) labels = inputs.get("labels", labels)
assert len(inputs) <= 7, "Too many inputs." assert len(inputs) <= 8, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
return_dict = return_dict if return_dict is not None else self.distilbert.return_dict
if input_ids is not None: if input_ids is not None:
num_choices = shape_list(input_ids)[1] num_choices = shape_list(input_ids)[1]
@@ -989,6 +979,7 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic
flat_inputs_embeds, flat_inputs_embeds,
output_attentions, output_attentions,
output_hidden_states, output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
hidden_state = distilbert_output[0] # (bs, seq_len, dim) hidden_state = distilbert_output[0] # (bs, seq_len, dim)
@@ -997,13 +988,19 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic
pooled_output = self.dropout(pooled_output, training=training) # (bs, dim) pooled_output = self.dropout(pooled_output, training=training) # (bs, dim)
logits = self.classifier(pooled_output) logits = self.classifier(pooled_output)
reshaped_logits = tf.reshape(logits, (-1, num_choices)) reshaped_logits = tf.reshape(logits, (-1, num_choices))
outputs = (reshaped_logits,) + distilbert_output[1:] # add hidden states and attention if they are here
if labels is not None: loss = None if labels is None else self.compute_loss(labels, reshaped_logits)
loss = self.compute_loss(labels, reshaped_logits)
outputs = (loss,) + outputs
return outputs # (loss), reshaped_logits, (hidden_states), (attentions) if not return_dict:
output = (reshaped_logits,) + distilbert_output[1:]
return ((loss,) + output) if loss is not None else output
return TFMultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=distilbert_output.hidden_states,
attentions=distilbert_output.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -1023,7 +1020,12 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn
self.dropout = tf.keras.layers.Dropout(config.qa_dropout) self.dropout = tf.keras.layers.Dropout(config.qa_dropout)
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="distilbert-base-uncased",
output_type=TFQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -1032,6 +1034,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
start_positions=None, start_positions=None,
end_positions=None, end_positions=None,
training=False, training=False,
@@ -1045,30 +1048,13 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn
Labels for position (index) of the end of the labelled span for computing the token classification loss. Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss. Position outside of the sequence are not taken into account for computing the loss.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs:
start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-start scores (before SoftMax).
end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.distilbert.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
start_positions = inputs[6] if len(inputs) > 6 else start_positions start_positions = inputs[7] if len(inputs) > 7 else start_positions
end_positions = inputs[7] if len(inputs) > 7 else end_positions end_positions = inputs[8] if len(inputs) > 8 else end_positions
if len(inputs) > 6: if len(inputs) > 7:
inputs = inputs[:6] inputs = inputs[:7]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
start_positions = inputs.pop("start_positions", start_positions) start_positions = inputs.pop("start_positions", start_positions)
end_positions = inputs.pop("end_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions)
@@ -1080,6 +1066,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
@@ -1090,12 +1077,20 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn
start_logits = tf.squeeze(start_logits, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1)
outputs = (start_logits, end_logits,) + distilbert_output[1:] loss = None
if start_positions is not None and end_positions is not None: if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions} labels = {"start_position": start_positions}
labels["end_position"] = end_positions labels["end_position"] = end_positions
loss = self.compute_loss(labels, outputs[:2]) loss = self.compute_loss(labels, (start_logits, end_logits))
outputs = (loss,) + outputs
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) if not return_dict:
output = (start_logits, end_logits) + distilbert_output[1:]
return ((loss,) + output) if loss is not None else output
return TFQuestionAnsweringModelOutput(
loss=loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=distilbert_output.hidden_states,
attentions=distilbert_output.attentions,
)

View File

@@ -1,4 +1,6 @@
import logging import logging
from dataclasses import dataclass
from typing import Optional, Tuple
import tensorflow as tf import tensorflow as tf
@@ -6,11 +8,21 @@ from transformers import ElectraConfig
from .file_utils import ( from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS, MULTIPLE_CHOICE_DUMMY_INPUTS,
ModelOutput,
add_code_sample_docstrings, add_code_sample_docstrings,
add_start_docstrings, add_start_docstrings,
add_start_docstrings_to_callable, add_start_docstrings_to_callable,
replace_return_docstrings,
) )
from .modeling_tf_bert import ACT2FN, TFBertEncoder, TFBertPreTrainedModel from .modeling_tf_bert import ACT2FN, TFBertEncoder, TFBertPreTrainedModel
from .modeling_tf_outputs import (
TFBaseModelOutput,
TFMaskedLMOutput,
TFMultipleChoiceModelOutput,
TFQuestionAnsweringModelOutput,
TFSequenceClassifierOutput,
TFTokenClassifierOutput,
)
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFMaskedLanguageModelingLoss, TFMaskedLanguageModelingLoss,
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
@@ -27,8 +39,8 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "ElectraTokenizer"
_CONFIG_FOR_DOC = "ElectraConfig" _CONFIG_FOR_DOC = "ElectraConfig"
_TOKENIZER_FOR_DOC = "ElectraTokenizer"
TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
"google/electra-small-generator", "google/electra-small-generator",
@@ -254,6 +266,7 @@ class TFElectraMainLayer(TFElectraPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
training=False, training=False,
): ):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
@@ -265,7 +278,8 @@ class TFElectraMainLayer(TFElectraPreTrainedModel):
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_attentions = inputs[6] if len(inputs) > 6 else output_attentions
output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states
assert len(inputs) <= 8, "Too many inputs." return_dict = inputs[8] if len(inputs) > 8 else return_dict
assert len(inputs) <= 9, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
@@ -275,7 +289,8 @@ class TFElectraMainLayer(TFElectraPreTrainedModel):
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
assert len(inputs) <= 8, "Too many inputs." return_dict = inputs.get("return_dict", return_dict)
assert len(inputs) <= 9, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
@@ -283,6 +298,7 @@ class TFElectraMainLayer(TFElectraPreTrainedModel):
output_hidden_states = ( output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
) )
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
@@ -312,12 +328,41 @@ class TFElectraMainLayer(TFElectraPreTrainedModel):
head_mask, head_mask,
output_attentions, output_attentions,
output_hidden_states, output_hidden_states,
return_dict,
training=training, training=training,
) )
return hidden_states return hidden_states
@dataclass
class TFElectraForPreTrainingOutput(ModelOutput):
"""
Output type of :class:`~transformers.TFElectraForPreTrainingModel`.
Args:
loss (`optional`, returned when ``labels`` is provided, ``tf.Tensor`` of shape :obj:`(1,)`):
Total loss of the ELECTRA objective.
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
Prediction scores of the head (scores for each token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
logits: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
ELECTRA_START_DOCSTRING = r""" ELECTRA_START_DOCSTRING = r"""
This model is a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ sub-class. This model is a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ sub-class.
Use it as a regular TF 2.0 Keras Model and Use it as a regular TF 2.0 Keras Model and
@@ -380,9 +425,13 @@ ELECTRA_INPUTS_DOCSTRING = r"""
training (:obj:`boolean`, `optional`, defaults to :obj:`False`): training (:obj:`boolean`, `optional`, defaults to :obj:`False`):
Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them
(if set to :obj:`False`) for evaluation. (if set to :obj:`False`) for evaluation.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
@@ -400,25 +449,13 @@ class TFElectraModel(TFElectraPreTrainedModel):
self.electra = TFElectraMainLayer(config, name="electra") self.electra = TFElectraMainLayer(config, name="electra")
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="google/electra-small-discriminator",
output_type=TFBaseModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r"""
Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
outputs = self.electra(inputs, **kwargs) outputs = self.electra(inputs, **kwargs)
return outputs return outputs
@@ -439,6 +476,7 @@ class TFElectraForPreTraining(TFElectraPreTrainedModel):
self.discriminator_predictions = TFElectraDiscriminatorPredictions(config, name="discriminator_predictions") self.discriminator_predictions = TFElectraDiscriminatorPredictions(config, name="discriminator_predictions")
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFElectraForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
def call( def call(
self, self,
input_ids, input_ids,
@@ -449,24 +487,11 @@ class TFElectraForPreTraining(TFElectraPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
training=False, training=False,
): ):
r""" r"""
Returns: Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
Prediction scores of the head (scores for each token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples:: Examples::
@@ -479,6 +504,7 @@ class TFElectraForPreTraining(TFElectraPreTrainedModel):
outputs = model(input_ids) outputs = model(input_ids)
scores = outputs[0] scores = outputs[0]
""" """
return_dict = return_dict if return_dict is not None else self.electra.config.return_dict
discriminator_hidden_states = self.electra( discriminator_hidden_states = self.electra(
input_ids, input_ids,
@@ -489,14 +515,20 @@ class TFElectraForPreTraining(TFElectraPreTrainedModel):
inputs_embeds, inputs_embeds,
output_attentions, output_attentions,
output_hidden_states, output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
discriminator_sequence_output = discriminator_hidden_states[0] discriminator_sequence_output = discriminator_hidden_states[0]
logits = self.discriminator_predictions(discriminator_sequence_output) logits = self.discriminator_predictions(discriminator_sequence_output)
output = (logits,)
output += discriminator_hidden_states[1:]
return output # (loss), scores, (hidden_states), (attentions) if not return_dict:
return (logits,) + discriminator_hidden_states[1:]
return TFElectraForPreTrainingOutput(
logits=logits,
hidden_states=discriminator_hidden_states.hidden_states,
attentions=discriminator_hidden_states.attentions,
)
class TFElectraMaskedLMHead(tf.keras.layers.Layer): class TFElectraMaskedLMHead(tf.keras.layers.Layer):
@@ -539,7 +571,12 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLos
return self.generator_lm_head return self.generator_lm_head
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-generator") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="google/electra-small-generator",
output_type=TFMaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
input_ids, input_ids,
@@ -550,6 +587,7 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLos
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -559,27 +597,12 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLos
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]`` in ``[0, ..., config.vocab_size]``
Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.electra.config.return_dict
if isinstance(input_ids, (tuple, list)): if isinstance(input_ids, (tuple, list)):
labels = input_ids[8] if len(input_ids) > 8 else labels labels = input_ids[9] if len(input_ids) > 9 else labels
if len(input_ids) > 8: if len(input_ids) > 9:
input_ids = input_ids[:8] input_ids = input_ids[:9]
elif isinstance(input_ids, (dict, BatchEncoding)): elif isinstance(input_ids, (dict, BatchEncoding)):
labels = input_ids.pop("labels", labels) labels = input_ids.pop("labels", labels)
@@ -592,19 +615,25 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLos
inputs_embeds, inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
generator_sequence_output = generator_hidden_states[0] generator_sequence_output = generator_hidden_states[0]
prediction_scores = self.generator_predictions(generator_sequence_output, training=training) prediction_scores = self.generator_predictions(generator_sequence_output, training=training)
prediction_scores = self.generator_lm_head(prediction_scores, training=training) prediction_scores = self.generator_lm_head(prediction_scores, training=training)
output = (prediction_scores,)
output += generator_hidden_states[1:]
if labels is not None: loss = None if labels is None else self.compute_loss(labels, prediction_scores)
loss = self.compute_loss(labels, prediction_scores)
output = (loss,) + output
return output # (masked_lm_loss), prediction_scores, (hidden_states), (attentions) if not return_dict:
output = (prediction_scores,) + generator_hidden_states[1:]
return ((loss,) + output) if loss is not None else output
return TFMaskedLMOutput(
loss=loss,
logits=prediction_scores,
hidden_states=generator_hidden_states.hidden_states,
attentions=generator_hidden_states.attentions,
)
class TFElectraClassificationHead(tf.keras.layers.Layer): class TFElectraClassificationHead(tf.keras.layers.Layer):
@@ -647,6 +676,7 @@ class TFElectraForSequenceClassification(TFElectraPreTrainedModel, TFSequenceCla
@add_code_sample_docstrings( @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC, tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="google/electra-small-discriminator", checkpoint="google/electra-small-discriminator",
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC, config_class=_CONFIG_FOR_DOC,
) )
def call( def call(
@@ -659,23 +689,25 @@ class TFElectraForSequenceClassification(TFElectraPreTrainedModel, TFSequenceCla
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
r""" r"""
Returns: labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: Labels for computing the sequence classification/regression loss.
logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`) Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
Classification (or regression if config.num_labels==1) scores (before SoftMax). If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
""" """
return_dict = return_dict if return_dict is not None else self.electra.config.return_dict
if isinstance(input_ids, (tuple, list)):
labels = input_ids[9] if len(input_ids) > 9 else labels
if len(input_ids) > 9:
input_ids = input_ids[:9]
elif isinstance(input_ids, (dict, BatchEncoding)):
labels = input_ids.pop("labels", labels)
outputs = self.electra( outputs = self.electra(
input_ids, input_ids,
attention_mask, attention_mask,
@@ -685,16 +717,20 @@ class TFElectraForSequenceClassification(TFElectraPreTrainedModel, TFSequenceCla
inputs_embeds, inputs_embeds,
output_attentions, output_attentions,
output_hidden_states, output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
logits = self.classifier(outputs[0]) logits = self.classifier(outputs[0])
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
if labels is not None: loss = None if labels is None else self.compute_loss(labels, logits)
loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # (loss), logits, (hidden_states), (attentions) if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFSequenceClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -724,7 +760,12 @@ class TFElectraForMultipleChoice(TFElectraPreTrainedModel, TFMultipleChoiceLoss)
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="google/electra-small-discriminator",
output_type=TFMultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs, inputs,
@@ -735,6 +776,7 @@ class TFElectraForMultipleChoice(TFElectraPreTrainedModel, TFMultipleChoiceLoss)
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -743,24 +785,6 @@ class TFElectraForMultipleChoice(TFElectraPreTrainedModel, TFMultipleChoiceLoss)
Labels for computing the multiple choice classification loss. Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above) of the input tensors. (see `input_ids` above)
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
`num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
@@ -771,8 +795,9 @@ class TFElectraForMultipleChoice(TFElectraPreTrainedModel, TFMultipleChoiceLoss)
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_attentions = inputs[6] if len(inputs) > 6 else output_attentions
output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states
labels = inputs[8] if len(inputs) > 8 else labels return_dict = inputs[8] if len(inputs) > 8 else return_dict
assert len(inputs) <= 9, "Too many inputs." labels = inputs[9] if len(inputs) > 9 else labels
assert len(inputs) <= 10, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
@@ -782,10 +807,12 @@ class TFElectraForMultipleChoice(TFElectraPreTrainedModel, TFMultipleChoiceLoss)
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
return_dict = inputs.get("return_dict", return_dict)
labels = inputs.get("labels", labels) labels = inputs.get("labels", labels)
assert len(inputs) <= 9, "Too many inputs." assert len(inputs) <= 10, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
return_dict = return_dict if return_dict is not None else self.electra.config.return_dict
if input_ids is not None: if input_ids is not None:
num_choices = shape_list(input_ids)[1] num_choices = shape_list(input_ids)[1]
@@ -812,18 +839,22 @@ class TFElectraForMultipleChoice(TFElectraPreTrainedModel, TFMultipleChoiceLoss)
flat_inputs_embeds, flat_inputs_embeds,
output_attentions, output_attentions,
output_hidden_states, output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
logits = self.sequence_summary(outputs[0]) logits = self.sequence_summary(outputs[0])
logits = self.classifier(logits) logits = self.classifier(logits)
reshaped_logits = tf.reshape(logits, (-1, num_choices)) reshaped_logits = tf.reshape(logits, (-1, num_choices))
outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here
if labels is not None: loss = None if labels is None else self.compute_loss(labels, reshaped_logits)
loss = self.compute_loss(labels, reshaped_logits)
outputs = (loss,) + outputs
return outputs # (loss), reshaped_logits, (hidden_states), (attentions) if not return_dict:
output = (reshaped_logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFMultipleChoiceModelOutput(
loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -843,7 +874,12 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific
) )
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="google/electra-small-discriminator",
output_type=TFTokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs, inputs,
@@ -854,6 +890,7 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -861,27 +898,12 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the token classification loss. Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``. Indices should be in ``[0, ..., config.num_labels - 1]``.
Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.electra.config.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
@@ -894,19 +916,25 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific
inputs_embeds, inputs_embeds,
output_attentions, output_attentions,
output_hidden_states, output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
discriminator_sequence_output = discriminator_hidden_states[0] discriminator_sequence_output = discriminator_hidden_states[0]
discriminator_sequence_output = self.dropout(discriminator_sequence_output) discriminator_sequence_output = self.dropout(discriminator_sequence_output)
logits = self.classifier(discriminator_sequence_output) logits = self.classifier(discriminator_sequence_output)
outputs = (logits,) + discriminator_hidden_states[1:] loss = None if labels is None else self.compute_loss(labels, logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, logits) output = (logits,) + discriminator_hidden_states[1:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), scores, (hidden_states), (attentions) return TFTokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=discriminator_hidden_states.hidden_states,
attentions=discriminator_hidden_states.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -925,7 +953,12 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin
) )
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="google/electra-small-discriminator",
output_type=TFQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs, inputs,
@@ -936,6 +969,7 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
start_positions=None, start_positions=None,
end_positions=None, end_positions=None,
training=False, training=False,
@@ -949,30 +983,13 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin
Labels for position (index) of the end of the labelled span for computing the token classification loss. Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss. Position outside of the sequence are not taken into account for computing the loss.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-start scores (before SoftMax).
end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.electra.config.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
start_positions = inputs[8] if len(inputs) > 8 else start_positions start_positions = inputs[9] if len(inputs) > 9 else start_positions
end_positions = inputs[9] if len(inputs) > 9 else end_positions end_positions = inputs[10] if len(inputs) > 10 else end_positions
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
start_positions = inputs.pop("start_positions", start_positions) start_positions = inputs.pop("start_positions", start_positions)
end_positions = inputs.pop("end_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions)
@@ -986,6 +1003,7 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin
inputs_embeds, inputs_embeds,
output_attentions, output_attentions,
output_hidden_states, output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
discriminator_sequence_output = discriminator_hidden_states[0] discriminator_sequence_output = discriminator_hidden_states[0]
@@ -995,12 +1013,20 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin
start_logits = tf.squeeze(start_logits, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1)
outputs = (start_logits, end_logits,) + discriminator_hidden_states[1:] loss = None
if start_positions is not None and end_positions is not None: if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions} labels = {"start_position": start_positions}
labels["end_position"] = end_positions labels["end_position"] = end_positions
loss = self.compute_loss(labels, outputs[:2]) loss = self.compute_loss(labels, (start_logits, end_logits))
outputs = (loss,) + outputs
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) if not return_dict:
output = (start_logits, end_logits,) + discriminator_hidden_states[1:]
return ((loss,) + output) if loss is not None else output
return TFQuestionAnsweringModelOutput(
loss=loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=discriminator_hidden_states.hidden_states,
attentions=discriminator_hidden_states.attentions,
)

View File

@@ -22,6 +22,7 @@ import tensorflow as tf
from .configuration_flaubert import FlaubertConfig from .configuration_flaubert import FlaubertConfig
from .file_utils import add_start_docstrings from .file_utils import add_start_docstrings
from .modeling_tf_outputs import TFBaseModelOutput
from .modeling_tf_utils import keras_serializable, shape_list from .modeling_tf_utils import keras_serializable, shape_list
from .modeling_tf_xlm import ( from .modeling_tf_xlm import (
TFXLMForMultipleChoice, TFXLMForMultipleChoice,
@@ -103,6 +104,11 @@ FLAUBERT_INPUTS_DOCSTRING = r"""
than the model's internal embedding lookup matrix. than the model's internal embedding lookup matrix.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
@@ -126,6 +132,7 @@ class TFFlaubertMainLayer(TFXLMMainLayer):
self.pre_norm = getattr(config, "pre_norm", False) self.pre_norm = getattr(config, "pre_norm", False)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.return_dict = config.use_return_dict
def call( def call(
self, self,
@@ -140,6 +147,7 @@ class TFFlaubertMainLayer(TFXLMMainLayer):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
training=False, training=False,
): ):
# removed: src_enc=None, src_len=None # removed: src_enc=None, src_len=None
@@ -155,7 +163,8 @@ class TFFlaubertMainLayer(TFXLMMainLayer):
inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
output_attentions = inputs[9] if len(inputs) > 9 else output_attentions output_attentions = inputs[9] if len(inputs) > 9 else output_attentions
output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states
assert len(inputs) <= 11, "Too many inputs." return_dict = inputs[11] if len(inputs) > 11 else return_dict
assert len(inputs) <= 12, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
@@ -168,12 +177,14 @@ class TFFlaubertMainLayer(TFXLMMainLayer):
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
assert len(inputs) <= 11, "Too many inputs." return_dict = inputs.get("return_dict", return_dict)
assert len(inputs) <= 12, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_attentions = output_attentions if output_attentions is not None else self.output_attentions
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
return_dict = return_dict if return_dict is not None else self.return_dict
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
@@ -260,8 +271,8 @@ class TFFlaubertMainLayer(TFXLMMainLayer):
tensor = tensor * mask[..., tf.newaxis] tensor = tensor * mask[..., tf.newaxis]
# transformer layers # transformer layers
hidden_states = () hidden_states = () if output_hidden_states else None
attentions = () attentions = () if output_attentions else None
for i in range(self.n_layers): for i in range(self.n_layers):
# LayerDrop # LayerDrop
dropout_probability = random.uniform(0, 1) dropout_probability = random.uniform(0, 1)
@@ -321,12 +332,9 @@ class TFFlaubertMainLayer(TFXLMMainLayer):
# move back sequence length to dimension 0 # move back sequence length to dimension 0
# tensor = tensor.transpose(0, 1) # tensor = tensor.transpose(0, 1)
outputs = (tensor,) if not return_dict:
if output_hidden_states: return tuple(v for v in [tensor, hidden_states, attentions] if v is not None)
outputs = outputs + (hidden_states,) return TFBaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions)
if output_attentions:
outputs = outputs + (attentions,)
return outputs # outputs, (hidden_states), (attentions)
@add_start_docstrings( @add_start_docstrings(

View File

@@ -17,12 +17,21 @@
import logging import logging
from dataclasses import dataclass
from typing import List, Optional, Tuple
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from .configuration_gpt2 import GPT2Config from .configuration_gpt2 import GPT2Config
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
replace_return_docstrings,
)
from .modeling_tf_outputs import TFBaseModelOutputWithPast, TFCausalLMOutputWithPast
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFCausalLanguageModelingLoss, TFCausalLanguageModelingLoss,
TFConv1D, TFConv1D,
@@ -38,6 +47,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_CONFIG_FOR_DOC = "GPT2Config"
_TOKENIZER_FOR_DOC = "GPT2Tokenizer" _TOKENIZER_FOR_DOC = "GPT2Tokenizer"
TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -214,12 +224,11 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.use_cache = config.use_cache self.use_cache = config.use_cache
self.return_dict = config.use_return_dict
self.num_hidden_layers = config.n_layer self.num_hidden_layers = config.n_layer
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.n_embd = config.n_embd self.n_embd = config.n_embd
self.output_hidden_states = self.output_hidden_states
self.output_attentions = self.output_attentions
self.wte = TFSharedEmbeddings( self.wte = TFSharedEmbeddings(
config.vocab_size, config.hidden_size, initializer_range=config.initializer_range, name="wte" config.vocab_size, config.hidden_size, initializer_range=config.initializer_range, name="wte"
@@ -259,6 +268,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
use_cache=None, use_cache=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
training=False, training=False,
): ):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
@@ -272,7 +282,8 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
use_cache = inputs[7] if len(inputs) > 7 else use_cache use_cache = inputs[7] if len(inputs) > 7 else use_cache
output_attentions = inputs[8] if len(inputs) > 8 else output_attentions output_attentions = inputs[8] if len(inputs) > 8 else output_attentions
output_hidden_states = inputs[9] if len(inputs) > 9 else output_hidden_states output_hidden_states = inputs[9] if len(inputs) > 9 else output_hidden_states
assert len(inputs) <= 10, "Too many inputs." return_dict = inputs[10] if len(inputs) > 10 else return_dict
assert len(inputs) <= 11, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
past = inputs.get("past", past) past = inputs.get("past", past)
@@ -284,13 +295,15 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
use_cache = inputs.get("use_cache", use_cache) use_cache = inputs.get("use_cache", use_cache)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
assert len(inputs) <= 10, "Too many inputs." return_dict = inputs.get("return_dict", return_dict)
assert len(inputs) <= 11, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_attentions = output_attentions if output_attentions is not None else self.output_attentions
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
use_cache = use_cache if use_cache is not None else self.use_cache use_cache = use_cache if use_cache is not None else self.use_cache
return_dict = return_dict if return_dict is not None else self.return_dict
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
@@ -355,9 +368,9 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
output_shape = input_shape + [shape_list(hidden_states)[-1]] output_shape = input_shape + [shape_list(hidden_states)[-1]]
presents = () presents = () if use_cache else None
all_attentions = [] all_attentions = () if output_attentions else None
all_hidden_states = () all_hidden_states = () if output_hidden_states else None
for i, (block, layer_past) in enumerate(zip(self.h, past)): for i, (block, layer_past) in enumerate(zip(self.h, past)):
if output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),) all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),)
@@ -373,10 +386,11 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
) )
hidden_states, present = outputs[:2] hidden_states, present = outputs[:2]
presents = presents + (present,) if use_cache:
presents = presents + (present,)
if output_attentions: if output_attentions:
all_attentions.append(outputs[2]) all_attentions = all_attentions + (outputs[2],)
hidden_states = self.ln_f(hidden_states) hidden_states = self.ln_f(hidden_states)
@@ -385,18 +399,20 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
if output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states,)
if use_cache:
outputs = outputs + (presents,)
if output_hidden_states:
outputs = outputs + (all_hidden_states,)
if output_attentions: if output_attentions:
# let the number of heads free (-1) so we can extract attention even after head pruning # let the number of heads free (-1) so we can extract attention even after head pruning
attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:] attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:]
all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions) all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions)
outputs = outputs + (all_attentions,)
return outputs # last hidden state, presents, (all hidden_states), (attentions) if not return_dict:
return tuple(v for v in [hidden_states, presents, all_hidden_states, all_attentions] if v is not None)
return TFBaseModelOutputWithPast(
last_hidden_state=hidden_states,
past_key_values=presents,
hidden_states=all_hidden_states,
attentions=all_attentions,
)
class TFGPT2PreTrainedModel(TFPreTrainedModel): class TFGPT2PreTrainedModel(TFPreTrainedModel):
@@ -408,6 +424,42 @@ class TFGPT2PreTrainedModel(TFPreTrainedModel):
base_model_prefix = "transformer" base_model_prefix = "transformer"
@dataclass
class TFGPT2DoubleHeadsModelOutput(ModelOutput):
"""
Base class for outputs of models predicting if two sentences are consecutive or not.
Args:
lm_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
mc_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`):
Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape
:obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
``past_key_values`` input) to speed up sequential decoding.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
lm_logits: tf.Tensor = None
mc_logits: tf.Tensor = None
past_key_values: Optional[List[tf.Tensor]] = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
GPT2_START_DOCSTRING = r""" GPT2_START_DOCSTRING = r"""
.. note:: .. note::
@@ -482,6 +534,11 @@ GPT2_INPUTS_DOCSTRING = r"""
(if set to :obj:`False`) for evaluation. (if set to :obj:`False`) for evaluation.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
@@ -495,29 +552,13 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
self.transformer = TFGPT2MainLayer(config, name="transformer") self.transformer = TFGPT2MainLayer(config, name="transformer")
@add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="gpt2",
output_type=TFBaseModelOutputWithPast,
config_class=_CONFIG_FOR_DOC,
)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r"""
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the last layer of the model.
past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
outputs = self.transformer(inputs, **kwargs) outputs = self.transformer(inputs, **kwargs)
return outputs return outputs
@@ -543,7 +584,12 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss):
return {"inputs": inputs, "past": past, "use_cache": kwargs["use_cache"]} return {"inputs": inputs, "past": past, "use_cache": kwargs["use_cache"]}
@add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="gpt2",
output_type=TFCausalLMOutputWithPast,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs, inputs,
@@ -556,6 +602,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss):
use_cache=None, use_cache=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -563,31 +610,12 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss):
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the cross entropy classification loss. Labels for computing the cross entropy classification loss.
Indices should be in ``[0, ..., config.vocab_size - 1]``. Indices should be in ``[0, ..., config.vocab_size - 1]``.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs:
prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[10] if len(inputs) > 10 else labels labels = inputs[11] if len(inputs) > 11 else labels
if len(inputs) > 10: if len(inputs) > 11:
inputs = inputs[:10] inputs = inputs[:11]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
@@ -602,6 +630,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss):
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
@@ -609,15 +638,24 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss):
logits = self.transformer.wte(hidden_states, mode="linear") logits = self.transformer.wte(hidden_states, mode="linear")
outputs = (logits,) + transformer_outputs[1:] loss = None
if labels is not None: if labels is not None:
# shift labels to the left and cut last logit token # shift labels to the left and cut last logit token
logits = logits[:, :-1] logits = logits[:, :-1]
labels = labels[:, 1:] labels = labels[:, 1:]
loss = self.compute_loss(labels, logits) loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # lm_logits, presents, (all hidden_states), (attentions) if not return_dict:
output = (logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFCausalLMOutputWithPast(
loss=loss,
logits=logits,
past_key_values=transformer_outputs.past_key_values,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -641,6 +679,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
return self.transformer.wte return self.transformer.wte
@add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFGPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)
def call( def call(
self, self,
inputs, inputs,
@@ -654,6 +693,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
use_cache=None, use_cache=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
training=False, training=False,
): ):
r""" r"""
@@ -662,26 +702,6 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
Selected in the range ``[0, input_ids.size(-1) - 1[``. Selected in the range ``[0, input_ids.size(-1) - 1[``.
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs:
lm_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
mc_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`):
Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as `input_ids` as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples:: Examples::
@@ -717,8 +737,10 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds
mc_token_ids = inputs[7] if len(inputs) > 7 else mc_token_ids mc_token_ids = inputs[7] if len(inputs) > 7 else mc_token_ids
use_cache = inputs[8] if len(inputs) > 8 else use_cache use_cache = inputs[8] if len(inputs) > 8 else use_cache
output_attentions = inputs[9] if len(inputs) > 8 else output_attentions output_attentions = inputs[9] if len(inputs) > 9 else output_attentions
assert len(inputs) <= 10, "Too many inputs." output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states
return_dict = inputs[11] if len(inputs) > 11 else return_dict
assert len(inputs) <= 12, "Too many inputs."
elif isinstance(inputs, dict): elif isinstance(inputs, dict):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
past = inputs.get("past", past) past = inputs.get("past", past)
@@ -730,9 +752,12 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
mc_token_ids = inputs.get("mc_token_ids", mc_token_ids) mc_token_ids = inputs.get("mc_token_ids", mc_token_ids)
use_cache = inputs.get("use_cache", use_cache) use_cache = inputs.get("use_cache", use_cache)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
assert len(inputs) <= 10, "Too many inputs." output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
return_dict = inputs.get("return_dict", return_dict)
assert len(inputs) <= 12, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if input_ids is not None: if input_ids is not None:
input_shapes = shape_list(input_ids) input_shapes = shape_list(input_ids)
@@ -755,6 +780,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
use_cache, use_cache,
output_attentions, output_attentions,
output_hidden_states, output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
@@ -762,6 +788,14 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
lm_logits = self.transformer.wte(hidden_states, mode="linear") lm_logits = self.transformer.wte(hidden_states, mode="linear")
mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids, training=training) mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids, training=training)
mc_logits = tf.squeeze(mc_logits, axis=-1) mc_logits = tf.squeeze(mc_logits, axis=-1)
outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
return outputs # lm logits, mc logits, presents, (all hidden_states), (attentions) if not return_dict:
return (lm_logits, mc_logits) + transformer_outputs[1:]
return TFGPT2DoubleHeadsModelOutput(
lm_logits=lm_logits,
mc_logits=mc_logits,
past_key_values=transformer_outputs.past_key_values,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)

View File

@@ -17,17 +17,31 @@
import logging import logging
from dataclasses import dataclass
from typing import Optional, Tuple
import tensorflow as tf import tensorflow as tf
from . import MobileBertConfig from . import MobileBertConfig
from .file_utils import ( from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS, MULTIPLE_CHOICE_DUMMY_INPUTS,
ModelOutput,
add_code_sample_docstrings, add_code_sample_docstrings,
add_start_docstrings, add_start_docstrings,
add_start_docstrings_to_callable, add_start_docstrings_to_callable,
replace_return_docstrings,
) )
from .modeling_tf_bert import TFBertIntermediate, gelu, gelu_new, swish from .modeling_tf_bert import TFBertIntermediate, gelu, gelu_new, swish
from .modeling_tf_outputs import (
TFBaseModelOutput,
TFBaseModelOutputWithPooling,
TFMaskedLMOutput,
TFMultipleChoiceModelOutput,
TFNextSentencePredictorOutput,
TFQuestionAnsweringModelOutput,
TFSequenceClassifierOutput,
TFTokenClassifierOutput,
)
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFMaskedLanguageModelingLoss, TFMaskedLanguageModelingLoss,
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
@@ -44,6 +58,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_CONFIG_FOR_DOC = "MobileBertConfig"
_TOKENIZER_FOR_DOC = "MobileBertTokenizer" _TOKENIZER_FOR_DOC = "MobileBertTokenizer"
TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -541,9 +556,18 @@ class TFMobileBertEncoder(tf.keras.layers.Layer):
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.layer = [TFMobileBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)] self.layer = [TFMobileBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)]
def call(self, hidden_states, attention_mask, head_mask, output_attentions, output_hidden_states, training=False): def call(
all_hidden_states = () self,
all_attentions = () hidden_states,
attention_mask,
head_mask,
output_attentions,
output_hidden_states,
return_dict,
training=False,
):
all_hidden_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
for i, layer_module in enumerate(self.layer): for i, layer_module in enumerate(self.layer):
if output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
@@ -561,12 +585,11 @@ class TFMobileBertEncoder(tf.keras.layers.Layer):
if output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states,) if not return_dict:
if output_hidden_states: return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
outputs = outputs + (all_hidden_states,) return TFBaseModelOutput(
if output_attentions: last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
outputs = outputs + (all_attentions,) )
return outputs # outputs, (hidden states), (attentions)
class TFMobileBertPooler(tf.keras.layers.Layer): class TFMobileBertPooler(tf.keras.layers.Layer):
@@ -660,6 +683,7 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer):
self.num_hidden_layers = config.num_hidden_layers self.num_hidden_layers = config.num_hidden_layers
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.return_dict = config.use_return_dict
self.embeddings = TFMobileBertEmbeddings(config, name="embeddings") self.embeddings = TFMobileBertEmbeddings(config, name="embeddings")
self.encoder = TFMobileBertEncoder(config, name="encoder") self.encoder = TFMobileBertEncoder(config, name="encoder")
@@ -688,6 +712,7 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
training=False, training=False,
): ):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
@@ -699,7 +724,8 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer):
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_attentions = inputs[6] if len(inputs) > 6 else output_attentions
output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states
assert len(inputs) <= 8, "Too many inputs." return_dict = inputs[8] if len(inputs) > 8 else return_dict
assert len(inputs) <= 9, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
@@ -709,12 +735,14 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer):
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
assert len(inputs) <= 8, "Too many inputs." return_dict = inputs.get("return_dict", return_dict)
assert len(inputs) <= 9, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_attentions = output_attentions if output_attentions is not None else self.output_attentions
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
return_dict = return_dict if return_dict is not None else self.return_dict
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
@@ -763,16 +791,22 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer):
head_mask, head_mask,
output_attentions, output_attentions,
output_hidden_states, output_hidden_states,
return_dict,
training=training, training=training,
) )
sequence_output = encoder_outputs[0] sequence_output = encoder_outputs[0]
pooled_output = self.pooler(sequence_output) pooled_output = self.pooler(sequence_output)
outputs = (sequence_output, pooled_output,) + encoder_outputs[ if not return_dict:
1: return (sequence_output, pooled_output,) + encoder_outputs[1:]
] # add hidden_states and attentions if they are here
return outputs # sequence_output, pooled_output, (hidden_states), (attentions) return TFBaseModelOutputWithPooling(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
class TFMobileBertPreTrainedModel(TFPreTrainedModel): class TFMobileBertPreTrainedModel(TFPreTrainedModel):
@@ -784,6 +818,37 @@ class TFMobileBertPreTrainedModel(TFPreTrainedModel):
base_model_prefix = "mobilebert" base_model_prefix = "mobilebert"
@dataclass
class TFMobileBertForPreTrainingOutput(ModelOutput):
"""
Output type of :class:`~transformers.TFMobileBertForPreTrainingModel`.
Args:
prediction_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
seq_relationship_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False
continuation before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[tf.Tensor] = None
prediction_logits: tf.Tensor = None
seq_relationship_logits: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
MOBILEBERT_START_DOCSTRING = r""" MOBILEBERT_START_DOCSTRING = r"""
This model is a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ sub-class. This model is a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ sub-class.
Use it as a regular TF 2.0 Keras Model and Use it as a regular TF 2.0 Keras Model and
@@ -852,6 +917,13 @@ MOBILEBERT_INPUTS_DOCSTRING = r"""
training (:obj:`boolean`, `optional`, defaults to :obj:`False`): training (:obj:`boolean`, `optional`, defaults to :obj:`False`):
Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them
(if set to :obj:`False`) for evaluation. (if set to :obj:`False`) for evaluation.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
@@ -865,32 +937,13 @@ class TFMobileBertModel(TFMobileBertPreTrainedModel):
self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert") self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert")
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="google/mobilebert-uncased",
output_type=TFBaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r"""
Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification token)
further processed by a Linear layer and a Tanh activation function. The Linear
layer weights are trained from the next sentence prediction (classification)
objective during the original Bert pretraining. This output is usually *not* a good summary
of the semantic content of the input, you're often better with averaging or pooling
the sequence of hidden-states for the whole input sequence.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
outputs = self.mobilebert(inputs, **kwargs) outputs = self.mobilebert(inputs, **kwargs)
return outputs return outputs
@@ -911,25 +964,10 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel):
return self.mobilebert.embeddings return self.mobilebert.embeddings
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@replace_return_docstrings(output_type=TFMobileBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs:
prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
seq_relationship_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples:: Examples::
@@ -943,16 +981,23 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel):
>>> prediction_scores, seq_relationship_scores = outputs[:2] >>> prediction_scores, seq_relationship_scores = outputs[:2]
""" """
return_dict = kwargs.get("return_dict")
return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict
outputs = self.mobilebert(inputs, **kwargs) outputs = self.mobilebert(inputs, **kwargs)
sequence_output, pooled_output = outputs[:2] sequence_output, pooled_output = outputs[:2]
prediction_scores = self.predictions(sequence_output) prediction_scores = self.predictions(sequence_output)
seq_relationship_score = self.seq_relationship(pooled_output) seq_relationship_score = self.seq_relationship(pooled_output)
outputs = (prediction_scores, seq_relationship_score,) + outputs[
2:
] # add hidden states and attention if they are here
return outputs # prediction_scores, seq_relationship_score, (hidden_states), (attentions) if not return_dict:
return (prediction_scores, seq_relationship_score) + outputs[2:]
return TFMobileBertForPreTrainingOutput(
prediction_logits=prediction_scores,
seq_relationship_logits=seq_relationship_score,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings("""MobileBert Model with a `language modeling` head on top. """, MOBILEBERT_START_DOCSTRING) @add_start_docstrings("""MobileBert Model with a `language modeling` head on top. """, MOBILEBERT_START_DOCSTRING)
@@ -967,7 +1012,12 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModel
return self.mobilebert.embeddings return self.mobilebert.embeddings
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="google/mobilebert-uncased",
output_type=TFMaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -978,6 +1028,7 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModel
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -986,27 +1037,12 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModel
Labels for computing the masked language modeling loss. Labels for computing the masked language modeling loss.
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs:
prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
@@ -1019,18 +1055,22 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModel
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
prediction_scores = self.mlm(sequence_output, training=training) prediction_scores = self.mlm(sequence_output, training=training)
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here loss = None if labels is None else self.compute_loss(labels, prediction_scores)
if labels is not None:
loss = self.compute_loss(labels, prediction_scores)
outputs = (loss,) + outputs
return outputs # (loss), prediction_scores, (hidden_states), (attentions) if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFMaskedLMOutput(
loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
class TFMobileBertOnlyNSPHead(tf.keras.layers.Layer): class TFMobileBertOnlyNSPHead(tf.keras.layers.Layer):
@@ -1055,23 +1095,10 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel):
self.cls = TFMobileBertOnlyNSPHead(config, name="seq_relationship___cls") self.cls = TFMobileBertOnlyNSPHead(config, name="seq_relationship___cls")
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@replace_return_docstrings(output_type=TFNextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs:
seq_relationship_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`)
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples:: Examples::
@@ -1087,14 +1114,19 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel):
>>> logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0] >>> logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
""" """
return_dict = kwargs.get("return_dict")
return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict
outputs = self.mobilebert(inputs, **kwargs) outputs = self.mobilebert(inputs, **kwargs)
pooled_output = outputs[1] pooled_output = outputs[1]
seq_relationship_score = self.cls(pooled_output) seq_relationship_score = self.cls(pooled_output)
outputs = (seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here if not return_dict:
return (seq_relationship_score,) + outputs[2:]
return outputs # seq_relationship_score, (hidden_states), (attentions) return TFNextSentencePredictorOutput(
logits=seq_relationship_score, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -1114,7 +1146,12 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
) )
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="google/mobilebert-uncased",
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -1125,6 +1162,7 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -1134,27 +1172,12 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
Indices should be in :obj:`[0, ..., config.num_labels - 1]`. Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs:
logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
@@ -1167,6 +1190,7 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
@@ -1175,13 +1199,15 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
pooled_output = self.dropout(pooled_output, training=training) pooled_output = self.dropout(pooled_output, training=training)
logits = self.classifier(pooled_output) logits = self.classifier(pooled_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here loss = None if labels is None else self.compute_loss(labels, logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, logits) output = (logits,) + outputs[2:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), logits, (hidden_states), (attentions) return TFSequenceClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -1200,7 +1226,12 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn
) )
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="google/mobilebert-uncased",
output_type=TFQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -1211,6 +1242,7 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
start_positions=None, start_positions=None,
end_positions=None, end_positions=None,
training=False, training=False,
@@ -1224,30 +1256,13 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn
Labels for position (index) of the end of the labelled span for computing the token classification loss. Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss. Position outside of the sequence are not taken into account for computing the loss.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs:
start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-start scores (before SoftMax).
end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
start_positions = inputs[8] if len(inputs) > 8 else start_positions start_positions = inputs[9] if len(inputs) > 9 else start_positions
end_positions = inputs[9] if len(inputs) > 9 else end_positions end_positions = inputs[10] if len(inputs) > 10 else end_positions
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
start_positions = inputs.pop("start_positions", start_positions) start_positions = inputs.pop("start_positions", start_positions)
end_positions = inputs.pop("end_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions)
@@ -1261,6 +1276,7 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
@@ -1271,15 +1287,23 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn
start_logits = tf.squeeze(start_logits, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1)
outputs = (start_logits, end_logits,) + outputs[2:] loss = None
if start_positions is not None and end_positions is not None: if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions} labels = {"start_position": start_positions}
labels["end_position"] = end_positions labels["end_position"] = end_positions
loss = self.compute_loss(labels, outputs[:2]) loss = self.compute_loss(labels, (start_logits, end_logits))
outputs = (loss,) + outputs
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFQuestionAnsweringModelOutput(
loss=loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -1307,7 +1331,12 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="google/mobilebert-uncased",
output_type=TFMultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs, inputs,
@@ -1318,6 +1347,7 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -1326,24 +1356,6 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
Labels for computing the multiple choice classification loss. Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above) of the input tensors. (see `input_ids` above)
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs:
classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
`num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
@@ -1354,8 +1366,9 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_attentions = inputs[6] if len(inputs) > 6 else output_attentions
output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states
labels = inputs[8] if len(inputs) > 8 else labels return_dict = inputs[8] if len(inputs) > 8 else return_dict
assert len(inputs) <= 9, "Too many inputs." labels = inputs[9] if len(inputs) > 9 else labels
assert len(inputs) <= 10, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
@@ -1365,10 +1378,12 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
return_dict = inputs.get("return_dict", return_dict)
labels = inputs.get("labels", labels) labels = inputs.get("labels", labels)
assert len(inputs) <= 9, "Too many inputs." assert len(inputs) <= 10, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict
if input_ids is not None: if input_ids is not None:
num_choices = shape_list(input_ids)[1] num_choices = shape_list(input_ids)[1]
@@ -1395,19 +1410,23 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
flat_inputs_embeds, flat_inputs_embeds,
output_attentions, output_attentions,
output_hidden_states, output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
pooled_output = outputs[1] pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output, training=training) pooled_output = self.dropout(pooled_output, training=training)
logits = self.classifier(pooled_output) logits = self.classifier(pooled_output)
reshaped_logits = tf.reshape(logits, (-1, num_choices)) reshaped_logits = tf.reshape(logits, (-1, num_choices))
outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here
if labels is not None: loss = None if labels is None else self.compute_loss(labels, reshaped_logits)
loss = self.compute_loss(labels, reshaped_logits)
outputs = (loss,) + outputs
return outputs # (loss), reshaped_logits, (hidden_states), (attentions) if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFMultipleChoiceModelOutput(
loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -1427,7 +1446,12 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla
) )
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="google/mobilebert-uncased",
output_type=TFTokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -1438,6 +1462,7 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -1445,27 +1470,12 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the token classification loss. Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``. Indices should be in ``[0, ..., config.num_labels - 1]``.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs:
scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
@@ -1478,6 +1488,7 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
@@ -1486,10 +1497,12 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla
sequence_output = self.dropout(sequence_output, training=training) sequence_output = self.dropout(sequence_output, training=training)
logits = self.classifier(sequence_output) logits = self.classifier(sequence_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here loss = None if labels is None else self.compute_loss(labels, logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, logits) output = (logits,) + outputs[2:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), logits, (hidden_states), (attentions) return TFTokenClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)

View File

@@ -17,12 +17,21 @@
import logging import logging
from dataclasses import dataclass
from typing import Optional, Tuple
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from .configuration_openai import OpenAIGPTConfig from .configuration_openai import OpenAIGPTConfig
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
replace_return_docstrings,
)
from .modeling_tf_outputs import TFBaseModelOutput, TFCausalLMOutput
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFCausalLanguageModelingLoss, TFCausalLanguageModelingLoss,
TFConv1D, TFConv1D,
@@ -38,6 +47,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_CONFIG_FOR_DOC = "OpenAIGPTConfig"
_TOKENIZER_FOR_DOC = "OpenAIGPTTokenizer" _TOKENIZER_FOR_DOC = "OpenAIGPTTokenizer"
TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -208,6 +218,7 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
super().__init__(*inputs, **kwargs) super().__init__(*inputs, **kwargs)
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.return_dict = config.use_return_dict
self.num_hidden_layers = config.n_layer self.num_hidden_layers = config.n_layer
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.n_embd = config.n_embd self.n_embd = config.n_embd
@@ -247,6 +258,7 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
training=False, training=False,
): ):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
@@ -258,7 +270,8 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_attentions = inputs[6] if len(inputs) > 6 else output_attentions
output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states
assert len(inputs) <= 8, "Too many inputs." return_dict = inputs[8] if len(inputs) > 8 else return_dict
assert len(inputs) <= 9, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
@@ -268,12 +281,14 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
assert len(inputs) <= 8, "Too many inputs." return_dict = inputs.get("return_dict", return_dict)
assert len(inputs) <= 9, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_attentions = output_attentions if output_attentions is not None else self.output_attentions
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
return_dict = return_dict if return_dict is not None else self.return_dict
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
@@ -333,8 +348,8 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
output_shape = input_shape + [shape_list(hidden_states)[-1]] output_shape = input_shape + [shape_list(hidden_states)[-1]]
all_attentions = [] all_attentions = () if output_attentions else None
all_hidden_states = () all_hidden_states = () if output_hidden_states else None
for i, block in enumerate(self.h): for i, block in enumerate(self.h):
if output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),) all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),)
@@ -342,22 +357,24 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
outputs = block(hidden_states, attention_mask, head_mask[i], output_attentions, training=training) outputs = block(hidden_states, attention_mask, head_mask[i], output_attentions, training=training)
hidden_states = outputs[0] hidden_states = outputs[0]
if output_attentions: if output_attentions:
all_attentions.append(outputs[1]) all_attentions = all_attentions + (outputs[1],)
hidden_states = tf.reshape(hidden_states, output_shape) hidden_states = tf.reshape(hidden_states, output_shape)
# Add last hidden state # Add last hidden state
if output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states,)
if output_hidden_states:
outputs = outputs + (all_hidden_states,)
if output_attentions: if output_attentions:
# let the number of heads free (-1) so we can extract attention even after head pruning # let the number of heads free (-1) so we can extract attention even after head pruning
attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:] attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:]
all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions) all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions)
outputs = outputs + (all_attentions,)
return outputs # last hidden state, (all hidden_states), (attentions) if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
return TFBaseModelOutput(
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions,
)
class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel): class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel):
@@ -369,6 +386,35 @@ class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel):
base_model_prefix = "transformer" base_model_prefix = "transformer"
@dataclass
class TFOpenAIGPTDoubleHeadsModelOutput(ModelOutput):
"""
Base class for outputs of models predicting if two sentences are consecutive or not.
Args:
lm_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
mc_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`):
Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
lm_logits: tf.Tensor = None
mc_logits: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
OPENAI_GPT_START_DOCSTRING = r""" OPENAI_GPT_START_DOCSTRING = r"""
.. note:: .. note::
@@ -436,6 +482,11 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""
(if set to :obj:`False`) for evaluation. (if set to :obj:`False`) for evaluation.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
@@ -449,25 +500,13 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer") self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
@add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="openai-gpt",
output_type=TFBaseModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r"""
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the last layer of the model.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
outputs = self.transformer(inputs, **kwargs) outputs = self.transformer(inputs, **kwargs)
return outputs return outputs
@@ -486,7 +525,12 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin
return self.transformer.tokens_embed return self.transformer.tokens_embed
@add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="openai-gpt",
output_type=TFCausalLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs, inputs,
@@ -497,6 +541,7 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -504,27 +549,12 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the cross entropy classification loss. Labels for computing the cross entropy classification loss.
Indices should be in ``[0, ..., config.vocab_size - 1]``. Indices should be in ``[0, ..., config.vocab_size - 1]``.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs:
prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
@@ -537,21 +567,30 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
logits = self.transformer.tokens_embed(hidden_states, mode="linear") logits = self.transformer.tokens_embed(hidden_states, mode="linear")
outputs = (logits,) + transformer_outputs[1:]
loss = None
if labels is not None: if labels is not None:
# shift labels to the left and cut last logit token # shift labels to the left and cut last logit token
logits = logits[:, :-1] logits = logits[:, :-1]
labels = labels[:, 1:] labels = labels[:, 1:]
loss = self.compute_loss(labels, logits) loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # lm_logits, (all hidden_states), (attentions) if not return_dict:
output = (logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFCausalLMOutput(
loss=loss,
logits=logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -575,6 +614,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
return self.transformer.tokens_embed return self.transformer.tokens_embed
@add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFOpenAIGPTDoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)
def call( def call(
self, self,
inputs, inputs,
@@ -586,6 +626,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
mc_token_ids=None, mc_token_ids=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
training=False, training=False,
): ):
r""" r"""
@@ -594,27 +635,6 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
Selected in the range ``[0, input_ids.size(-1) - 1]``. Selected in the range ``[0, input_ids.size(-1) - 1]``.
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs:
lm_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
mc_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`):
Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples:: Examples::
@@ -646,7 +666,9 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids
output_attentions = inputs[7] if len(inputs) > 7 else output_attentions output_attentions = inputs[7] if len(inputs) > 7 else output_attentions
assert len(inputs) <= 8, "Too many inputs." output_hidden_states = inputs[8] if len(inputs) > 8 else output_hidden_states
return_dict = inputs[9] if len(inputs) > 9 else return_dict
assert len(inputs) <= 10, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
@@ -656,9 +678,12 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
mc_token_ids = inputs.get("mc_token_ids", mc_token_ids) mc_token_ids = inputs.get("mc_token_ids", mc_token_ids)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
assert len(inputs) <= 8, "Too many inputs." output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
return_dict = inputs.get("return_dict", return_dict)
assert len(inputs) <= 10, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if input_ids is not None: if input_ids is not None:
input_shapes = shape_list(input_ids) input_shapes = shape_list(input_ids)
@@ -679,6 +704,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
inputs_embeds, inputs_embeds,
output_attentions, output_attentions,
output_hidden_states, output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
@@ -686,6 +712,13 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
lm_logits = self.transformer.tokens_embed(hidden_states, mode="linear") lm_logits = self.transformer.tokens_embed(hidden_states, mode="linear")
mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids, training=training) mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids, training=training)
mc_logits = tf.squeeze(mc_logits, axis=-1) mc_logits = tf.squeeze(mc_logits, axis=-1)
outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
return outputs # lm logits, mc logits, (all hidden_states), (attentions) if not return_dict:
return (lm_logits, mc_logits) + transformer_outputs[1:]
return TFOpenAIGPTDoubleHeadsModelOutput(
lm_logits=lm_logits,
mc_logits=mc_logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)

View File

@@ -0,0 +1,555 @@
from dataclasses import dataclass
from typing import List, Optional, Tuple
import tensorflow as tf
from .file_utils import ModelOutput
@dataclass
class TFBaseModelOutput(ModelOutput):
"""
Base class for model's outputs, with potential hidden states and attentions.
Args:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
hidden_states (:obj:`tuple(tf.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
last_hidden_state: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFBaseModelOutputWithPooling(ModelOutput):
"""
Base class for model's outputs that also contains a pooling of the last hidden states.
Args:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification token)
further processed by a Linear layer and a Tanh activation function. The Linear
layer weights are trained from the next sentence prediction (classification)
objective during pretraining.
This output is usually *not* a good summary
of the semantic content of the input, you're often better with averaging or pooling
the sequence of hidden-states for the whole input sequence.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
last_hidden_state: tf.Tensor = None
pooler_output: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFBaseModelOutputWithPast(ModelOutput):
"""
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
Args:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
If `past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output.
past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape
:obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
``past_key_values`` input) to speed up sequential decoding.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
last_hidden_state: tf.Tensor = None
past_key_values: Optional[List[tf.Tensor]] = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFSeq2SeqModelOutput(ModelOutput):
"""
Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
decoding.
Args:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the decoder of the model.
If ``decoder_past_key_values`` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output.
decoder_past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape
:obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
used (see ``decoder_past_key_values`` input) to speed up sequential decoding.
decoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
decoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
encoder_last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
encoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
"""
last_hidden_state: tf.Tensor = None
decoder_past_key_values: Optional[List[tf.Tensor]] = None
decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
decoder_attentions: Optional[Tuple[tf.Tensor]] = None
encoder_last_hidden_state: Optional[tf.Tensor] = None
encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
encoder_attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFCausalLMOutput(ModelOutput):
"""
Base class for causal language model (or autoregressive) outputs.
Args:
loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Language modeling loss (for next-token prediction).
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[tf.Tensor] = None
logits: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFCausalLMOutputWithPast(ModelOutput):
"""
Base class for causal language model (or autoregressive) outputs.
Args:
loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Language modeling loss (for next-token prediction).
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape
:obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
``past_key_values`` input) to speed up sequential decoding.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[tf.Tensor] = None
logits: tf.Tensor = None
past_key_values: Optional[List[tf.Tensor]] = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFMaskedLMOutput(ModelOutput):
"""
Base class for masked language models outputs.
Args:
loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Masked languaged modeling (MLM) loss.
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[tf.Tensor] = None
logits: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFSeq2SeqLMOutput(ModelOutput):
"""
Base class for sequence-to-sequence language models outputs.
Args:
loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Languaged modeling loss.
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
decoder_past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape
:obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
used (see ``decoder_past_key_values`` input) to speed up sequential decoding.
decoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
decoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
encoder_last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
encoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
"""
loss: Optional[tf.Tensor] = None
logits: tf.Tensor = None
decoder_past_key_values: Optional[List[tf.Tensor]] = None
decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
decoder_attentions: Optional[Tuple[tf.Tensor]] = None
encoder_last_hidden_state: Optional[tf.Tensor] = None
encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
encoder_attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFNextSentencePredictorOutput(ModelOutput):
"""
Base class for outputs of models predicting if two sentences are consecutive or not.
Args:
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
logits: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFSequenceClassifierOutput(ModelOutput):
"""
Base class for outputs of sentence classification models.
Args:
loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Classification (or regression if config.num_labels==1) loss.
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[tf.Tensor] = None
logits: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFSeq2SeqSequenceClassifierOutput(ModelOutput):
"""
Base class for outputs of sequence-to-sequence sentence classification models.
Args:
loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
Classification (or regression if config.num_labels==1) loss.
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
decoder_past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape
:obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
used (see ``decoder_past_key_values`` input) to speed up sequential decoding.
decoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
decoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
encoder_last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
encoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
"""
loss: Optional[tf.Tensor] = None
logits: tf.Tensor = None
decoder_past_key_values: Optional[List[tf.Tensor]] = None
decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
decoder_attentions: Optional[Tuple[tf.Tensor]] = None
encoder_last_hidden_state: Optional[tf.Tensor] = None
encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
encoder_attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFMultipleChoiceModelOutput(ModelOutput):
"""
Base class for outputs of multiple choice models.
Args:
loss (:obj:`tf.Tensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
Classification loss.
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`):
`num_choices` is the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[tf.Tensor] = None
logits: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFTokenClassifierOutput(ModelOutput):
"""
Base class for outputs of token classification models.
Args:
loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
Classification loss.
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[tf.Tensor] = None
logits: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFQuestionAnsweringModelOutput(ModelOutput):
"""
Base class for outputs of question answering models.
Args:
loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
start_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-start scores (before SoftMax).
end_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[tf.Tensor] = None
start_logits: tf.Tensor = None
end_logits: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
"""
Base class for outputs of sequence-to-sequence question answering models.
Args:
loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
start_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-start scores (before SoftMax).
end_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax).
decoder_past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape
:obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
used (see ``decoder_past_key_values`` input) to speed up sequential decoding.
decoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
decoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
encoder_last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
encoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
"""
loss: Optional[tf.Tensor] = None
start_logits: tf.Tensor = None
end_logits: tf.Tensor = None
decoder_past_key_values: Optional[List[tf.Tensor]] = None
decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
decoder_attentions: Optional[Tuple[tf.Tensor]] = None
encoder_last_hidden_state: Optional[tf.Tensor] = None
encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
encoder_attentions: Optional[Tuple[tf.Tensor]] = None

View File

@@ -28,6 +28,14 @@ from .file_utils import (
add_start_docstrings_to_callable, add_start_docstrings_to_callable,
) )
from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu
from .modeling_tf_outputs import (
TFBaseModelOutputWithPooling,
TFMaskedLMOutput,
TFMultipleChoiceModelOutput,
TFQuestionAnsweringModelOutput,
TFSequenceClassifierOutput,
TFTokenClassifierOutput,
)
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFMaskedLanguageModelingLoss, TFMaskedLanguageModelingLoss,
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
@@ -44,6 +52,7 @@ from .tokenization_utils_base import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_CONFIG_FOR_DOC = "RobertaConfig"
_TOKENIZER_FOR_DOC = "RobertaTokenizer" _TOKENIZER_FOR_DOC = "RobertaTokenizer"
TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -190,6 +199,11 @@ ROBERTA_INPUTS_DOCSTRING = r"""
(if set to :obj:`False`) for evaluation. (if set to :obj:`False`) for evaluation.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
@@ -203,32 +217,13 @@ class TFRobertaModel(TFRobertaPreTrainedModel):
self.roberta = TFRobertaMainLayer(config, name="roberta") self.roberta = TFRobertaMainLayer(config, name="roberta")
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="roberta-base",
output_type=TFBaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r"""
Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification token)
further processed by a Linear layer and a Tanh activation function. The Linear
layer weights are trained from the next sentence prediction (classification)
objective during Bert pretraining. This output is usually *not* a good summary
of the semantic content of the input, you're often better with averaging or pooling
the sequence of hidden-states for the whole input sequence.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
outputs = self.roberta(inputs, **kwargs) outputs = self.roberta(inputs, **kwargs)
return outputs return outputs
@@ -276,7 +271,12 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos
return self.lm_head.decoder return self.lm_head.decoder
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="roberta-base",
output_type=TFMaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -287,6 +287,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -296,27 +297,12 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]`` in ``[0, ..., config.vocab_size]``
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.roberta.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
@@ -329,6 +315,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
@@ -337,13 +324,15 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos
sequence_output = outputs[0] sequence_output = outputs[0]
prediction_scores = self.lm_head(sequence_output) prediction_scores = self.lm_head(sequence_output)
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here loss = None if labels is None else self.compute_loss(labels, prediction_scores)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, prediction_scores) output = (prediction_scores,) + outputs[2:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), prediction_scores, (hidden_states), (attentions) return TFMaskedLMOutput(
loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
class TFRobertaClassificationHead(tf.keras.layers.Layer): class TFRobertaClassificationHead(tf.keras.layers.Layer):
@@ -385,7 +374,12 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla
self.classifier = TFRobertaClassificationHead(config, name="classifier") self.classifier = TFRobertaClassificationHead(config, name="classifier")
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="roberta-base",
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -396,30 +390,22 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
r""" r"""
Return: labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: Labels for computing the sequence classification/regression loss.
logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
Classification (or regression if config.num_labels==1) scores (before SoftMax). If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.roberta.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
@@ -432,19 +418,22 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
logits = self.classifier(sequence_output, training=training) logits = self.classifier(sequence_output, training=training)
outputs = (logits,) + outputs[2:] loss = None if labels is None else self.compute_loss(labels, logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, logits) output = (logits,) + outputs[2:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), logits, (hidden_states), (attentions) return TFSequenceClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -472,7 +461,12 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="roberta-base",
output_type=TFMultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs, inputs,
@@ -483,6 +477,7 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -491,24 +486,6 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
Labels for computing the multiple choice classification loss. Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above) of the input tensors. (see `input_ids` above)
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
`num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
@@ -519,8 +496,9 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_attentions = inputs[6] if len(inputs) > 6 else output_attentions
output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states
labels = inputs[8] if len(inputs) > 8 else labels return_dict = inputs[8] if len(inputs) > 8 else return_dict
assert len(inputs) <= 9, "Too many inputs." labels = inputs[9] if len(inputs) > 9 else labels
assert len(inputs) <= 10, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
@@ -530,10 +508,12 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_attentions)
return_dict = inputs.get("return_dict", return_dict)
labels = inputs.get("labels", labels) labels = inputs.get("labels", labels)
assert len(inputs) <= 9, "Too many inputs." assert len(inputs) <= 10, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
return_dict = return_dict if return_dict is not None else self.roberta.return_dict
if input_ids is not None: if input_ids is not None:
num_choices = shape_list(input_ids)[1] num_choices = shape_list(input_ids)[1]
@@ -555,19 +535,23 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
inputs_embeds, inputs_embeds,
output_attentions, output_attentions,
output_hidden_states, output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
pooled_output = outputs[1] pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output, training=training) pooled_output = self.dropout(pooled_output, training=training)
logits = self.classifier(pooled_output) logits = self.classifier(pooled_output)
reshaped_logits = tf.reshape(logits, (-1, num_choices)) reshaped_logits = tf.reshape(logits, (-1, num_choices))
outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here
if labels is not None: loss = None if labels is None else self.compute_loss(labels, reshaped_logits)
loss = self.compute_loss(labels, reshaped_logits)
outputs = (loss,) + outputs
return outputs # (loss), reshaped_logits, (hidden_states), (attentions) if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFMultipleChoiceModelOutput(
loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -587,7 +571,12 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
) )
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="roberta-base",
output_type=TFTokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -598,6 +587,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -605,27 +595,12 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the token classification loss. Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``. Indices should be in ``[0, ..., config.num_labels - 1]``.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.roberta.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
@@ -638,6 +613,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
@@ -646,13 +622,15 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
sequence_output = self.dropout(sequence_output, training=training) sequence_output = self.dropout(sequence_output, training=training)
logits = self.classifier(sequence_output) logits = self.classifier(sequence_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here loss = None if labels is None else self.compute_loss(labels, logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, logits) output = (logits,) + outputs[2:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), logits, (hidden_states), (attentions) return TFTokenClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -670,7 +648,12 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin
) )
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="roberta-base",
output_type=TFQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -681,6 +664,7 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
start_positions=None, start_positions=None,
end_positions=None, end_positions=None,
training=False, training=False,
@@ -694,30 +678,13 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin
Labels for position (index) of the end of the labelled span for computing the token classification loss. Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss. Position outside of the sequence are not taken into account for computing the loss.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-start scores (before SoftMax).
end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.roberta.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
start_positions = inputs[8] if len(inputs) > 8 else start_positions start_positions = inputs[9] if len(inputs) > 9 else start_positions
end_positions = inputs[9] if len(inputs) > 9 else end_positions end_positions = inputs[10] if len(inputs) > 10 else end_positions
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
start_positions = inputs.pop("start_positions", start_positions) start_positions = inputs.pop("start_positions", start_positions)
end_positions = inputs.pop("end_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions)
@@ -731,6 +698,7 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
@@ -741,12 +709,20 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin
start_logits = tf.squeeze(start_logits, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1)
outputs = (start_logits, end_logits,) + outputs[2:] loss = None
if start_positions is not None and end_positions is not None: if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions} labels = {"start_position": start_positions}
labels["end_position"] = end_positions labels["end_position"] = end_positions
loss = self.compute_loss(labels, outputs[:2]) loss = self.compute_loss(labels, (start_logits, end_logits))
outputs = (loss,) + outputs
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFQuestionAnsweringModelOutput(
loss=loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)

View File

@@ -25,7 +25,14 @@ import warnings
import tensorflow as tf import tensorflow as tf
from .configuration_t5 import T5Config from .configuration_t5 import T5Config
from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import (
DUMMY_INPUTS,
DUMMY_MASK,
add_start_docstrings,
add_start_docstrings_to_callable,
replace_return_docstrings,
)
from .modeling_tf_outputs import TFSeq2SeqLMOutput, TFSeq2SeqModelOutput
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFCausalLanguageModelingLoss, TFCausalLanguageModelingLoss,
TFPreTrainedModel, TFPreTrainedModel,
@@ -39,6 +46,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_CONFIG_FOR_DOC = "T5Config"
_TOKENIZER_FOR_DOC = "T5Tokenizer" _TOKENIZER_FOR_DOC = "T5Tokenizer"
TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -575,8 +583,8 @@ class TFT5MainLayer(tf.keras.layers.Layer):
head_mask = inputs[5] if len(inputs) > 5 else head_mask head_mask = inputs[5] if len(inputs) > 5 else head_mask
past_key_value_states = inputs[6] if len(inputs) > 6 else past_key_value_states past_key_value_states = inputs[6] if len(inputs) > 6 else past_key_value_states
use_cache = inputs[7] if len(inputs) > 7 else use_cache use_cache = inputs[7] if len(inputs) > 7 else use_cache
output_attentions = inputs[8] if len(inputs) > 7 else output_attentions output_attentions = inputs[8] if len(inputs) > 8 else output_attentions
output_hidden_states = inputs[9] if len(inputs) > 8 else output_hidden_states output_hidden_states = inputs[9] if len(inputs) > 9 else output_hidden_states
assert len(inputs) <= 10, "Too many inputs." assert len(inputs) <= 10, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
@@ -934,6 +942,7 @@ class TFT5Model(TFT5PreTrainedModel):
return self.decoder return self.decoder
@add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFSeq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
def call( def call(
self, self,
inputs, inputs,
@@ -948,29 +957,11 @@ class TFT5Model(TFT5PreTrainedModel):
use_cache=None, use_cache=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
training=False, training=False,
): ):
r""" r"""
Returns: Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
If `decoder_past_key_value_states` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output.
decoder_past_key_value_states (:obj:`tuple(tuple(tf.Tensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`, `optional`, returned when ``use_cache=True``):
Contains pre-computed key and value hidden-states of the attention blocks.
Can be used to speed up sequential decoding (see `decoder_past_key_value_states` input).
Note that when using `decoder_past_key_value_states`, the model only outputs the last `hidden-state` of the sequence of shape :obj:`(batch_size, 1, config.vocab_size)`.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples:: Examples::
@@ -996,7 +987,8 @@ class TFT5Model(TFT5PreTrainedModel):
use_cache = inputs[9] if len(inputs) > 9 else use_cache use_cache = inputs[9] if len(inputs) > 9 else use_cache
output_attentions = inputs[10] if len(inputs) > 10 else output_attentions output_attentions = inputs[10] if len(inputs) > 10 else output_attentions
output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states
assert len(inputs) <= 12, "Too many inputs." return_dict = inputs[12] if len(inputs) > 12 else return_dict
assert len(inputs) <= 13, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
if "inputs" in inputs: if "inputs" in inputs:
warnings.warn("Using `inputs` as a keyword argument is deprecated. Please use `input_ids` instead.") warnings.warn("Using `inputs` as a keyword argument is deprecated. Please use `input_ids` instead.")
@@ -1013,11 +1005,13 @@ class TFT5Model(TFT5PreTrainedModel):
use_cache = inputs.get("use_cache", use_cache) use_cache = inputs.get("use_cache", use_cache)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
assert len(inputs) <= 12, "Too many inputs." return_dict = inputs.get("return_dict", return_dict)
assert len(inputs) <= 13, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
use_cache = use_cache if use_cache is not None else self.config.use_cache use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.return_dict
# Encode if needed (training, first prediction pass) # Encode if needed (training, first prediction pass)
if encoder_outputs is None: if encoder_outputs is None:
@@ -1063,12 +1057,40 @@ class TFT5Model(TFT5PreTrainedModel):
], ],
training=training, training=training,
) )
past = (
(encoder_outputs, decoder_outputs[1]) if cast_bool_to_primitive(use_cache, self.config.use_cache) else None
)
if not return_dict:
if past is not None:
decoder_outputs = decoder_outputs[:1] + (past,) + decoder_outputs[2:]
return decoder_outputs + encoder_outputs
if cast_bool_to_primitive(use_cache, self.config.use_cache) is True: # If put before, this breaks the tf compilation.
past = ((encoder_outputs, decoder_outputs[1]),) output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
decoder_outputs = decoder_outputs[:1] + past + decoder_outputs[2:] output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return decoder_outputs + encoder_outputs # This is long and annoying but if we introduce return_dict at the TFT5MainLayer level (like in PyTorch)
# TF refuses to compile anymore.
if not cast_bool_to_primitive(use_cache, self.config.use_cache):
decoder_outputs = decoder_outputs[:1] + (None,) + decoder_outputs[1:]
if not cast_bool_to_primitive(output_hidden_states, self.config.output_hidden_states):
encoder_outputs = encoder_outputs[:1] + (None,) + encoder_outputs[1:]
decoder_outputs = decoder_outputs[:2] + (None,) + decoder_outputs[2:]
if not cast_bool_to_primitive(output_attentions, self.config.output_attentions):
encoder_outputs = encoder_outputs + (None,)
decoder_outputs = decoder_outputs + (None,)
return TFSeq2SeqModelOutput(
last_hidden_state=decoder_outputs[0],
decoder_past_key_values=past,
decoder_hidden_states=decoder_outputs[2],
decoder_attentions=decoder_outputs[3],
encoder_last_hidden_state=encoder_outputs[0],
encoder_hidden_states=encoder_outputs[1],
encoder_attentions=encoder_outputs[2],
)
@add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING) @add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING)
@@ -1115,6 +1137,7 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling
return self.decoder return self.decoder
@add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
def call( def call(
self, self,
inputs, inputs,
@@ -1129,6 +1152,7 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling
use_cache=None, use_cache=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -1138,24 +1162,6 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling
Indices should be in ``[0, ..., config.vocab_size - 1]``. Indices should be in ``[0, ..., config.vocab_size - 1]``.
Returns: Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs:
prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
decoder_past_key_value_states (:obj:`tuple(tuple(tf.Tensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`, `optional`, returned when ``use_cache=True``):
Contains pre-computed key and value hidden-states of the attention blocks.
Can be used to speed up sequential decoding (see `decoder_past_key_value_states` input).
Note that when using `decoder_past_key_value_states`, the model only outputs the last `prediction_score` of the sequence of shape :obj:`(batch_size, 1, config.vocab_size)`.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples:: Examples::
@@ -1186,8 +1192,9 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling
use_cache = inputs[9] if len(inputs) > 9 else use_cache use_cache = inputs[9] if len(inputs) > 9 else use_cache
output_attentions = inputs[10] if len(inputs) > 10 else output_attentions output_attentions = inputs[10] if len(inputs) > 10 else output_attentions
output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states
labels = inputs[12] if len(inputs) > 12 else labels return_dict = inputs[12] if len(inputs) > 12 else return_dict
assert len(inputs) <= 13, "Too many inputs." labels = inputs[13] if len(inputs) > 13 else labels
assert len(inputs) <= 14, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
if "inputs" in inputs: if "inputs" in inputs:
warnings.warn("Using `inputs` as a keyword argument is deprecated. Please use `input_ids` instead.") warnings.warn("Using `inputs` as a keyword argument is deprecated. Please use `input_ids` instead.")
@@ -1204,12 +1211,14 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling
use_cache = inputs.get("use_cache", use_cache) use_cache = inputs.get("use_cache", use_cache)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
return_dict = inputs.get("return_dict", return_dict)
labels = inputs.get("labels", labels) labels = inputs.get("labels", labels)
assert len(inputs) <= 13, "Too many inputs." assert len(inputs) <= 14, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
use_cache = use_cache if use_cache is not None else self.config.use_cache use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.return_dict
# Encode if needed (training, first prediction pass) # Encode if needed (training, first prediction pass)
if encoder_outputs is None: if encoder_outputs is None:
@@ -1261,22 +1270,48 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling
training=training, training=training,
) )
# insert decoder past at right place
# to speed up decoding
if cast_bool_to_primitive(use_cache, self.config.use_cache) is True:
past = ((encoder_outputs, decoder_outputs[1]),)
decoder_outputs = decoder_outputs[:1] + past + decoder_outputs[2:]
sequence_output = decoder_outputs[0] * (self.model_dim ** -0.5) sequence_output = decoder_outputs[0] * (self.model_dim ** -0.5)
embed_tokens = self.get_output_embeddings() embed_tokens = self.get_output_embeddings()
logits = embed_tokens(sequence_output, mode="linear") logits = embed_tokens(sequence_output, mode="linear")
decoder_outputs = (logits,) + decoder_outputs[1:]
if labels is not None: loss = None if labels is None else self.compute_loss(labels, logits)
loss = self.compute_loss(labels, logits)
decoder_outputs = (loss,) + decoder_outputs
return decoder_outputs + encoder_outputs past = (
(encoder_outputs, decoder_outputs[1]) if cast_bool_to_primitive(use_cache, self.config.use_cache) else None
)
if not return_dict:
if past is not None:
decoder_outputs = decoder_outputs[:1] + (past,) + decoder_outputs[2:]
output = (logits,) + decoder_outputs[1:] + encoder_outputs
return ((loss,) + output) if loss is not None else output
# Putting this before breaks tf compilation.
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# This is long and annoying but if we introduce return_dict at the TFT5MainLayer level (like in PyTorch)
# TF refuses to compile anymore.
if not cast_bool_to_primitive(use_cache, self.config.use_cache):
decoder_outputs = decoder_outputs[:1] + (None,) + decoder_outputs[1:]
if not cast_bool_to_primitive(output_hidden_states, self.config.output_hidden_states):
encoder_outputs = encoder_outputs[:1] + (None,) + encoder_outputs[1:]
decoder_outputs = decoder_outputs[:2] + (None,) + decoder_outputs[2:]
if not cast_bool_to_primitive(output_attentions, self.config.output_attentions):
encoder_outputs = encoder_outputs + (None,)
decoder_outputs = decoder_outputs + (None,)
return TFSeq2SeqLMOutput(
loss=loss,
logits=logits,
decoder_past_key_values=past,
decoder_hidden_states=decoder_outputs[2],
decoder_attentions=decoder_outputs[3],
encoder_last_hidden_state=encoder_outputs[0],
encoder_hidden_states=encoder_outputs[1],
encoder_attentions=encoder_outputs[2],
)
def prepare_inputs_for_generation(self, inputs, past, attention_mask, use_cache, **kwargs): def prepare_inputs_for_generation(self, inputs, past, attention_mask, use_cache, **kwargs):
assert past is not None, "past has to be defined for encoder_outputs" assert past is not None, "past has to be defined for encoder_outputs"

View File

@@ -18,11 +18,13 @@
import logging import logging
from dataclasses import dataclass
from typing import List, Optional, Tuple
import tensorflow as tf import tensorflow as tf
from .configuration_transfo_xl import TransfoXLConfig from .configuration_transfo_xl import TransfoXLConfig
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import ModelOutput, add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask
from .modeling_tf_utils import TFPreTrainedModel, get_initializer, keras_serializable, shape_list from .modeling_tf_utils import TFPreTrainedModel, get_initializer, keras_serializable, shape_list
from .tokenization_utils import BatchEncoding from .tokenization_utils import BatchEncoding
@@ -30,6 +32,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_CONFIG_FOR_DOC = "TransfoXLConfig"
_TOKENIZER_FOR_DOC = "TransfoXLTokenizer" _TOKENIZER_FOR_DOC = "TransfoXLTokenizer"
TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -388,6 +391,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.return_dict = config.use_return_dict
self.n_token = config.vocab_size self.n_token = config.vocab_size
@@ -525,6 +529,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
training=False, training=False,
): ):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
@@ -533,8 +538,9 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
head_mask = inputs[2] if len(inputs) > 2 else head_mask head_mask = inputs[2] if len(inputs) > 2 else head_mask
inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
output_attentions = inputs[4] if len(inputs) > 4 else output_attentions output_attentions = inputs[4] if len(inputs) > 4 else output_attentions
output_hidden_states = inputs[5] if len(inputs) > 4 else output_hidden_states output_hidden_states = inputs[5] if len(inputs) > 5 else output_hidden_states
assert len(inputs) <= 6, "Too many inputs." return_dict = inputs[6] if len(inputs) > 6 else return_dict
assert len(inputs) <= 7, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
mems = inputs.get("mems", mems) mems = inputs.get("mems", mems)
@@ -542,12 +548,14 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
assert len(inputs) <= 6, "Too many inputs." return_dict = inputs.get("return_dict", return_dict)
assert len(inputs) <= 7, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_attentions = output_attentions if output_attentions is not None else self.output_attentions
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
return_dict = return_dict if return_dict is not None else self.return_dict
# the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
# so we transpose here from shape [bsz, len] to shape [len, bsz] # so we transpose here from shape [bsz, len] to shape [len, bsz]
@@ -606,7 +614,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
# word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1+mlen)[:,:,None] # word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1+mlen)[:,:,None]
hids = [] hids = []
attentions = [] attentions = [] if output_attentions else None
if self.attn_type == 0: # default if self.attn_type == 0: # default
pos_seq = tf.range(klen - 1, -1, -1.0) pos_seq = tf.range(klen - 1, -1, -1.0)
if self.clamp_len > 0: if self.clamp_len > 0:
@@ -633,17 +641,24 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
new_mems = self._update_mems(hids, mems, mlen, qlen) new_mems = self._update_mems(hids, mems, mlen, qlen)
# We transpose back here to shape [bsz, len, hidden_dim] # We transpose back here to shape [bsz, len, hidden_dim]
outputs = [tf.transpose(core_out, perm=(1, 0, 2)), new_mems] core_out = tf.transpose(core_out, perm=(1, 0, 2))
if output_hidden_states: if output_hidden_states:
# Add last layer and transpose to library standard shape [bsz, len, hidden_dim] # Add last layer and transpose to library standard shape [bsz, len, hidden_dim]
hids.append(core_out) hids.append(core_out)
hids = list(tf.transpose(t, perm=(1, 0, 2)) for t in hids) hids = tuple(tf.transpose(t, perm=(1, 0, 2)) for t in hids)
outputs.append(hids) else:
hids = None
if output_attentions: if output_attentions:
# Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len] # Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len]
attentions = list(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions) attentions = tuple(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions)
outputs.append(attentions)
return outputs # last hidden state, new_mems, (all hidden states), (all attentions) if not return_dict:
return tuple(v for v in [core_out, new_mems, hids, attentions] if v is not None)
return TFTransfoXLModelOutput(
last_hidden_state=core_out, mems=new_mems, hidden_states=hids, attentions=attentions,
)
class TFTransfoXLPreTrainedModel(TFPreTrainedModel): class TFTransfoXLPreTrainedModel(TFPreTrainedModel):
@@ -655,6 +670,70 @@ class TFTransfoXLPreTrainedModel(TFPreTrainedModel):
base_model_prefix = "transformer" base_model_prefix = "transformer"
@dataclass
class TFTransfoXLModelOutput(ModelOutput):
"""
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
Args:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
last_hidden_state: tf.Tensor = None
mems: List[tf.Tensor] = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFTransfoXLLMHeadModelOutput(ModelOutput):
"""
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
Args:
losses (:obj:`tf.Tensor` of shape `(batch_size, sequence_length-1)`, `optional`, returned when ``labels`` is provided)
Language modeling losses (not reduced).
prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token after SoftMax).
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
prediction_scores: tf.Tensor = None
mems: List[tf.Tensor] = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
TRANSFO_XL_START_DOCSTRING = r""" TRANSFO_XL_START_DOCSTRING = r"""
.. note:: .. note::
@@ -706,6 +785,11 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
than the model's internal embedding lookup matrix. than the model's internal embedding lookup matrix.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
@@ -719,29 +803,13 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
self.transformer = TFTransfoXLMainLayer(config, name="transformer") self.transformer = TFTransfoXLMainLayer(config, name="transformer")
@add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="transfo-xl-wt103",
output_type=TFTransfoXLModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r"""
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.TransfoXLConfig`) and inputs:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the last layer of the model.
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
outputs = self.transformer(inputs, **kwargs) outputs = self.transformer(inputs, **kwargs)
return outputs return outputs
@@ -797,57 +865,47 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
return self.transformer.init_mems(bsz) return self.transformer.init_mems(bsz)
@add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="transfo-xl-wt103",
output_type=TFTransfoXLLMHeadModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs, inputs,
mems=None, mems=None,
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None,
training=False, training=False,
): ):
r"""
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.TransfoXLConfig`) and inputs:
prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
mems = inputs[1] if len(inputs) > 1 else mems mems = inputs[1] if len(inputs) > 1 else mems
head_mask = inputs[2] if len(inputs) > 2 else head_mask head_mask = inputs[2] if len(inputs) > 2 else head_mask
inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
labels = inputs[4] if len(inputs) > 4 else labels output_attentions = inputs[4] if len(inputs) > 4 else output_attentions
output_attentions = inputs[5] if len(inputs) > 5 else output_attentions output_hidden_states = inputs[5] if len(inputs) > 5 else output_hidden_states
assert len(inputs) <= 6, "Too many inputs." return_dict = inputs[6] if len(inputs) > 6 else return_dict
labels = inputs[7] if len(inputs) > 7 else labels
assert len(inputs) <= 8, "Too many inputs."
elif isinstance(inputs, (BatchEncoding, dict)): elif isinstance(inputs, (BatchEncoding, dict)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
mems = inputs.get("mems", mems) mems = inputs.get("mems", mems)
head_mask = inputs.get("head_mask", head_mask) head_mask = inputs.get("head_mask", head_mask)
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
labels = inputs.get("labels", labels)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
assert len(inputs) <= 6, "Too many inputs." output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
return_dict = inputs.get("return_dict", return_dict)
labels = inputs.get("labels", labels)
assert len(inputs) <= 8, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if input_ids is not None: if input_ids is not None:
bsz, tgt_len = shape_list(input_ids)[:2] bsz, tgt_len = shape_list(input_ids)[:2]
@@ -855,17 +913,30 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
bsz, tgt_len = shape_list(inputs_embeds)[:2] bsz, tgt_len = shape_list(inputs_embeds)[:2]
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
input_ids, mems, head_mask, inputs_embeds, output_attentions, output_hidden_states, training=training input_ids,
mems,
head_mask,
inputs_embeds,
output_attentions,
output_hidden_states,
return_dict,
training=training,
) )
last_hidden = transformer_outputs[0] last_hidden = transformer_outputs[0]
pred_hid = last_hidden[:, -tgt_len:] pred_hid = last_hidden[:, -tgt_len:]
outputs = transformer_outputs[1:]
softmax_output = self.crit(pred_hid, labels, training=training) softmax_output = self.crit(pred_hid, labels, training=training)
outputs = [softmax_output] + outputs
return outputs # logits, new_mems, (all hidden states), (all attentions) if not return_dict:
return (softmax_output,) + transformer_outputs[1:]
return TFTransfoXLLMHeadModelOutput(
prediction_scores=softmax_output,
mems=transformer_outputs.mems,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
def prepare_inputs_for_generation(self, inputs, past, **model_kwargs): def prepare_inputs_for_generation(self, inputs, past, **model_kwargs):
inputs = {"inputs": inputs} inputs = {"inputs": inputs}

View File

@@ -20,6 +20,8 @@ import itertools
import logging import logging
import math import math
import warnings import warnings
from dataclasses import dataclass
from typing import Optional, Tuple
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
@@ -27,10 +29,18 @@ import tensorflow as tf
from .configuration_xlm import XLMConfig from .configuration_xlm import XLMConfig
from .file_utils import ( from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS, MULTIPLE_CHOICE_DUMMY_INPUTS,
ModelOutput,
add_code_sample_docstrings, add_code_sample_docstrings,
add_start_docstrings, add_start_docstrings,
add_start_docstrings_to_callable, add_start_docstrings_to_callable,
) )
from .modeling_tf_outputs import (
TFBaseModelOutput,
TFMultipleChoiceModelOutput,
TFQuestionAnsweringModelOutput,
TFSequenceClassifierOutput,
TFTokenClassifierOutput,
)
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
TFPreTrainedModel, TFPreTrainedModel,
@@ -48,6 +58,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_CONFIG_FOR_DOC = "XLMConfig"
_TOKENIZER_FOR_DOC = "XLMTokenizer" _TOKENIZER_FOR_DOC = "XLMTokenizer"
TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -224,6 +235,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.return_dict = config.use_return_dict
# encoder / decoder, output layer # encoder / decoder, output layer
self.is_encoder = config.is_encoder self.is_encoder = config.is_encoder
@@ -340,6 +352,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
training=False, training=False,
): # removed: src_enc=None, src_len=None ): # removed: src_enc=None, src_len=None
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
@@ -354,7 +367,8 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
output_attentions = inputs[9] if len(inputs) > 9 else output_attentions output_attentions = inputs[9] if len(inputs) > 9 else output_attentions
output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states
assert len(inputs) <= 11, "Too many inputs." return_dict = inputs[11] if len(inputs) > 11 else return_dict
assert len(inputs) <= 12, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
@@ -367,12 +381,14 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
assert len(inputs) <= 11, "Too many inputs." return_dict = inputs.get("return_dict", return_dict)
assert len(inputs) <= 12, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_attentions = output_attentions if output_attentions is not None else self.output_attentions
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
return_dict = return_dict if return_dict is not None else self.return_dict
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
@@ -454,8 +470,8 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
tensor = tensor * mask[..., tf.newaxis] tensor = tensor * mask[..., tf.newaxis]
# transformer layers # transformer layers
hidden_states = () hidden_states = () if output_hidden_states else None
attentions = () attentions = () if output_attentions else None
for i in range(self.n_layers): for i in range(self.n_layers):
if output_hidden_states: if output_hidden_states:
hidden_states = hidden_states + (tensor,) hidden_states = hidden_states + (tensor,)
@@ -494,12 +510,9 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
# move back sequence length to dimension 0 # move back sequence length to dimension 0
# tensor = tensor.transpose(0, 1) # tensor = tensor.transpose(0, 1)
outputs = (tensor,) if not return_dict:
if output_hidden_states: return tuple(v for v in [tensor, hidden_states, attentions] if v is not None)
outputs = outputs + (hidden_states,) return TFBaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions)
if output_attentions:
outputs = outputs + (attentions,)
return outputs # outputs, (hidden_states), (attentions)
class TFXLMPreTrainedModel(TFPreTrainedModel): class TFXLMPreTrainedModel(TFPreTrainedModel):
@@ -522,6 +535,33 @@ class TFXLMPreTrainedModel(TFPreTrainedModel):
return {"input_ids": inputs_list, "attention_mask": attns_list, "langs": langs_list} return {"input_ids": inputs_list, "attention_mask": attns_list, "langs": langs_list}
# Remove when XLMWithLMHead computes loss like other LM models
@dataclass
class TFXLMWithLMHeadModelOutput(ModelOutput):
"""
Base class for :class:`~transformers.TFXLMWithLMHeadModel` outputs.
Args:
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
logits: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
XLM_START_DOCSTRING = r""" XLM_START_DOCSTRING = r"""
.. note:: .. note::
@@ -603,6 +643,11 @@ XLM_INPUTS_DOCSTRING = r"""
than the model's internal embedding lookup matrix. than the model's internal embedding lookup matrix.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
@@ -616,25 +661,13 @@ class TFXLMModel(TFXLMPreTrainedModel):
self.transformer = TFXLMMainLayer(config, name="transformer") self.transformer = TFXLMMainLayer(config, name="transformer")
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlm-mlm-en-2048",
output_type=TFBaseModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r"""
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
last_hidden_state (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
outputs = self.transformer(inputs, **kwargs) outputs = self.transformer(inputs, **kwargs)
return outputs return outputs
@@ -701,32 +734,26 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
return {"inputs": inputs, "langs": langs} return {"inputs": inputs, "langs": langs}
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlm-mlm-en-2048",
output_type=TFXLMWithLMHeadModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" return_dict = kwargs.get("return_dict")
Return: return_dict = return_dict if return_dict is not None else self.transformer.return_dict
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
prediction_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
transformer_outputs = self.transformer(inputs, **kwargs) transformer_outputs = self.transformer(inputs, **kwargs)
output = transformer_outputs[0] output = transformer_outputs[0]
outputs = self.pred_layer(output) outputs = self.pred_layer(output)
outputs = (outputs,) + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here
return outputs if not return_dict:
return (outputs,) + transformer_outputs[1:]
return TFXLMWithLMHeadModelOutput(
logits=outputs, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions
)
@add_start_docstrings( @add_start_docstrings(
@@ -743,7 +770,12 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat
self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary") self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlm-mlm-en-2048",
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -757,6 +789,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -766,27 +799,12 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat
Indices should be in ``[0, ..., config.num_labels - 1]``. Indices should be in ``[0, ..., config.num_labels - 1]``.
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[11] if len(inputs) > 11 else labels labels = inputs[12] if len(inputs) > 12 else labels
if len(inputs) > 11: if len(inputs) > 12:
inputs = inputs[:11] inputs = inputs[:12]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
@@ -802,19 +820,25 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
output = transformer_outputs[0] output = transformer_outputs[0]
logits = self.sequence_summary(output) logits = self.sequence_summary(output)
outputs = (logits,) + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here loss = None if labels is None else self.compute_loss(labels, logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, logits) output = (logits,) + transformer_outputs[1:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), logits, (hidden_states), (attentions) return TFSequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -845,7 +869,12 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
} }
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlm-mlm-en-2048",
output_type=TFMultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs, inputs,
@@ -859,6 +888,7 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -867,24 +897,6 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
Labels for computing the multiple choice classification loss. Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above) of the input tensors. (see `input_ids` above)
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
`num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
@@ -898,8 +910,9 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
output_attentions = inputs[9] if len(inputs) > 9 else output_attentions output_attentions = inputs[9] if len(inputs) > 9 else output_attentions
output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states
labels = inputs[11] if len(inputs) > 11 else labels return_dict = inputs[11] if len(inputs) > 11 else return_dict
assert len(inputs) <= 12, "Too many inputs." labels = inputs[12] if len(inputs) > 12 else labels
assert len(inputs) <= 13, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
@@ -912,10 +925,12 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
return_dict = inputs.get("return_dict", return_dict)
labels = inputs.get("labels", labels) labels = inputs.get("labels", labels)
assert len(inputs) <= 12, "Too many inputs." assert len(inputs) <= 13, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if input_ids is not None: if input_ids is not None:
num_choices = shape_list(input_ids)[1] num_choices = shape_list(input_ids)[1]
@@ -955,19 +970,26 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
flat_inputs_embeds, flat_inputs_embeds,
output_attentions, output_attentions,
output_hidden_states, output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
output = transformer_outputs[0] output = transformer_outputs[0]
logits = self.sequence_summary(output) logits = self.sequence_summary(output)
logits = self.logits_proj(logits) logits = self.logits_proj(logits)
reshaped_logits = tf.reshape(logits, (-1, num_choices)) reshaped_logits = tf.reshape(logits, (-1, num_choices))
outputs = (reshaped_logits,) + transformer_outputs[1:] # add hidden states and attention if they are here
if labels is not None: loss = None if labels is None else self.compute_loss(labels, reshaped_logits)
loss = self.compute_loss(labels, reshaped_logits)
outputs = (loss,) + outputs
return outputs # (loss), reshaped_logits, (hidden_states), (attentions) if not return_dict:
output = (reshaped_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFMultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -987,7 +1009,12 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
) )
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlm-mlm-en-2048",
output_type=TFTokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -1001,6 +1028,7 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -1008,27 +1036,12 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the token classification loss. Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``. Indices should be in ``[0, ..., config.num_labels - 1]``.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[11] if len(inputs) > 11 else labels labels = inputs[12] if len(inputs) > 12 else labels
if len(inputs) > 11: if len(inputs) > 12:
inputs = inputs[:11] inputs = inputs[:12]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
@@ -1044,6 +1057,7 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
@@ -1052,13 +1066,18 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
sequence_output = self.dropout(sequence_output, training=training) sequence_output = self.dropout(sequence_output, training=training)
logits = self.classifier(sequence_output) logits = self.classifier(sequence_output)
outputs = (logits,) + transformer_outputs[1:] # add hidden states and attention if they are here loss = None if labels is None else self.compute_loss(labels, logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, logits) output = (logits,) + transformer_outputs[1:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), logits, (hidden_states), (attentions) return TFTokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -1075,7 +1094,12 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL
) )
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlm-mlm-en-2048",
output_type=TFQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -1089,6 +1113,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
start_positions=None, start_positions=None,
end_positions=None, end_positions=None,
training=False, training=False,
@@ -1102,30 +1127,13 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL
Labels for position (index) of the end of the labelled span for computing the token classification loss. Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss. Position outside of the sequence are not taken into account for computing the loss.
Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
start_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`):
Span-start scores (before SoftMax).
end_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
start_positions = inputs[11] if len(inputs) > 11 else start_positions start_positions = inputs[12] if len(inputs) > 12 else start_positions
end_positions = inputs[12] if len(inputs) > 12 else end_positions end_positions = inputs[13] if len(inputs) > 13 else end_positions
if len(inputs) > 11: if len(inputs) > 12:
inputs = inputs[:11] inputs = inputs[:12]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
start_positions = inputs.pop("start_positions", start_positions) start_positions = inputs.pop("start_positions", start_positions)
end_positions = inputs.pop("end_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions)
@@ -1142,6 +1150,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
@@ -1152,14 +1161,20 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL
start_logits = tf.squeeze(start_logits, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1)
outputs = (start_logits, end_logits,) + transformer_outputs[ loss = None
1:
] # Keep mems, hidden states, attentions if there are in it
if start_positions is not None and end_positions is not None: if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions} labels = {"start_position": start_positions}
labels["end_position"] = end_positions labels["end_position"] = end_positions
loss = self.compute_loss(labels, outputs[:2]) loss = self.compute_loss(labels, (start_logits, end_logits))
outputs = (loss,) + outputs
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) if not return_dict:
output = (start_logits, end_logits) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFQuestionAnsweringModelOutput(
loss=loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)

View File

@@ -62,8 +62,6 @@ XLM_ROBERTA_START_DOCSTRING = r"""
config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the
model. Initializing with a config file does not load the weights associated with the model, only the configuration. model. Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
""" """

View File

@@ -18,6 +18,8 @@
import logging import logging
from dataclasses import dataclass
from typing import List, Optional, Tuple
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
@@ -25,9 +27,11 @@ import tensorflow as tf
from .configuration_xlnet import XLNetConfig from .configuration_xlnet import XLNetConfig
from .file_utils import ( from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS, MULTIPLE_CHOICE_DUMMY_INPUTS,
ModelOutput,
add_code_sample_docstrings, add_code_sample_docstrings,
add_start_docstrings, add_start_docstrings,
add_start_docstrings_to_callable, add_start_docstrings_to_callable,
replace_return_docstrings,
) )
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFCausalLanguageModelingLoss, TFCausalLanguageModelingLoss,
@@ -47,6 +51,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_CONFIG_FOR_DOC = "XLNetConfig"
_TOKENIZER_FOR_DOC = "XLNetTokenizer" _TOKENIZER_FOR_DOC = "XLNetTokenizer"
TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -436,6 +441,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.return_dict = config.return_dict
self.mem_len = config.mem_len self.mem_len = config.mem_len
self.reuse_len = config.reuse_len self.reuse_len = config.reuse_len
@@ -586,6 +592,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
use_cache=True, use_cache=True,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
training=False, training=False,
): ):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
@@ -601,7 +608,8 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
use_cache = inputs[9] if len(inputs) > 9 else use_cache use_cache = inputs[9] if len(inputs) > 9 else use_cache
output_attentions = inputs[10] if len(inputs) > 10 else output_attentions output_attentions = inputs[10] if len(inputs) > 10 else output_attentions
output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states
assert len(inputs) <= 12, "Too many inputs." return_dict = inputs[12] if len(inputs) > 12 else return_dict
assert len(inputs) <= 13, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
@@ -615,12 +623,14 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
use_cache = inputs.get("use_cache", use_cache) use_cache = inputs.get("use_cache", use_cache)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
assert len(inputs) <= 12, "Too many inputs." return_dict = inputs.get("return_dict", return_dict)
assert len(inputs) <= 13, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_attentions = output_attentions if output_attentions is not None else self.output_attentions
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
return_dict = return_dict if return_dict is not None else self.return_dict
# the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
# but we want a unified interface in the library with the batch size on the first dimension # but we want a unified interface in the library with the batch size on the first dimension
@@ -743,8 +753,8 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
if mems is None: if mems is None:
mems = [None] * len(self.layer) mems = [None] * len(self.layer)
attentions = [] attentions = [] if output_attentions else None
hidden_states = [] hidden_states = [] if output_hidden_states else None
for i, layer_module in enumerate(self.layer): for i, layer_module in enumerate(self.layer):
# cache new mems # cache new mems
if self.mem_len is not None and self.mem_len > 0 and use_cache: if self.mem_len is not None and self.mem_len > 0 and use_cache:
@@ -776,22 +786,24 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
output = self.dropout(output_g if output_g is not None else output_h, training=training) output = self.dropout(output_g if output_g is not None else output_h, training=training)
# Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method) # Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method)
outputs = (tf.transpose(output, perm=(1, 0, 2)),) output = tf.transpose(output, perm=(1, 0, 2))
if self.mem_len is not None and self.mem_len > 0 and use_cache:
outputs = outputs + (new_mems,)
if not (self.mem_len is not None and self.mem_len > 0 and use_cache):
new_mems = None
if output_hidden_states: if output_hidden_states:
if output_g is not None: if output_g is not None:
hidden_states = tuple(tf.transpose(h, perm=(1, 0, 2)) for hs in hidden_states for h in hs) hidden_states = tuple(tf.transpose(h, perm=(1, 0, 2)) for hs in hidden_states for h in hs)
else: else:
hidden_states = tuple(tf.transpose(hs, perm=(1, 0, 2)) for hs in hidden_states) hidden_states = tuple(tf.transpose(hs, perm=(1, 0, 2)) for hs in hidden_states)
outputs = outputs + (hidden_states,)
if output_attentions: if output_attentions:
attentions = tuple(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions) attentions = tuple(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions)
outputs = outputs + (attentions,)
return outputs # outputs, (new_mems), (hidden_states), (attentions) if not return_dict:
return tuple(v for v in [output, new_mems, hidden_states, attentions] if v is not None)
return TFXLNetModelOutput(
last_hidden_state=output, mems=new_mems, hidden_states=hidden_states, attentions=attentions
)
class TFXLNetPreTrainedModel(TFPreTrainedModel): class TFXLNetPreTrainedModel(TFPreTrainedModel):
@@ -803,6 +815,218 @@ class TFXLNetPreTrainedModel(TFPreTrainedModel):
base_model_prefix = "transformer" base_model_prefix = "transformer"
@dataclass
class TFXLNetModelOutput(ModelOutput):
"""
Output type of :class:`~transformers.TFXLNetModel`.
Args:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_predict, hidden_size)`):
Sequence of hidden-states at the last layer of the model.
``num_predict`` corresponds to ``target_mapping.shape[1]``. If ``target_mapping`` is ``None``, then
``num_predict`` corresponds to ``sequence_length``.
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states.
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
last_hidden_state: tf.Tensor = None
mems: Optional[List[tf.Tensor]] = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFXLNetLMHeadModelOutput(ModelOutput):
"""
Output type of :class:`~transformers.TFXLNetLMHeadModel`.
Args:
loss (:obj:`tf.Tensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided)
Language modeling loss (for next-token prediction).
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_predict, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
``num_predict`` corresponds to ``target_mapping.shape[1]``. If ``target_mapping`` is ``None``, then
``num_predict`` corresponds to ``sequence_length``.
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states.
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[tf.Tensor] = None
logits: tf.Tensor = None
mems: Optional[List[tf.Tensor]] = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFXLNetForSequenceClassificationOutput(ModelOutput):
"""
Output type of :class:`~transformers.TFXLNetForSequenceClassification`.
Args:
loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
Classification (or regression if config.num_labels==1) loss.
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states.
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[tf.Tensor] = None
logits: tf.Tensor = None
mems: Optional[List[tf.Tensor]] = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFXLNetForTokenClassificationOutput(ModelOutput):
"""
Output type of :class:`~transformers.TFXLNetForTokenClassificationOutput`.
Args:
loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
Classification loss.
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax).
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states.
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[tf.Tensor] = None
logits: tf.Tensor = None
mems: Optional[List[tf.Tensor]] = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFXLNetForMultipleChoiceOutput(ModelOutput):
"""
Output type of :class:`~transformers.TFXLNetForMultipleChoice`.
Args:
loss (:obj:`tf.Tensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
Classification loss.
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`):
`num_choices` is the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax).
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states.
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[tf.Tensor] = None
logits: tf.Tensor = None
mems: Optional[List[tf.Tensor]] = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFXLNetForQuestionAnsweringSimpleOutput(ModelOutput):
"""
Output type of :class:`~transformers.TFXLNetForQuestionAnsweringSimple`.
Args:
loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
start_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-start scores (before SoftMax).
end_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax).
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states.
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[tf.Tensor] = None
start_logits: tf.Tensor = None
end_logits: tf.Tensor = None
mems: Optional[List[tf.Tensor]] = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
XLNET_START_DOCSTRING = r""" XLNET_START_DOCSTRING = r"""
.. note:: .. note::
@@ -885,6 +1109,11 @@ XLNET_INPUTS_DOCSTRING = r"""
If `use_cache` is True, `mems` are returned and can be used to speed up decoding (see `mems`). Defaults to `True`. If `use_cache` is True, `mems` are returned and can be used to speed up decoding (see `mems`). Defaults to `True`.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
@@ -898,29 +1127,13 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
self.transformer = TFXLNetMainLayer(config, name="transformer") self.transformer = TFXLNetMainLayer(config, name="transformer")
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlnet-base-cased",
output_type=TFXLNetModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r"""
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
last_hidden_state (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the last layer of the model.
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
outputs = self.transformer(inputs, **kwargs) outputs = self.transformer(inputs, **kwargs)
return outputs return outputs
@@ -980,6 +1193,7 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss):
return inputs return inputs
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFXLNetLMHeadModelOutput, config_class=_CONFIG_FOR_DOC)
def call( def call(
self, self,
inputs, inputs,
@@ -994,6 +1208,7 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss):
use_cache=True, use_cache=True,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -1003,24 +1218,6 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss):
Indices should be in ``[0, ..., config.vocab_size - 1]``. Indices should be in ``[0, ..., config.vocab_size - 1]``.
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
prediction_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples:: Examples::
@@ -1045,10 +1242,11 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss):
next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[12] if len(inputs) > 12 else labels labels = inputs[13] if len(inputs) > 13 else labels
if len(inputs) > 12: if len(inputs) > 13:
inputs = inputs[:12] inputs = inputs[:13]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
@@ -1065,21 +1263,30 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss):
use_cache=True, use_cache=True,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=return_dict,
training=training, training=training,
) )
hidden_state = transformer_outputs[0] hidden_state = transformer_outputs[0]
logits = self.lm_loss(hidden_state, training=training) logits = self.lm_loss(hidden_state, training=training)
outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it loss = None
if labels is not None: if labels is not None:
# shift labels to the left and cut last logit token # shift labels to the left and cut last logit token
logits = logits[:, :-1] logits = logits[:, :-1]
labels = labels[:, 1:] labels = labels[:, 1:]
loss = self.compute_loss(labels, logits) loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # return logits, (mems), (hidden states), (attentions) if not return_dict:
output = (logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFXLNetLMHeadModelOutput(
loss=loss,
logits=logits,
mems=transformer_outputs.mems,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -1101,7 +1308,12 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif
) )
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlnet-base-cased",
output_type=TFXLNetForSequenceClassificationOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -1116,6 +1328,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif
use_cache=True, use_cache=True,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -1125,31 +1338,12 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif
Indices should be in ``[0, ..., config.num_labels - 1]``. Indices should be in ``[0, ..., config.num_labels - 1]``.
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[12] if len(inputs) > 12 else labels labels = inputs[13] if len(inputs) > 13 else labels
if len(inputs) > 12: if len(inputs) > 13:
inputs = inputs[:12] inputs = inputs[:13]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
@@ -1166,19 +1360,26 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
) )
output = transformer_outputs[0] output = transformer_outputs[0]
output = self.sequence_summary(output) output = self.sequence_summary(output)
logits = self.logits_proj(output) logits = self.logits_proj(output)
outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it loss = None if labels is None else self.compute_loss(labels, logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, logits) output = (logits,) + transformer_outputs[1:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), logits, (hidden_states), (attentions) return TFXLNetForSequenceClassificationOutput(
loss=loss,
logits=logits,
mems=transformer_outputs.mems,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -1208,7 +1409,12 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlnet-base-cased",
output_type=TFXLNetForMultipleChoiceOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -1223,6 +1429,7 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
use_cache=True, use_cache=True,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -1231,24 +1438,6 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
Labels for computing the multiple choice classification loss. Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above) of the input tensors. (see `input_ids` above)
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
`num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
@@ -1263,8 +1452,9 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
use_cache = inputs[9] if len(inputs) > 9 else use_cache use_cache = inputs[9] if len(inputs) > 9 else use_cache
output_attentions = inputs[10] if len(inputs) > 10 else output_attentions output_attentions = inputs[10] if len(inputs) > 10 else output_attentions
output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states
labels = inputs[12] if len(inputs) > 12 else labels return_dict = inputs[12] if len(inputs) > 12 else return_dict
assert len(inputs) <= 13, "Too many inputs." labels = inputs[13] if len(inputs) > 13 else labels
assert len(inputs) <= 14, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
@@ -1278,10 +1468,12 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
use_cache = inputs.get("use_cache", use_cache) use_cache = inputs.get("use_cache", use_cache)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
return_dict = inputs.get("return_dict", return_dict)
labels = inputs.get("labels", labels) labels = inputs.get("labels", labels)
assert len(inputs) <= 13, "Too many inputs." assert len(inputs) <= 14, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if input_ids is not None: if input_ids is not None:
num_choices = shape_list(input_ids)[1] num_choices = shape_list(input_ids)[1]
@@ -1312,19 +1504,26 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
use_cache, use_cache,
output_attentions, output_attentions,
output_hidden_states, output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
output = transformer_outputs[0] output = transformer_outputs[0]
logits = self.sequence_summary(output) logits = self.sequence_summary(output)
logits = self.logits_proj(logits) logits = self.logits_proj(logits)
reshaped_logits = tf.reshape(logits, (-1, num_choices)) reshaped_logits = tf.reshape(logits, (-1, num_choices))
outputs = (reshaped_logits,) + transformer_outputs[1:] # add hidden states and attention if they are here loss = None if labels is None else self.compute_loss(labels, reshaped_logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, reshaped_logits) output = (reshaped_logits,) + transformer_outputs[1:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), logits, (mems), (hidden states), (attentions) return TFXLNetForMultipleChoiceOutput(
loss=loss,
logits=reshaped_logits,
mems=transformer_outputs.mems,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -1343,7 +1542,12 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio
) )
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlnet-base-cased",
output_type=TFXLNetForTokenClassificationOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -1358,6 +1562,7 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio
use_cache=True, use_cache=True,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -1365,31 +1570,12 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the token classification loss. Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``. Indices should be in ``[0, ..., config.num_labels - 1]``.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:(batch_size, config.num_labels)`):
Classification scores (before SoftMax).
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[12] if len(inputs) > 12 else labels labels = inputs[13] if len(inputs) > 13 else labels
if len(inputs) > 12: if len(inputs) > 13:
inputs = inputs[:12] inputs = inputs[:13]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
@@ -1406,19 +1592,25 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
output = transformer_outputs[0] output = transformer_outputs[0]
logits = self.classifier(output) logits = self.classifier(output)
loss = None if labels is None else self.compute_loss(labels, logits)
outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it if not return_dict:
output = (logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
if labels is not None: return TFXLNetForTokenClassificationOutput(
loss = self.compute_loss(labels, logits) loss=loss,
outputs = (loss,) + outputs logits=logits,
mems=transformer_outputs.mems,
return outputs # (loss), logits, (hidden_states), (attentions) hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -1435,7 +1627,12 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer
) )
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlnet-base-cased",
output_type=TFXLNetForQuestionAnsweringSimpleOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -1450,6 +1647,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer
use_cache=True, use_cache=True,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
start_positions=None, start_positions=None,
end_positions=None, end_positions=None,
training=False, training=False,
@@ -1463,36 +1661,13 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer
Labels for position (index) of the end of the labelled span for computing the token classification loss. Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss. Position outside of the sequence are not taken into account for computing the loss.
Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
loss (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
start_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`):
Span-start scores (before SoftMax).
end_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax).
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
start_positions = inputs[12] if len(inputs) > 12 else start_positions start_positions = inputs[13] if len(inputs) > 13 else start_positions
end_positions = inputs[13] if len(inputs) > 13 else end_positions end_positions = inputs[14] if len(inputs) > 14 else end_positions
if len(inputs) > 12: if len(inputs) > 13:
inputs = inputs[:12] inputs = inputs[:13]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
start_positions = inputs.pop("start_positions", start_positions) start_positions = inputs.pop("start_positions", start_positions)
end_positions = inputs.pop("end_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions)
@@ -1510,6 +1685,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
@@ -1520,17 +1696,24 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer
start_logits = tf.squeeze(start_logits, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1)
outputs = (start_logits, end_logits,) + transformer_outputs[ loss = None
1:
] # Keep mems, hidden states, attentions if there are in it
if start_positions is not None and end_positions is not None: if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions} labels = {"start_position": start_positions}
labels["end_position"] = end_positions labels["end_position"] = end_positions
loss = self.compute_loss(labels, outputs[:2]) loss = self.compute_loss(labels, (start_logits, end_logits))
outputs = (loss,) + outputs
return outputs # (loss), start_logits, end_logits, (mems), (hidden_states), (attentions) if not return_dict:
output = (start_logits, end_logits) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFXLNetForQuestionAnsweringSimpleOutput(
loss=loss,
start_logits=start_logits,
end_logits=end_logits,
mems=transformer_outputs.mems,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
# @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of # @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of

View File

@@ -711,7 +711,7 @@ class XLNetForTokenClassificationOutput(ModelOutput):
@dataclass @dataclass
class XLNetForMultipleChoiceOutput(ModelOutput): class XLNetForMultipleChoiceOutput(ModelOutput):
""" """
Base class for outputs of multiple choice models. Output type of :class:`~transformers.XLNetForMultipleChoice`.
Args: Args:
loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided): loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
@@ -747,7 +747,7 @@ class XLNetForMultipleChoiceOutput(ModelOutput):
@dataclass @dataclass
class XLNetForQuestionAnsweringSimpleOutput(ModelOutput): class XLNetForQuestionAnsweringSimpleOutput(ModelOutput):
""" """
Base class for outputs of question answering models. Output type of :class:`~transformers.XLNetForQuestionAnsweringSimple`.
Args: Args:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
@@ -784,7 +784,7 @@ class XLNetForQuestionAnsweringSimpleOutput(ModelOutput):
@dataclass @dataclass
class XLNetForQuestionAnsweringOutput(ModelOutput): class XLNetForQuestionAnsweringOutput(ModelOutput):
""" """
Base class for outputs of question answering models using a :obj:`SquadHead`. Output type of :class:`~transformers.XLNetForQuestionAnswering`.
Args: Args:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided): loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided):
@@ -1227,7 +1227,6 @@ class XLNetModel(XLNetPreTrainedModel):
# Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method) # Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method)
output = output.permute(1, 0, 2).contiguous() output = output.permute(1, 0, 2).contiguous()
# TODO Teven: fix this test to only use use_cache.
if not use_cache: if not use_cache:
new_mems = None new_mems = None

View File

@@ -31,6 +31,14 @@ from .file_utils import (
add_start_docstrings, add_start_docstrings,
add_start_docstrings_to_callable, add_start_docstrings_to_callable,
) )
from .modeling_tf_outputs import (
TFBaseModelOutputWithPooling,
TFMaskedLMOutput,
TFMultipleChoiceModelOutput,
TFQuestionAnsweringModelOutput,
TFSequenceClassifierOutput,
TFTokenClassifierOutput,
)
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFMaskedLanguageModelingLoss, TFMaskedLanguageModelingLoss,
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
@@ -46,6 +54,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_CONFIG_FOR_DOC = "XXXConfig"
_TOKENIZER_FOR_DOC = "XxxTokenizer" _TOKENIZER_FOR_DOC = "XxxTokenizer"
#################################################### ####################################################
@@ -117,35 +126,60 @@ class TFXxxMainLayer(tf.keras.layers.Layer):
raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models
def call( def call(
self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False self,
inputs,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
training=False,
): ):
# We allow three types of multi-inputs:
# - traditional keyword arguments in the call method
# - all the arguments provided as a dict in the first positional argument of call
# - all the arguments provided as a list/tuple (ordered) in the first positional argument of call
# The last two options are useful to use the tf.keras fit() method.
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
position_ids = inputs[3] if len(inputs) > 3 else position_ids position_ids = inputs[3] if len(inputs) > 3 else position_ids
head_mask = inputs[4] if len(inputs) > 4 else head_mask head_mask = inputs[4] if len(inputs) > 4 else head_mask
assert len(inputs) <= 5, "Too many inputs." inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
elif isinstance(inputs, dict): output_attentions = inputs[6] if len(inputs) > 6 else output_attentions
output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states
return_dict = inputs[8] if len(inputs) > 8 else return_dict
assert len(inputs) <= 9, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
token_type_ids = inputs.get("token_type_ids", token_type_ids) token_type_ids = inputs.get("token_type_ids", token_type_ids)
position_ids = inputs.get("position_ids", position_ids) position_ids = inputs.get("position_ids", position_ids)
head_mask = inputs.get("head_mask", head_mask) head_mask = inputs.get("head_mask", head_mask)
assert len(inputs) <= 5, "Too many inputs." inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
return_dict = inputs.get("return_dict", return_dict)
assert len(inputs) <= 9, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
output_attentions = output_attentions if output_attentions is not None else self.output_attentions
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
return_dict = return_dict if return_dict is not None else self.return_dict
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = shape_list(input_ids)
elif inputs_embeds is not None:
input_shape = shape_list(inputs_embeds)[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
if attention_mask is None: if attention_mask is None:
attention_mask = tf.fill(shape_list(input_ids), 1) attention_mask = tf.fill(input_shape, 1)
if token_type_ids is None: if token_type_ids is None:
token_type_ids = tf.fill(shape_list(input_ids), 0) token_type_ids = tf.fill(input_shape, 0)
# We create a 3D attention mask from a 2D tensor mask. # We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length] # Sizes are [batch_size, 1, 1, to_seq_length]
@@ -174,14 +208,29 @@ class TFXxxMainLayer(tf.keras.layers.Layer):
head_mask = [None] * self.num_hidden_layers head_mask = [None] * self.num_hidden_layers
# head_mask = tf.constant([0] * self.num_hidden_layers) # head_mask = tf.constant([0] * self.num_hidden_layers)
################################## embedding_output = self.embeddings(input_ids, position_ids, token_type_ids, inputs_embeds, training=training)
# Replace this with your model code encoder_outputs = self.encoder(
embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids) embedding_output,
encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training) extended_attention_mask,
sequence_output = encoder_outputs[0] head_mask,
outputs = (sequence_output,) + encoder_outputs[1:] # add hidden_states and attentions if they are here output_attentions,
output_hidden_states,
return_dict,
training=training,
)
return outputs # sequence_output, (hidden_states), (attentions) sequence_output = encoder_outputs[0]
pooled_output = self.pooler(sequence_output)
if not return_dict:
return (sequence_output, pooled_output,) + encoder_outputs[1:]
return TFBaseModelOutputWithPooling(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
#################################################### ####################################################
@@ -274,6 +323,11 @@ XXX_INPUTS_DOCSTRING = r"""
(if set to :obj:`False`) for evaluation. (if set to :obj:`False`) for evaluation.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
@@ -287,32 +341,13 @@ class TFXxxModel(TFXxxPreTrainedModel):
self.transformer = TFXxxMainLayer(config, name="transformer") self.transformer = TFXxxMainLayer(config, name="transformer")
@add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xxx-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xxx-base-cased",
output_type=TFBaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r"""
Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XxxConfig`) and inputs:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification token)
further processed by a Linear layer and a Tanh activation function. The Linear
layer weights are trained from the next sentence prediction (classification)
objective during XXX pretraining. This output is usually *not* a good summary
of the semantic content of the input, you're often better with averaging or pooling
the sequence of hidden-states for the whole input sequence.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
outputs = self.transformer(inputs, **kwargs) outputs = self.transformer(inputs, **kwargs)
return outputs return outputs
@@ -329,7 +364,12 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel, TFMaskedLanguageModelingLoss):
self.mlm = TFXxxMLMHead(config, self.transformer.embeddings, name="mlm") self.mlm = TFXxxMLMHead(config, self.transformer.embeddings, name="mlm")
@add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xxx-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xxx-base-cased",
output_type=TFMaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -340,6 +380,7 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel, TFMaskedLanguageModelingLoss):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -349,27 +390,12 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel, TFMaskedLanguageModelingLoss):
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]`` in ``[0, ..., config.vocab_size]``
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XxxConfig`) and inputs:
prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
@@ -382,19 +408,22 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel, TFMaskedLanguageModelingLoss):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
prediction_scores = self.mlm(sequence_output, training=training) prediction_scores = self.mlm(sequence_output, training=training)
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here loss = None if labels is None else self.compute_loss(labels, prediction_scores)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, prediction_scores) output = (prediction_scores,) + outputs[2:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), prediction_scores, (hidden_states), (attentions) return TFMaskedLMOutput(
loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -414,7 +443,12 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel, TFSequenceClassificat
) )
@add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xxx-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xxx-base-cased",
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -425,6 +459,7 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel, TFSequenceClassificat
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -434,27 +469,12 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel, TFSequenceClassificat
Indices should be in :obj:`[0, ..., config.num_labels - 1]`. Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XxxConfig`) and inputs:
logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
@@ -467,6 +487,7 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel, TFSequenceClassificat
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
@@ -475,13 +496,15 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel, TFSequenceClassificat
pooled_output = self.dropout(pooled_output, training=training) pooled_output = self.dropout(pooled_output, training=training)
logits = self.classifier(pooled_output) logits = self.classifier(pooled_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here loss = None if labels is None else self.compute_loss(labels, logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, logits) output = (logits,) + outputs[2:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), logits, (hidden_states), (attentions) return TFSequenceClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -509,7 +532,12 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss):
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xxx-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xxx-base-cased",
output_type=TFMultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs, inputs,
@@ -520,6 +548,7 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -527,24 +556,7 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss):
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for computing the multiple choice classification loss. Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above) of the input tensors. (see `input_ids` above)s after the attention softmax, used to compute the weighted average in the self-attention
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XxxConfig`) and inputs:
classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
`num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
@@ -556,8 +568,9 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss):
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_attentions = inputs[6] if len(inputs) > 6 else output_attentions
output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states
labels = inputs[8] if len(inputs) > 8 else labels return_dict = inputs[8] if len(inputs) > 8 else return_dict
assert len(inputs) <= 9, "Too many inputs." labels = inputs[9] if len(inputs) > 9 else labels
assert len(inputs) <= 10, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
@@ -567,10 +580,12 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss):
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
return_dict = inputs.get("return_dict", return_dict)
labels = inputs.get("labels", labels) labels = inputs.get("labels", labels)
assert len(inputs) <= 9, "Too many inputs." assert len(inputs) <= 10, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if input_ids is not None: if input_ids is not None:
num_choices = shape_list(input_ids)[1] num_choices = shape_list(input_ids)[1]
@@ -598,6 +613,7 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss):
flat_inputs_embeds, flat_inputs_embeds,
output_attentions, output_attentions,
output_hidden_states, output_hidden_states,
return_dict,
] ]
outputs = self.transformer(flat_inputs, training=training) outputs = self.transformer(flat_inputs, training=training)
@@ -608,13 +624,15 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss):
logits = self.classifier(pooled_output) logits = self.classifier(pooled_output)
reshaped_logits = tf.reshape(logits, (-1, num_choices)) reshaped_logits = tf.reshape(logits, (-1, num_choices))
outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here loss = None if labels is None else self.compute_loss(labels, reshaped_logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, reshaped_logits) output = (reshaped_logits,) + outputs[2:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), reshaped_logits, (hidden_states), (attentions) return TFMultipleChoiceModelOutput(
loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -634,7 +652,12 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLos
) )
@add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xxx-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xxx-base-cased",
output_type=TFTokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -645,6 +668,7 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLos
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
@@ -652,27 +676,12 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLos
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the token classification loss. Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``. Indices should be in ``[0, ..., config.num_labels - 1]``.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XxxConfig`) and inputs:
scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
@@ -685,6 +694,7 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLos
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
@@ -693,13 +703,15 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLos
sequence_output = self.dropout(sequence_output, training=training) sequence_output = self.dropout(sequence_output, training=training)
logits = self.classifier(sequence_output) logits = self.classifier(sequence_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here loss = None if labels is None else self.compute_loss(labels, logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, logits) output = (logits,) + outputs[2:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), logits, (hidden_states), (attentions) return TFTokenClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
@@ -718,7 +730,12 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel, TFQuestionAnsweringLoss):
) )
@add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xxx-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xxx-base-cased",
output_type=TFQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
@@ -729,6 +746,7 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel, TFQuestionAnsweringLoss):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
start_positions=None, start_positions=None,
end_positions=None, end_positions=None,
training=False, training=False,
@@ -742,30 +760,13 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel, TFQuestionAnsweringLoss):
Labels for position (index) of the end of the labelled span for computing the token classification loss. Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss. Position outside of the sequence are not taken into account for computing the loss.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XxxConfig`) and inputs:
start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-start scores (before SoftMax).
end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
start_positions = inputs[8] if len(inputs) > 8 else start_positions start_positions = inputs[9] if len(inputs) > 9 else start_positions
end_positions = inputs[9] if len(inputs) > 9 else end_positions end_positions = inputs[10] if len(inputs) > 10 else end_positions
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
start_positions = inputs.pop("start_positions", start_positions) start_positions = inputs.pop("start_positions", start_positions)
end_positions = inputs.pop("end_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions)
@@ -779,6 +780,7 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel, TFQuestionAnsweringLoss):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
@@ -789,12 +791,20 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel, TFQuestionAnsweringLoss):
start_logits = tf.squeeze(start_logits, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1)
outputs = (start_logits, end_logits,) + outputs[2:] loss = None
if start_positions is not None and end_positions is not None: if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions} labels = {"start_position": start_positions}
labels["end_position"] = end_positions labels["end_position"] = end_positions
loss = self.compute_loss(labels, outputs[:2]) loss = self.compute_loss(labels, (start_logits, end_logits))
outputs = (loss,) + outputs
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFQuestionAnsweringModelOutput(
loss=loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)

View File

@@ -24,9 +24,11 @@ from .utils import CACHE_DIR, require_tf, slow
if is_tf_available(): if is_tf_available():
import tensorflow as tf
from transformers.modeling_tf_xxx import ( from transformers.modeling_tf_xxx import (
TFXxxModel, TFXxxModel,
TFXxxForMaskedLM, TFXxxForMaskedLM,
TFXxxForMultipleChoice,
TFXxxForSequenceClassification, TFXxxForSequenceClassification,
TFXxxForTokenClassification, TFXxxForTokenClassification,
TFXxxForQuestionAnswering, TFXxxForQuestionAnswering,
@@ -40,6 +42,7 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase):
( (
TFXxxModel, TFXxxModel,
TFXxxForMaskedLM, TFXxxForMaskedLM,
TFXxxForMultipleChoice,
TFXxxForQuestionAnswering, TFXxxForQuestionAnswering,
TFXxxForSequenceClassification, TFXxxForSequenceClassification,
TFXxxForTokenClassification, TFXxxForTokenClassification,
@@ -128,6 +131,7 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase):
max_position_embeddings=self.max_position_embeddings, max_position_embeddings=self.max_position_embeddings,
type_vocab_size=self.type_vocab_size, type_vocab_size=self.type_vocab_size,
initializer_range=self.initializer_range, initializer_range=self.initializer_range,
return_dict=True,
) )
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
@@ -137,33 +141,26 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase):
): ):
model = TFXxxModel(config=config) model = TFXxxModel(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
sequence_output, pooled_output = model(inputs) result = model(inputs)
inputs = [input_ids, input_mask] inputs = [input_ids, input_mask]
sequence_output, pooled_output = model(inputs) result = model(inputs)
sequence_output, pooled_output = model(input_ids) result = model(input_ids)
result = {
"sequence_output": sequence_output.numpy(),
"pooled_output": pooled_output.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size]
) )
self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size]) self.parent.assertListEqual(list(result["pooler_output"].shape), [self.batch_size, self.hidden_size])
def create_and_check_xxx_for_masked_lm( def create_and_check_xxx_for_masked_lm(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
): ):
model = TFXxxForMaskedLM(config=config) model = TFXxxForMaskedLM(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
(prediction_scores,) = model(inputs) result = model(inputs)
result = {
"prediction_scores": prediction_scores.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]
) )
def create_and_check_xxx_for_sequence_classification( def create_and_check_xxx_for_sequence_classification(
@@ -172,22 +169,32 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase):
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = TFXxxForSequenceClassification(config=config) model = TFXxxForSequenceClassification(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
(logits,) = model(inputs) result = model(inputs)
result = {
"logits": logits.numpy(),
}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
def create_and_check_bert_for_multiple_choice(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
config.num_choices = self.num_choices
model = TFXxxForMultipleChoice(config=config)
multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
inputs = {
"input_ids": multiple_choice_inputs_ids,
"attention_mask": multiple_choice_input_mask,
"token_type_ids": multiple_choice_token_type_ids,
}
result = model(inputs)
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices])
def create_and_check_xxx_for_token_classification( def create_and_check_xxx_for_token_classification(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
): ):
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = TFXxxForTokenClassification(config=config) model = TFXxxForTokenClassification(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
(logits,) = model(inputs) result = model(inputs)
result = {
"logits": logits.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels] list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]
) )
@@ -197,11 +204,7 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase):
): ):
model = TFXxxForQuestionAnswering(config=config) model = TFXxxForQuestionAnswering(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
start_logits, end_logits = model(inputs) result = model(inputs)
result = {
"start_logits": start_logits.numpy(),
"end_logits": end_logits.numpy(),
}
self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])

View File

@@ -116,6 +116,7 @@ class TFAlbertModelTester:
max_position_embeddings=self.max_position_embeddings, max_position_embeddings=self.max_position_embeddings,
type_vocab_size=self.type_vocab_size, type_vocab_size=self.type_vocab_size,
initializer_range=self.initializer_range, initializer_range=self.initializer_range,
return_dict=True,
) )
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
@@ -129,21 +130,17 @@ class TFAlbertModelTester:
# 'token_type_ids': token_type_ids} # 'token_type_ids': token_type_ids}
# sequence_output, pooled_output = model(**inputs) # sequence_output, pooled_output = model(**inputs)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
sequence_output, pooled_output = model(inputs) result = model(inputs)
inputs = [input_ids, input_mask] inputs = [input_ids, input_mask]
sequence_output, pooled_output = model(inputs) result = model(inputs)
sequence_output, pooled_output = model(input_ids) result = model(input_ids)
result = {
"sequence_output": sequence_output.numpy(),
"pooled_output": pooled_output.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size]
) )
self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size]) self.parent.assertListEqual(list(result["pooler_output"].shape), [self.batch_size, self.hidden_size])
def create_and_check_albert_for_pretraining( def create_and_check_albert_for_pretraining(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
@@ -151,28 +148,19 @@ class TFAlbertModelTester:
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = TFAlbertForPreTraining(config=config) model = TFAlbertForPreTraining(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
prediction_scores, sop_scores = model(inputs) result = model(inputs)
result = {
"prediction_scores": prediction_scores.numpy(),
"sop_scores": sop_scores.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] list(result["prediction_logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]
) )
self.parent.assertListEqual(list(result["sop_scores"].shape), [self.batch_size, self.num_labels]) self.parent.assertListEqual(list(result["sop_logits"].shape), [self.batch_size, self.num_labels])
def create_and_check_albert_for_masked_lm( def create_and_check_albert_for_masked_lm(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
): ):
model = TFAlbertForMaskedLM(config=config) model = TFAlbertForMaskedLM(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
(prediction_scores,) = model(inputs) result = model(inputs)
result = { self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size])
"prediction_scores": prediction_scores.numpy(),
}
self.parent.assertListEqual(
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
)
def create_and_check_albert_for_sequence_classification( def create_and_check_albert_for_sequence_classification(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
@@ -180,10 +168,7 @@ class TFAlbertModelTester:
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = TFAlbertForSequenceClassification(config=config) model = TFAlbertForSequenceClassification(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
(logits,) = model(inputs) result = model(inputs)
result = {
"logits": logits.numpy(),
}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
def create_and_check_albert_for_question_answering( def create_and_check_albert_for_question_answering(
@@ -191,11 +176,7 @@ class TFAlbertModelTester:
): ):
model = TFAlbertForQuestionAnswering(config=config) model = TFAlbertForQuestionAnswering(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
start_logits, end_logits = model(inputs) result = model(inputs)
result = {
"start_logits": start_logits.numpy(),
"end_logits": end_logits.numpy(),
}
self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])

View File

@@ -118,6 +118,7 @@ class TFBertModelTester:
max_position_embeddings=self.max_position_embeddings, max_position_embeddings=self.max_position_embeddings,
type_vocab_size=self.type_vocab_size, type_vocab_size=self.type_vocab_size,
initializer_range=self.initializer_range, initializer_range=self.initializer_range,
return_dict=True,
) )
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
@@ -130,18 +131,14 @@ class TFBertModelTester:
sequence_output, pooled_output = model(inputs) sequence_output, pooled_output = model(inputs)
inputs = [input_ids, input_mask] inputs = [input_ids, input_mask]
sequence_output, pooled_output = model(inputs) result = model(inputs)
sequence_output, pooled_output = model(input_ids) result = model(input_ids)
result = {
"sequence_output": sequence_output.numpy(),
"pooled_output": pooled_output.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size]
) )
self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size]) self.parent.assertListEqual(list(result["pooler_output"].shape), [self.batch_size, self.hidden_size])
def create_and_check_bert_lm_head( def create_and_check_bert_lm_head(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
@@ -153,7 +150,7 @@ class TFBertModelTester:
"attention_mask": input_mask, "attention_mask": input_mask,
"token_type_ids": token_type_ids, "token_type_ids": token_type_ids,
} }
(prediction_scores,) = model(inputs) prediction_scores = model(inputs)["logits"]
self.parent.assertListEqual( self.parent.assertListEqual(
list(prediction_scores.numpy().shape), [self.batch_size, self.seq_length, self.vocab_size] list(prediction_scores.numpy().shape), [self.batch_size, self.seq_length, self.vocab_size]
) )
@@ -167,39 +164,27 @@ class TFBertModelTester:
"attention_mask": input_mask, "attention_mask": input_mask,
"token_type_ids": token_type_ids, "token_type_ids": token_type_ids,
} }
(prediction_scores,) = model(inputs) result = model(inputs)
result = { self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size])
"prediction_scores": prediction_scores.numpy(),
}
self.parent.assertListEqual(
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
)
def create_and_check_bert_for_next_sequence_prediction( def create_and_check_bert_for_next_sequence_prediction(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
): ):
model = TFBertForNextSentencePrediction(config=config) model = TFBertForNextSentencePrediction(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
(seq_relationship_score,) = model(inputs) result = model(inputs)
result = { self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, 2])
"seq_relationship_score": seq_relationship_score.numpy(),
}
self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2])
def create_and_check_bert_for_pretraining( def create_and_check_bert_for_pretraining(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
): ):
model = TFBertForPreTraining(config=config) model = TFBertForPreTraining(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
prediction_scores, seq_relationship_score = model(inputs) result = model(inputs)
result = {
"prediction_scores": prediction_scores.numpy(),
"seq_relationship_score": seq_relationship_score.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] list(result["prediction_logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]
) )
self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2]) self.parent.assertListEqual(list(result["seq_relationship_logits"].shape), [self.batch_size, 2])
def create_and_check_bert_for_sequence_classification( def create_and_check_bert_for_sequence_classification(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
@@ -212,8 +197,7 @@ class TFBertModelTester:
"token_type_ids": token_type_ids, "token_type_ids": token_type_ids,
} }
(logits,) = model(inputs) result = model(inputs)
result = {"logits": logits.numpy()}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
def create_and_check_bert_for_multiple_choice( def create_and_check_bert_for_multiple_choice(
@@ -229,8 +213,7 @@ class TFBertModelTester:
"attention_mask": multiple_choice_input_mask, "attention_mask": multiple_choice_input_mask,
"token_type_ids": multiple_choice_token_type_ids, "token_type_ids": multiple_choice_token_type_ids,
} }
(logits,) = model(inputs) result = model(inputs)
result = {"logits": logits.numpy()}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices])
def create_and_check_bert_for_token_classification( def create_and_check_bert_for_token_classification(
@@ -243,10 +226,7 @@ class TFBertModelTester:
"attention_mask": input_mask, "attention_mask": input_mask,
"token_type_ids": token_type_ids, "token_type_ids": token_type_ids,
} }
(logits,) = model(inputs) result = model(inputs)
result = {
"logits": logits.numpy(),
}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels])
def create_and_check_bert_for_question_answering( def create_and_check_bert_for_question_answering(
@@ -259,8 +239,7 @@ class TFBertModelTester:
"token_type_ids": token_type_ids, "token_type_ids": token_type_ids,
} }
start_logits, end_logits = model(inputs) result = model(inputs)
result = {"start_logits": start_logits.numpy(), "end_logits": end_logits.numpy()}
self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])

View File

@@ -35,7 +35,7 @@ class TFCamembertModelIntegrationTest(unittest.TestCase):
[[5, 121, 11, 660, 16, 730, 25543, 110, 83, 6]], dtype=tf.int32, [[5, 121, 11, 660, 16, 730, 25543, 110, 83, 6]], dtype=tf.int32,
) # J'aime le camembert !" ) # J'aime le camembert !"
output = model(input_ids)[0] output = model(input_ids)["last_hidden_state"]
expected_shape = tf.TensorShape((1, 10, 768)) expected_shape = tf.TensorShape((1, 10, 768))
self.assertEqual(output.shape, expected_shape) self.assertEqual(output.shape, expected_shape)
# compare the actual values for a slice. # compare the actual values for a slice.

View File

@@ -146,7 +146,8 @@ class TFModelTesterMixin:
tf.saved_model.save(model, tmpdirname) tf.saved_model.save(model, tmpdirname)
model = tf.keras.models.load_model(tmpdirname) model = tf.keras.models.load_model(tmpdirname)
outputs = model(inputs_dict) outputs = model(inputs_dict)
hidden_states = [t.numpy() for t in outputs[-1]] output = outputs[list(outputs.keys())[-1]] if isinstance(outputs, dict) else outputs[-1]
hidden_states = [t.numpy() for t in output]
self.assertEqual(len(outputs), num_out) self.assertEqual(len(outputs), num_out)
self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1) self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
self.assertListEqual( self.assertListEqual(
@@ -177,7 +178,8 @@ class TFModelTesterMixin:
tf.saved_model.save(model, tmpdirname) tf.saved_model.save(model, tmpdirname)
model = tf.keras.models.load_model(tmpdirname) model = tf.keras.models.load_model(tmpdirname)
outputs = model(inputs_dict) outputs = model(inputs_dict)
attentions = [t.numpy() for t in outputs[-1]] output = outputs[list(outputs.keys())[-1]] if isinstance(outputs, dict) else outputs[-1]
attentions = [t.numpy() for t in output]
self.assertEqual(len(outputs), num_out) self.assertEqual(len(outputs), num_out)
self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
self.assertListEqual( self.assertListEqual(
@@ -238,6 +240,8 @@ class TFModelTesterMixin:
# Make sure we don't have nans # Make sure we don't have nans
if isinstance(after_outputs, tf.Tensor): if isinstance(after_outputs, tf.Tensor):
out_1 = after_outputs.numpy() out_1 = after_outputs.numpy()
elif isinstance(after_outputs, dict):
out_1 = after_outputs[list(after_outputs.keys())[0]]
else: else:
out_1 = after_outputs[0].numpy() out_1 = after_outputs[0].numpy()
out_2 = outputs[0].numpy() out_2 = outputs[0].numpy()

View File

@@ -89,9 +89,10 @@ class TFCTRLModelTester(object):
# hidden_dropout_prob=self.hidden_dropout_prob, # hidden_dropout_prob=self.hidden_dropout_prob,
# attention_probs_dropout_prob=self.attention_probs_dropout_prob, # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
n_positions=self.max_position_embeddings, n_positions=self.max_position_embeddings,
n_ctx=self.max_position_embeddings n_ctx=self.max_position_embeddings,
# type_vocab_size=self.type_vocab_size, # type_vocab_size=self.type_vocab_size,
# initializer_range=self.initializer_range # initializer_range=self.initializer_range,
return_dict=True,
) )
head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
@@ -111,30 +112,22 @@ class TFCTRLModelTester(object):
def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
model = TFCTRLModel(config=config) model = TFCTRLModel(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
sequence_output = model(inputs)[0] result = model(inputs)
inputs = [input_ids, None, input_mask] # None is the input for 'past' inputs = [input_ids, None, input_mask] # None is the input for 'past'
sequence_output = model(inputs)[0] result = model(inputs)
sequence_output = model(input_ids)[0] result = model(input_ids)
result = {
"sequence_output": sequence_output.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size]
) )
def create_and_check_ctrl_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): def create_and_check_ctrl_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
model = TFCTRLLMHeadModel(config=config) model = TFCTRLLMHeadModel(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
prediction_scores = model(inputs)[0] result = model(inputs)
result = { self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size])
"prediction_scores": prediction_scores.numpy(),
}
self.parent.assertListEqual(
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
)
def prepare_config_and_inputs_for_common(self): def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs() config_and_inputs = self.prepare_config_and_inputs()

View File

@@ -89,6 +89,7 @@ class TFDistilBertModelTester:
attention_dropout=self.attention_probs_dropout_prob, attention_dropout=self.attention_probs_dropout_prob,
max_position_embeddings=self.max_position_embeddings, max_position_embeddings=self.max_position_embeddings,
initializer_range=self.initializer_range, initializer_range=self.initializer_range,
return_dict=True,
) )
return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
@@ -99,18 +100,14 @@ class TFDistilBertModelTester:
model = TFDistilBertModel(config=config) model = TFDistilBertModel(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask} inputs = {"input_ids": input_ids, "attention_mask": input_mask}
outputs = model(inputs) result = model(inputs)
sequence_output = outputs[0]
inputs = [input_ids, input_mask] inputs = [input_ids, input_mask]
(sequence_output,) = model(inputs) result = model(inputs)
result = {
"sequence_output": sequence_output.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size]
) )
def create_and_check_distilbert_for_masked_lm( def create_and_check_distilbert_for_masked_lm(
@@ -118,11 +115,8 @@ class TFDistilBertModelTester:
): ):
model = TFDistilBertForMaskedLM(config=config) model = TFDistilBertForMaskedLM(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask} inputs = {"input_ids": input_ids, "attention_mask": input_mask}
(prediction_scores,) = model(inputs) result = model(inputs)
result = {"prediction_scores": prediction_scores.numpy()} self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size])
self.parent.assertListEqual(
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
)
def create_and_check_distilbert_for_question_answering( def create_and_check_distilbert_for_question_answering(
self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
@@ -132,8 +126,7 @@ class TFDistilBertModelTester:
"input_ids": input_ids, "input_ids": input_ids,
"attention_mask": input_mask, "attention_mask": input_mask,
} }
start_logits, end_logits = model(inputs) result = model(inputs)
result = {"start_logits": start_logits.numpy(), "end_logits": end_logits.numpy()}
self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
@@ -143,8 +136,7 @@ class TFDistilBertModelTester:
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = TFDistilBertForSequenceClassification(config) model = TFDistilBertForSequenceClassification(config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask} inputs = {"input_ids": input_ids, "attention_mask": input_mask}
(logits,) = model(inputs) result = model(inputs)
result = {"logits": logits.numpy()}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
def create_and_check_distilbert_for_multiple_choice( def create_and_check_distilbert_for_multiple_choice(
@@ -158,8 +150,7 @@ class TFDistilBertModelTester:
"input_ids": multiple_choice_inputs_ids, "input_ids": multiple_choice_inputs_ids,
"attention_mask": multiple_choice_input_mask, "attention_mask": multiple_choice_input_mask,
} }
(logits,) = model(inputs) result = model(inputs)
result = {"logits": logits.numpy()}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices])
def create_and_check_distilbert_for_token_classification( def create_and_check_distilbert_for_token_classification(
@@ -168,10 +159,7 @@ class TFDistilBertModelTester:
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = TFDistilBertForTokenClassification(config) model = TFDistilBertForTokenClassification(config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask} inputs = {"input_ids": input_ids, "attention_mask": input_mask}
(logits,) = model(inputs) result = model(inputs)
result = {
"logits": logits.numpy(),
}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels])
def prepare_config_and_inputs_for_common(self): def prepare_config_and_inputs_for_common(self):

View File

@@ -95,6 +95,7 @@ class TFElectraModelTester:
max_position_embeddings=self.max_position_embeddings, max_position_embeddings=self.max_position_embeddings,
type_vocab_size=self.type_vocab_size, type_vocab_size=self.type_vocab_size,
initializer_range=self.initializer_range, initializer_range=self.initializer_range,
return_dict=True,
) )
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
@@ -104,18 +105,15 @@ class TFElectraModelTester:
): ):
model = TFElectraModel(config=config) model = TFElectraModel(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
(sequence_output,) = model(inputs) result = model(inputs)
inputs = [input_ids, input_mask] inputs = [input_ids, input_mask]
(sequence_output,) = model(inputs) result = model(inputs)
(sequence_output,) = model(input_ids) result = model(input_ids)
result = {
"sequence_output": sequence_output.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size]
) )
def create_and_check_electra_for_masked_lm( def create_and_check_electra_for_masked_lm(
@@ -123,24 +121,16 @@ class TFElectraModelTester:
): ):
model = TFElectraForMaskedLM(config=config) model = TFElectraForMaskedLM(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
(prediction_scores,) = model(inputs) result = model(inputs)
result = { self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size])
"prediction_scores": prediction_scores.numpy(),
}
self.parent.assertListEqual(
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
)
def create_and_check_electra_for_pretraining( def create_and_check_electra_for_pretraining(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
): ):
model = TFElectraForPreTraining(config=config) model = TFElectraForPreTraining(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
(prediction_scores,) = model(inputs) result = model(inputs)
result = { self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length])
"prediction_scores": prediction_scores.numpy(),
}
self.parent.assertListEqual(list(result["prediction_scores"].shape), [self.batch_size, self.seq_length])
def create_and_check_electra_for_sequence_classification( def create_and_check_electra_for_sequence_classification(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
@@ -148,10 +138,7 @@ class TFElectraModelTester:
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = TFElectraForSequenceClassification(config=config) model = TFElectraForSequenceClassification(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
(logits,) = model(inputs) result = model(inputs)
result = {
"logits": logits.numpy(),
}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
def create_and_check_electra_for_multiple_choice( def create_and_check_electra_for_multiple_choice(
@@ -167,8 +154,7 @@ class TFElectraModelTester:
"attention_mask": multiple_choice_input_mask, "attention_mask": multiple_choice_input_mask,
"token_type_ids": multiple_choice_token_type_ids, "token_type_ids": multiple_choice_token_type_ids,
} }
(logits,) = model(inputs) result = model(inputs)
result = {"logits": logits.numpy()}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices])
def create_and_check_electra_for_question_answering( def create_and_check_electra_for_question_answering(
@@ -176,11 +162,7 @@ class TFElectraModelTester:
): ):
model = TFElectraForQuestionAnswering(config=config) model = TFElectraForQuestionAnswering(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
start_logits, end_logits = model(inputs) result = model(inputs)
result = {
"start_logits": start_logits.numpy(),
"end_logits": end_logits.numpy(),
}
self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
@@ -190,10 +172,7 @@ class TFElectraModelTester:
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = TFElectraForTokenClassification(config=config) model = TFElectraForTokenClassification(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
(logits,) = model(inputs) result = model(inputs)
result = {
"logits": logits.numpy(),
}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels])
def prepare_config_and_inputs_for_common(self): def prepare_config_and_inputs_for_common(self):

View File

@@ -113,6 +113,7 @@ class TFFlaubertModelTester:
summary_type=self.summary_type, summary_type=self.summary_type,
use_proj=self.use_proj, use_proj=self.use_proj,
bos_token_id=self.bos_token_id, bos_token_id=self.bos_token_id,
return_dict=True,
) )
return ( return (
@@ -141,16 +142,12 @@ class TFFlaubertModelTester:
): ):
model = TFFlaubertModel(config=config) model = TFFlaubertModel(config=config)
inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids} inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids}
outputs = model(inputs) result = model(inputs)
inputs = [input_ids, input_mask] inputs = [input_ids, input_mask]
outputs = model(inputs) result = model(inputs)
sequence_output = outputs[0]
result = {
"sequence_output": sequence_output.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size]
) )
def create_and_check_flaubert_lm_head( def create_and_check_flaubert_lm_head(
@@ -168,13 +165,7 @@ class TFFlaubertModelTester:
model = TFFlaubertWithLMHeadModel(config) model = TFFlaubertWithLMHeadModel(config)
inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids} inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids}
outputs = model(inputs) result = model(inputs)
logits = outputs[0]
result = {
"logits": logits.numpy(),
}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size])
@@ -194,12 +185,7 @@ class TFFlaubertModelTester:
inputs = {"input_ids": input_ids, "lengths": input_lengths} inputs = {"input_ids": input_ids, "lengths": input_lengths}
start_logits, end_logits = model(inputs) result = model(inputs)
result = {
"start_logits": start_logits.numpy(),
"end_logits": end_logits.numpy(),
}
self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
@@ -220,11 +206,7 @@ class TFFlaubertModelTester:
inputs = {"input_ids": input_ids, "lengths": input_lengths} inputs = {"input_ids": input_ids, "lengths": input_lengths}
(logits,) = model(inputs) result = model(inputs)
result = {
"logits": logits.numpy(),
}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size])
@@ -243,10 +225,7 @@ class TFFlaubertModelTester:
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = TFFlaubertForTokenClassification(config=config) model = TFFlaubertForTokenClassification(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
(logits,) = model(inputs) result = model(inputs)
result = {
"logits": logits.numpy(),
}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels])
def create_and_check_flaubert_for_multiple_choice( def create_and_check_flaubert_for_multiple_choice(
@@ -271,8 +250,7 @@ class TFFlaubertModelTester:
"attention_mask": multiple_choice_input_mask, "attention_mask": multiple_choice_input_mask,
"token_type_ids": multiple_choice_token_type_ids, "token_type_ids": multiple_choice_token_type_ids,
} }
(logits,) = model(inputs) result = model(inputs)
result = {"logits": logits.numpy()}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices])
def prepare_config_and_inputs_for_common(self): def prepare_config_and_inputs_for_common(self):

View File

@@ -102,6 +102,7 @@ class TFGPT2ModelTester:
# initializer_range=self.initializer_range # initializer_range=self.initializer_range
bos_token_id=self.bos_token_id, bos_token_id=self.bos_token_id,
eos_token_id=self.eos_token_id, eos_token_id=self.eos_token_id,
return_dict=True,
) )
head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
@@ -125,18 +126,15 @@ class TFGPT2ModelTester:
"attention_mask": input_mask, "attention_mask": input_mask,
"token_type_ids": token_type_ids, "token_type_ids": token_type_ids,
} }
sequence_output = model(inputs)[0] result = model(inputs)
inputs = [input_ids, None, input_mask] # None is the input for 'past' inputs = [input_ids, None, input_mask] # None is the input for 'past'
sequence_output = model(inputs)[0] result = model(inputs)
sequence_output = model(input_ids)[0] result = model(input_ids)
result = {
"sequence_output": sequence_output.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size], list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size],
) )
def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
@@ -150,7 +148,7 @@ class TFGPT2ModelTester:
self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf)) self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1) self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
output, past = outputs output, past = outputs.to_tuple()
# create hypothetical next token and extent to next_input_ids # create hypothetical next token and extent to next_input_ids
next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
@@ -160,8 +158,8 @@ class TFGPT2ModelTester:
next_input_ids = tf.concat([input_ids, next_tokens], axis=-1) next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
next_token_type_ids = tf.concat([token_type_ids, next_token_types], axis=-1) next_token_type_ids = tf.concat([token_type_ids, next_token_types], axis=-1)
output_from_no_past, _ = model(next_input_ids, token_type_ids=next_token_type_ids) output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"]
output_from_past, _ = model(next_tokens, token_type_ids=next_token_types, past=past) output_from_past = model(next_tokens, token_type_ids=next_token_types, past=past)["last_hidden_state"]
# select random slice # select random slice
random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1])) random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1]))
@@ -183,7 +181,7 @@ class TFGPT2ModelTester:
attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1) attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1)
# first forward pass # first forward pass
output, past = model(input_ids, attention_mask=attn_mask) output, past = model(input_ids, attention_mask=attn_mask).to_tuple()
# create hypothetical next token and extent to next_input_ids # create hypothetical next token and extent to next_input_ids
next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
@@ -202,8 +200,8 @@ class TFGPT2ModelTester:
attn_mask = tf.concat([attn_mask, tf.ones((shape_list(attn_mask)[0], 1), dtype=tf.int32)], axis=1) attn_mask = tf.concat([attn_mask, tf.ones((shape_list(attn_mask)[0], 1), dtype=tf.int32)], axis=1)
# get two different outputs # get two different outputs
output_from_no_past, _ = model(next_input_ids, attention_mask=attn_mask) output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
output_from_past, _ = model(next_tokens, past=past, attention_mask=attn_mask) output_from_past = model(next_tokens, past=past, attention_mask=attn_mask)["last_hidden_state"]
# select random slice # select random slice
random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1])) random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1]))
@@ -220,12 +218,9 @@ class TFGPT2ModelTester:
"attention_mask": input_mask, "attention_mask": input_mask,
"token_type_ids": token_type_ids, "token_type_ids": token_type_ids,
} }
prediction_scores = model(inputs)[0] result = model(inputs)
result = {
"prediction_scores": prediction_scores.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size], list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size],
) )
def create_and_check_gpt2_double_head( def create_and_check_gpt2_double_head(
@@ -243,8 +238,7 @@ class TFGPT2ModelTester:
"attention_mask": multiple_choice_input_mask, "attention_mask": multiple_choice_input_mask,
"token_type_ids": multiple_choice_token_type_ids, "token_type_ids": multiple_choice_token_type_ids,
} }
lm_logits, mc_logits = model(inputs)[:2] result = model(inputs)
result = {"lm_logits": lm_logits.numpy(), "mc_logits": mc_logits.numpy()}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["lm_logits"].shape), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size], list(result["lm_logits"].shape), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size],
) )

View File

@@ -138,6 +138,7 @@ class TFMobileBertModelTest(TFModelTesterMixin, unittest.TestCase):
type_vocab_size=self.type_vocab_size, type_vocab_size=self.type_vocab_size,
initializer_range=self.initializer_range, initializer_range=self.initializer_range,
embedding_size=self.embedding_size, embedding_size=self.embedding_size,
return_dict=True,
) )
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
@@ -147,33 +148,26 @@ class TFMobileBertModelTest(TFModelTesterMixin, unittest.TestCase):
): ):
model = TFMobileBertModel(config=config) model = TFMobileBertModel(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
sequence_output, pooled_output = model(inputs) result = model(inputs)
inputs = [input_ids, input_mask] inputs = [input_ids, input_mask]
sequence_output, pooled_output = model(inputs) result = model(inputs)
sequence_output, pooled_output = model(input_ids) result = model(input_ids)
result = {
"sequence_output": sequence_output.numpy(),
"pooled_output": pooled_output.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size]
) )
self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size]) self.parent.assertListEqual(list(result["pooler_output"].shape), [self.batch_size, self.hidden_size])
def create_and_check_mobilebert_for_masked_lm( def create_and_check_mobilebert_for_masked_lm(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
): ):
model = TFMobileBertForMaskedLM(config=config) model = TFMobileBertForMaskedLM(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
(prediction_scores,) = model(inputs) result = model(inputs)
result = {
"prediction_scores": prediction_scores.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]
) )
def create_and_check_mobilebert_for_next_sequence_prediction( def create_and_check_mobilebert_for_next_sequence_prediction(
@@ -181,26 +175,19 @@ class TFMobileBertModelTest(TFModelTesterMixin, unittest.TestCase):
): ):
model = TFMobileBertForNextSentencePrediction(config=config) model = TFMobileBertForNextSentencePrediction(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
(seq_relationship_score,) = model(inputs) result = model(inputs)
result = { self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, 2])
"seq_relationship_score": seq_relationship_score.numpy(),
}
self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2])
def create_and_check_mobilebert_for_pretraining( def create_and_check_mobilebert_for_pretraining(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
): ):
model = TFMobileBertForPreTraining(config=config) model = TFMobileBertForPreTraining(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
prediction_scores, seq_relationship_score = model(inputs) result = model(inputs)
result = {
"prediction_scores": prediction_scores.numpy(),
"seq_relationship_score": seq_relationship_score.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] list(result["prediction_logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]
) )
self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2]) self.parent.assertListEqual(list(result["seq_relationship_logits"].shape), [self.batch_size, 2])
def create_and_check_mobilebert_for_sequence_classification( def create_and_check_mobilebert_for_sequence_classification(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
@@ -208,10 +195,7 @@ class TFMobileBertModelTest(TFModelTesterMixin, unittest.TestCase):
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = TFMobileBertForSequenceClassification(config=config) model = TFMobileBertForSequenceClassification(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
(logits,) = model(inputs) result = model(inputs)
result = {
"logits": logits.numpy(),
}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
def create_and_check_mobilebert_for_multiple_choice( def create_and_check_mobilebert_for_multiple_choice(
@@ -227,10 +211,7 @@ class TFMobileBertModelTest(TFModelTesterMixin, unittest.TestCase):
"attention_mask": multiple_choice_input_mask, "attention_mask": multiple_choice_input_mask,
"token_type_ids": multiple_choice_token_type_ids, "token_type_ids": multiple_choice_token_type_ids,
} }
(logits,) = model(inputs) result = model(inputs)
result = {
"logits": logits.numpy(),
}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices])
def create_and_check_mobilebert_for_token_classification( def create_and_check_mobilebert_for_token_classification(
@@ -239,10 +220,7 @@ class TFMobileBertModelTest(TFModelTesterMixin, unittest.TestCase):
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = TFMobileBertForTokenClassification(config=config) model = TFMobileBertForTokenClassification(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
(logits,) = model(inputs) result = model(inputs)
result = {
"logits": logits.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels] list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]
) )
@@ -252,11 +230,7 @@ class TFMobileBertModelTest(TFModelTesterMixin, unittest.TestCase):
): ):
model = TFMobileBertForQuestionAnswering(config=config) model = TFMobileBertForQuestionAnswering(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
start_logits, end_logits = model(inputs) result = model(inputs)
result = {
"start_logits": start_logits.numpy(),
"end_logits": end_logits.numpy(),
}
self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])

View File

@@ -94,9 +94,10 @@ class TFOpenAIGPTModelTester:
# hidden_dropout_prob=self.hidden_dropout_prob, # hidden_dropout_prob=self.hidden_dropout_prob,
# attention_probs_dropout_prob=self.attention_probs_dropout_prob, # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
n_positions=self.max_position_embeddings, n_positions=self.max_position_embeddings,
n_ctx=self.max_position_embeddings n_ctx=self.max_position_embeddings,
# type_vocab_size=self.type_vocab_size, # type_vocab_size=self.type_vocab_size,
# initializer_range=self.initializer_range # initializer_range=self.initializer_range,
return_dict=True,
) )
head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
@@ -116,30 +117,22 @@ class TFOpenAIGPTModelTester:
def create_and_check_openai_gpt_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): def create_and_check_openai_gpt_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
model = TFOpenAIGPTModel(config=config) model = TFOpenAIGPTModel(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
sequence_output = model(inputs)[0] result = model(inputs)
inputs = [input_ids, input_mask] inputs = [input_ids, input_mask]
sequence_output = model(inputs)[0] result = model(inputs)
sequence_output = model(input_ids)[0] result = model(input_ids)
result = {
"sequence_output": sequence_output.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size]
) )
def create_and_check_openai_gpt_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): def create_and_check_openai_gpt_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
model = TFOpenAIGPTLMHeadModel(config=config) model = TFOpenAIGPTLMHeadModel(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
prediction_scores = model(inputs)[0] result = model(inputs)
result = { self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size])
"prediction_scores": prediction_scores.numpy(),
}
self.parent.assertListEqual(
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
)
def create_and_check_openai_gpt_double_head( def create_and_check_openai_gpt_double_head(
self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
@@ -156,8 +149,7 @@ class TFOpenAIGPTModelTester:
"attention_mask": multiple_choice_input_mask, "attention_mask": multiple_choice_input_mask,
"token_type_ids": multiple_choice_token_type_ids, "token_type_ids": multiple_choice_token_type_ids,
} }
lm_logits, mc_logits = model(inputs)[:2] result = model(inputs)
result = {"lm_logits": lm_logits.numpy(), "mc_logits": mc_logits.numpy()}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["lm_logits"].shape), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size] list(result["lm_logits"].shape), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size]
) )

View File

@@ -95,6 +95,7 @@ class TFRobertaModelTester:
max_position_embeddings=self.max_position_embeddings, max_position_embeddings=self.max_position_embeddings,
type_vocab_size=self.type_vocab_size, type_vocab_size=self.type_vocab_size,
initializer_range=self.initializer_range, initializer_range=self.initializer_range,
return_dict=True,
) )
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
@@ -104,31 +105,23 @@ class TFRobertaModelTester:
): ):
model = TFRobertaModel(config=config) model = TFRobertaModel(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
sequence_output = model(inputs)[0] result = model(inputs)
inputs = [input_ids, input_mask] inputs = [input_ids, input_mask]
sequence_output = model(inputs)[0] result = model(inputs)
sequence_output = model(input_ids)[0] result = model(input_ids)
result = {
"sequence_output": sequence_output.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size]
) )
def create_and_check_roberta_for_masked_lm( def create_and_check_roberta_for_masked_lm(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
): ):
model = TFRobertaForMaskedLM(config=config) model = TFRobertaForMaskedLM(config=config)
prediction_scores = model([input_ids, input_mask, token_type_ids])[0] result = model([input_ids, input_mask, token_type_ids])
result = { self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size])
"prediction_scores": prediction_scores.numpy(),
}
self.parent.assertListEqual(
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
)
def create_and_check_roberta_for_token_classification( def create_and_check_roberta_for_token_classification(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
@@ -136,10 +129,7 @@ class TFRobertaModelTester:
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = TFRobertaForTokenClassification(config=config) model = TFRobertaForTokenClassification(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
(logits,) = model(inputs) result = model(inputs)
result = {
"logits": logits.numpy(),
}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels])
def create_and_check_roberta_for_question_answering( def create_and_check_roberta_for_question_answering(
@@ -147,11 +137,7 @@ class TFRobertaModelTester:
): ):
model = TFRobertaForQuestionAnswering(config=config) model = TFRobertaForQuestionAnswering(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
start_logits, end_logits = model(inputs) result = model(inputs)
result = {
"start_logits": start_logits.numpy(),
"end_logits": end_logits.numpy(),
}
self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
@@ -168,10 +154,7 @@ class TFRobertaModelTester:
"attention_mask": multiple_choice_input_mask, "attention_mask": multiple_choice_input_mask,
"token_type_ids": multiple_choice_token_type_ids, "token_type_ids": multiple_choice_token_type_ids,
} }
(logits,) = model(inputs) result = model(inputs)
result = {
"logits": logits.numpy(),
}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices])
def prepare_config_and_inputs_for_common(self): def prepare_config_and_inputs_for_common(self):

View File

@@ -78,6 +78,7 @@ class TFT5ModelTester:
bos_token_id=self.pad_token_id, bos_token_id=self.pad_token_id,
pad_token_id=self.pad_token_id, pad_token_id=self.pad_token_id,
decoder_start_token_id=self.pad_token_id, decoder_start_token_id=self.pad_token_id,
return_dict=True,
) )
return (config, input_ids, input_mask, token_labels) return (config, input_ids, input_mask, token_labels)
@@ -89,22 +90,14 @@ class TFT5ModelTester:
"decoder_input_ids": input_ids, "decoder_input_ids": input_ids,
"decoder_attention_mask": input_mask, "decoder_attention_mask": input_mask,
} }
decoder_output, decoder_past, encoder_output = model(inputs) result = model(inputs)
decoder_output, decoder_past, encoder_output = model( result = model(input_ids, decoder_attention_mask=input_mask, decoder_input_ids=input_ids)
input_ids, decoder_attention_mask=input_mask, decoder_input_ids=input_ids decoder_output = result["last_hidden_state"]
) decoder_past = result["decoder_past_key_values"]
result = { encoder_output = result["encoder_last_hidden_state"]
"encoder_output": encoder_output.numpy(), self.parent.assertListEqual(list(encoder_output.shape), [self.batch_size, self.seq_length, self.hidden_size])
"decoder_past": decoder_past, self.parent.assertListEqual(list(decoder_output.shape), [self.batch_size, self.seq_length, self.hidden_size])
"decoder_output": decoder_output.numpy(),
}
self.parent.assertListEqual(
list(result["encoder_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
)
self.parent.assertListEqual(
list(result["decoder_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
)
self.parent.assertEqual(len(decoder_past), 2) self.parent.assertEqual(len(decoder_past), 2)
# decoder_past[0] should correspond to encoder output # decoder_past[0] should correspond to encoder output
self.parent.assertTrue(tf.reduce_all(tf.math.equal(decoder_past[0][0], encoder_output))) self.parent.assertTrue(tf.reduce_all(tf.math.equal(decoder_past[0][0], encoder_output)))
@@ -121,14 +114,9 @@ class TFT5ModelTester:
"decoder_attention_mask": input_mask, "decoder_attention_mask": input_mask,
} }
prediction_scores, _, _ = model(inputs_dict) result = model(inputs_dict)
result = { self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size])
"prediction_scores": prediction_scores.numpy(),
}
self.parent.assertListEqual(
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
)
def create_and_check_t5_decoder_model_past(self, config, input_ids, decoder_input_ids, attention_mask): def create_and_check_t5_decoder_model_past(self, config, input_ids, decoder_input_ids, attention_mask):
model = TFT5Model(config=config).get_decoder() model = TFT5Model(config=config).get_decoder()

View File

@@ -79,6 +79,7 @@ class TFTransfoXLModelTester:
div_val=self.div_val, div_val=self.div_val,
n_layer=self.num_hidden_layers, n_layer=self.num_hidden_layers,
eos_token_id=self.eos_token_id, eos_token_id=self.eos_token_id,
return_dict=True,
) )
return (config, input_ids_1, input_ids_2, lm_labels) return (config, input_ids_1, input_ids_2, lm_labels)
@@ -90,11 +91,11 @@ class TFTransfoXLModelTester:
def create_and_check_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels): def create_and_check_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels):
model = TFTransfoXLModel(config) model = TFTransfoXLModel(config)
hidden_states_1, mems_1 = model(input_ids_1) hidden_states_1, mems_1 = model(input_ids_1).to_tuple()
inputs = {"input_ids": input_ids_2, "mems": mems_1} inputs = {"input_ids": input_ids_2, "mems": mems_1}
hidden_states_2, mems_2 = model(inputs) hidden_states_2, mems_2 = model(inputs).to_tuple()
result = { result = {
"hidden_states_1": hidden_states_1.numpy(), "hidden_states_1": hidden_states_1.numpy(),
@@ -121,16 +122,16 @@ class TFTransfoXLModelTester:
def create_and_check_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels): def create_and_check_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
model = TFTransfoXLLMHeadModel(config) model = TFTransfoXLLMHeadModel(config)
lm_logits_1, mems_1 = model(input_ids_1) lm_logits_1, mems_1 = model(input_ids_1).to_tuple()
inputs = {"input_ids": input_ids_1, "labels": lm_labels} inputs = {"input_ids": input_ids_1, "labels": lm_labels}
_, mems_1 = model(inputs) _, mems_1 = model(inputs).to_tuple()
lm_logits_2, mems_2 = model([input_ids_2, mems_1]) lm_logits_2, mems_2 = model([input_ids_2, mems_1]).to_tuple()
inputs = {"input_ids": input_ids_1, "mems": mems_1, "labels": lm_labels} inputs = {"input_ids": input_ids_1, "mems": mems_1, "labels": lm_labels}
_, mems_2 = model(inputs) _, mems_2 = model(inputs).to_tuple()
result = { result = {
"mems_1": [mem.numpy() for mem in mems_1], "mems_1": [mem.numpy() for mem in mems_1],

View File

@@ -112,6 +112,7 @@ class TFXLMModelTester:
summary_type=self.summary_type, summary_type=self.summary_type,
use_proj=self.use_proj, use_proj=self.use_proj,
bos_token_id=self.bos_token_id, bos_token_id=self.bos_token_id,
return_dict=True,
) )
return ( return (
@@ -140,16 +141,12 @@ class TFXLMModelTester:
): ):
model = TFXLMModel(config=config) model = TFXLMModel(config=config)
inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids} inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids}
outputs = model(inputs) result = model(inputs)
inputs = [input_ids, input_mask] inputs = [input_ids, input_mask]
outputs = model(inputs) result = model(inputs)
sequence_output = outputs[0]
result = {
"sequence_output": sequence_output.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size]
) )
def create_and_check_xlm_lm_head( def create_and_check_xlm_lm_head(
@@ -169,11 +166,7 @@ class TFXLMModelTester:
inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids} inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids}
outputs = model(inputs) outputs = model(inputs)
logits = outputs[0] result = outputs
result = {
"logits": logits.numpy(),
}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size])
@@ -193,12 +186,7 @@ class TFXLMModelTester:
inputs = {"input_ids": input_ids, "lengths": input_lengths} inputs = {"input_ids": input_ids, "lengths": input_lengths}
start_logits, end_logits = model(inputs) result = model(inputs)
result = {
"start_logits": start_logits.numpy(),
"end_logits": end_logits.numpy(),
}
self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
@@ -219,11 +207,7 @@ class TFXLMModelTester:
inputs = {"input_ids": input_ids, "lengths": input_lengths} inputs = {"input_ids": input_ids, "lengths": input_lengths}
(logits,) = model(inputs) result = model(inputs)
result = {
"logits": logits.numpy(),
}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size])
@@ -242,10 +226,7 @@ class TFXLMModelTester:
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = TFXLMForTokenClassification(config=config) model = TFXLMForTokenClassification(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
(logits,) = model(inputs) result = model(inputs)
result = {
"logits": logits.numpy(),
}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels])
def create_and_check_xlm_for_multiple_choice( def create_and_check_xlm_for_multiple_choice(
@@ -270,8 +251,7 @@ class TFXLMModelTester:
"attention_mask": multiple_choice_input_mask, "attention_mask": multiple_choice_input_mask,
"token_type_ids": multiple_choice_token_type_ids, "token_type_ids": multiple_choice_token_type_ids,
} }
(logits,) = model(inputs) result = model(inputs)
result = {"logits": logits.numpy()}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices])
def prepare_config_and_inputs_for_common(self): def prepare_config_and_inputs_for_common(self):

View File

@@ -36,7 +36,7 @@ class TFFlaubertModelIntegrationTest(unittest.TestCase):
"attention_mask": tf.convert_to_tensor([[1, 1, 1, 1, 1, 1]], dtype=tf.int32), "attention_mask": tf.convert_to_tensor([[1, 1, 1, 1, 1, 1]], dtype=tf.int32),
} }
output = model(features)[0] output = model(features)["last_hidden_state"]
expected_shape = tf.TensorShape((1, 6, 768)) expected_shape = tf.TensorShape((1, 6, 768))
self.assertEqual(output.shape, expected_shape) self.assertEqual(output.shape, expected_shape)
# compare the actual values for a slice. # compare the actual values for a slice.

View File

@@ -110,6 +110,7 @@ class TFXLNetModelTester:
bos_token_id=self.bos_token_id, bos_token_id=self.bos_token_id,
pad_token_id=self.pad_token_id, pad_token_id=self.pad_token_id,
eos_token_id=self.eos_token_id, eos_token_id=self.eos_token_id,
return_dict=True,
) )
return ( return (
@@ -147,17 +148,10 @@ class TFXLNetModelTester:
model = TFXLNetModel(config) model = TFXLNetModel(config)
inputs = {"input_ids": input_ids_1, "input_mask": input_mask, "token_type_ids": segment_ids} inputs = {"input_ids": input_ids_1, "input_mask": input_mask, "token_type_ids": segment_ids}
result = model(inputs)
_, _ = model(inputs)
inputs = [input_ids_1, input_mask] inputs = [input_ids_1, input_mask]
result = model(inputs)
outputs, mems_1 = model(inputs)
result = {
"mems_1": [mem.numpy() for mem in mems_1],
"outputs": outputs.numpy(),
}
config.mem_len = 0 config.mem_len = 0
model = TFXLNetModel(config) model = TFXLNetModel(config)
@@ -165,10 +159,10 @@ class TFXLNetModelTester:
self.parent.assertEqual(len(no_mems_outputs), 1) self.parent.assertEqual(len(no_mems_outputs), 1)
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["outputs"].shape), [self.batch_size, self.seq_length, self.hidden_size] list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size]
) )
self.parent.assertListEqual( self.parent.assertListEqual(
list(list(mem.shape) for mem in result["mems_1"]), list(list(mem.shape) for mem in result["mems"]),
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers, [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
) )
@@ -189,16 +183,13 @@ class TFXLNetModelTester:
model = TFXLNetLMHeadModel(config) model = TFXLNetLMHeadModel(config)
inputs_1 = {"input_ids": input_ids_1, "token_type_ids": segment_ids} inputs_1 = {"input_ids": input_ids_1, "token_type_ids": segment_ids}
all_logits_1, mems_1 = model(inputs_1).to_tuple()
all_logits_1, mems_1 = model(inputs_1)
inputs_2 = {"input_ids": input_ids_2, "mems": mems_1, "token_type_ids": segment_ids} inputs_2 = {"input_ids": input_ids_2, "mems": mems_1, "token_type_ids": segment_ids}
all_logits_2, mems_2 = model(inputs_2).to_tuple()
all_logits_2, mems_2 = model(inputs_2)
inputs_3 = {"input_ids": input_ids_q, "perm_mask": perm_mask, "target_mapping": target_mapping} inputs_3 = {"input_ids": input_ids_q, "perm_mask": perm_mask, "target_mapping": target_mapping}
logits, _ = model(inputs_3).to_tuple()
logits, _ = model(inputs_3)
result = { result = {
"mems_1": [mem.numpy() for mem in mems_1], "mems_1": [mem.numpy() for mem in mems_1],
@@ -240,13 +231,7 @@ class TFXLNetModelTester:
model = TFXLNetForQuestionAnsweringSimple(config) model = TFXLNetForQuestionAnsweringSimple(config)
inputs = {"input_ids": input_ids_1, "attention_mask": input_mask, "token_type_ids": segment_ids} inputs = {"input_ids": input_ids_1, "attention_mask": input_mask, "token_type_ids": segment_ids}
start_logits, end_logits, mems = model(inputs) result = model(inputs)
result = {
"start_logits": start_logits.numpy(),
"end_logits": end_logits.numpy(),
"mems": [m.numpy() for m in mems],
}
self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
@@ -271,16 +256,11 @@ class TFXLNetModelTester:
): ):
model = TFXLNetForSequenceClassification(config) model = TFXLNetForSequenceClassification(config)
logits, mems_1 = model(input_ids_1) result = model(input_ids_1)
result = {
"mems_1": [mem.numpy() for mem in mems_1],
"logits": logits.numpy(),
}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size])
self.parent.assertListEqual( self.parent.assertListEqual(
list(list(mem.shape) for mem in result["mems_1"]), list(list(mem.shape) for mem in result["mems"]),
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers, [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
) )
@@ -305,16 +285,12 @@ class TFXLNetModelTester:
"attention_mask": input_mask, "attention_mask": input_mask,
# 'token_type_ids': token_type_ids # 'token_type_ids': token_type_ids
} }
logits, mems_1 = model(inputs) result = model(inputs)
result = {
"mems_1": [mem.numpy() for mem in mems_1],
"logits": logits.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["logits"].shape), [self.batch_size, self.seq_length, config.num_labels] list(result["logits"].shape), [self.batch_size, self.seq_length, config.num_labels]
) )
self.parent.assertListEqual( self.parent.assertListEqual(
list(list(mem.shape) for mem in result["mems_1"]), list(list(mem.shape) for mem in result["mems"]),
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers, [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
) )
@@ -342,15 +318,11 @@ class TFXLNetModelTester:
"attention_mask": multiple_choice_input_mask, "attention_mask": multiple_choice_input_mask,
"token_type_ids": multiple_choice_token_type_ids, "token_type_ids": multiple_choice_token_type_ids,
} }
(logits, mems_1) = model(inputs) result = model(inputs)
result = {
"mems_1": [mem.numpy() for mem in mems_1],
"logits": logits.numpy(),
}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices])
self.parent.assertListEqual( self.parent.assertListEqual(
list(list(mem.shape) for mem in result["mems_1"]), list(list(mem.shape) for mem in result["mems"]),
[[self.seq_length, self.batch_size * self.num_choices, self.hidden_size]] * self.num_hidden_layers, [[self.seq_length, self.batch_size * self.num_choices, self.hidden_size]] * self.num_hidden_layers,
) )